From dbd2dacc3e05015b8143fe2d2abf33131c19ca31 Mon Sep 17 00:00:00 2001 From: ModelHub XC Date: Fri, 8 May 2026 20:12:22 +0800 Subject: [PATCH] =?UTF-8?q?=E5=88=9D=E5=A7=8B=E5=8C=96=E9=A1=B9=E7=9B=AE?= =?UTF-8?q?=EF=BC=8C=E7=94=B1ModelHub=20XC=E7=A4=BE=E5=8C=BA=E6=8F=90?= =?UTF-8?q?=E4=BE=9B=E6=A8=A1=E5=9E=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Model: Neelectric/Llama-3.1-8B-Instruct_SFT_mathsp_ewc_v00.01 Source: Original Platform --- .gitattributes | 36 + README.md | 60 + all_results.json | 11 + chat_template.jinja | 121 + config.json | 36 + generation_config.json | 9 + model-00001-of-00004.safetensors | 3 + model-00002-of-00004.safetensors | 3 + model-00003-of-00004.safetensors | 3 + model-00004-of-00004.safetensors | 3 + model.safetensors.index.json | 299 + special_tokens_map.json | 11 + tokenizer.json | 3 + tokenizer_config.json | 2063 + train_results.json | 11 + trainer_state.json | 283042 ++++++++++++++++++++++++++++ training_args.bin | 3 + 17 files changed, 285717 insertions(+) create mode 100644 .gitattributes create mode 100644 README.md create mode 100644 all_results.json create mode 100644 chat_template.jinja create mode 100644 config.json create mode 100644 generation_config.json create mode 100644 model-00001-of-00004.safetensors create mode 100644 model-00002-of-00004.safetensors create mode 100644 model-00003-of-00004.safetensors create mode 100644 model-00004-of-00004.safetensors create mode 100644 model.safetensors.index.json create mode 100644 special_tokens_map.json create mode 100644 tokenizer.json create mode 100644 tokenizer_config.json create mode 100644 train_results.json create mode 100644 trainer_state.json create mode 100644 training_args.bin diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..52373fe --- /dev/null +++ b/.gitattributes @@ -0,0 +1,36 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text +tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/README.md b/README.md new file mode 100644 index 0000000..1bd28db --- /dev/null +++ b/README.md @@ -0,0 +1,60 @@ +--- +base_model: meta-llama/Llama-3.1-8B-Instruct +datasets: Neelectric/OpenR1-Math-220k_all_Llama3_4096toks +library_name: transformers +model_name: Llama-3.1-8B-Instruct_SFT_mathsp_ewc_v00.01 +tags: +- generated_from_trainer +- sft +- trl +- open-r1 +licence: license +--- + +# Model Card for Llama-3.1-8B-Instruct_SFT_mathsp_ewc_v00.01 + +This model is a fine-tuned version of [meta-llama/Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct) on the [Neelectric/OpenR1-Math-220k_all_Llama3_4096toks](https://huggingface.co/datasets/Neelectric/OpenR1-Math-220k_all_Llama3_4096toks) dataset. +It has been trained using [TRL](https://github.com/huggingface/trl). + +## Quick start + +```python +from transformers import pipeline + +question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?" +generator = pipeline("text-generation", model="Neelectric/Llama-3.1-8B-Instruct_SFT_mathsp_ewc_v00.01", device="cuda") +output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0] +print(output["generated_text"]) +``` + +## Training procedure + +[Visualize in Weights & Biases](https://wandb.ai/neelectric/open-r1_math/runs/8pr09okd) + + + +This model was trained with SFT. + +### Framework versions + +- TRL: 1.1.0.dev0 +- Transformers: 4.57.6 +- Pytorch: 2.9.0 +- Datasets: 4.8.5 +- Tokenizers: 0.22.2 + +## Citations + + + +Cite TRL as: + +```bibtex +@software{vonwerra2020trl, + title = {{TRL: Transformers Reinforcement Learning}}, + author = {von Werra, Leandro and Belkada, Younes and Tunstall, Lewis and Beeching, Edward and Thrush, Tristan and Lambert, Nathan and Huang, Shengyi and Rasul, Kashif and Gallouédec, Quentin}, + license = {Apache-2.0}, + url = {https://github.com/huggingface/trl}, + year = {2020} +} +``` \ No newline at end of file diff --git a/all_results.json b/all_results.json new file mode 100644 index 0000000..01271b4 --- /dev/null +++ b/all_results.json @@ -0,0 +1,11 @@ +{ + "ewc_loss": 0.041259765625, + "ewc_loss_parallel": 4.124641418457031e-05, + "ewc_loss_perp": 0.0, + "total_flos": 5.62815163329864e+19, + "train_loss": 0.43322460087045367, + "train_runtime": 46186.2128, + "train_samples": 125770, + "train_samples_per_second": 8.169, + "train_steps_per_second": 0.511 +} \ No newline at end of file diff --git a/chat_template.jinja b/chat_template.jinja new file mode 100644 index 0000000..0ab931a --- /dev/null +++ b/chat_template.jinja @@ -0,0 +1,121 @@ +{{- bos_token }} +{%- if custom_tools is defined %} + {%- set tools = custom_tools %} +{%- endif %} +{%- if not tools_in_user_message is defined %} + {%- set tools_in_user_message = true %} +{%- endif %} +{%- if not date_string is defined %} + {%- set date_string = "26 Jul 2024" %} +{%- endif %} +{%- if not tools is defined %} + {%- set tools = none %} +{%- endif %} + +{#- This block extracts the system message, so we can slot it into the right place. #} +{%- if messages[0]['role'] == 'system' %} + {%- set system_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} +{%- else %} + {%- set system_message = "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: +... + + +... +" %} +{%- endif %} + +{#- System message + builtin tools #} +{{- "<|start_header_id|>system<|end_header_id|>\n\n" }} +{%- if builtin_tools is defined or tools is not none %} + {{- "Environment: ipython\n" }} +{%- endif %} +{%- if builtin_tools is defined %} + {{- "Tools: " + builtin_tools | reject('equalto', 'code_interpreter') | join(", ") + "\n\n"}} +{%- endif %} +{{- "Cutting Knowledge Date: December 2023\n" }} +{{- "Today Date: " + date_string + "\n\n" }} +{%- if tools is not none and not tools_in_user_message %} + {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} +{%- endif %} +{{- system_message }} +{{- "<|eot_id|>" }} + +{#- Custom tools are passed in a user message with some extra guidance #} +{%- if tools_in_user_message and not tools is none %} + {#- Extract the first user message so we can plug it in here #} + {%- if messages | length != 0 %} + {%- set first_user_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} + {%- else %} + {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }} + {%- endif %} + {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}} + {{- "Given the following functions, please respond with a JSON for a function call " }} + {{- "with its proper arguments that best answers the given prompt.\n\n" }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} + {{- first_user_message + "<|eot_id|>"}} +{%- endif %} + +{%- for message in messages %} + {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %} + {%- if message['role'] == 'assistant' %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }} + {% generation %} + {{- message['content'] | trim + '<|eot_id|>' }} + {% endgeneration %} + {%- else %} + {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }} + {%- endif %} + {%- elif 'tool_calls' in message %} + {%- if not message.tool_calls|length == 1 %} + {{- raise_exception("This model only supports single tool-calls at once!") }} + {%- endif %} + {%- set tool_call = message.tool_calls[0].function %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}} + {% generation %} + {%- if builtin_tools is defined and tool_call.name in builtin_tools %} + {{- "<|python_tag|>" + tool_call.name + ".call(" }} + {%- for arg_name, arg_val in tool_call.arguments | items %} + {{- arg_name + '="' + arg_val + '"' }} + {%- if not loop.last %} + {{- ", " }} + {%- endif %} + {%- endfor %} + {{- ")" }} + {%- else %} + {{- '{"name": "' + tool_call.name + '", ' }} + {{- '"parameters": ' }} + {{- tool_call.arguments | tojson }} + {{- "}" }} + {%- endif %} + {%- if builtin_tools is defined %} + {{- "<|eom_id|>" }} + {%- else %} + {{- "<|eot_id|>" }} + {%- endif %} + {% endgeneration %} + {%- elif message.role == "tool" or message.role == "ipython" %} + {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }} + {%- if message.content is mapping or message.content is iterable %} + {{- message.content | tojson }} + {%- else %} + {{- message.content }} + {%- endif %} + {{- "<|eot_id|>" }} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }} +{%- endif %} \ No newline at end of file diff --git a/config.json b/config.json new file mode 100644 index 0000000..06df27b --- /dev/null +++ b/config.json @@ -0,0 +1,36 @@ +{ + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "dtype": "bfloat16", + "eos_token_id": 128009, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pad_token_id": 128009, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 8.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "transformers_version": "4.57.6", + "use_cache": true, + "vocab_size": 128256 +} diff --git a/generation_config.json b/generation_config.json new file mode 100644 index 0000000..50f6077 --- /dev/null +++ b/generation_config.json @@ -0,0 +1,9 @@ +{ + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": 128009, + "pad_token_id": 128009, + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "4.57.6" +} diff --git a/model-00001-of-00004.safetensors b/model-00001-of-00004.safetensors new file mode 100644 index 0000000..8e81066 --- /dev/null +++ b/model-00001-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e537368191c7c936268b8238d76da5df9a2c9987ec2aaf2562604f058e519434 +size 4976698672 diff --git a/model-00002-of-00004.safetensors b/model-00002-of-00004.safetensors new file mode 100644 index 0000000..5cd0e1d --- /dev/null +++ b/model-00002-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ad4aa431d7028d954db1f0b6f9451195ccc64798e6dd87c4d52858a8ae539d4c +size 4999802720 diff --git a/model-00003-of-00004.safetensors b/model-00003-of-00004.safetensors new file mode 100644 index 0000000..04bfe1b --- /dev/null +++ b/model-00003-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:58e008607d74cec5ce9bbca24a1ebf6121c81cb71fbad27d7c59994bcb6c303d +size 4915916176 diff --git a/model-00004-of-00004.safetensors b/model-00004-of-00004.safetensors new file mode 100644 index 0000000..3dfaefe --- /dev/null +++ b/model-00004-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e32a8469764005b740a2ec7e8ba65e5c5b49949959c4b033470350be56dbd857 +size 1168138808 diff --git a/model.safetensors.index.json b/model.safetensors.index.json new file mode 100644 index 0000000..5c64f1e --- /dev/null +++ b/model.safetensors.index.json @@ -0,0 +1,299 @@ +{ + "metadata": { + "total_parameters": 8030261248, + "total_size": 16060522496 + }, + "weight_map": { + "lm_head.weight": "model-00004-of-00004.safetensors", + "model.embed_tokens.weight": "model-00001-of-00004.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.20.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.21.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.30.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.input_layernorm.weight": "model-00004-of-00004.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.31.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00004-of-00004.safetensors", + "model.layers.31.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.9.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.norm.weight": "model-00004-of-00004.safetensors" + } +} diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000..9d4773c --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1,11 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": "<|eot_id|>", + "pad_token": "<|eot_id|>" +} diff --git a/tokenizer.json b/tokenizer.json new file mode 100644 index 0000000..1c1d8d5 --- /dev/null +++ b/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000..3beeacc --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/train_results.json b/train_results.json new file mode 100644 index 0000000..01271b4 --- /dev/null +++ b/train_results.json @@ -0,0 +1,11 @@ +{ + "ewc_loss": 0.041259765625, + "ewc_loss_parallel": 4.124641418457031e-05, + "ewc_loss_perp": 0.0, + "total_flos": 5.62815163329864e+19, + "train_loss": 0.43322460087045367, + "train_runtime": 46186.2128, + "train_samples": 125770, + "train_samples_per_second": 8.169, + "train_steps_per_second": 0.511 +} \ No newline at end of file diff --git a/trainer_state.json b/trainer_state.json new file mode 100644 index 0000000..d22c837 --- /dev/null +++ b/trainer_state.json @@ -0,0 +1,283042 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 500, + "global_step": 23583, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00012721027859051011, + "ewc_loss": 0.0, + "ewc_loss_parallel": 0.0, + "ewc_loss_perp": 0.0, + "grad_norm": 4.835049152374268, + "learning_rate": 0.0, + "loss": 0.7982, + "mean_token_accuracy": 0.7762961387634277, + "num_tokens": 38493.0, + "step": 1 + }, + { + "epoch": 0.00025442055718102023, + "ewc_loss": 0.0, + "ewc_loss_parallel": 0.0, + "ewc_loss_perp": 0.0, + "grad_norm": 4.5886054039001465, + "learning_rate": 4.2390843577787196e-10, + "loss": 0.8329, + "mean_token_accuracy": 0.765798807144165, + "num_tokens": 80419.0, + "step": 2 + }, + { + "epoch": 0.0003816308357715303, + "ewc_loss": 1.6431300764452317e-14, + "ewc_loss_parallel": 1.6479873021779667e-17, + "ewc_loss_perp": 0.0, + "grad_norm": 4.725162029266357, + "learning_rate": 8.478168715557439e-10, + "loss": 0.7224, + "mean_token_accuracy": 0.7962077856063843, + "num_tokens": 118717.0, + "step": 3 + }, + { + "epoch": 0.0005088411143620405, + "ewc_loss": 7.602807272633072e-13, + "ewc_loss_parallel": 7.598088824778415e-16, + "ewc_loss_perp": 0.0, + "grad_norm": 5.366641998291016, + "learning_rate": 1.271725307333616e-09, + "loss": 0.814, + "mean_token_accuracy": 0.7711915969848633, + "num_tokens": 150155.0, + "step": 4 + }, + { + "epoch": 0.0006360513929525506, + "ewc_loss": 5.087485988042317e-12, + "ewc_loss_parallel": 5.079270337660091e-15, + "ewc_loss_perp": 0.0, + "grad_norm": 4.305373668670654, + "learning_rate": 1.6956337431114878e-09, + "loss": 0.7919, + "mean_token_accuracy": 0.7749974727630615, + "num_tokens": 193616.0, + "step": 5 + }, + { + "epoch": 0.0007632616715430606, + "ewc_loss": 2.5011104298755527e-11, + "ewc_loss_parallel": 2.4980018054066022e-14, + "ewc_loss_perp": 0.0, + "grad_norm": 5.209270477294922, + "learning_rate": 2.1195421788893596e-09, + "loss": 0.7894, + "mean_token_accuracy": 0.7780354022979736, + "num_tokens": 227640.0, + "step": 6 + }, + { + "epoch": 0.0008904719501335708, + "ewc_loss": 4.9112713895738125e-11, + "ewc_loss_parallel": 4.907185768843192e-14, + "ewc_loss_perp": 0.0, + "grad_norm": 4.830974102020264, + "learning_rate": 2.543450614667232e-09, + "loss": 0.816, + "mean_token_accuracy": 0.7747328281402588, + "num_tokens": 265114.0, + "step": 7 + }, + { + "epoch": 0.001017682228724081, + "ewc_loss": 2.1736923372372985e-10, + "ewc_loss_parallel": 2.1760371282653068e-13, + "ewc_loss_perp": 0.0, + "grad_norm": 4.939243316650391, + "learning_rate": 2.967359050445104e-09, + "loss": 0.7583, + "mean_token_accuracy": 0.7889528870582581, + "num_tokens": 299865.0, + "step": 8 + }, + { + "epoch": 0.001144892507314591, + "ewc_loss": 3.2014213502407074e-10, + "ewc_loss_parallel": 3.197442310920451e-13, + "ewc_loss_perp": 0.0, + "grad_norm": 4.586737632751465, + "learning_rate": 3.3912674862229757e-09, + "loss": 0.8142, + "mean_token_accuracy": 0.7733393907546997, + "num_tokens": 342063.0, + "step": 9 + }, + { + "epoch": 0.0012721027859051012, + "ewc_loss": 5.893525667488575e-10, + "ewc_loss_parallel": 5.897504706808832e-13, + "ewc_loss_perp": 0.0, + "grad_norm": 5.454995155334473, + "learning_rate": 3.815175922000847e-09, + "loss": 0.8638, + "mean_token_accuracy": 0.7635148763656616, + "num_tokens": 374864.0, + "step": 10 + }, + { + "epoch": 0.0013993130644956112, + "ewc_loss": 1.9208528101444244e-09, + "ewc_loss_parallel": 1.9184653865522705e-12, + "ewc_loss_perp": 0.0, + "grad_norm": 4.302826404571533, + "learning_rate": 4.239084357778719e-09, + "loss": 0.7767, + "mean_token_accuracy": 0.775107741355896, + "num_tokens": 416605.0, + "step": 11 + }, + { + "epoch": 0.0015265233430861213, + "ewc_loss": 2.4010660126805305e-09, + "ewc_loss_parallel": 2.4016344468691386e-12, + "ewc_loss_perp": 0.0, + "grad_norm": 5.44810676574707, + "learning_rate": 4.662992793556591e-09, + "loss": 0.8345, + "mean_token_accuracy": 0.7689513564109802, + "num_tokens": 448798.0, + "step": 12 + }, + { + "epoch": 0.0016537336216766315, + "ewc_loss": 2.9685907065868378e-09, + "ewc_loss_parallel": 2.9700686354772188e-12, + "ewc_loss_perp": 0.0, + "grad_norm": 5.445432662963867, + "learning_rate": 5.086901229334464e-09, + "loss": 0.8739, + "mean_token_accuracy": 0.7540445923805237, + "num_tokens": 480084.0, + "step": 13 + }, + { + "epoch": 0.0017809439002671415, + "ewc_loss": 9.604264050722122e-09, + "ewc_loss_parallel": 9.606537787476555e-12, + "ewc_loss_perp": 0.0, + "grad_norm": 4.3748321533203125, + "learning_rate": 5.510809665112336e-09, + "loss": 0.806, + "mean_token_accuracy": 0.7792655825614929, + "num_tokens": 524543.0, + "step": 14 + }, + { + "epoch": 0.0019081541788576518, + "ewc_loss": 1.437729224562645e-08, + "ewc_loss_parallel": 1.4381384971784428e-11, + "ewc_loss_perp": 0.0, + "grad_norm": 4.785918712615967, + "learning_rate": 5.934718100890208e-09, + "loss": 0.7518, + "mean_token_accuracy": 0.7888622283935547, + "num_tokens": 563314.0, + "step": 15 + }, + { + "epoch": 0.002035364457448162, + "ewc_loss": 1.6298145055770874e-08, + "ewc_loss_parallel": 1.6257217794191092e-11, + "ewc_loss_perp": 0.0, + "grad_norm": 5.07543420791626, + "learning_rate": 6.3586265366680796e-09, + "loss": 0.8424, + "mean_token_accuracy": 0.7630856037139893, + "num_tokens": 598421.0, + "step": 16 + }, + { + "epoch": 0.002162574736038672, + "ewc_loss": 1.9674189388751984e-08, + "ewc_loss_parallel": 1.9667822925839573e-11, + "ewc_loss_perp": 0.0, + "grad_norm": 5.037054061889648, + "learning_rate": 6.782534972445951e-09, + "loss": 0.832, + "mean_token_accuracy": 0.7627519369125366, + "num_tokens": 634690.0, + "step": 17 + }, + { + "epoch": 0.002289785014629182, + "ewc_loss": 2.2351741790771484e-08, + "ewc_loss_parallel": 2.2396307031158358e-11, + "ewc_loss_perp": 0.0, + "grad_norm": 5.1440749168396, + "learning_rate": 7.206443408223823e-09, + "loss": 0.869, + "mean_token_accuracy": 0.7593342661857605, + "num_tokens": 674653.0, + "step": 18 + }, + { + "epoch": 0.0024169952932196924, + "ewc_loss": 6.146728992462158e-08, + "ewc_loss_parallel": 6.139089236967266e-11, + "ewc_loss_perp": 0.0, + "grad_norm": 5.054834842681885, + "learning_rate": 7.630351844001695e-09, + "loss": 0.8005, + "mean_token_accuracy": 0.7768722176551819, + "num_tokens": 708238.0, + "step": 19 + }, + { + "epoch": 0.0025442055718102024, + "ewc_loss": 9.499490261077881e-08, + "ewc_loss_parallel": 9.5042196335271e-11, + "ewc_loss_perp": 0.0, + "grad_norm": 4.631464004516602, + "learning_rate": 8.054260279779567e-09, + "loss": 0.7828, + "mean_token_accuracy": 0.7789447903633118, + "num_tokens": 749312.0, + "step": 20 + }, + { + "epoch": 0.0026714158504007124, + "ewc_loss": 1.0756775736808777e-07, + "ewc_loss_parallel": 1.07775122160092e-10, + "ewc_loss_perp": 0.0, + "grad_norm": 5.414412498474121, + "learning_rate": 8.478168715557438e-09, + "loss": 0.8523, + "mean_token_accuracy": 0.7661522030830383, + "num_tokens": 783532.0, + "step": 21 + }, + { + "epoch": 0.0027986261289912225, + "ewc_loss": 1.1827796697616577e-07, + "ewc_loss_parallel": 1.1823431123048067e-10, + "ewc_loss_perp": 0.0, + "grad_norm": 5.105653762817383, + "learning_rate": 8.902077151335311e-09, + "loss": 0.8303, + "mean_token_accuracy": 0.7739511132240295, + "num_tokens": 817429.0, + "step": 22 + }, + { + "epoch": 0.0029258364075817325, + "ewc_loss": 1.2759119272232056e-07, + "ewc_loss_parallel": 1.2732925824820995e-10, + "ewc_loss_perp": 0.0, + "grad_norm": 4.823330402374268, + "learning_rate": 9.325985587113182e-09, + "loss": 0.7637, + "mean_token_accuracy": 0.7886726260185242, + "num_tokens": 853964.0, + "step": 23 + }, + { + "epoch": 0.0030530466861722425, + "ewc_loss": 1.387670636177063e-07, + "ewc_loss_parallel": 1.3915268937125802e-10, + "ewc_loss_perp": 0.0, + "grad_norm": 5.464425563812256, + "learning_rate": 9.749894022891054e-09, + "loss": 0.8276, + "mean_token_accuracy": 0.767574667930603, + "num_tokens": 885070.0, + "step": 24 + }, + { + "epoch": 0.003180256964762753, + "ewc_loss": 1.555308699607849e-07, + "ewc_loss_parallel": 1.5552359400317073e-10, + "ewc_loss_perp": 0.0, + "grad_norm": 4.831496715545654, + "learning_rate": 1.0173802458668929e-08, + "loss": 0.8799, + "mean_token_accuracy": 0.7515967488288879, + "num_tokens": 926893.0, + "step": 25 + }, + { + "epoch": 0.003307467243353263, + "ewc_loss": 3.9301812648773193e-07, + "ewc_loss_parallel": 3.92901711165905e-10, + "ewc_loss_perp": 0.0, + "grad_norm": 4.89300012588501, + "learning_rate": 1.05977108944468e-08, + "loss": 0.8162, + "mean_token_accuracy": 0.7674952745437622, + "num_tokens": 964773.0, + "step": 26 + }, + { + "epoch": 0.003434677521943773, + "ewc_loss": 5.587935447692871e-07, + "ewc_loss_parallel": 5.602487362921238e-10, + "ewc_loss_perp": 0.0, + "grad_norm": 4.7495012283325195, + "learning_rate": 1.1021619330224672e-08, + "loss": 0.7692, + "mean_token_accuracy": 0.7851283550262451, + "num_tokens": 1002725.0, + "step": 27 + }, + { + "epoch": 0.003561887800534283, + "ewc_loss": 6.705522537231445e-07, + "ewc_loss_parallel": 6.693881005048752e-10, + "ewc_loss_perp": 0.0, + "grad_norm": 4.9860453605651855, + "learning_rate": 1.1445527766002543e-08, + "loss": 0.873, + "mean_token_accuracy": 0.7576355934143066, + "num_tokens": 1040296.0, + "step": 28 + }, + { + "epoch": 0.003689098079124793, + "ewc_loss": 7.227063179016113e-07, + "ewc_loss_parallel": 7.239577826112509e-10, + "ewc_loss_perp": 0.0, + "grad_norm": 4.445861339569092, + "learning_rate": 1.1869436201780416e-08, + "loss": 0.7657, + "mean_token_accuracy": 0.7852626442909241, + "num_tokens": 1081711.0, + "step": 29 + }, + { + "epoch": 0.0038163083577153036, + "ewc_loss": 7.525086402893066e-07, + "ewc_loss_parallel": 7.530616130679846e-10, + "ewc_loss_perp": 0.0, + "grad_norm": 4.81271505355835, + "learning_rate": 1.2293344637558287e-08, + "loss": 0.812, + "mean_token_accuracy": 0.7752777934074402, + "num_tokens": 1120556.0, + "step": 30 + }, + { + "epoch": 0.003943518636305814, + "ewc_loss": 7.934868335723877e-07, + "ewc_loss_parallel": 7.930793799459934e-10, + "ewc_loss_perp": 0.0, + "grad_norm": 4.804330825805664, + "learning_rate": 1.2717253073336159e-08, + "loss": 0.764, + "mean_token_accuracy": 0.7893945574760437, + "num_tokens": 1157723.0, + "step": 31 + }, + { + "epoch": 0.004070728914896324, + "ewc_loss": 8.605420589447021e-07, + "ewc_loss_parallel": 8.62200977280736e-10, + "ewc_loss_perp": 0.0, + "grad_norm": 4.690738201141357, + "learning_rate": 1.314116150911403e-08, + "loss": 0.8101, + "mean_token_accuracy": 0.7741218209266663, + "num_tokens": 1197879.0, + "step": 32 + }, + { + "epoch": 0.004197939193486834, + "ewc_loss": 9.015202522277832e-07, + "ewc_loss_parallel": 9.022187441587448e-10, + "ewc_loss_perp": 0.0, + "grad_norm": 4.752446174621582, + "learning_rate": 1.3565069944891903e-08, + "loss": 0.8109, + "mean_token_accuracy": 0.7754741311073303, + "num_tokens": 1237342.0, + "step": 33 + }, + { + "epoch": 0.004325149472077344, + "ewc_loss": 9.909272193908691e-07, + "ewc_loss_parallel": 9.89530235528946e-10, + "ewc_loss_perp": 0.0, + "grad_norm": 4.492638111114502, + "learning_rate": 1.3988978380669775e-08, + "loss": 0.7821, + "mean_token_accuracy": 0.7823759913444519, + "num_tokens": 1280197.0, + "step": 34 + }, + { + "epoch": 0.004452359750667854, + "ewc_loss": 1.2889504432678223e-06, + "ewc_loss_parallel": 1.2878444977104664e-09, + "ewc_loss_perp": 0.0, + "grad_norm": 4.712782859802246, + "learning_rate": 1.4412886816447646e-08, + "loss": 0.7645, + "mean_token_accuracy": 0.7847288250923157, + "num_tokens": 1318625.0, + "step": 35 + }, + { + "epoch": 0.004579570029258364, + "ewc_loss": 2.4437904357910156e-06, + "ewc_loss_parallel": 2.444721758365631e-09, + "ewc_loss_perp": 0.0, + "grad_norm": 4.8404083251953125, + "learning_rate": 1.4836795252225519e-08, + "loss": 0.822, + "mean_token_accuracy": 0.7721850872039795, + "num_tokens": 1356868.0, + "step": 36 + }, + { + "epoch": 0.004706780307848874, + "ewc_loss": 3.3229589462280273e-06, + "ewc_loss_parallel": 3.3178366720676422e-09, + "ewc_loss_perp": 0.0, + "grad_norm": 4.874773979187012, + "learning_rate": 1.526070368800339e-08, + "loss": 0.7937, + "mean_token_accuracy": 0.7810244560241699, + "num_tokens": 1394696.0, + "step": 37 + }, + { + "epoch": 0.004833990586439385, + "ewc_loss": 3.844499588012695e-06, + "ewc_loss_parallel": 3.841705620288849e-09, + "ewc_loss_perp": 0.0, + "grad_norm": 4.417423248291016, + "learning_rate": 1.5684612123781262e-08, + "loss": 0.7827, + "mean_token_accuracy": 0.7822397351264954, + "num_tokens": 1438738.0, + "step": 38 + }, + { + "epoch": 0.004961200865029895, + "ewc_loss": 4.023313522338867e-06, + "ewc_loss_parallel": 4.016328603029251e-09, + "ewc_loss_perp": 0.0, + "grad_norm": 4.793605804443359, + "learning_rate": 1.6108520559559135e-08, + "loss": 0.7483, + "mean_token_accuracy": 0.7924410104751587, + "num_tokens": 1475089.0, + "step": 39 + }, + { + "epoch": 0.005088411143620405, + "ewc_loss": 4.26173210144043e-06, + "ewc_loss_parallel": 4.249159246683121e-09, + "ewc_loss_perp": 0.0, + "grad_norm": 4.777746200561523, + "learning_rate": 1.6532428995337004e-08, + "loss": 0.8084, + "mean_token_accuracy": 0.7742187976837158, + "num_tokens": 1514566.0, + "step": 40 + }, + { + "epoch": 0.005215621422210915, + "ewc_loss": 4.4405460357666016e-06, + "ewc_loss_parallel": 4.452886059880257e-09, + "ewc_loss_perp": 0.0, + "grad_norm": 4.985526084899902, + "learning_rate": 1.6956337431114877e-08, + "loss": 0.8232, + "mean_token_accuracy": 0.7735110521316528, + "num_tokens": 1552560.0, + "step": 41 + }, + { + "epoch": 0.005342831700801425, + "ewc_loss": 4.6193599700927734e-06, + "ewc_loss_parallel": 4.627509042620659e-09, + "ewc_loss_perp": 0.0, + "grad_norm": 5.416777610778809, + "learning_rate": 1.738024586689275e-08, + "loss": 0.8006, + "mean_token_accuracy": 0.7770767211914062, + "num_tokens": 1584759.0, + "step": 42 + }, + { + "epoch": 0.005470041979391935, + "ewc_loss": 4.738569259643555e-06, + "ewc_loss_parallel": 4.743924364447594e-09, + "ewc_loss_perp": 0.0, + "grad_norm": 4.8780646324157715, + "learning_rate": 1.7804154302670622e-08, + "loss": 0.8021, + "mean_token_accuracy": 0.775871992111206, + "num_tokens": 1621825.0, + "step": 43 + }, + { + "epoch": 0.005597252257982445, + "ewc_loss": 5.036592483520508e-06, + "ewc_loss_parallel": 5.034962669014931e-09, + "ewc_loss_perp": 0.0, + "grad_norm": 4.813459873199463, + "learning_rate": 1.8228062738448494e-08, + "loss": 0.8277, + "mean_token_accuracy": 0.7682169675827026, + "num_tokens": 1662946.0, + "step": 44 + }, + { + "epoch": 0.005724462536572955, + "ewc_loss": 5.245208740234375e-06, + "ewc_loss_parallel": 5.238689482212067e-09, + "ewc_loss_perp": 0.0, + "grad_norm": 4.666078567504883, + "learning_rate": 1.8651971174226364e-08, + "loss": 0.8154, + "mean_token_accuracy": 0.7737029790878296, + "num_tokens": 1699433.0, + "step": 45 + }, + { + "epoch": 0.005851672815163465, + "ewc_loss": 5.543231964111328e-06, + "ewc_loss_parallel": 5.529727786779404e-09, + "ewc_loss_perp": 0.0, + "grad_norm": 4.3684797286987305, + "learning_rate": 1.9075879610004236e-08, + "loss": 0.7462, + "mean_token_accuracy": 0.7885390520095825, + "num_tokens": 1742812.0, + "step": 46 + }, + { + "epoch": 0.005978883093753975, + "ewc_loss": 5.751848220825195e-06, + "ewc_loss_parallel": 5.762558430433273e-09, + "ewc_loss_perp": 0.0, + "grad_norm": 4.8986287117004395, + "learning_rate": 1.949978804578211e-08, + "loss": 0.8016, + "mean_token_accuracy": 0.7784011363983154, + "num_tokens": 1778725.0, + "step": 47 + }, + { + "epoch": 0.006106093372344485, + "ewc_loss": 6.4373016357421875e-06, + "ewc_loss_parallel": 6.4319465309381485e-09, + "ewc_loss_perp": 0.0, + "grad_norm": 4.875207901000977, + "learning_rate": 1.9923696481559985e-08, + "loss": 0.8481, + "mean_token_accuracy": 0.7617505788803101, + "num_tokens": 1816592.0, + "step": 48 + }, + { + "epoch": 0.006233303650934996, + "ewc_loss": 7.927417755126953e-06, + "ewc_loss_parallel": 7.916241884231567e-09, + "ewc_loss_perp": 0.0, + "grad_norm": 4.4648027420043945, + "learning_rate": 2.0347604917337857e-08, + "loss": 0.7397, + "mean_token_accuracy": 0.7902255058288574, + "num_tokens": 1859907.0, + "step": 49 + }, + { + "epoch": 0.006360513929525506, + "ewc_loss": 1.2218952178955078e-05, + "ewc_loss_parallel": 1.2223608791828156e-08, + "ewc_loss_perp": 0.0, + "grad_norm": 4.766383647918701, + "learning_rate": 2.0771513353115727e-08, + "loss": 0.7291, + "mean_token_accuracy": 0.7997006177902222, + "num_tokens": 1896627.0, + "step": 50 + }, + { + "epoch": 0.006487724208116016, + "ewc_loss": 1.6570091247558594e-05, + "ewc_loss_parallel": 1.6530975699424744e-08, + "ewc_loss_perp": 0.0, + "grad_norm": 5.032137393951416, + "learning_rate": 2.11954217888936e-08, + "loss": 0.8442, + "mean_token_accuracy": 0.7667090892791748, + "num_tokens": 1934041.0, + "step": 51 + }, + { + "epoch": 0.006614934486706526, + "ewc_loss": 1.919269561767578e-05, + "ewc_loss_parallel": 1.9208528101444244e-08, + "ewc_loss_perp": 0.0, + "grad_norm": 4.662564277648926, + "learning_rate": 2.1619330224671472e-08, + "loss": 0.7737, + "mean_token_accuracy": 0.7845584750175476, + "num_tokens": 1976482.0, + "step": 52 + }, + { + "epoch": 0.006742144765297036, + "ewc_loss": 2.1457672119140625e-05, + "ewc_loss_parallel": 2.1420419216156006e-08, + "ewc_loss_perp": 0.0, + "grad_norm": 5.376045227050781, + "learning_rate": 2.2043238660449344e-08, + "loss": 0.803, + "mean_token_accuracy": 0.7724766731262207, + "num_tokens": 2009224.0, + "step": 53 + }, + { + "epoch": 0.006869355043887546, + "ewc_loss": 2.3126602172851562e-05, + "ewc_loss_parallel": 2.3166649043560028e-08, + "ewc_loss_perp": 0.0, + "grad_norm": 4.980083465576172, + "learning_rate": 2.2467147096227214e-08, + "loss": 0.8475, + "mean_token_accuracy": 0.7620729207992554, + "num_tokens": 2049235.0, + "step": 54 + }, + { + "epoch": 0.006996565322478056, + "ewc_loss": 2.4437904357910156e-05, + "ewc_loss_parallel": 2.444721758365631e-08, + "ewc_loss_perp": 0.0, + "grad_norm": 4.682716369628906, + "learning_rate": 2.2891055532005086e-08, + "loss": 0.8602, + "mean_token_accuracy": 0.7612201571464539, + "num_tokens": 2090260.0, + "step": 55 + }, + { + "epoch": 0.007123775601068566, + "ewc_loss": 2.491474151611328e-05, + "ewc_loss_parallel": 2.491287887096405e-08, + "ewc_loss_perp": 0.0, + "grad_norm": 5.090335845947266, + "learning_rate": 2.331496396778296e-08, + "loss": 0.8657, + "mean_token_accuracy": 0.760025680065155, + "num_tokens": 2126686.0, + "step": 56 + }, + { + "epoch": 0.007250985879659076, + "ewc_loss": 2.586841583251953e-05, + "ewc_loss_parallel": 2.584420144557953e-08, + "ewc_loss_perp": 0.0, + "grad_norm": 4.462711334228516, + "learning_rate": 2.373887240356083e-08, + "loss": 0.7675, + "mean_token_accuracy": 0.7846771478652954, + "num_tokens": 2171355.0, + "step": 57 + }, + { + "epoch": 0.007378196158249586, + "ewc_loss": 2.6345252990722656e-05, + "ewc_loss_parallel": 2.6309862732887268e-08, + "ewc_loss_perp": 0.0, + "grad_norm": 4.705465316772461, + "learning_rate": 2.4162780839338704e-08, + "loss": 0.7407, + "mean_token_accuracy": 0.7931541204452515, + "num_tokens": 2211660.0, + "step": 58 + }, + { + "epoch": 0.007505406436840096, + "ewc_loss": 2.6345252990722656e-05, + "ewc_loss_parallel": 2.6309862732887268e-08, + "ewc_loss_perp": 0.0, + "grad_norm": 5.599023818969727, + "learning_rate": 2.4586689275116573e-08, + "loss": 0.8709, + "mean_token_accuracy": 0.75911945104599, + "num_tokens": 2244411.0, + "step": 59 + }, + { + "epoch": 0.007632616715430607, + "ewc_loss": 2.6464462280273438e-05, + "ewc_loss_parallel": 2.6426278054714203e-08, + "ewc_loss_perp": 0.0, + "grad_norm": 4.473204612731934, + "learning_rate": 2.5010597710894446e-08, + "loss": 0.7977, + "mean_token_accuracy": 0.7713140249252319, + "num_tokens": 2285253.0, + "step": 60 + }, + { + "epoch": 0.007759826994021117, + "ewc_loss": 2.6702880859375e-05, + "ewc_loss_parallel": 2.6659108698368073e-08, + "ewc_loss_perp": 0.0, + "grad_norm": 4.577515602111816, + "learning_rate": 2.5434506146672318e-08, + "loss": 0.7621, + "mean_token_accuracy": 0.785428524017334, + "num_tokens": 2328577.0, + "step": 61 + }, + { + "epoch": 0.007887037272611627, + "ewc_loss": 2.6702880859375e-05, + "ewc_loss_parallel": 2.6659108698368073e-08, + "ewc_loss_perp": 0.0, + "grad_norm": 4.799027442932129, + "learning_rate": 2.585841458245019e-08, + "loss": 0.8419, + "mean_token_accuracy": 0.7702468633651733, + "num_tokens": 2366524.0, + "step": 62 + }, + { + "epoch": 0.008014247551202136, + "ewc_loss": 2.7418136596679688e-05, + "ewc_loss_parallel": 2.7474015951156616e-08, + "ewc_loss_perp": 0.0, + "grad_norm": 4.325414657592773, + "learning_rate": 2.628232301822806e-08, + "loss": 0.7556, + "mean_token_accuracy": 0.7895278334617615, + "num_tokens": 2408628.0, + "step": 63 + }, + { + "epoch": 0.008141457829792647, + "ewc_loss": 2.765655517578125e-05, + "ewc_loss_parallel": 2.7706846594810486e-08, + "ewc_loss_perp": 0.0, + "grad_norm": 4.386251926422119, + "learning_rate": 2.6706231454005933e-08, + "loss": 0.7953, + "mean_token_accuracy": 0.7753013372421265, + "num_tokens": 2451800.0, + "step": 64 + }, + { + "epoch": 0.008268668108383158, + "ewc_loss": 2.8371810913085938e-05, + "ewc_loss_parallel": 2.8405338525772095e-08, + "ewc_loss_perp": 0.0, + "grad_norm": 5.758973121643066, + "learning_rate": 2.7130139889783805e-08, + "loss": 0.8661, + "mean_token_accuracy": 0.7549817562103271, + "num_tokens": 2481448.0, + "step": 65 + }, + { + "epoch": 0.008395878386973667, + "ewc_loss": 3.0040740966796875e-05, + "ewc_loss_parallel": 3.003515303134918e-08, + "ewc_loss_perp": 0.0, + "grad_norm": 4.394927501678467, + "learning_rate": 2.7554048325561678e-08, + "loss": 0.7928, + "mean_token_accuracy": 0.7758862376213074, + "num_tokens": 2526339.0, + "step": 66 + }, + { + "epoch": 0.008523088665564178, + "ewc_loss": 3.1948089599609375e-05, + "ewc_loss_parallel": 3.189779818058014e-08, + "ewc_loss_perp": 0.0, + "grad_norm": 4.5773444175720215, + "learning_rate": 2.797795676133955e-08, + "loss": 0.8181, + "mean_token_accuracy": 0.7701197862625122, + "num_tokens": 2570691.0, + "step": 67 + }, + { + "epoch": 0.008650298944154687, + "ewc_loss": 3.5762786865234375e-05, + "ewc_loss_parallel": 3.585591912269592e-08, + "ewc_loss_perp": 0.0, + "grad_norm": 4.751788139343262, + "learning_rate": 2.840186519711742e-08, + "loss": 0.8047, + "mean_token_accuracy": 0.7752950191497803, + "num_tokens": 2609207.0, + "step": 68 + }, + { + "epoch": 0.008777509222745198, + "ewc_loss": 4.982948303222656e-05, + "ewc_loss_parallel": 4.98257577419281e-08, + "ewc_loss_perp": 0.0, + "grad_norm": 5.1442341804504395, + "learning_rate": 2.8825773632895292e-08, + "loss": 0.793, + "mean_token_accuracy": 0.7769806981086731, + "num_tokens": 2645494.0, + "step": 69 + }, + { + "epoch": 0.008904719501335707, + "ewc_loss": 7.104873657226562e-05, + "ewc_loss_parallel": 7.12461769580841e-08, + "ewc_loss_perp": 0.0, + "grad_norm": 4.757089138031006, + "learning_rate": 2.9249682068673165e-08, + "loss": 0.75, + "mean_token_accuracy": 0.7916167974472046, + "num_tokens": 2686258.0, + "step": 70 + }, + { + "epoch": 0.009031929779926218, + "ewc_loss": 8.869171142578125e-05, + "ewc_loss_parallel": 8.847564458847046e-08, + "ewc_loss_perp": 0.0, + "grad_norm": 5.33197021484375, + "learning_rate": 2.9673590504451037e-08, + "loss": 0.805, + "mean_token_accuracy": 0.7778120040893555, + "num_tokens": 2720544.0, + "step": 71 + }, + { + "epoch": 0.009159140058516728, + "ewc_loss": 0.00010061264038085938, + "ewc_loss_parallel": 1.0058283805847168e-07, + "ewc_loss_perp": 0.0, + "grad_norm": 5.12754487991333, + "learning_rate": 3.0097498940228907e-08, + "loss": 0.7677, + "mean_token_accuracy": 0.7862880229949951, + "num_tokens": 2758068.0, + "step": 72 + }, + { + "epoch": 0.009286350337107238, + "ewc_loss": 0.00010919570922851562, + "ewc_loss_parallel": 1.0896474123001099e-07, + "ewc_loss_perp": 0.0, + "grad_norm": 5.313689231872559, + "learning_rate": 3.052140737600678e-08, + "loss": 0.8519, + "mean_token_accuracy": 0.7630953788757324, + "num_tokens": 2793342.0, + "step": 73 + }, + { + "epoch": 0.009413560615697748, + "ewc_loss": 0.0001125335693359375, + "ewc_loss_parallel": 1.126900315284729e-07, + "ewc_loss_perp": 0.0, + "grad_norm": 5.493561744689941, + "learning_rate": 3.094531581178465e-08, + "loss": 0.8888, + "mean_token_accuracy": 0.7551553249359131, + "num_tokens": 2828003.0, + "step": 74 + }, + { + "epoch": 0.009540770894288259, + "ewc_loss": 0.00011348724365234375, + "ewc_loss_parallel": 1.1362135410308838e-07, + "ewc_loss_perp": 0.0, + "grad_norm": 4.587783336639404, + "learning_rate": 3.1369224247562524e-08, + "loss": 0.7639, + "mean_token_accuracy": 0.7842103242874146, + "num_tokens": 2874755.0, + "step": 75 + }, + { + "epoch": 0.00966798117287877, + "ewc_loss": 0.00011491775512695312, + "ewc_loss_parallel": 1.150183379650116e-07, + "ewc_loss_perp": 0.0, + "grad_norm": 4.940126895904541, + "learning_rate": 3.17931326833404e-08, + "loss": 0.7862, + "mean_token_accuracy": 0.7763802409172058, + "num_tokens": 2909703.0, + "step": 76 + }, + { + "epoch": 0.009795191451469279, + "ewc_loss": 0.00011777877807617188, + "ewc_loss_parallel": 1.1781230568885803e-07, + "ewc_loss_perp": 0.0, + "grad_norm": 4.488656520843506, + "learning_rate": 3.221704111911827e-08, + "loss": 0.7165, + "mean_token_accuracy": 0.7987850904464722, + "num_tokens": 2954020.0, + "step": 77 + }, + { + "epoch": 0.00992240173005979, + "ewc_loss": 0.000118255615234375, + "ewc_loss_parallel": 1.1827796697616577e-07, + "ewc_loss_perp": 0.0, + "grad_norm": 5.49862003326416, + "learning_rate": 3.264094955489614e-08, + "loss": 0.8114, + "mean_token_accuracy": 0.7750422954559326, + "num_tokens": 2984894.0, + "step": 78 + }, + { + "epoch": 0.010049612008650299, + "ewc_loss": 0.000118255615234375, + "ewc_loss_parallel": 1.1827796697616577e-07, + "ewc_loss_perp": 0.0, + "grad_norm": 5.246587753295898, + "learning_rate": 3.306485799067401e-08, + "loss": 0.7184, + "mean_token_accuracy": 0.7954885959625244, + "num_tokens": 3017773.0, + "step": 79 + }, + { + "epoch": 0.01017682228724081, + "ewc_loss": 0.0001201629638671875, + "ewc_loss_parallel": 1.2014061212539673e-07, + "ewc_loss_perp": 0.0, + "grad_norm": 4.658181190490723, + "learning_rate": 3.348876642645188e-08, + "loss": 0.7178, + "mean_token_accuracy": 0.7980969548225403, + "num_tokens": 3059739.0, + "step": 80 + }, + { + "epoch": 0.010304032565831319, + "ewc_loss": 0.00011920928955078125, + "ewc_loss_parallel": 1.1920928955078125e-07, + "ewc_loss_perp": 0.0, + "grad_norm": 5.174802780151367, + "learning_rate": 3.391267486222975e-08, + "loss": 0.8095, + "mean_token_accuracy": 0.7725939750671387, + "num_tokens": 3094910.0, + "step": 81 + }, + { + "epoch": 0.01043124284442183, + "ewc_loss": 0.00011682510375976562, + "ewc_loss_parallel": 1.1688098311424255e-07, + "ewc_loss_perp": 0.0, + "grad_norm": 4.888227462768555, + "learning_rate": 3.4336583298007626e-08, + "loss": 0.8138, + "mean_token_accuracy": 0.7737807035446167, + "num_tokens": 3133884.0, + "step": 82 + }, + { + "epoch": 0.010558453123012339, + "ewc_loss": 0.00011539459228515625, + "ewc_loss_parallel": 1.1548399925231934e-07, + "ewc_loss_perp": 0.0, + "grad_norm": 5.430181980133057, + "learning_rate": 3.47604917337855e-08, + "loss": 0.8461, + "mean_token_accuracy": 0.7608584761619568, + "num_tokens": 3166755.0, + "step": 83 + }, + { + "epoch": 0.01068566340160285, + "ewc_loss": 0.00011348724365234375, + "ewc_loss_parallel": 1.1362135410308838e-07, + "ewc_loss_perp": 0.0, + "grad_norm": 5.048045635223389, + "learning_rate": 3.518440016956337e-08, + "loss": 0.8151, + "mean_token_accuracy": 0.7708519101142883, + "num_tokens": 3204825.0, + "step": 84 + }, + { + "epoch": 0.010812873680193359, + "ewc_loss": 0.0001125335693359375, + "ewc_loss_parallel": 1.126900315284729e-07, + "ewc_loss_perp": 0.0, + "grad_norm": 5.071069717407227, + "learning_rate": 3.5608308605341244e-08, + "loss": 0.8508, + "mean_token_accuracy": 0.7625106573104858, + "num_tokens": 3244036.0, + "step": 85 + }, + { + "epoch": 0.01094008395878387, + "ewc_loss": 0.0001125335693359375, + "ewc_loss_parallel": 1.126900315284729e-07, + "ewc_loss_perp": 0.0, + "grad_norm": 4.8272552490234375, + "learning_rate": 3.6032217041119116e-08, + "loss": 0.8379, + "mean_token_accuracy": 0.7670039534568787, + "num_tokens": 3285969.0, + "step": 86 + }, + { + "epoch": 0.01106729423737438, + "ewc_loss": 0.0001125335693359375, + "ewc_loss_parallel": 1.126900315284729e-07, + "ewc_loss_perp": 0.0, + "grad_norm": 4.803405284881592, + "learning_rate": 3.645612547689699e-08, + "loss": 0.8093, + "mean_token_accuracy": 0.7745776176452637, + "num_tokens": 3327648.0, + "step": 87 + }, + { + "epoch": 0.01119450451596489, + "ewc_loss": 0.00011301040649414062, + "ewc_loss_parallel": 1.1315569281578064e-07, + "ewc_loss_perp": 0.0, + "grad_norm": 4.9376749992370605, + "learning_rate": 3.6880033912674855e-08, + "loss": 0.8121, + "mean_token_accuracy": 0.7728368639945984, + "num_tokens": 3367399.0, + "step": 88 + }, + { + "epoch": 0.0113217147945554, + "ewc_loss": 0.00011396408081054688, + "ewc_loss_parallel": 1.1408701539039612e-07, + "ewc_loss_perp": 0.0, + "grad_norm": 5.127857685089111, + "learning_rate": 3.730394234845273e-08, + "loss": 0.7957, + "mean_token_accuracy": 0.7795237898826599, + "num_tokens": 3405402.0, + "step": 89 + }, + { + "epoch": 0.01144892507314591, + "ewc_loss": 0.00011682510375976562, + "ewc_loss_parallel": 1.1688098311424255e-07, + "ewc_loss_perp": 0.0, + "grad_norm": 5.24790620803833, + "learning_rate": 3.77278507842306e-08, + "loss": 0.8789, + "mean_token_accuracy": 0.7536770701408386, + "num_tokens": 3441791.0, + "step": 90 + }, + { + "epoch": 0.01157613535173642, + "ewc_loss": 0.00011920928955078125, + "ewc_loss_parallel": 1.1920928955078125e-07, + "ewc_loss_perp": 0.0, + "grad_norm": 5.202239036560059, + "learning_rate": 3.815175922000847e-08, + "loss": 0.8382, + "mean_token_accuracy": 0.7692206501960754, + "num_tokens": 3480151.0, + "step": 91 + }, + { + "epoch": 0.01170334563032693, + "ewc_loss": 0.0001239776611328125, + "ewc_loss_parallel": 1.2386590242385864e-07, + "ewc_loss_perp": 0.0, + "grad_norm": 5.091145992279053, + "learning_rate": 3.8575667655786345e-08, + "loss": 0.8018, + "mean_token_accuracy": 0.7756656408309937, + "num_tokens": 3516867.0, + "step": 92 + }, + { + "epoch": 0.01183055590891744, + "ewc_loss": 0.00012683868408203125, + "ewc_loss_parallel": 1.2665987014770508e-07, + "ewc_loss_perp": 0.0, + "grad_norm": 4.7100911140441895, + "learning_rate": 3.899957609156422e-08, + "loss": 0.7781, + "mean_token_accuracy": 0.782346248626709, + "num_tokens": 3557466.0, + "step": 93 + }, + { + "epoch": 0.01195776618750795, + "ewc_loss": 0.0001354217529296875, + "ewc_loss_parallel": 1.3504177331924438e-07, + "ewc_loss_perp": 0.0, + "grad_norm": 4.92112922668457, + "learning_rate": 3.94234845273421e-08, + "loss": 0.8075, + "mean_token_accuracy": 0.7730696797370911, + "num_tokens": 3596009.0, + "step": 94 + }, + { + "epoch": 0.012084976466098461, + "ewc_loss": 0.0001544952392578125, + "ewc_loss_parallel": 1.5459954738616943e-07, + "ewc_loss_perp": 0.0, + "grad_norm": 5.148350238800049, + "learning_rate": 3.984739296311997e-08, + "loss": 0.7996, + "mean_token_accuracy": 0.7749776244163513, + "num_tokens": 3631582.0, + "step": 95 + }, + { + "epoch": 0.01221218674468897, + "ewc_loss": 0.000186920166015625, + "ewc_loss_parallel": 1.8719583749771118e-07, + "ewc_loss_perp": 0.0, + "grad_norm": 5.162476062774658, + "learning_rate": 4.027130139889784e-08, + "loss": 0.8264, + "mean_token_accuracy": 0.7681081295013428, + "num_tokens": 3672855.0, + "step": 96 + }, + { + "epoch": 0.012339397023279481, + "ewc_loss": 0.0002460479736328125, + "ewc_loss_parallel": 2.4586915969848633e-07, + "ewc_loss_perp": 0.0, + "grad_norm": 5.077449798583984, + "learning_rate": 4.0695209834675715e-08, + "loss": 0.8186, + "mean_token_accuracy": 0.7726196050643921, + "num_tokens": 3714100.0, + "step": 97 + }, + { + "epoch": 0.012466607301869992, + "ewc_loss": 0.0003414154052734375, + "ewc_loss_parallel": 3.4086406230926514e-07, + "ewc_loss_perp": 0.0, + "grad_norm": 5.284477233886719, + "learning_rate": 4.111911827045358e-08, + "loss": 0.7701, + "mean_token_accuracy": 0.7846237421035767, + "num_tokens": 3752834.0, + "step": 98 + }, + { + "epoch": 0.012593817580460501, + "ewc_loss": 0.000431060791015625, + "ewc_loss_parallel": 4.302710294723511e-07, + "ewc_loss_perp": 0.0, + "grad_norm": 5.440809726715088, + "learning_rate": 4.154302670623145e-08, + "loss": 0.7808, + "mean_token_accuracy": 0.7806040644645691, + "num_tokens": 3791513.0, + "step": 99 + }, + { + "epoch": 0.012721027859051012, + "ewc_loss": 0.00048065185546875, + "ewc_loss_parallel": 4.805624485015869e-07, + "ewc_loss_perp": 0.0, + "grad_norm": 5.937448978424072, + "learning_rate": 4.1966935142009326e-08, + "loss": 0.8039, + "mean_token_accuracy": 0.7738640308380127, + "num_tokens": 3825085.0, + "step": 100 + }, + { + "epoch": 0.012848238137641521, + "ewc_loss": 0.000492095947265625, + "ewc_loss_parallel": 4.917383193969727e-07, + "ewc_loss_perp": 0.0, + "grad_norm": 5.381030082702637, + "learning_rate": 4.23908435777872e-08, + "loss": 0.8037, + "mean_token_accuracy": 0.7763598561286926, + "num_tokens": 3866492.0, + "step": 101 + }, + { + "epoch": 0.012975448416232032, + "ewc_loss": 0.000484466552734375, + "ewc_loss_parallel": 4.842877388000488e-07, + "ewc_loss_perp": 0.0, + "grad_norm": 5.655030727386475, + "learning_rate": 4.281475201356507e-08, + "loss": 0.8305, + "mean_token_accuracy": 0.7629142999649048, + "num_tokens": 3904053.0, + "step": 102 + }, + { + "epoch": 0.013102658694822541, + "ewc_loss": 0.00048065185546875, + "ewc_loss_parallel": 4.805624485015869e-07, + "ewc_loss_perp": 0.0, + "grad_norm": 5.652588844299316, + "learning_rate": 4.3238660449342943e-08, + "loss": 0.7391, + "mean_token_accuracy": 0.7952284812927246, + "num_tokens": 3935118.0, + "step": 103 + }, + { + "epoch": 0.013229868973413052, + "ewc_loss": 0.000484466552734375, + "ewc_loss_parallel": 4.842877388000488e-07, + "ewc_loss_perp": 0.0, + "grad_norm": 5.2665605545043945, + "learning_rate": 4.3662568885120816e-08, + "loss": 0.789, + "mean_token_accuracy": 0.7765403985977173, + "num_tokens": 3977899.0, + "step": 104 + }, + { + "epoch": 0.013357079252003561, + "ewc_loss": 0.000484466552734375, + "ewc_loss_parallel": 4.842877388000488e-07, + "ewc_loss_perp": 0.0, + "grad_norm": 5.298080921173096, + "learning_rate": 4.408647732089869e-08, + "loss": 0.8095, + "mean_token_accuracy": 0.7734013795852661, + "num_tokens": 4019416.0, + "step": 105 + }, + { + "epoch": 0.013484289530594072, + "ewc_loss": 0.000484466552734375, + "ewc_loss_parallel": 4.842877388000488e-07, + "ewc_loss_perp": 0.0, + "grad_norm": 5.255363464355469, + "learning_rate": 4.451038575667656e-08, + "loss": 0.7981, + "mean_token_accuracy": 0.772476077079773, + "num_tokens": 4063887.0, + "step": 106 + }, + { + "epoch": 0.013611499809184581, + "ewc_loss": 0.000484466552734375, + "ewc_loss_parallel": 4.842877388000488e-07, + "ewc_loss_perp": 0.0, + "grad_norm": 5.39852237701416, + "learning_rate": 4.493429419245443e-08, + "loss": 0.8293, + "mean_token_accuracy": 0.7677338719367981, + "num_tokens": 4101309.0, + "step": 107 + }, + { + "epoch": 0.013738710087775092, + "ewc_loss": 0.000484466552734375, + "ewc_loss_parallel": 4.842877388000488e-07, + "ewc_loss_perp": 0.0, + "grad_norm": 5.528039455413818, + "learning_rate": 4.53582026282323e-08, + "loss": 0.8131, + "mean_token_accuracy": 0.7701451778411865, + "num_tokens": 4136669.0, + "step": 108 + }, + { + "epoch": 0.013865920366365603, + "ewc_loss": 0.00048065185546875, + "ewc_loss_parallel": 4.805624485015869e-07, + "ewc_loss_perp": 0.0, + "grad_norm": 5.139890670776367, + "learning_rate": 4.578211106401017e-08, + "loss": 0.7443, + "mean_token_accuracy": 0.7897796630859375, + "num_tokens": 4178583.0, + "step": 109 + }, + { + "epoch": 0.013993130644956112, + "ewc_loss": 0.000469207763671875, + "ewc_loss_parallel": 4.6938657760620117e-07, + "ewc_loss_perp": 0.0, + "grad_norm": 5.110963344573975, + "learning_rate": 4.6206019499788045e-08, + "loss": 0.7748, + "mean_token_accuracy": 0.777565598487854, + "num_tokens": 4219552.0, + "step": 110 + }, + { + "epoch": 0.014120340923546623, + "ewc_loss": 0.0004634857177734375, + "ewc_loss_parallel": 4.637986421585083e-07, + "ewc_loss_perp": 0.0, + "grad_norm": 5.155389308929443, + "learning_rate": 4.662992793556592e-08, + "loss": 0.7673, + "mean_token_accuracy": 0.7853755950927734, + "num_tokens": 4264487.0, + "step": 111 + }, + { + "epoch": 0.014247551202137132, + "ewc_loss": 0.0004520416259765625, + "ewc_loss_parallel": 4.5262277126312256e-07, + "ewc_loss_perp": 0.0, + "grad_norm": 5.22394323348999, + "learning_rate": 4.705383637134379e-08, + "loss": 0.7336, + "mean_token_accuracy": 0.790625810623169, + "num_tokens": 4303015.0, + "step": 112 + }, + { + "epoch": 0.014374761480727643, + "ewc_loss": 0.000438690185546875, + "ewc_loss_parallel": 4.3958425521850586e-07, + "ewc_loss_perp": 0.0, + "grad_norm": 5.1959943771362305, + "learning_rate": 4.747774480712166e-08, + "loss": 0.7884, + "mean_token_accuracy": 0.7778067588806152, + "num_tokens": 4345446.0, + "step": 113 + }, + { + "epoch": 0.014501971759318152, + "ewc_loss": 0.00042724609375, + "ewc_loss_parallel": 4.2654573917388916e-07, + "ewc_loss_perp": 0.0, + "grad_norm": 5.323200702667236, + "learning_rate": 4.7901653242899535e-08, + "loss": 0.7413, + "mean_token_accuracy": 0.7914602756500244, + "num_tokens": 4383854.0, + "step": 114 + }, + { + "epoch": 0.014629182037908663, + "ewc_loss": 0.00041961669921875, + "ewc_loss_parallel": 4.1909515857696533e-07, + "ewc_loss_perp": 0.0, + "grad_norm": 5.348447322845459, + "learning_rate": 4.832556167867741e-08, + "loss": 0.8097, + "mean_token_accuracy": 0.772890567779541, + "num_tokens": 4420750.0, + "step": 115 + }, + { + "epoch": 0.014756392316499172, + "ewc_loss": 0.0004100799560546875, + "ewc_loss_parallel": 4.0978193283081055e-07, + "ewc_loss_perp": 0.0, + "grad_norm": 5.432947635650635, + "learning_rate": 4.8749470114455274e-08, + "loss": 0.8068, + "mean_token_accuracy": 0.7733354568481445, + "num_tokens": 4458761.0, + "step": 116 + }, + { + "epoch": 0.014883602595089683, + "ewc_loss": 0.00040435791015625, + "ewc_loss_parallel": 4.041939973831177e-07, + "ewc_loss_perp": 0.0, + "grad_norm": 5.257537364959717, + "learning_rate": 4.9173378550233146e-08, + "loss": 0.7672, + "mean_token_accuracy": 0.7806214690208435, + "num_tokens": 4496547.0, + "step": 117 + }, + { + "epoch": 0.015010812873680193, + "ewc_loss": 0.000396728515625, + "ewc_loss_parallel": 3.9674341678619385e-07, + "ewc_loss_perp": 0.0, + "grad_norm": 5.32292366027832, + "learning_rate": 4.959728698601102e-08, + "loss": 0.7654, + "mean_token_accuracy": 0.7892588973045349, + "num_tokens": 4533357.0, + "step": 118 + }, + { + "epoch": 0.015138023152270703, + "ewc_loss": 0.000396728515625, + "ewc_loss_parallel": 3.9674341678619385e-07, + "ewc_loss_perp": 0.0, + "grad_norm": 5.169501781463623, + "learning_rate": 5.002119542178889e-08, + "loss": 0.7399, + "mean_token_accuracy": 0.7905914783477783, + "num_tokens": 4573570.0, + "step": 119 + }, + { + "epoch": 0.015265233430861214, + "ewc_loss": 0.0003871917724609375, + "ewc_loss_parallel": 3.8743019104003906e-07, + "ewc_loss_perp": 0.0, + "grad_norm": 5.302407741546631, + "learning_rate": 5.0445103857566764e-08, + "loss": 0.7949, + "mean_token_accuracy": 0.7741454839706421, + "num_tokens": 4613195.0, + "step": 120 + }, + { + "epoch": 0.015392443709451724, + "ewc_loss": 0.0003814697265625, + "ewc_loss_parallel": 3.818422555923462e-07, + "ewc_loss_perp": 0.0, + "grad_norm": 5.033573150634766, + "learning_rate": 5.0869012293344637e-08, + "loss": 0.7153, + "mean_token_accuracy": 0.797850489616394, + "num_tokens": 4657230.0, + "step": 121 + }, + { + "epoch": 0.015519653988042234, + "ewc_loss": 0.00037384033203125, + "ewc_loss_parallel": 3.7439167499542236e-07, + "ewc_loss_perp": 0.0, + "grad_norm": 5.153960227966309, + "learning_rate": 5.129292072912251e-08, + "loss": 0.789, + "mean_token_accuracy": 0.7803250551223755, + "num_tokens": 4701436.0, + "step": 122 + }, + { + "epoch": 0.015646864266632744, + "ewc_loss": 0.0003719329833984375, + "ewc_loss_parallel": 3.725290298461914e-07, + "ewc_loss_perp": 0.0, + "grad_norm": 5.272110939025879, + "learning_rate": 5.171682916490038e-08, + "loss": 0.7768, + "mean_token_accuracy": 0.7802776098251343, + "num_tokens": 4741806.0, + "step": 123 + }, + { + "epoch": 0.015774074545223254, + "ewc_loss": 0.0003662109375, + "ewc_loss_parallel": 3.6694109439849854e-07, + "ewc_loss_perp": 0.0, + "grad_norm": 5.4438982009887695, + "learning_rate": 5.2140737600678254e-08, + "loss": 0.8141, + "mean_token_accuracy": 0.7654598355293274, + "num_tokens": 4780671.0, + "step": 124 + }, + { + "epoch": 0.015901284823813765, + "ewc_loss": 0.0003662109375, + "ewc_loss_parallel": 3.6694109439849854e-07, + "ewc_loss_perp": 0.0, + "grad_norm": 5.414806365966797, + "learning_rate": 5.256464603645612e-08, + "loss": 0.7627, + "mean_token_accuracy": 0.7862942218780518, + "num_tokens": 4816879.0, + "step": 125 + }, + { + "epoch": 0.016028495102404273, + "ewc_loss": 0.000370025634765625, + "ewc_loss_parallel": 3.7066638469696045e-07, + "ewc_loss_perp": 0.0, + "grad_norm": 5.34622859954834, + "learning_rate": 5.298855447223399e-08, + "loss": 0.8152, + "mean_token_accuracy": 0.7670844793319702, + "num_tokens": 4858704.0, + "step": 126 + }, + { + "epoch": 0.016155705380994784, + "ewc_loss": 0.0003681182861328125, + "ewc_loss_parallel": 3.688037395477295e-07, + "ewc_loss_perp": 0.0, + "grad_norm": 5.302659511566162, + "learning_rate": 5.3412462908011865e-08, + "loss": 0.7725, + "mean_token_accuracy": 0.7823249101638794, + "num_tokens": 4896780.0, + "step": 127 + }, + { + "epoch": 0.016282915659585295, + "ewc_loss": 0.0003681182861328125, + "ewc_loss_parallel": 3.688037395477295e-07, + "ewc_loss_perp": 0.0, + "grad_norm": 5.474038124084473, + "learning_rate": 5.383637134378974e-08, + "loss": 0.8023, + "mean_token_accuracy": 0.7718168497085571, + "num_tokens": 4932108.0, + "step": 128 + }, + { + "epoch": 0.016410125938175806, + "ewc_loss": 0.0003719329833984375, + "ewc_loss_parallel": 3.725290298461914e-07, + "ewc_loss_perp": 0.0, + "grad_norm": 5.257008075714111, + "learning_rate": 5.426027977956761e-08, + "loss": 0.7785, + "mean_token_accuracy": 0.7821923494338989, + "num_tokens": 4970384.0, + "step": 129 + }, + { + "epoch": 0.016537336216766316, + "ewc_loss": 0.000377655029296875, + "ewc_loss_parallel": 3.781169652938843e-07, + "ewc_loss_perp": 0.0, + "grad_norm": 5.5176544189453125, + "learning_rate": 5.468418821534548e-08, + "loss": 0.7875, + "mean_token_accuracy": 0.7764784097671509, + "num_tokens": 5005996.0, + "step": 130 + }, + { + "epoch": 0.016664546495356824, + "ewc_loss": 0.00038909912109375, + "ewc_loss_parallel": 3.8929283618927e-07, + "ewc_loss_perp": 0.0, + "grad_norm": 5.371910572052002, + "learning_rate": 5.5108096651123356e-08, + "loss": 0.7674, + "mean_token_accuracy": 0.7814100384712219, + "num_tokens": 5044208.0, + "step": 131 + }, + { + "epoch": 0.016791756773947335, + "ewc_loss": 0.0004024505615234375, + "ewc_loss_parallel": 4.023313522338867e-07, + "ewc_loss_perp": 0.0, + "grad_norm": 5.328114032745361, + "learning_rate": 5.553200508690123e-08, + "loss": 0.7824, + "mean_token_accuracy": 0.7760555148124695, + "num_tokens": 5085173.0, + "step": 132 + }, + { + "epoch": 0.016918967052537846, + "ewc_loss": 0.000423431396484375, + "ewc_loss_parallel": 4.2282044887542725e-07, + "ewc_loss_perp": 0.0, + "grad_norm": 5.61325740814209, + "learning_rate": 5.59559135226791e-08, + "loss": 0.7997, + "mean_token_accuracy": 0.7737308144569397, + "num_tokens": 5118440.0, + "step": 133 + }, + { + "epoch": 0.017046177331128357, + "ewc_loss": 0.000453948974609375, + "ewc_loss_parallel": 4.544854164123535e-07, + "ewc_loss_perp": 0.0, + "grad_norm": 5.70325231552124, + "learning_rate": 5.637982195845697e-08, + "loss": 0.804, + "mean_token_accuracy": 0.7740615606307983, + "num_tokens": 5155202.0, + "step": 134 + }, + { + "epoch": 0.017173387609718864, + "ewc_loss": 0.000484466552734375, + "ewc_loss_parallel": 4.842877388000488e-07, + "ewc_loss_perp": 0.0, + "grad_norm": 5.627960205078125, + "learning_rate": 5.680373039423484e-08, + "loss": 0.7904, + "mean_token_accuracy": 0.7792110443115234, + "num_tokens": 5193240.0, + "step": 135 + }, + { + "epoch": 0.017300597888309375, + "ewc_loss": 0.000537872314453125, + "ewc_loss_parallel": 5.364418029785156e-07, + "ewc_loss_perp": 0.0, + "grad_norm": 5.710394859313965, + "learning_rate": 5.722763883001271e-08, + "loss": 0.7792, + "mean_token_accuracy": 0.7805018424987793, + "num_tokens": 5230975.0, + "step": 136 + }, + { + "epoch": 0.017427808166899886, + "ewc_loss": 0.00060272216796875, + "ewc_loss_parallel": 6.034970283508301e-07, + "ewc_loss_perp": 0.0, + "grad_norm": 6.112974643707275, + "learning_rate": 5.7651547265790585e-08, + "loss": 0.8431, + "mean_token_accuracy": 0.7633397579193115, + "num_tokens": 5264786.0, + "step": 137 + }, + { + "epoch": 0.017555018445490397, + "ewc_loss": 0.000690460205078125, + "ewc_loss_parallel": 6.891787052154541e-07, + "ewc_loss_perp": 0.0, + "grad_norm": 5.94351053237915, + "learning_rate": 5.807545570156846e-08, + "loss": 0.8057, + "mean_token_accuracy": 0.7730467319488525, + "num_tokens": 5301554.0, + "step": 138 + }, + { + "epoch": 0.017682228724080904, + "ewc_loss": 0.000762939453125, + "ewc_loss_parallel": 7.636845111846924e-07, + "ewc_loss_perp": 0.0, + "grad_norm": 5.924936771392822, + "learning_rate": 5.849936413734633e-08, + "loss": 0.7455, + "mean_token_accuracy": 0.7891095280647278, + "num_tokens": 5337356.0, + "step": 139 + }, + { + "epoch": 0.017809439002671415, + "ewc_loss": 0.00087738037109375, + "ewc_loss_parallel": 8.791685104370117e-07, + "ewc_loss_perp": 0.0, + "grad_norm": 6.060916423797607, + "learning_rate": 5.89232725731242e-08, + "loss": 0.817, + "mean_token_accuracy": 0.7731615900993347, + "num_tokens": 5375767.0, + "step": 140 + }, + { + "epoch": 0.017936649281261926, + "ewc_loss": 0.0009918212890625, + "ewc_loss_parallel": 9.909272193908691e-07, + "ewc_loss_perp": 0.0, + "grad_norm": 6.034092903137207, + "learning_rate": 5.9347181008902075e-08, + "loss": 0.7679, + "mean_token_accuracy": 0.7817250490188599, + "num_tokens": 5413683.0, + "step": 141 + }, + { + "epoch": 0.018063859559852437, + "ewc_loss": 0.0010528564453125, + "ewc_loss_parallel": 1.0505318641662598e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 5.826715469360352, + "learning_rate": 5.977108944467995e-08, + "loss": 0.7476, + "mean_token_accuracy": 0.7866340279579163, + "num_tokens": 5457750.0, + "step": 142 + }, + { + "epoch": 0.018191069838442948, + "ewc_loss": 0.00115966796875, + "ewc_loss_parallel": 1.1622905731201172e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 6.0090837478637695, + "learning_rate": 6.019499788045781e-08, + "loss": 0.7432, + "mean_token_accuracy": 0.7887741327285767, + "num_tokens": 5495743.0, + "step": 143 + }, + { + "epoch": 0.018318280117033455, + "ewc_loss": 0.0012054443359375, + "ewc_loss_parallel": 1.2069940567016602e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 6.047730922698975, + "learning_rate": 6.061890631623569e-08, + "loss": 0.7562, + "mean_token_accuracy": 0.7802528142929077, + "num_tokens": 5532198.0, + "step": 144 + }, + { + "epoch": 0.018445490395623966, + "ewc_loss": 0.0012664794921875, + "ewc_loss_parallel": 1.2665987014770508e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 6.232186317443848, + "learning_rate": 6.104281475201356e-08, + "loss": 0.7691, + "mean_token_accuracy": 0.7811511754989624, + "num_tokens": 5568977.0, + "step": 145 + }, + { + "epoch": 0.018572700674214477, + "ewc_loss": 0.00130462646484375, + "ewc_loss_parallel": 1.30385160446167e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 6.1436309814453125, + "learning_rate": 6.146672318779143e-08, + "loss": 0.7809, + "mean_token_accuracy": 0.7726761102676392, + "num_tokens": 5606229.0, + "step": 146 + }, + { + "epoch": 0.018699910952804988, + "ewc_loss": 0.0013580322265625, + "ewc_loss_parallel": 1.3560056686401367e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 6.118358612060547, + "learning_rate": 6.18906316235693e-08, + "loss": 0.7811, + "mean_token_accuracy": 0.775063157081604, + "num_tokens": 5649828.0, + "step": 147 + }, + { + "epoch": 0.018827121231395495, + "ewc_loss": 0.00138092041015625, + "ewc_loss_parallel": 1.3783574104309082e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 6.244716644287109, + "learning_rate": 6.231454005934718e-08, + "loss": 0.7275, + "mean_token_accuracy": 0.7951653599739075, + "num_tokens": 5687433.0, + "step": 148 + }, + { + "epoch": 0.018954331509986006, + "ewc_loss": 0.00141143798828125, + "ewc_loss_parallel": 1.4081597328186035e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 6.328971862792969, + "learning_rate": 6.273844849512505e-08, + "loss": 0.7582, + "mean_token_accuracy": 0.7820098996162415, + "num_tokens": 5723247.0, + "step": 149 + }, + { + "epoch": 0.019081541788576517, + "ewc_loss": 0.00142669677734375, + "ewc_loss_parallel": 1.4230608940124512e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 6.268218994140625, + "learning_rate": 6.316235693090292e-08, + "loss": 0.7726, + "mean_token_accuracy": 0.7771285772323608, + "num_tokens": 5760305.0, + "step": 150 + }, + { + "epoch": 0.019208752067167028, + "ewc_loss": 0.00142669677734375, + "ewc_loss_parallel": 1.4230608940124512e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 6.140386581420898, + "learning_rate": 6.35862653666808e-08, + "loss": 0.7722, + "mean_token_accuracy": 0.7762006521224976, + "num_tokens": 5800586.0, + "step": 151 + }, + { + "epoch": 0.01933596234575754, + "ewc_loss": 0.00142669677734375, + "ewc_loss_parallel": 1.4230608940124512e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 6.138143539428711, + "learning_rate": 6.401017380245867e-08, + "loss": 0.7637, + "mean_token_accuracy": 0.7778059840202332, + "num_tokens": 5840351.0, + "step": 152 + }, + { + "epoch": 0.019463172624348046, + "ewc_loss": 0.00140380859375, + "ewc_loss_parallel": 1.4007091522216797e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 6.260440826416016, + "learning_rate": 6.443408223823654e-08, + "loss": 0.804, + "mean_token_accuracy": 0.7692397832870483, + "num_tokens": 5880162.0, + "step": 153 + }, + { + "epoch": 0.019590382902938557, + "ewc_loss": 0.00139617919921875, + "ewc_loss_parallel": 1.3932585716247559e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 6.285397529602051, + "learning_rate": 6.485799067401441e-08, + "loss": 0.8593, + "mean_token_accuracy": 0.7541623115539551, + "num_tokens": 5922213.0, + "step": 154 + }, + { + "epoch": 0.019717593181529068, + "ewc_loss": 0.0013885498046875, + "ewc_loss_parallel": 1.385807991027832e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 6.20120096206665, + "learning_rate": 6.528189910979228e-08, + "loss": 0.741, + "mean_token_accuracy": 0.7851471900939941, + "num_tokens": 5957461.0, + "step": 155 + }, + { + "epoch": 0.01984480346011958, + "ewc_loss": 0.001373291015625, + "ewc_loss_parallel": 1.3709068298339844e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 6.080012321472168, + "learning_rate": 6.570580754557016e-08, + "loss": 0.7261, + "mean_token_accuracy": 0.784895122051239, + "num_tokens": 5994675.0, + "step": 156 + }, + { + "epoch": 0.019972013738710086, + "ewc_loss": 0.001373291015625, + "ewc_loss_parallel": 1.3709068298339844e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 6.060145378112793, + "learning_rate": 6.612971598134802e-08, + "loss": 0.6884, + "mean_token_accuracy": 0.8011094331741333, + "num_tokens": 6032514.0, + "step": 157 + }, + { + "epoch": 0.020099224017300597, + "ewc_loss": 0.00133514404296875, + "ewc_loss_parallel": 1.3336539268493652e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 6.120073318481445, + "learning_rate": 6.655362441712589e-08, + "loss": 0.7612, + "mean_token_accuracy": 0.7791165113449097, + "num_tokens": 6069160.0, + "step": 158 + }, + { + "epoch": 0.020226434295891108, + "ewc_loss": 0.00131988525390625, + "ewc_loss_parallel": 1.3187527656555176e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 6.236578464508057, + "learning_rate": 6.697753285290376e-08, + "loss": 0.8152, + "mean_token_accuracy": 0.7633384466171265, + "num_tokens": 6106652.0, + "step": 159 + }, + { + "epoch": 0.02035364457448162, + "ewc_loss": 0.00130462646484375, + "ewc_loss_parallel": 1.30385160446167e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 6.142921447753906, + "learning_rate": 6.740144128868163e-08, + "loss": 0.7495, + "mean_token_accuracy": 0.7848194241523743, + "num_tokens": 6143070.0, + "step": 160 + }, + { + "epoch": 0.020480854853072127, + "ewc_loss": 0.00128936767578125, + "ewc_loss_parallel": 1.2889504432678223e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 5.851146697998047, + "learning_rate": 6.78253497244595e-08, + "loss": 0.7412, + "mean_token_accuracy": 0.7853337526321411, + "num_tokens": 6186495.0, + "step": 161 + }, + { + "epoch": 0.020608065131662637, + "ewc_loss": 0.00124359130859375, + "ewc_loss_parallel": 1.2442469596862793e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 6.017281532287598, + "learning_rate": 6.824925816023738e-08, + "loss": 0.7421, + "mean_token_accuracy": 0.785398006439209, + "num_tokens": 6222454.0, + "step": 162 + }, + { + "epoch": 0.02073527541025315, + "ewc_loss": 0.0012359619140625, + "ewc_loss_parallel": 1.2367963790893555e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 6.325032711029053, + "learning_rate": 6.867316659601525e-08, + "loss": 0.7557, + "mean_token_accuracy": 0.7835153341293335, + "num_tokens": 6253260.0, + "step": 163 + }, + { + "epoch": 0.02086248568884366, + "ewc_loss": 0.001220703125, + "ewc_loss_parallel": 1.2218952178955078e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 5.888177394866943, + "learning_rate": 6.909707503179312e-08, + "loss": 0.7351, + "mean_token_accuracy": 0.7892338037490845, + "num_tokens": 6294773.0, + "step": 164 + }, + { + "epoch": 0.02098969596743417, + "ewc_loss": 0.0012054443359375, + "ewc_loss_parallel": 1.2069940567016602e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 6.016923427581787, + "learning_rate": 6.9520983467571e-08, + "loss": 0.7792, + "mean_token_accuracy": 0.7748324275016785, + "num_tokens": 6335334.0, + "step": 165 + }, + { + "epoch": 0.021116906246024678, + "ewc_loss": 0.00118255615234375, + "ewc_loss_parallel": 1.1846423149108887e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 6.034970760345459, + "learning_rate": 6.994489190334887e-08, + "loss": 0.7146, + "mean_token_accuracy": 0.7906671762466431, + "num_tokens": 6372255.0, + "step": 166 + }, + { + "epoch": 0.02124411652461519, + "ewc_loss": 0.00115966796875, + "ewc_loss_parallel": 1.1622905731201172e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 5.906264781951904, + "learning_rate": 7.036880033912674e-08, + "loss": 0.6904, + "mean_token_accuracy": 0.7994367480278015, + "num_tokens": 6411971.0, + "step": 167 + }, + { + "epoch": 0.0213713268032057, + "ewc_loss": 0.0011444091796875, + "ewc_loss_parallel": 1.1473894119262695e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 6.174205780029297, + "learning_rate": 7.079270877490461e-08, + "loss": 0.7621, + "mean_token_accuracy": 0.7787671089172363, + "num_tokens": 6450366.0, + "step": 168 + }, + { + "epoch": 0.02149853708179621, + "ewc_loss": 0.001129150390625, + "ewc_loss_parallel": 1.1324882507324219e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 6.270016670227051, + "learning_rate": 7.121661721068249e-08, + "loss": 0.7254, + "mean_token_accuracy": 0.7882567644119263, + "num_tokens": 6482199.0, + "step": 169 + }, + { + "epoch": 0.021625747360386718, + "ewc_loss": 0.00112152099609375, + "ewc_loss_parallel": 1.125037670135498e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 5.918713569641113, + "learning_rate": 7.164052564646036e-08, + "loss": 0.7058, + "mean_token_accuracy": 0.7966461181640625, + "num_tokens": 6519697.0, + "step": 170 + }, + { + "epoch": 0.02175295763897723, + "ewc_loss": 0.0011138916015625, + "ewc_loss_parallel": 1.1101365089416504e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 6.098674774169922, + "learning_rate": 7.206443408223823e-08, + "loss": 0.767, + "mean_token_accuracy": 0.7778044939041138, + "num_tokens": 6554687.0, + "step": 171 + }, + { + "epoch": 0.02188016791756774, + "ewc_loss": 0.00110626220703125, + "ewc_loss_parallel": 1.1026859283447266e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 6.170716285705566, + "learning_rate": 7.24883425180161e-08, + "loss": 0.7299, + "mean_token_accuracy": 0.7811394333839417, + "num_tokens": 6593066.0, + "step": 172 + }, + { + "epoch": 0.02200737819615825, + "ewc_loss": 0.00109100341796875, + "ewc_loss_parallel": 1.087784767150879e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 5.843870639801025, + "learning_rate": 7.291225095379398e-08, + "loss": 0.6869, + "mean_token_accuracy": 0.8008129596710205, + "num_tokens": 6635484.0, + "step": 173 + }, + { + "epoch": 0.02213458847474876, + "ewc_loss": 0.00107574462890625, + "ewc_loss_parallel": 1.0728836059570312e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 5.821542739868164, + "learning_rate": 7.333615938957185e-08, + "loss": 0.7796, + "mean_token_accuracy": 0.7746413946151733, + "num_tokens": 6677090.0, + "step": 174 + }, + { + "epoch": 0.02226179875333927, + "ewc_loss": 0.001068115234375, + "ewc_loss_parallel": 1.0654330253601074e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 6.08604621887207, + "learning_rate": 7.376006782534971e-08, + "loss": 0.6959, + "mean_token_accuracy": 0.7951148748397827, + "num_tokens": 6710250.0, + "step": 175 + }, + { + "epoch": 0.02238900903192978, + "ewc_loss": 0.001068115234375, + "ewc_loss_parallel": 1.0654330253601074e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 5.886287212371826, + "learning_rate": 7.418397626112758e-08, + "loss": 0.7214, + "mean_token_accuracy": 0.7869648933410645, + "num_tokens": 6752998.0, + "step": 176 + }, + { + "epoch": 0.02251621931052029, + "ewc_loss": 0.00106048583984375, + "ewc_loss_parallel": 1.0579824447631836e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 6.021209716796875, + "learning_rate": 7.460788469690545e-08, + "loss": 0.7233, + "mean_token_accuracy": 0.7929136753082275, + "num_tokens": 6789568.0, + "step": 177 + }, + { + "epoch": 0.0226434295891108, + "ewc_loss": 0.00104522705078125, + "ewc_loss_parallel": 1.043081283569336e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 6.208191871643066, + "learning_rate": 7.503179313268333e-08, + "loss": 0.7642, + "mean_token_accuracy": 0.7763253450393677, + "num_tokens": 6822810.0, + "step": 178 + }, + { + "epoch": 0.02277063986770131, + "ewc_loss": 0.0010528564453125, + "ewc_loss_parallel": 1.0505318641662598e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 5.906447410583496, + "learning_rate": 7.54557015684612e-08, + "loss": 0.7031, + "mean_token_accuracy": 0.7921798229217529, + "num_tokens": 6861598.0, + "step": 179 + }, + { + "epoch": 0.02289785014629182, + "ewc_loss": 0.00102996826171875, + "ewc_loss_parallel": 1.0281801223754883e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 5.947169303894043, + "learning_rate": 7.587961000423907e-08, + "loss": 0.7257, + "mean_token_accuracy": 0.7889186143875122, + "num_tokens": 6900220.0, + "step": 180 + }, + { + "epoch": 0.02302506042488233, + "ewc_loss": 0.0010223388671875, + "ewc_loss_parallel": 1.0207295417785645e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 6.030984878540039, + "learning_rate": 7.630351844001694e-08, + "loss": 0.8343, + "mean_token_accuracy": 0.7594441175460815, + "num_tokens": 6946983.0, + "step": 181 + }, + { + "epoch": 0.02315227070347284, + "ewc_loss": 0.0010223388671875, + "ewc_loss_parallel": 1.0207295417785645e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 6.0418195724487305, + "learning_rate": 7.672742687579482e-08, + "loss": 0.7396, + "mean_token_accuracy": 0.785476803779602, + "num_tokens": 6986019.0, + "step": 182 + }, + { + "epoch": 0.02327948098206335, + "ewc_loss": 0.00102996826171875, + "ewc_loss_parallel": 1.0281801223754883e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 6.073606967926025, + "learning_rate": 7.715133531157269e-08, + "loss": 0.7139, + "mean_token_accuracy": 0.7862935662269592, + "num_tokens": 7020722.0, + "step": 183 + }, + { + "epoch": 0.02340669126065386, + "ewc_loss": 0.00102996826171875, + "ewc_loss_parallel": 1.0281801223754883e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 5.99667501449585, + "learning_rate": 7.757524374735056e-08, + "loss": 0.7852, + "mean_token_accuracy": 0.7723340392112732, + "num_tokens": 7062691.0, + "step": 184 + }, + { + "epoch": 0.02353390153924437, + "ewc_loss": 0.00102996826171875, + "ewc_loss_parallel": 1.0281801223754883e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 5.994189262390137, + "learning_rate": 7.799915218312844e-08, + "loss": 0.7329, + "mean_token_accuracy": 0.7878163456916809, + "num_tokens": 7101090.0, + "step": 185 + }, + { + "epoch": 0.02366111181783488, + "ewc_loss": 0.00103759765625, + "ewc_loss_parallel": 1.0356307029724121e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 6.017747402191162, + "learning_rate": 7.842306061890631e-08, + "loss": 0.7067, + "mean_token_accuracy": 0.7888456583023071, + "num_tokens": 7139891.0, + "step": 186 + }, + { + "epoch": 0.023788322096425393, + "ewc_loss": 0.00103759765625, + "ewc_loss_parallel": 1.0356307029724121e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 6.058300018310547, + "learning_rate": 7.88469690546842e-08, + "loss": 0.7551, + "mean_token_accuracy": 0.7796720862388611, + "num_tokens": 7179501.0, + "step": 187 + }, + { + "epoch": 0.0239155323750159, + "ewc_loss": 0.00104522705078125, + "ewc_loss_parallel": 1.043081283569336e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 6.082681179046631, + "learning_rate": 7.927087749046207e-08, + "loss": 0.7245, + "mean_token_accuracy": 0.7866575717926025, + "num_tokens": 7213809.0, + "step": 188 + }, + { + "epoch": 0.02404274265360641, + "ewc_loss": 0.0010528564453125, + "ewc_loss_parallel": 1.0505318641662598e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 6.028855323791504, + "learning_rate": 7.969478592623994e-08, + "loss": 0.8, + "mean_token_accuracy": 0.765119194984436, + "num_tokens": 7254493.0, + "step": 189 + }, + { + "epoch": 0.024169952932196922, + "ewc_loss": 0.00106048583984375, + "ewc_loss_parallel": 1.0579824447631836e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 5.96298360824585, + "learning_rate": 8.011869436201781e-08, + "loss": 0.658, + "mean_token_accuracy": 0.8067280054092407, + "num_tokens": 7292643.0, + "step": 190 + }, + { + "epoch": 0.024297163210787433, + "ewc_loss": 0.00107574462890625, + "ewc_loss_parallel": 1.0728836059570312e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 6.325003147125244, + "learning_rate": 8.054260279779568e-08, + "loss": 0.7824, + "mean_token_accuracy": 0.7742310166358948, + "num_tokens": 7329810.0, + "step": 191 + }, + { + "epoch": 0.02442437348937794, + "ewc_loss": 0.0010833740234375, + "ewc_loss_parallel": 1.080334186553955e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 6.041314125061035, + "learning_rate": 8.096651123357356e-08, + "loss": 0.7681, + "mean_token_accuracy": 0.7758057117462158, + "num_tokens": 7367630.0, + "step": 192 + }, + { + "epoch": 0.02455158376796845, + "ewc_loss": 0.00109100341796875, + "ewc_loss_parallel": 1.087784767150879e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 6.0932936668396, + "learning_rate": 8.139041966935143e-08, + "loss": 0.749, + "mean_token_accuracy": 0.7798691987991333, + "num_tokens": 7411580.0, + "step": 193 + }, + { + "epoch": 0.024678794046558962, + "ewc_loss": 0.0011138916015625, + "ewc_loss_parallel": 1.1101365089416504e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 6.123937606811523, + "learning_rate": 8.181432810512929e-08, + "loss": 0.7043, + "mean_token_accuracy": 0.7936409711837769, + "num_tokens": 7451631.0, + "step": 194 + }, + { + "epoch": 0.024806004325149473, + "ewc_loss": 0.0011138916015625, + "ewc_loss_parallel": 1.1175870895385742e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 6.040451526641846, + "learning_rate": 8.223823654090716e-08, + "loss": 0.7135, + "mean_token_accuracy": 0.7897235155105591, + "num_tokens": 7493645.0, + "step": 195 + }, + { + "epoch": 0.024933214603739984, + "ewc_loss": 0.001129150390625, + "ewc_loss_parallel": 1.1324882507324219e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 6.019721031188965, + "learning_rate": 8.266214497668503e-08, + "loss": 0.6742, + "mean_token_accuracy": 0.8034495711326599, + "num_tokens": 7538042.0, + "step": 196 + }, + { + "epoch": 0.02506042488233049, + "ewc_loss": 0.00113677978515625, + "ewc_loss_parallel": 1.1399388313293457e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 6.125536918640137, + "learning_rate": 8.30860534124629e-08, + "loss": 0.7385, + "mean_token_accuracy": 0.7834575176239014, + "num_tokens": 7575374.0, + "step": 197 + }, + { + "epoch": 0.025187635160921002, + "ewc_loss": 0.0011749267578125, + "ewc_loss_parallel": 1.1771917343139648e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 6.651110649108887, + "learning_rate": 8.350996184824078e-08, + "loss": 0.7328, + "mean_token_accuracy": 0.7837494611740112, + "num_tokens": 7606880.0, + "step": 198 + }, + { + "epoch": 0.025314845439511513, + "ewc_loss": 0.00119781494140625, + "ewc_loss_parallel": 1.1995434761047363e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 6.314355373382568, + "learning_rate": 8.393387028401865e-08, + "loss": 0.7095, + "mean_token_accuracy": 0.7922524213790894, + "num_tokens": 7644840.0, + "step": 199 + }, + { + "epoch": 0.025442055718102024, + "ewc_loss": 0.0012054443359375, + "ewc_loss_parallel": 1.2069940567016602e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 6.1080241203308105, + "learning_rate": 8.435777871979652e-08, + "loss": 0.7495, + "mean_token_accuracy": 0.7787837982177734, + "num_tokens": 7683856.0, + "step": 200 + }, + { + "epoch": 0.02556926599669253, + "ewc_loss": 0.0012054443359375, + "ewc_loss_parallel": 1.2069940567016602e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 6.327450275421143, + "learning_rate": 8.47816871555744e-08, + "loss": 0.7101, + "mean_token_accuracy": 0.7912582159042358, + "num_tokens": 7715306.0, + "step": 201 + }, + { + "epoch": 0.025696476275283042, + "ewc_loss": 0.00122833251953125, + "ewc_loss_parallel": 1.2293457984924316e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 6.409172534942627, + "learning_rate": 8.520559559135227e-08, + "loss": 0.669, + "mean_token_accuracy": 0.8021237850189209, + "num_tokens": 7752442.0, + "step": 202 + }, + { + "epoch": 0.025823686553873553, + "ewc_loss": 0.0012664794921875, + "ewc_loss_parallel": 1.2665987014770508e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 6.261364936828613, + "learning_rate": 8.562950402713014e-08, + "loss": 0.7526, + "mean_token_accuracy": 0.7827749848365784, + "num_tokens": 7799891.0, + "step": 203 + }, + { + "epoch": 0.025950896832464064, + "ewc_loss": 0.00127410888671875, + "ewc_loss_parallel": 1.2740492820739746e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 6.435070991516113, + "learning_rate": 8.605341246290801e-08, + "loss": 0.7447, + "mean_token_accuracy": 0.7799767255783081, + "num_tokens": 7830696.0, + "step": 204 + }, + { + "epoch": 0.026078107111054575, + "ewc_loss": 0.00131988525390625, + "ewc_loss_parallel": 1.3187527656555176e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 6.2030510902404785, + "learning_rate": 8.647732089868589e-08, + "loss": 0.7136, + "mean_token_accuracy": 0.7921432852745056, + "num_tokens": 7870968.0, + "step": 205 + }, + { + "epoch": 0.026205317389645082, + "ewc_loss": 0.0013580322265625, + "ewc_loss_parallel": 1.3560056686401367e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 6.221837520599365, + "learning_rate": 8.690122933446376e-08, + "loss": 0.7366, + "mean_token_accuracy": 0.7850819230079651, + "num_tokens": 7914418.0, + "step": 206 + }, + { + "epoch": 0.026332527668235593, + "ewc_loss": 0.0013885498046875, + "ewc_loss_parallel": 1.385807991027832e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 6.1791863441467285, + "learning_rate": 8.732513777024163e-08, + "loss": 0.6746, + "mean_token_accuracy": 0.8061870336532593, + "num_tokens": 7954544.0, + "step": 207 + }, + { + "epoch": 0.026459737946826104, + "ewc_loss": 0.00141143798828125, + "ewc_loss_parallel": 1.4081597328186035e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 6.364721775054932, + "learning_rate": 8.77490462060195e-08, + "loss": 0.7239, + "mean_token_accuracy": 0.7906659841537476, + "num_tokens": 7997373.0, + "step": 208 + }, + { + "epoch": 0.026586948225416615, + "ewc_loss": 0.001434326171875, + "ewc_loss_parallel": 1.430511474609375e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 6.856248378753662, + "learning_rate": 8.817295464179738e-08, + "loss": 0.6897, + "mean_token_accuracy": 0.7925087213516235, + "num_tokens": 8033314.0, + "step": 209 + }, + { + "epoch": 0.026714158504007122, + "ewc_loss": 0.00146484375, + "ewc_loss_parallel": 1.4677643775939941e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 6.306034088134766, + "learning_rate": 8.859686307757525e-08, + "loss": 0.7067, + "mean_token_accuracy": 0.7928593158721924, + "num_tokens": 8069362.0, + "step": 210 + }, + { + "epoch": 0.026841368782597633, + "ewc_loss": 0.001495361328125, + "ewc_loss_parallel": 1.4975666999816895e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 6.711033821105957, + "learning_rate": 8.902077151335312e-08, + "loss": 0.74, + "mean_token_accuracy": 0.7802305221557617, + "num_tokens": 8109126.0, + "step": 211 + }, + { + "epoch": 0.026968579061188144, + "ewc_loss": 0.00153350830078125, + "ewc_loss_parallel": 1.5348196029663086e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 6.703207492828369, + "learning_rate": 8.944467994913098e-08, + "loss": 0.7961, + "mean_token_accuracy": 0.7710479497909546, + "num_tokens": 8143083.0, + "step": 212 + }, + { + "epoch": 0.027095789339778655, + "ewc_loss": 0.00156402587890625, + "ewc_loss_parallel": 1.564621925354004e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 6.556682109832764, + "learning_rate": 8.986858838490885e-08, + "loss": 0.6436, + "mean_token_accuracy": 0.8045727014541626, + "num_tokens": 8176636.0, + "step": 213 + }, + { + "epoch": 0.027222999618369163, + "ewc_loss": 0.00159454345703125, + "ewc_loss_parallel": 1.5944242477416992e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 6.5047430992126465, + "learning_rate": 9.029249682068673e-08, + "loss": 0.7018, + "mean_token_accuracy": 0.7915247678756714, + "num_tokens": 8220262.0, + "step": 214 + }, + { + "epoch": 0.027350209896959674, + "ewc_loss": 0.001617431640625, + "ewc_loss_parallel": 1.6167759895324707e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 6.480494499206543, + "learning_rate": 9.07164052564646e-08, + "loss": 0.7226, + "mean_token_accuracy": 0.7888960838317871, + "num_tokens": 8256303.0, + "step": 215 + }, + { + "epoch": 0.027477420175550184, + "ewc_loss": 0.0016632080078125, + "ewc_loss_parallel": 1.6614794731140137e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 6.726897716522217, + "learning_rate": 9.114031369224247e-08, + "loss": 0.6859, + "mean_token_accuracy": 0.7986595630645752, + "num_tokens": 8293888.0, + "step": 216 + }, + { + "epoch": 0.027604630454140695, + "ewc_loss": 0.00170135498046875, + "ewc_loss_parallel": 1.6987323760986328e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 6.412038803100586, + "learning_rate": 9.156422212802034e-08, + "loss": 0.6473, + "mean_token_accuracy": 0.8077428340911865, + "num_tokens": 8335368.0, + "step": 217 + }, + { + "epoch": 0.027731840732731206, + "ewc_loss": 0.00171661376953125, + "ewc_loss_parallel": 1.7136335372924805e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 7.0726704597473145, + "learning_rate": 9.198813056379822e-08, + "loss": 0.7039, + "mean_token_accuracy": 0.7946972846984863, + "num_tokens": 8370338.0, + "step": 218 + }, + { + "epoch": 0.027859051011321714, + "ewc_loss": 0.00174713134765625, + "ewc_loss_parallel": 1.7508864402770996e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 6.666274070739746, + "learning_rate": 9.241203899957609e-08, + "loss": 0.6879, + "mean_token_accuracy": 0.796484649181366, + "num_tokens": 8409904.0, + "step": 219 + }, + { + "epoch": 0.027986261289912225, + "ewc_loss": 0.00177001953125, + "ewc_loss_parallel": 1.773238182067871e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 6.750667095184326, + "learning_rate": 9.283594743535396e-08, + "loss": 0.8154, + "mean_token_accuracy": 0.7631980180740356, + "num_tokens": 8446579.0, + "step": 220 + }, + { + "epoch": 0.028113471568502735, + "ewc_loss": 0.001800537109375, + "ewc_loss_parallel": 1.8030405044555664e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 6.38192081451416, + "learning_rate": 9.325985587113183e-08, + "loss": 0.6606, + "mean_token_accuracy": 0.8058763742446899, + "num_tokens": 8483663.0, + "step": 221 + }, + { + "epoch": 0.028240681847093246, + "ewc_loss": 0.0018310546875, + "ewc_loss_parallel": 1.8328428268432617e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 6.68411111831665, + "learning_rate": 9.368376430690971e-08, + "loss": 0.6943, + "mean_token_accuracy": 0.7941595315933228, + "num_tokens": 8516346.0, + "step": 222 + }, + { + "epoch": 0.028367892125683754, + "ewc_loss": 0.00185394287109375, + "ewc_loss_parallel": 1.8551945686340332e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 6.987332820892334, + "learning_rate": 9.410767274268758e-08, + "loss": 0.6885, + "mean_token_accuracy": 0.7983835339546204, + "num_tokens": 8556914.0, + "step": 223 + }, + { + "epoch": 0.028495102404274265, + "ewc_loss": 0.00188446044921875, + "ewc_loss_parallel": 1.8849968910217285e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 7.2632951736450195, + "learning_rate": 9.453158117846545e-08, + "loss": 0.6758, + "mean_token_accuracy": 0.7970216274261475, + "num_tokens": 8596548.0, + "step": 224 + }, + { + "epoch": 0.028622312682864776, + "ewc_loss": 0.001922607421875, + "ewc_loss_parallel": 1.9222497940063477e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 7.05104923248291, + "learning_rate": 9.495548961424333e-08, + "loss": 0.707, + "mean_token_accuracy": 0.794918417930603, + "num_tokens": 8634366.0, + "step": 225 + }, + { + "epoch": 0.028749522961455286, + "ewc_loss": 0.0019378662109375, + "ewc_loss_parallel": 1.9371509552001953e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 7.819660186767578, + "learning_rate": 9.53793980500212e-08, + "loss": 0.6473, + "mean_token_accuracy": 0.8065747022628784, + "num_tokens": 8666480.0, + "step": 226 + }, + { + "epoch": 0.028876733240045797, + "ewc_loss": 0.0019683837890625, + "ewc_loss_parallel": 1.9669532775878906e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 7.1865410804748535, + "learning_rate": 9.580330648579907e-08, + "loss": 0.7965, + "mean_token_accuracy": 0.7708466649055481, + "num_tokens": 8705880.0, + "step": 227 + }, + { + "epoch": 0.029003943518636305, + "ewc_loss": 0.0019683837890625, + "ewc_loss_parallel": 1.9669532775878906e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 6.931376934051514, + "learning_rate": 9.622721492157694e-08, + "loss": 0.6588, + "mean_token_accuracy": 0.8047986626625061, + "num_tokens": 8743772.0, + "step": 228 + }, + { + "epoch": 0.029131153797226816, + "ewc_loss": 0.001953125, + "ewc_loss_parallel": 1.952052116394043e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 7.913638114929199, + "learning_rate": 9.665112335735482e-08, + "loss": 0.6842, + "mean_token_accuracy": 0.7931594848632812, + "num_tokens": 8775919.0, + "step": 229 + }, + { + "epoch": 0.029258364075817327, + "ewc_loss": 0.001983642578125, + "ewc_loss_parallel": 1.9818544387817383e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 7.150176048278809, + "learning_rate": 9.707503179313267e-08, + "loss": 0.6309, + "mean_token_accuracy": 0.8107359409332275, + "num_tokens": 8809336.0, + "step": 230 + }, + { + "epoch": 0.029385574354407838, + "ewc_loss": 0.001983642578125, + "ewc_loss_parallel": 1.9818544387817383e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 6.877656936645508, + "learning_rate": 9.749894022891055e-08, + "loss": 0.6824, + "mean_token_accuracy": 0.7963889241218567, + "num_tokens": 8845952.0, + "step": 231 + }, + { + "epoch": 0.029512784632998345, + "ewc_loss": 0.001953125, + "ewc_loss_parallel": 1.952052116394043e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 6.561822414398193, + "learning_rate": 9.792284866468842e-08, + "loss": 0.7026, + "mean_token_accuracy": 0.7875009775161743, + "num_tokens": 8889801.0, + "step": 232 + }, + { + "epoch": 0.029639994911588856, + "ewc_loss": 0.001953125, + "ewc_loss_parallel": 1.952052116394043e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 7.3109283447265625, + "learning_rate": 9.834675710046629e-08, + "loss": 0.6624, + "mean_token_accuracy": 0.7988977432250977, + "num_tokens": 8925429.0, + "step": 233 + }, + { + "epoch": 0.029767205190179367, + "ewc_loss": 0.0019683837890625, + "ewc_loss_parallel": 1.9669532775878906e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 6.97318696975708, + "learning_rate": 9.877066553624416e-08, + "loss": 0.6937, + "mean_token_accuracy": 0.7921556234359741, + "num_tokens": 8963360.0, + "step": 234 + }, + { + "epoch": 0.029894415468769878, + "ewc_loss": 0.0019989013671875, + "ewc_loss_parallel": 1.996755599975586e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 7.188724517822266, + "learning_rate": 9.919457397202204e-08, + "loss": 0.7066, + "mean_token_accuracy": 0.7929096221923828, + "num_tokens": 8998314.0, + "step": 235 + }, + { + "epoch": 0.030021625747360385, + "ewc_loss": 0.0019989013671875, + "ewc_loss_parallel": 1.996755599975586e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 7.066784381866455, + "learning_rate": 9.961848240779991e-08, + "loss": 0.7769, + "mean_token_accuracy": 0.7732815146446228, + "num_tokens": 9034804.0, + "step": 236 + }, + { + "epoch": 0.030148836025950896, + "ewc_loss": 0.0019683837890625, + "ewc_loss_parallel": 1.9669532775878906e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 7.868381023406982, + "learning_rate": 1.0004239084357778e-07, + "loss": 0.6991, + "mean_token_accuracy": 0.7936979532241821, + "num_tokens": 9070840.0, + "step": 237 + }, + { + "epoch": 0.030276046304541407, + "ewc_loss": 0.001983642578125, + "ewc_loss_parallel": 1.9818544387817383e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 7.94923734664917, + "learning_rate": 1.0046629927935566e-07, + "loss": 0.6793, + "mean_token_accuracy": 0.8003249168395996, + "num_tokens": 9109370.0, + "step": 238 + }, + { + "epoch": 0.030403256583131918, + "ewc_loss": 0.002044677734375, + "ewc_loss_parallel": 2.041459083557129e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 6.921803951263428, + "learning_rate": 1.0089020771513353e-07, + "loss": 0.6211, + "mean_token_accuracy": 0.8139144778251648, + "num_tokens": 9148191.0, + "step": 239 + }, + { + "epoch": 0.03053046686172243, + "ewc_loss": 0.0019683837890625, + "ewc_loss_parallel": 1.9669532775878906e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 6.80225133895874, + "learning_rate": 1.013141161509114e-07, + "loss": 0.6891, + "mean_token_accuracy": 0.7952370643615723, + "num_tokens": 9182923.0, + "step": 240 + }, + { + "epoch": 0.030657677140312936, + "ewc_loss": 0.001953125, + "ewc_loss_parallel": 1.952052116394043e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 8.34101676940918, + "learning_rate": 1.0173802458668927e-07, + "loss": 0.6894, + "mean_token_accuracy": 0.7928320169448853, + "num_tokens": 9215882.0, + "step": 241 + }, + { + "epoch": 0.030784887418903447, + "ewc_loss": 0.0019989013671875, + "ewc_loss_parallel": 1.996755599975586e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.337912559509277, + "learning_rate": 1.0216193302246715e-07, + "loss": 0.7217, + "mean_token_accuracy": 0.7809464931488037, + "num_tokens": 9249718.0, + "step": 242 + }, + { + "epoch": 0.030912097697493958, + "ewc_loss": 0.0020904541015625, + "ewc_loss_parallel": 2.086162567138672e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 6.757498741149902, + "learning_rate": 1.0258584145824502e-07, + "loss": 0.6406, + "mean_token_accuracy": 0.8092440366744995, + "num_tokens": 9291146.0, + "step": 243 + }, + { + "epoch": 0.03103930797608447, + "ewc_loss": 0.0019989013671875, + "ewc_loss_parallel": 1.996755599975586e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 7.773455619812012, + "learning_rate": 1.0300974989402289e-07, + "loss": 0.635, + "mean_token_accuracy": 0.8088030815124512, + "num_tokens": 9326403.0, + "step": 244 + }, + { + "epoch": 0.031166518254674976, + "ewc_loss": 0.0019683837890625, + "ewc_loss_parallel": 1.9669532775878906e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 8.569150924682617, + "learning_rate": 1.0343365832980076e-07, + "loss": 0.6875, + "mean_token_accuracy": 0.795070230960846, + "num_tokens": 9368491.0, + "step": 245 + }, + { + "epoch": 0.03129372853326549, + "ewc_loss": 0.00201416015625, + "ewc_loss_parallel": 2.0116567611694336e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 7.487648010253906, + "learning_rate": 1.0385756676557864e-07, + "loss": 0.6835, + "mean_token_accuracy": 0.7973704934120178, + "num_tokens": 9409041.0, + "step": 246 + }, + { + "epoch": 0.031420938811855995, + "ewc_loss": 0.001983642578125, + "ewc_loss_parallel": 1.9818544387817383e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 7.13338041305542, + "learning_rate": 1.0428147520135651e-07, + "loss": 0.6553, + "mean_token_accuracy": 0.8048714399337769, + "num_tokens": 9448591.0, + "step": 247 + }, + { + "epoch": 0.03154814909044651, + "ewc_loss": 0.0019683837890625, + "ewc_loss_parallel": 1.9669532775878906e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 7.108875274658203, + "learning_rate": 1.0470538363713437e-07, + "loss": 0.6864, + "mean_token_accuracy": 0.7938442230224609, + "num_tokens": 9486604.0, + "step": 248 + }, + { + "epoch": 0.031675359369037016, + "ewc_loss": 0.0019683837890625, + "ewc_loss_parallel": 1.9669532775878906e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 7.326183795928955, + "learning_rate": 1.0512929207291224e-07, + "loss": 0.6924, + "mean_token_accuracy": 0.7885834574699402, + "num_tokens": 9525285.0, + "step": 249 + }, + { + "epoch": 0.03180256964762753, + "ewc_loss": 0.0019683837890625, + "ewc_loss_parallel": 1.9669532775878906e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.776183128356934, + "learning_rate": 1.0555320050869011e-07, + "loss": 0.6726, + "mean_token_accuracy": 0.794941246509552, + "num_tokens": 9557898.0, + "step": 250 + }, + { + "epoch": 0.03192977992621804, + "ewc_loss": 0.002044677734375, + "ewc_loss_parallel": 2.041459083557129e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 7.466505527496338, + "learning_rate": 1.0597710894446799e-07, + "loss": 0.7332, + "mean_token_accuracy": 0.7786881923675537, + "num_tokens": 9590438.0, + "step": 251 + }, + { + "epoch": 0.032056990204808546, + "ewc_loss": 0.0019989013671875, + "ewc_loss_parallel": 1.996755599975586e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 7.208379745483398, + "learning_rate": 1.0640101738024586e-07, + "loss": 0.6406, + "mean_token_accuracy": 0.807834267616272, + "num_tokens": 9631402.0, + "step": 252 + }, + { + "epoch": 0.03218420048339906, + "ewc_loss": 0.001953125, + "ewc_loss_parallel": 1.952052116394043e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.460002899169922, + "learning_rate": 1.0682492581602373e-07, + "loss": 0.6243, + "mean_token_accuracy": 0.812977135181427, + "num_tokens": 9663702.0, + "step": 253 + }, + { + "epoch": 0.03231141076198957, + "ewc_loss": 0.0020294189453125, + "ewc_loss_parallel": 2.0265579223632812e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 8.23025131225586, + "learning_rate": 1.072488342518016e-07, + "loss": 0.6108, + "mean_token_accuracy": 0.8134795427322388, + "num_tokens": 9698969.0, + "step": 254 + }, + { + "epoch": 0.03243862104058008, + "ewc_loss": 0.00201416015625, + "ewc_loss_parallel": 2.0116567611694336e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 7.322776794433594, + "learning_rate": 1.0767274268757948e-07, + "loss": 0.6804, + "mean_token_accuracy": 0.7945481538772583, + "num_tokens": 9738075.0, + "step": 255 + }, + { + "epoch": 0.03256583131917059, + "ewc_loss": 0.001953125, + "ewc_loss_parallel": 1.952052116394043e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 6.9583868980407715, + "learning_rate": 1.0809665112335735e-07, + "loss": 0.6845, + "mean_token_accuracy": 0.7973679900169373, + "num_tokens": 9782675.0, + "step": 256 + }, + { + "epoch": 0.0326930415977611, + "ewc_loss": 0.001922607421875, + "ewc_loss_parallel": 1.9222497940063477e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 7.293177604675293, + "learning_rate": 1.0852055955913522e-07, + "loss": 0.5845, + "mean_token_accuracy": 0.8226860761642456, + "num_tokens": 9820585.0, + "step": 257 + }, + { + "epoch": 0.03282025187635161, + "ewc_loss": 0.0019073486328125, + "ewc_loss_parallel": 1.9073486328125e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 8.0636568069458, + "learning_rate": 1.089444679949131e-07, + "loss": 0.7101, + "mean_token_accuracy": 0.7892480492591858, + "num_tokens": 9860693.0, + "step": 258 + }, + { + "epoch": 0.03294746215494212, + "ewc_loss": 0.001983642578125, + "ewc_loss_parallel": 1.9818544387817383e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 7.798069000244141, + "learning_rate": 1.0936837643069097e-07, + "loss": 0.6424, + "mean_token_accuracy": 0.8072253465652466, + "num_tokens": 9902629.0, + "step": 259 + }, + { + "epoch": 0.03307467243353263, + "ewc_loss": 0.0019683837890625, + "ewc_loss_parallel": 1.9669532775878906e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 7.444677352905273, + "learning_rate": 1.0979228486646884e-07, + "loss": 0.6147, + "mean_token_accuracy": 0.8125289678573608, + "num_tokens": 9937304.0, + "step": 260 + }, + { + "epoch": 0.03320188271212314, + "ewc_loss": 0.001953125, + "ewc_loss_parallel": 1.952052116394043e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 8.186314582824707, + "learning_rate": 1.1021619330224671e-07, + "loss": 0.7482, + "mean_token_accuracy": 0.7774312496185303, + "num_tokens": 9976911.0, + "step": 261 + }, + { + "epoch": 0.03332909299071365, + "ewc_loss": 0.001953125, + "ewc_loss_parallel": 1.952052116394043e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 7.33156681060791, + "learning_rate": 1.1064010173802458e-07, + "loss": 0.6819, + "mean_token_accuracy": 0.7969212532043457, + "num_tokens": 10015740.0, + "step": 262 + }, + { + "epoch": 0.03345630326930416, + "ewc_loss": 0.0018768310546875, + "ewc_loss_parallel": 1.8775463104248047e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 7.871378421783447, + "learning_rate": 1.1106401017380246e-07, + "loss": 0.6202, + "mean_token_accuracy": 0.8144707679748535, + "num_tokens": 10053054.0, + "step": 263 + }, + { + "epoch": 0.03358351354789467, + "ewc_loss": 0.00189971923828125, + "ewc_loss_parallel": 1.8998980522155762e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.2312593460083, + "learning_rate": 1.1148791860958033e-07, + "loss": 0.719, + "mean_token_accuracy": 0.7816762924194336, + "num_tokens": 10082088.0, + "step": 264 + }, + { + "epoch": 0.03371072382648518, + "ewc_loss": 0.0019683837890625, + "ewc_loss_parallel": 1.9669532775878906e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 7.196681976318359, + "learning_rate": 1.119118270453582e-07, + "loss": 0.6894, + "mean_token_accuracy": 0.7928994297981262, + "num_tokens": 10121122.0, + "step": 265 + }, + { + "epoch": 0.03383793410507569, + "ewc_loss": 0.0019073486328125, + "ewc_loss_parallel": 1.9073486328125e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 7.585251808166504, + "learning_rate": 1.1233573548113607e-07, + "loss": 0.6833, + "mean_token_accuracy": 0.7967028617858887, + "num_tokens": 10161017.0, + "step": 266 + }, + { + "epoch": 0.0339651443836662, + "ewc_loss": 0.00185394287109375, + "ewc_loss_parallel": 1.8551945686340332e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 6.922121524810791, + "learning_rate": 1.1275964391691393e-07, + "loss": 0.6476, + "mean_token_accuracy": 0.8062832951545715, + "num_tokens": 10204913.0, + "step": 267 + }, + { + "epoch": 0.03409235466225671, + "ewc_loss": 0.0018157958984375, + "ewc_loss_parallel": 1.817941665649414e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 8.047563552856445, + "learning_rate": 1.131835523526918e-07, + "loss": 0.6966, + "mean_token_accuracy": 0.7894207835197449, + "num_tokens": 10245154.0, + "step": 268 + }, + { + "epoch": 0.03421956494084722, + "ewc_loss": 0.00188446044921875, + "ewc_loss_parallel": 1.8849968910217285e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 7.972006320953369, + "learning_rate": 1.1360746078846968e-07, + "loss": 0.704, + "mean_token_accuracy": 0.7887883186340332, + "num_tokens": 10283006.0, + "step": 269 + }, + { + "epoch": 0.03434677521943773, + "ewc_loss": 0.00189208984375, + "ewc_loss_parallel": 1.8924474716186523e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 8.572866439819336, + "learning_rate": 1.1403136922424755e-07, + "loss": 0.6646, + "mean_token_accuracy": 0.7992357611656189, + "num_tokens": 10316689.0, + "step": 270 + }, + { + "epoch": 0.03447398549802824, + "ewc_loss": 0.0019073486328125, + "ewc_loss_parallel": 1.9073486328125e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 8.22103500366211, + "learning_rate": 1.1445527766002542e-07, + "loss": 0.6581, + "mean_token_accuracy": 0.7999957799911499, + "num_tokens": 10358145.0, + "step": 271 + }, + { + "epoch": 0.03460119577661875, + "ewc_loss": 0.00188446044921875, + "ewc_loss_parallel": 1.8849968910217285e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 7.988892078399658, + "learning_rate": 1.148791860958033e-07, + "loss": 0.7597, + "mean_token_accuracy": 0.7743803858757019, + "num_tokens": 10393424.0, + "step": 272 + }, + { + "epoch": 0.034728406055209264, + "ewc_loss": 0.00183868408203125, + "ewc_loss_parallel": 1.8402934074401855e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 7.019203186035156, + "learning_rate": 1.1530309453158117e-07, + "loss": 0.6725, + "mean_token_accuracy": 0.7974406480789185, + "num_tokens": 10433560.0, + "step": 273 + }, + { + "epoch": 0.03485561633379977, + "ewc_loss": 0.001800537109375, + "ewc_loss_parallel": 1.8030405044555664e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 7.13125467300415, + "learning_rate": 1.1572700296735904e-07, + "loss": 0.67, + "mean_token_accuracy": 0.7971447706222534, + "num_tokens": 10472024.0, + "step": 274 + }, + { + "epoch": 0.03498282661239028, + "ewc_loss": 0.0018157958984375, + "ewc_loss_parallel": 1.817941665649414e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 7.735804557800293, + "learning_rate": 1.1615091140313691e-07, + "loss": 0.7092, + "mean_token_accuracy": 0.787671685218811, + "num_tokens": 10510874.0, + "step": 275 + }, + { + "epoch": 0.03511003689098079, + "ewc_loss": 0.001861572265625, + "ewc_loss_parallel": 1.862645149230957e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 7.635549545288086, + "learning_rate": 1.1657481983891479e-07, + "loss": 0.646, + "mean_token_accuracy": 0.8061257600784302, + "num_tokens": 10552411.0, + "step": 276 + }, + { + "epoch": 0.0352372471695713, + "ewc_loss": 0.0018768310546875, + "ewc_loss_parallel": 1.8775463104248047e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 7.701191425323486, + "learning_rate": 1.1699872827469266e-07, + "loss": 0.6629, + "mean_token_accuracy": 0.8017622232437134, + "num_tokens": 10591537.0, + "step": 277 + }, + { + "epoch": 0.03536445744816181, + "ewc_loss": 0.00185394287109375, + "ewc_loss_parallel": 1.8551945686340332e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 7.400672435760498, + "learning_rate": 1.1742263671047053e-07, + "loss": 0.6709, + "mean_token_accuracy": 0.7990144491195679, + "num_tokens": 10632228.0, + "step": 278 + }, + { + "epoch": 0.03549166772675232, + "ewc_loss": 0.0018310546875, + "ewc_loss_parallel": 1.8328428268432617e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.349424362182617, + "learning_rate": 1.178465451462484e-07, + "loss": 0.7268, + "mean_token_accuracy": 0.78078293800354, + "num_tokens": 10672829.0, + "step": 279 + }, + { + "epoch": 0.03561887800534283, + "ewc_loss": 0.001922607421875, + "ewc_loss_parallel": 1.9222497940063477e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 8.654672622680664, + "learning_rate": 1.1827045358202628e-07, + "loss": 0.7008, + "mean_token_accuracy": 0.7951106429100037, + "num_tokens": 10706146.0, + "step": 280 + }, + { + "epoch": 0.035746088283933344, + "ewc_loss": 0.0019378662109375, + "ewc_loss_parallel": 1.9371509552001953e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 8.160304069519043, + "learning_rate": 1.1869436201780415e-07, + "loss": 0.6168, + "mean_token_accuracy": 0.8153817057609558, + "num_tokens": 10744112.0, + "step": 281 + }, + { + "epoch": 0.03587329856252385, + "ewc_loss": 0.00185394287109375, + "ewc_loss_parallel": 1.8551945686340332e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 8.016385078430176, + "learning_rate": 1.1911827045358202e-07, + "loss": 0.6481, + "mean_token_accuracy": 0.8102145791053772, + "num_tokens": 10783726.0, + "step": 282 + }, + { + "epoch": 0.03600050884111436, + "ewc_loss": 0.0018310546875, + "ewc_loss_parallel": 1.8328428268432617e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 7.370865345001221, + "learning_rate": 1.195421788893599e-07, + "loss": 0.7122, + "mean_token_accuracy": 0.788852334022522, + "num_tokens": 10824476.0, + "step": 283 + }, + { + "epoch": 0.036127719119704874, + "ewc_loss": 0.0018157958984375, + "ewc_loss_parallel": 1.817941665649414e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 8.282337188720703, + "learning_rate": 1.1996608732513778e-07, + "loss": 0.6726, + "mean_token_accuracy": 0.7993762493133545, + "num_tokens": 10859997.0, + "step": 284 + }, + { + "epoch": 0.03625492939829538, + "ewc_loss": 0.00186920166015625, + "ewc_loss_parallel": 1.8700957298278809e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 8.995033264160156, + "learning_rate": 1.2038999576091563e-07, + "loss": 0.6432, + "mean_token_accuracy": 0.8076287508010864, + "num_tokens": 10898800.0, + "step": 285 + }, + { + "epoch": 0.036382139676885895, + "ewc_loss": 0.0019073486328125, + "ewc_loss_parallel": 1.9073486328125e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 7.731472969055176, + "learning_rate": 1.208139041966935e-07, + "loss": 0.6083, + "mean_token_accuracy": 0.8175337910652161, + "num_tokens": 10937462.0, + "step": 286 + }, + { + "epoch": 0.0365093499554764, + "ewc_loss": 0.0018310546875, + "ewc_loss_parallel": 1.8328428268432617e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 8.470982551574707, + "learning_rate": 1.2123781263247137e-07, + "loss": 0.6707, + "mean_token_accuracy": 0.7957923412322998, + "num_tokens": 10970739.0, + "step": 287 + }, + { + "epoch": 0.03663656023406691, + "ewc_loss": 0.00186920166015625, + "ewc_loss_parallel": 1.8700957298278809e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 8.311945915222168, + "learning_rate": 1.2166172106824924e-07, + "loss": 0.7225, + "mean_token_accuracy": 0.7901273369789124, + "num_tokens": 11003000.0, + "step": 288 + }, + { + "epoch": 0.036763770512657425, + "ewc_loss": 0.001861572265625, + "ewc_loss_parallel": 1.862645149230957e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 7.730347156524658, + "learning_rate": 1.2208562950402712e-07, + "loss": 0.6268, + "mean_token_accuracy": 0.8083658814430237, + "num_tokens": 11039665.0, + "step": 289 + }, + { + "epoch": 0.03689098079124793, + "ewc_loss": 0.00185394287109375, + "ewc_loss_parallel": 1.8551945686340332e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 7.197889804840088, + "learning_rate": 1.22509537939805e-07, + "loss": 0.6539, + "mean_token_accuracy": 0.7974565625190735, + "num_tokens": 11078368.0, + "step": 290 + }, + { + "epoch": 0.03701819106983844, + "ewc_loss": 0.0018310546875, + "ewc_loss_parallel": 1.8328428268432617e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 7.532402038574219, + "learning_rate": 1.2293344637558286e-07, + "loss": 0.6795, + "mean_token_accuracy": 0.7967942953109741, + "num_tokens": 11122654.0, + "step": 291 + }, + { + "epoch": 0.037145401348428954, + "ewc_loss": 0.00185394287109375, + "ewc_loss_parallel": 1.8551945686340332e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.065274238586426, + "learning_rate": 1.2335735481136073e-07, + "loss": 0.6357, + "mean_token_accuracy": 0.8029918670654297, + "num_tokens": 11162622.0, + "step": 292 + }, + { + "epoch": 0.03727261162701946, + "ewc_loss": 0.0019378662109375, + "ewc_loss_parallel": 1.9371509552001953e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 8.797054290771484, + "learning_rate": 1.237812632471386e-07, + "loss": 0.6467, + "mean_token_accuracy": 0.8054604530334473, + "num_tokens": 11202564.0, + "step": 293 + }, + { + "epoch": 0.037399821905609976, + "ewc_loss": 0.001953125, + "ewc_loss_parallel": 1.952052116394043e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 8.732356071472168, + "learning_rate": 1.2420517168291648e-07, + "loss": 0.6653, + "mean_token_accuracy": 0.7955596446990967, + "num_tokens": 11243422.0, + "step": 294 + }, + { + "epoch": 0.03752703218420048, + "ewc_loss": 0.001922607421875, + "ewc_loss_parallel": 1.9222497940063477e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 8.236396789550781, + "learning_rate": 1.2462908011869435e-07, + "loss": 0.6464, + "mean_token_accuracy": 0.8014552593231201, + "num_tokens": 11280867.0, + "step": 295 + }, + { + "epoch": 0.03765424246279099, + "ewc_loss": 0.00188446044921875, + "ewc_loss_parallel": 1.8849968910217285e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.621509552001953, + "learning_rate": 1.2505298855447223e-07, + "loss": 0.5943, + "mean_token_accuracy": 0.8191545009613037, + "num_tokens": 11318454.0, + "step": 296 + }, + { + "epoch": 0.037781452741381505, + "ewc_loss": 0.0019378662109375, + "ewc_loss_parallel": 1.9371509552001953e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 7.789803981781006, + "learning_rate": 1.254768969902501e-07, + "loss": 0.7364, + "mean_token_accuracy": 0.781505823135376, + "num_tokens": 11354721.0, + "step": 297 + }, + { + "epoch": 0.03790866301997201, + "ewc_loss": 0.0018768310546875, + "ewc_loss_parallel": 1.8775463104248047e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 8.320802688598633, + "learning_rate": 1.2590080542602797e-07, + "loss": 0.6951, + "mean_token_accuracy": 0.7916209101676941, + "num_tokens": 11385938.0, + "step": 298 + }, + { + "epoch": 0.03803587329856253, + "ewc_loss": 0.00189971923828125, + "ewc_loss_parallel": 1.8998980522155762e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 7.315176486968994, + "learning_rate": 1.2632471386180584e-07, + "loss": 0.6736, + "mean_token_accuracy": 0.7975955009460449, + "num_tokens": 11424911.0, + "step": 299 + }, + { + "epoch": 0.038163083577153034, + "ewc_loss": 0.0018768310546875, + "ewc_loss_parallel": 1.8775463104248047e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 7.669283390045166, + "learning_rate": 1.2674862229758372e-07, + "loss": 0.6405, + "mean_token_accuracy": 0.8080927729606628, + "num_tokens": 11460580.0, + "step": 300 + }, + { + "epoch": 0.03829029385574354, + "ewc_loss": 0.00189971923828125, + "ewc_loss_parallel": 1.8998980522155762e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 7.929276943206787, + "learning_rate": 1.271725307333616e-07, + "loss": 0.6311, + "mean_token_accuracy": 0.8110893964767456, + "num_tokens": 11501031.0, + "step": 301 + }, + { + "epoch": 0.038417504134334056, + "ewc_loss": 0.0019378662109375, + "ewc_loss_parallel": 1.9371509552001953e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 8.036404609680176, + "learning_rate": 1.2759643916913946e-07, + "loss": 0.6407, + "mean_token_accuracy": 0.8075470328330994, + "num_tokens": 11544053.0, + "step": 302 + }, + { + "epoch": 0.03854471441292456, + "ewc_loss": 0.0019378662109375, + "ewc_loss_parallel": 1.9371509552001953e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 8.680339813232422, + "learning_rate": 1.2802034760491733e-07, + "loss": 0.6577, + "mean_token_accuracy": 0.7991933822631836, + "num_tokens": 11576944.0, + "step": 303 + }, + { + "epoch": 0.03867192469151508, + "ewc_loss": 0.0019683837890625, + "ewc_loss_parallel": 1.9669532775878906e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 8.21132755279541, + "learning_rate": 1.284442560406952e-07, + "loss": 0.7075, + "mean_token_accuracy": 0.7862000465393066, + "num_tokens": 11619735.0, + "step": 304 + }, + { + "epoch": 0.038799134970105585, + "ewc_loss": 0.0019378662109375, + "ewc_loss_parallel": 1.9371509552001953e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 7.487079620361328, + "learning_rate": 1.2886816447647308e-07, + "loss": 0.7453, + "mean_token_accuracy": 0.7738064527511597, + "num_tokens": 11655025.0, + "step": 305 + }, + { + "epoch": 0.03892634524869609, + "ewc_loss": 0.001922607421875, + "ewc_loss_parallel": 1.9222497940063477e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 8.325751304626465, + "learning_rate": 1.2929207291225095e-07, + "loss": 0.6414, + "mean_token_accuracy": 0.806957483291626, + "num_tokens": 11691970.0, + "step": 306 + }, + { + "epoch": 0.03905355552728661, + "ewc_loss": 0.0019683837890625, + "ewc_loss_parallel": 1.9669532775878906e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 8.18522834777832, + "learning_rate": 1.2971598134802882e-07, + "loss": 0.6572, + "mean_token_accuracy": 0.8003212213516235, + "num_tokens": 11729561.0, + "step": 307 + }, + { + "epoch": 0.039180765805877114, + "ewc_loss": 0.001983642578125, + "ewc_loss_parallel": 1.9818544387817383e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 8.479305267333984, + "learning_rate": 1.301398897838067e-07, + "loss": 0.677, + "mean_token_accuracy": 0.7935763597488403, + "num_tokens": 11765707.0, + "step": 308 + }, + { + "epoch": 0.03930797608446762, + "ewc_loss": 0.0019683837890625, + "ewc_loss_parallel": 1.9669532775878906e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 7.887449741363525, + "learning_rate": 1.3056379821958457e-07, + "loss": 0.686, + "mean_token_accuracy": 0.7990979552268982, + "num_tokens": 11806223.0, + "step": 309 + }, + { + "epoch": 0.039435186363058136, + "ewc_loss": 0.0019683837890625, + "ewc_loss_parallel": 1.9669532775878906e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 7.61956787109375, + "learning_rate": 1.3098770665536244e-07, + "loss": 0.6404, + "mean_token_accuracy": 0.8068702816963196, + "num_tokens": 11845477.0, + "step": 310 + }, + { + "epoch": 0.039562396641648644, + "ewc_loss": 0.0019683837890625, + "ewc_loss_parallel": 1.9669532775878906e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 8.388252258300781, + "learning_rate": 1.3141161509114031e-07, + "loss": 0.6258, + "mean_token_accuracy": 0.8059918880462646, + "num_tokens": 11877973.0, + "step": 311 + }, + { + "epoch": 0.03968960692023916, + "ewc_loss": 0.00201416015625, + "ewc_loss_parallel": 2.0116567611694336e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 8.567529678344727, + "learning_rate": 1.3183552352691819e-07, + "loss": 0.6342, + "mean_token_accuracy": 0.8078155517578125, + "num_tokens": 11912093.0, + "step": 312 + }, + { + "epoch": 0.039816817198829665, + "ewc_loss": 0.0020294189453125, + "ewc_loss_parallel": 2.0265579223632812e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 8.237844467163086, + "learning_rate": 1.3225943196269603e-07, + "loss": 0.6585, + "mean_token_accuracy": 0.8047814965248108, + "num_tokens": 11952541.0, + "step": 313 + }, + { + "epoch": 0.03994402747742017, + "ewc_loss": 0.0020294189453125, + "ewc_loss_parallel": 2.0265579223632812e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 8.347655296325684, + "learning_rate": 1.3268334039847393e-07, + "loss": 0.6359, + "mean_token_accuracy": 0.806073784828186, + "num_tokens": 11990414.0, + "step": 314 + }, + { + "epoch": 0.04007123775601069, + "ewc_loss": 0.0020294189453125, + "ewc_loss_parallel": 2.0265579223632812e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 7.99641227722168, + "learning_rate": 1.3310724883425178e-07, + "loss": 0.7049, + "mean_token_accuracy": 0.7847491502761841, + "num_tokens": 12025925.0, + "step": 315 + }, + { + "epoch": 0.040198448034601195, + "ewc_loss": 0.0020294189453125, + "ewc_loss_parallel": 2.0265579223632812e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 7.74501895904541, + "learning_rate": 1.3353115727002968e-07, + "loss": 0.6696, + "mean_token_accuracy": 0.7981827259063721, + "num_tokens": 12068879.0, + "step": 316 + }, + { + "epoch": 0.04032565831319171, + "ewc_loss": 0.0020294189453125, + "ewc_loss_parallel": 2.0265579223632812e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 8.19179630279541, + "learning_rate": 1.3395506570580752e-07, + "loss": 0.6358, + "mean_token_accuracy": 0.806449294090271, + "num_tokens": 12111931.0, + "step": 317 + }, + { + "epoch": 0.040452868591782216, + "ewc_loss": 0.0020751953125, + "ewc_loss_parallel": 2.0712614059448242e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.152350425720215, + "learning_rate": 1.3437897414158542e-07, + "loss": 0.7069, + "mean_token_accuracy": 0.7834283113479614, + "num_tokens": 12140431.0, + "step": 318 + }, + { + "epoch": 0.040580078870372724, + "ewc_loss": 0.0020904541015625, + "ewc_loss_parallel": 2.086162567138672e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 8.72072696685791, + "learning_rate": 1.3480288257736327e-07, + "loss": 0.6333, + "mean_token_accuracy": 0.8067511320114136, + "num_tokens": 12173932.0, + "step": 319 + }, + { + "epoch": 0.04070728914896324, + "ewc_loss": 0.0020904541015625, + "ewc_loss_parallel": 2.086162567138672e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 7.729561805725098, + "learning_rate": 1.3522679101314117e-07, + "loss": 0.6124, + "mean_token_accuracy": 0.811016321182251, + "num_tokens": 12210040.0, + "step": 320 + }, + { + "epoch": 0.040834499427553746, + "ewc_loss": 0.0020599365234375, + "ewc_loss_parallel": 2.0563602447509766e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 8.37445068359375, + "learning_rate": 1.35650699448919e-07, + "loss": 0.6447, + "mean_token_accuracy": 0.805270791053772, + "num_tokens": 12248556.0, + "step": 321 + }, + { + "epoch": 0.04096170970614425, + "ewc_loss": 0.0020904541015625, + "ewc_loss_parallel": 2.086162567138672e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 8.973859786987305, + "learning_rate": 1.360746078846969e-07, + "loss": 0.6336, + "mean_token_accuracy": 0.8065787553787231, + "num_tokens": 12285664.0, + "step": 322 + }, + { + "epoch": 0.04108891998473477, + "ewc_loss": 0.0021209716796875, + "ewc_loss_parallel": 2.115964889526367e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.91385269165039, + "learning_rate": 1.3649851632047476e-07, + "loss": 0.6869, + "mean_token_accuracy": 0.792407751083374, + "num_tokens": 12326964.0, + "step": 323 + }, + { + "epoch": 0.041216130263325275, + "ewc_loss": 0.0021820068359375, + "ewc_loss_parallel": 2.175569534301758e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 7.667752742767334, + "learning_rate": 1.3692242475625266e-07, + "loss": 0.594, + "mean_token_accuracy": 0.8215304613113403, + "num_tokens": 12366541.0, + "step": 324 + }, + { + "epoch": 0.04134334054191579, + "ewc_loss": 0.0020751953125, + "ewc_loss_parallel": 2.0712614059448242e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.441765785217285, + "learning_rate": 1.373463331920305e-07, + "loss": 0.6566, + "mean_token_accuracy": 0.8071485757827759, + "num_tokens": 12405664.0, + "step": 325 + }, + { + "epoch": 0.0414705508205063, + "ewc_loss": 0.0021514892578125, + "ewc_loss_parallel": 2.1457672119140625e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.998757362365723, + "learning_rate": 1.377702416278084e-07, + "loss": 0.7219, + "mean_token_accuracy": 0.783135175704956, + "num_tokens": 12445039.0, + "step": 326 + }, + { + "epoch": 0.041597761099096804, + "ewc_loss": 0.0021820068359375, + "ewc_loss_parallel": 2.175569534301758e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 8.528707504272461, + "learning_rate": 1.3819415006358625e-07, + "loss": 0.5912, + "mean_token_accuracy": 0.8200029134750366, + "num_tokens": 12485481.0, + "step": 327 + }, + { + "epoch": 0.04172497137768732, + "ewc_loss": 0.0021514892578125, + "ewc_loss_parallel": 2.1457672119140625e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 8.20357608795166, + "learning_rate": 1.3861805849936415e-07, + "loss": 0.6171, + "mean_token_accuracy": 0.8156691789627075, + "num_tokens": 12530272.0, + "step": 328 + }, + { + "epoch": 0.041852181656277826, + "ewc_loss": 0.002105712890625, + "ewc_loss_parallel": 2.1010637283325195e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 8.094721794128418, + "learning_rate": 1.39041966935142e-07, + "loss": 0.6635, + "mean_token_accuracy": 0.7985919713973999, + "num_tokens": 12565854.0, + "step": 329 + }, + { + "epoch": 0.04197939193486834, + "ewc_loss": 0.0021209716796875, + "ewc_loss_parallel": 2.115964889526367e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 8.294790267944336, + "learning_rate": 1.394658753709199e-07, + "loss": 0.6485, + "mean_token_accuracy": 0.8044062852859497, + "num_tokens": 12607313.0, + "step": 330 + }, + { + "epoch": 0.04210660221345885, + "ewc_loss": 0.0021514892578125, + "ewc_loss_parallel": 2.1457672119140625e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.438600540161133, + "learning_rate": 1.3988978380669774e-07, + "loss": 0.6864, + "mean_token_accuracy": 0.7931016683578491, + "num_tokens": 12642057.0, + "step": 331 + }, + { + "epoch": 0.042233812492049355, + "ewc_loss": 0.002197265625, + "ewc_loss_parallel": 2.1904706954956055e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 8.024364471435547, + "learning_rate": 1.403136922424756e-07, + "loss": 0.6767, + "mean_token_accuracy": 0.7954355478286743, + "num_tokens": 12684118.0, + "step": 332 + }, + { + "epoch": 0.04236102277063987, + "ewc_loss": 0.0021514892578125, + "ewc_loss_parallel": 2.1457672119140625e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.072795867919922, + "learning_rate": 1.4073760067825348e-07, + "loss": 0.6421, + "mean_token_accuracy": 0.8046202659606934, + "num_tokens": 12721945.0, + "step": 333 + }, + { + "epoch": 0.04248823304923038, + "ewc_loss": 0.002197265625, + "ewc_loss_parallel": 2.1904706954956055e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 8.51892375946045, + "learning_rate": 1.4116150911403136e-07, + "loss": 0.6551, + "mean_token_accuracy": 0.8011524677276611, + "num_tokens": 12764175.0, + "step": 334 + }, + { + "epoch": 0.04261544332782089, + "ewc_loss": 0.0021820068359375, + "ewc_loss_parallel": 2.175569534301758e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.82372760772705, + "learning_rate": 1.4158541754980923e-07, + "loss": 0.6074, + "mean_token_accuracy": 0.8170691132545471, + "num_tokens": 12801600.0, + "step": 335 + }, + { + "epoch": 0.0427426536064114, + "ewc_loss": 0.0022125244140625, + "ewc_loss_parallel": 2.205371856689453e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.541915893554688, + "learning_rate": 1.420093259855871e-07, + "loss": 0.6378, + "mean_token_accuracy": 0.8118122816085815, + "num_tokens": 12843584.0, + "step": 336 + }, + { + "epoch": 0.042869863885001906, + "ewc_loss": 0.0022125244140625, + "ewc_loss_parallel": 2.205371856689453e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 8.497153282165527, + "learning_rate": 1.4243323442136497e-07, + "loss": 0.6529, + "mean_token_accuracy": 0.802107572555542, + "num_tokens": 12878105.0, + "step": 337 + }, + { + "epoch": 0.04299707416359242, + "ewc_loss": 0.002166748046875, + "ewc_loss_parallel": 2.16066837310791e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 7.9197187423706055, + "learning_rate": 1.4285714285714285e-07, + "loss": 0.661, + "mean_token_accuracy": 0.7999072670936584, + "num_tokens": 12911712.0, + "step": 338 + }, + { + "epoch": 0.04312428444218293, + "ewc_loss": 0.0021514892578125, + "ewc_loss_parallel": 2.1457672119140625e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 8.923541069030762, + "learning_rate": 1.4328105129292072e-07, + "loss": 0.6408, + "mean_token_accuracy": 0.8069993257522583, + "num_tokens": 12951303.0, + "step": 339 + }, + { + "epoch": 0.043251494720773435, + "ewc_loss": 0.002227783203125, + "ewc_loss_parallel": 2.2202730178833008e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 8.97110652923584, + "learning_rate": 1.437049597286986e-07, + "loss": 0.6279, + "mean_token_accuracy": 0.809525728225708, + "num_tokens": 12990544.0, + "step": 340 + }, + { + "epoch": 0.04337870499936395, + "ewc_loss": 0.0022125244140625, + "ewc_loss_parallel": 2.205371856689453e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.015100479125977, + "learning_rate": 1.4412886816447646e-07, + "loss": 0.6877, + "mean_token_accuracy": 0.791803240776062, + "num_tokens": 13023766.0, + "step": 341 + }, + { + "epoch": 0.04350591527795446, + "ewc_loss": 0.0022125244140625, + "ewc_loss_parallel": 2.205371856689453e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.41790771484375, + "learning_rate": 1.4455277660025434e-07, + "loss": 0.6022, + "mean_token_accuracy": 0.8166999816894531, + "num_tokens": 13056337.0, + "step": 342 + }, + { + "epoch": 0.04363312555654497, + "ewc_loss": 0.0022430419921875, + "ewc_loss_parallel": 2.250075340270996e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.012749671936035, + "learning_rate": 1.449766850360322e-07, + "loss": 0.6586, + "mean_token_accuracy": 0.8016791939735413, + "num_tokens": 13097345.0, + "step": 343 + }, + { + "epoch": 0.04376033583513548, + "ewc_loss": 0.0022125244140625, + "ewc_loss_parallel": 2.205371856689453e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 8.953073501586914, + "learning_rate": 1.4540059347181008e-07, + "loss": 0.6449, + "mean_token_accuracy": 0.8036813735961914, + "num_tokens": 13127332.0, + "step": 344 + }, + { + "epoch": 0.043887546113725986, + "ewc_loss": 0.002227783203125, + "ewc_loss_parallel": 2.2202730178833008e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 7.75667667388916, + "learning_rate": 1.4582450190758795e-07, + "loss": 0.6732, + "mean_token_accuracy": 0.7955295443534851, + "num_tokens": 13170760.0, + "step": 345 + }, + { + "epoch": 0.0440147563923165, + "ewc_loss": 0.002166748046875, + "ewc_loss_parallel": 2.16066837310791e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 8.60373592376709, + "learning_rate": 1.4624841034336583e-07, + "loss": 0.6433, + "mean_token_accuracy": 0.806284487247467, + "num_tokens": 13207061.0, + "step": 346 + }, + { + "epoch": 0.04414196667090701, + "ewc_loss": 0.002227783203125, + "ewc_loss_parallel": 2.2351741790771484e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 8.991768836975098, + "learning_rate": 1.466723187791437e-07, + "loss": 0.6106, + "mean_token_accuracy": 0.8150819540023804, + "num_tokens": 13247903.0, + "step": 347 + }, + { + "epoch": 0.04426917694949752, + "ewc_loss": 0.002288818359375, + "ewc_loss_parallel": 2.294778823852539e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 8.315632820129395, + "learning_rate": 1.4709622721492157e-07, + "loss": 0.6209, + "mean_token_accuracy": 0.8074144124984741, + "num_tokens": 13287193.0, + "step": 348 + }, + { + "epoch": 0.04439638722808803, + "ewc_loss": 0.0022430419921875, + "ewc_loss_parallel": 2.250075340270996e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 8.47659969329834, + "learning_rate": 1.4752013565069942e-07, + "loss": 0.6253, + "mean_token_accuracy": 0.8095780611038208, + "num_tokens": 13323112.0, + "step": 349 + }, + { + "epoch": 0.04452359750667854, + "ewc_loss": 0.0022735595703125, + "ewc_loss_parallel": 2.2798776626586914e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 8.719677925109863, + "learning_rate": 1.4794404408647732e-07, + "loss": 0.6212, + "mean_token_accuracy": 0.8123430013656616, + "num_tokens": 13359653.0, + "step": 350 + }, + { + "epoch": 0.04465080778526905, + "ewc_loss": 0.0023040771484375, + "ewc_loss_parallel": 2.3096799850463867e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 8.56421947479248, + "learning_rate": 1.4836795252225516e-07, + "loss": 0.5925, + "mean_token_accuracy": 0.8176425695419312, + "num_tokens": 13400571.0, + "step": 351 + }, + { + "epoch": 0.04477801806385956, + "ewc_loss": 0.002288818359375, + "ewc_loss_parallel": 2.294778823852539e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 8.925312995910645, + "learning_rate": 1.4879186095803306e-07, + "loss": 0.5703, + "mean_token_accuracy": 0.8206771612167358, + "num_tokens": 13437656.0, + "step": 352 + }, + { + "epoch": 0.04490522834245007, + "ewc_loss": 0.0023040771484375, + "ewc_loss_parallel": 2.3096799850463867e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 8.373480796813965, + "learning_rate": 1.492157693938109e-07, + "loss": 0.6293, + "mean_token_accuracy": 0.8074237108230591, + "num_tokens": 13474688.0, + "step": 353 + }, + { + "epoch": 0.04503243862104058, + "ewc_loss": 0.002288818359375, + "ewc_loss_parallel": 2.294778823852539e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 8.035463333129883, + "learning_rate": 1.496396778295888e-07, + "loss": 0.5657, + "mean_token_accuracy": 0.8260382413864136, + "num_tokens": 13518256.0, + "step": 354 + }, + { + "epoch": 0.04515964889963109, + "ewc_loss": 0.0022735595703125, + "ewc_loss_parallel": 2.2798776626586914e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 8.40050220489502, + "learning_rate": 1.5006358626536665e-07, + "loss": 0.6534, + "mean_token_accuracy": 0.803668737411499, + "num_tokens": 13554315.0, + "step": 355 + }, + { + "epoch": 0.0452868591782216, + "ewc_loss": 0.0023193359375, + "ewc_loss_parallel": 2.3245811462402344e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 8.166177749633789, + "learning_rate": 1.5048749470114455e-07, + "loss": 0.6751, + "mean_token_accuracy": 0.796178936958313, + "num_tokens": 13594388.0, + "step": 356 + }, + { + "epoch": 0.04541406945681211, + "ewc_loss": 0.0023193359375, + "ewc_loss_parallel": 2.3245811462402344e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 8.298208236694336, + "learning_rate": 1.509114031369224e-07, + "loss": 0.6002, + "mean_token_accuracy": 0.8149843215942383, + "num_tokens": 13633704.0, + "step": 357 + }, + { + "epoch": 0.04554127973540262, + "ewc_loss": 0.0023193359375, + "ewc_loss_parallel": 2.3245811462402344e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 8.432403564453125, + "learning_rate": 1.513353115727003e-07, + "loss": 0.6513, + "mean_token_accuracy": 0.7979471683502197, + "num_tokens": 13671183.0, + "step": 358 + }, + { + "epoch": 0.04566849001399313, + "ewc_loss": 0.0023345947265625, + "ewc_loss_parallel": 2.339482307434082e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 8.301637649536133, + "learning_rate": 1.5175922000847814e-07, + "loss": 0.605, + "mean_token_accuracy": 0.8149482011795044, + "num_tokens": 13709640.0, + "step": 359 + }, + { + "epoch": 0.04579570029258364, + "ewc_loss": 0.002349853515625, + "ewc_loss_parallel": 2.3543834686279297e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 8.761834144592285, + "learning_rate": 1.5218312844425604e-07, + "loss": 0.6272, + "mean_token_accuracy": 0.8020066022872925, + "num_tokens": 13751114.0, + "step": 360 + }, + { + "epoch": 0.045922910571174154, + "ewc_loss": 0.0023651123046875, + "ewc_loss_parallel": 2.3692846298217773e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 8.495912551879883, + "learning_rate": 1.526070368800339e-07, + "loss": 0.5587, + "mean_token_accuracy": 0.8224564790725708, + "num_tokens": 13786448.0, + "step": 361 + }, + { + "epoch": 0.04605012084976466, + "ewc_loss": 0.002349853515625, + "ewc_loss_parallel": 2.3543834686279297e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 8.621554374694824, + "learning_rate": 1.530309453158118e-07, + "loss": 0.5813, + "mean_token_accuracy": 0.8252140283584595, + "num_tokens": 13827663.0, + "step": 362 + }, + { + "epoch": 0.04617733112835517, + "ewc_loss": 0.002349853515625, + "ewc_loss_parallel": 2.3543834686279297e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 8.891202926635742, + "learning_rate": 1.5345485375158963e-07, + "loss": 0.66, + "mean_token_accuracy": 0.800460696220398, + "num_tokens": 13866078.0, + "step": 363 + }, + { + "epoch": 0.04630454140694568, + "ewc_loss": 0.0023651123046875, + "ewc_loss_parallel": 2.3692846298217773e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 8.430252075195312, + "learning_rate": 1.5387876218736753e-07, + "loss": 0.5374, + "mean_token_accuracy": 0.8325359225273132, + "num_tokens": 13909767.0, + "step": 364 + }, + { + "epoch": 0.04643175168553619, + "ewc_loss": 0.002349853515625, + "ewc_loss_parallel": 2.3543834686279297e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 8.625835418701172, + "learning_rate": 1.5430267062314538e-07, + "loss": 0.6462, + "mean_token_accuracy": 0.8058871626853943, + "num_tokens": 13948937.0, + "step": 365 + }, + { + "epoch": 0.0465589619641267, + "ewc_loss": 0.0023651123046875, + "ewc_loss_parallel": 2.3692846298217773e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.44101619720459, + "learning_rate": 1.5472657905892328e-07, + "loss": 0.6573, + "mean_token_accuracy": 0.7986865043640137, + "num_tokens": 13984039.0, + "step": 366 + }, + { + "epoch": 0.04668617224271721, + "ewc_loss": 0.0023956298828125, + "ewc_loss_parallel": 2.3990869522094727e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.38039493560791, + "learning_rate": 1.5515048749470113e-07, + "loss": 0.6406, + "mean_token_accuracy": 0.8030736446380615, + "num_tokens": 14018162.0, + "step": 367 + }, + { + "epoch": 0.04681338252130772, + "ewc_loss": 0.0023956298828125, + "ewc_loss_parallel": 2.3990869522094727e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 8.925853729248047, + "learning_rate": 1.55574395930479e-07, + "loss": 0.64, + "mean_token_accuracy": 0.8047165274620056, + "num_tokens": 14056493.0, + "step": 368 + }, + { + "epoch": 0.046940592799898234, + "ewc_loss": 0.0023651123046875, + "ewc_loss_parallel": 2.3692846298217773e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 8.520870208740234, + "learning_rate": 1.5599830436625687e-07, + "loss": 0.6334, + "mean_token_accuracy": 0.807560384273529, + "num_tokens": 14097530.0, + "step": 369 + }, + { + "epoch": 0.04706780307848874, + "ewc_loss": 0.0023651123046875, + "ewc_loss_parallel": 2.3692846298217773e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 8.792484283447266, + "learning_rate": 1.5642221280203474e-07, + "loss": 0.6521, + "mean_token_accuracy": 0.8061304092407227, + "num_tokens": 14136240.0, + "step": 370 + }, + { + "epoch": 0.04719501335707925, + "ewc_loss": 0.0023956298828125, + "ewc_loss_parallel": 2.3990869522094727e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.331767082214355, + "learning_rate": 1.5684612123781262e-07, + "loss": 0.5622, + "mean_token_accuracy": 0.8262265920639038, + "num_tokens": 14171010.0, + "step": 371 + }, + { + "epoch": 0.04732222363566976, + "ewc_loss": 0.002410888671875, + "ewc_loss_parallel": 2.4139881134033203e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 8.39177417755127, + "learning_rate": 1.572700296735905e-07, + "loss": 0.6324, + "mean_token_accuracy": 0.8121798634529114, + "num_tokens": 14206985.0, + "step": 372 + }, + { + "epoch": 0.04744943391426027, + "ewc_loss": 0.0023651123046875, + "ewc_loss_parallel": 2.3692846298217773e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 8.988455772399902, + "learning_rate": 1.576939381093684e-07, + "loss": 0.6322, + "mean_token_accuracy": 0.8041210174560547, + "num_tokens": 14236728.0, + "step": 373 + }, + { + "epoch": 0.047576644192850785, + "ewc_loss": 0.0024261474609375, + "ewc_loss_parallel": 2.428889274597168e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.198880195617676, + "learning_rate": 1.5811784654514623e-07, + "loss": 0.5584, + "mean_token_accuracy": 0.8274049758911133, + "num_tokens": 14270758.0, + "step": 374 + }, + { + "epoch": 0.04770385447144129, + "ewc_loss": 0.0024566650390625, + "ewc_loss_parallel": 2.4586915969848633e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 8.830583572387695, + "learning_rate": 1.5854175498092413e-07, + "loss": 0.677, + "mean_token_accuracy": 0.7917110919952393, + "num_tokens": 14315002.0, + "step": 375 + }, + { + "epoch": 0.0478310647500318, + "ewc_loss": 0.0024261474609375, + "ewc_loss_parallel": 2.428889274597168e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.060370445251465, + "learning_rate": 1.5896566341670198e-07, + "loss": 0.647, + "mean_token_accuracy": 0.799508273601532, + "num_tokens": 14353385.0, + "step": 376 + }, + { + "epoch": 0.047958275028622314, + "ewc_loss": 0.0024566650390625, + "ewc_loss_parallel": 2.4586915969848633e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.152095794677734, + "learning_rate": 1.5938957185247988e-07, + "loss": 0.6792, + "mean_token_accuracy": 0.7892894148826599, + "num_tokens": 14391702.0, + "step": 377 + }, + { + "epoch": 0.04808548530721282, + "ewc_loss": 0.002471923828125, + "ewc_loss_parallel": 2.473592758178711e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 8.93427562713623, + "learning_rate": 1.5981348028825772e-07, + "loss": 0.6471, + "mean_token_accuracy": 0.8031771183013916, + "num_tokens": 14427876.0, + "step": 378 + }, + { + "epoch": 0.048212695585803336, + "ewc_loss": 0.00244140625, + "ewc_loss_parallel": 2.4437904357910156e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 8.90151309967041, + "learning_rate": 1.6023738872403562e-07, + "loss": 0.7354, + "mean_token_accuracy": 0.7751237154006958, + "num_tokens": 14460349.0, + "step": 379 + }, + { + "epoch": 0.048339905864393844, + "ewc_loss": 0.0024566650390625, + "ewc_loss_parallel": 2.4586915969848633e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.007549285888672, + "learning_rate": 1.6066129715981347e-07, + "loss": 0.5841, + "mean_token_accuracy": 0.8230306506156921, + "num_tokens": 14497763.0, + "step": 380 + }, + { + "epoch": 0.04846711614298435, + "ewc_loss": 0.0024566650390625, + "ewc_loss_parallel": 2.4586915969848633e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 8.13980770111084, + "learning_rate": 1.6108520559559137e-07, + "loss": 0.5764, + "mean_token_accuracy": 0.8208991289138794, + "num_tokens": 14537172.0, + "step": 381 + }, + { + "epoch": 0.048594326421574865, + "ewc_loss": 0.0024261474609375, + "ewc_loss_parallel": 2.428889274597168e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.2940034866333, + "learning_rate": 1.6150911403136921e-07, + "loss": 0.665, + "mean_token_accuracy": 0.802221417427063, + "num_tokens": 14565515.0, + "step": 382 + }, + { + "epoch": 0.04872153670016537, + "ewc_loss": 0.00250244140625, + "ewc_loss_parallel": 2.5033950805664062e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 8.345035552978516, + "learning_rate": 1.619330224671471e-07, + "loss": 0.6548, + "mean_token_accuracy": 0.8015453219413757, + "num_tokens": 14608104.0, + "step": 383 + }, + { + "epoch": 0.04884874697875588, + "ewc_loss": 0.002471923828125, + "ewc_loss_parallel": 2.473592758178711e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 8.710956573486328, + "learning_rate": 1.6235693090292496e-07, + "loss": 0.6613, + "mean_token_accuracy": 0.8028895854949951, + "num_tokens": 14645328.0, + "step": 384 + }, + { + "epoch": 0.048975957257346395, + "ewc_loss": 0.0024871826171875, + "ewc_loss_parallel": 2.4884939193725586e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.229822158813477, + "learning_rate": 1.6278083933870286e-07, + "loss": 0.6985, + "mean_token_accuracy": 0.7841172218322754, + "num_tokens": 14678792.0, + "step": 385 + }, + { + "epoch": 0.0491031675359369, + "ewc_loss": 0.00250244140625, + "ewc_loss_parallel": 2.5033950805664062e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 8.548136711120605, + "learning_rate": 1.632047477744807e-07, + "loss": 0.6206, + "mean_token_accuracy": 0.8114973306655884, + "num_tokens": 14715095.0, + "step": 386 + }, + { + "epoch": 0.049230377814527417, + "ewc_loss": 0.0024871826171875, + "ewc_loss_parallel": 2.4884939193725586e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 8.844243049621582, + "learning_rate": 1.6362865621025858e-07, + "loss": 0.6396, + "mean_token_accuracy": 0.8057734966278076, + "num_tokens": 14753641.0, + "step": 387 + }, + { + "epoch": 0.049357588093117924, + "ewc_loss": 0.00250244140625, + "ewc_loss_parallel": 2.5033950805664062e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 8.455232620239258, + "learning_rate": 1.6405256464603645e-07, + "loss": 0.6104, + "mean_token_accuracy": 0.8114067316055298, + "num_tokens": 14788816.0, + "step": 388 + }, + { + "epoch": 0.04948479837170843, + "ewc_loss": 0.0024871826171875, + "ewc_loss_parallel": 2.4884939193725586e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.059876441955566, + "learning_rate": 1.6447647308181432e-07, + "loss": 0.655, + "mean_token_accuracy": 0.7998487949371338, + "num_tokens": 14821607.0, + "step": 389 + }, + { + "epoch": 0.049612008650298946, + "ewc_loss": 0.0025177001953125, + "ewc_loss_parallel": 2.518296241760254e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.554154396057129, + "learning_rate": 1.649003815175922e-07, + "loss": 0.6722, + "mean_token_accuracy": 0.7975452542304993, + "num_tokens": 14861298.0, + "step": 390 + }, + { + "epoch": 0.04973921892888945, + "ewc_loss": 0.0025482177734375, + "ewc_loss_parallel": 2.5480985641479492e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 8.631272315979004, + "learning_rate": 1.6532428995337007e-07, + "loss": 0.5881, + "mean_token_accuracy": 0.818036675453186, + "num_tokens": 14899951.0, + "step": 391 + }, + { + "epoch": 0.04986642920747997, + "ewc_loss": 0.0025177001953125, + "ewc_loss_parallel": 2.518296241760254e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 8.170821189880371, + "learning_rate": 1.6574819838914794e-07, + "loss": 0.5682, + "mean_token_accuracy": 0.8276220560073853, + "num_tokens": 14937169.0, + "step": 392 + }, + { + "epoch": 0.049993639486070475, + "ewc_loss": 0.0024871826171875, + "ewc_loss_parallel": 2.4884939193725586e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 8.695625305175781, + "learning_rate": 1.661721068249258e-07, + "loss": 0.6219, + "mean_token_accuracy": 0.8103762865066528, + "num_tokens": 14975549.0, + "step": 393 + }, + { + "epoch": 0.05012084976466098, + "ewc_loss": 0.0025177001953125, + "ewc_loss_parallel": 2.518296241760254e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 8.536992073059082, + "learning_rate": 1.6659601526070368e-07, + "loss": 0.6494, + "mean_token_accuracy": 0.8035546541213989, + "num_tokens": 15016630.0, + "step": 394 + }, + { + "epoch": 0.0502480600432515, + "ewc_loss": 0.002532958984375, + "ewc_loss_parallel": 2.5331974029541016e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 8.816391944885254, + "learning_rate": 1.6701992369648156e-07, + "loss": 0.6305, + "mean_token_accuracy": 0.8062180280685425, + "num_tokens": 15057822.0, + "step": 395 + }, + { + "epoch": 0.050375270321842004, + "ewc_loss": 0.0025482177734375, + "ewc_loss_parallel": 2.5480985641479492e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.599835395812988, + "learning_rate": 1.6744383213225943e-07, + "loss": 0.6042, + "mean_token_accuracy": 0.8097303509712219, + "num_tokens": 15094385.0, + "step": 396 + }, + { + "epoch": 0.05050248060043251, + "ewc_loss": 0.0025787353515625, + "ewc_loss_parallel": 2.5779008865356445e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 8.5106782913208, + "learning_rate": 1.678677405680373e-07, + "loss": 0.5631, + "mean_token_accuracy": 0.8234203457832336, + "num_tokens": 15130969.0, + "step": 397 + }, + { + "epoch": 0.050629690879023026, + "ewc_loss": 0.0025177001953125, + "ewc_loss_parallel": 2.518296241760254e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 8.673959732055664, + "learning_rate": 1.6829164900381518e-07, + "loss": 0.5838, + "mean_token_accuracy": 0.8166221380233765, + "num_tokens": 15166586.0, + "step": 398 + }, + { + "epoch": 0.05075690115761353, + "ewc_loss": 0.002532958984375, + "ewc_loss_parallel": 2.5331974029541016e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 8.91696548461914, + "learning_rate": 1.6871555743959305e-07, + "loss": 0.5747, + "mean_token_accuracy": 0.8193067312240601, + "num_tokens": 15209603.0, + "step": 399 + }, + { + "epoch": 0.05088411143620405, + "ewc_loss": 0.0025634765625, + "ewc_loss_parallel": 2.562999725341797e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 8.791338920593262, + "learning_rate": 1.6913946587537092e-07, + "loss": 0.5917, + "mean_token_accuracy": 0.8168106079101562, + "num_tokens": 15251542.0, + "step": 400 + }, + { + "epoch": 0.051011321714794555, + "ewc_loss": 0.0025634765625, + "ewc_loss_parallel": 2.562999725341797e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 8.988388061523438, + "learning_rate": 1.695633743111488e-07, + "loss": 0.6065, + "mean_token_accuracy": 0.8154354691505432, + "num_tokens": 15288940.0, + "step": 401 + }, + { + "epoch": 0.05113853199338506, + "ewc_loss": 0.0025634765625, + "ewc_loss_parallel": 2.562999725341797e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 8.639955520629883, + "learning_rate": 1.6998728274692667e-07, + "loss": 0.5741, + "mean_token_accuracy": 0.8260315656661987, + "num_tokens": 15321965.0, + "step": 402 + }, + { + "epoch": 0.05126574227197558, + "ewc_loss": 0.0025634765625, + "ewc_loss_parallel": 2.562999725341797e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 8.722309112548828, + "learning_rate": 1.7041119118270454e-07, + "loss": 0.5808, + "mean_token_accuracy": 0.8214489221572876, + "num_tokens": 15362895.0, + "step": 403 + }, + { + "epoch": 0.051392952550566084, + "ewc_loss": 0.0025787353515625, + "ewc_loss_parallel": 2.5779008865356445e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.347762107849121, + "learning_rate": 1.7083509961848238e-07, + "loss": 0.6424, + "mean_token_accuracy": 0.8017255067825317, + "num_tokens": 15396675.0, + "step": 404 + }, + { + "epoch": 0.0515201628291566, + "ewc_loss": 0.002593994140625, + "ewc_loss_parallel": 2.592802047729492e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 8.647814750671387, + "learning_rate": 1.7125900805426028e-07, + "loss": 0.662, + "mean_token_accuracy": 0.797907829284668, + "num_tokens": 15443070.0, + "step": 405 + }, + { + "epoch": 0.051647373107747106, + "ewc_loss": 0.0025634765625, + "ewc_loss_parallel": 2.562999725341797e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 8.341715812683105, + "learning_rate": 1.7168291649003813e-07, + "loss": 0.5895, + "mean_token_accuracy": 0.8168545365333557, + "num_tokens": 15476747.0, + "step": 406 + }, + { + "epoch": 0.051774583386337614, + "ewc_loss": 0.0025482177734375, + "ewc_loss_parallel": 2.5480985641479492e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 8.807439804077148, + "learning_rate": 1.7210682492581603e-07, + "loss": 0.614, + "mean_token_accuracy": 0.8124684691429138, + "num_tokens": 15514866.0, + "step": 407 + }, + { + "epoch": 0.05190179366492813, + "ewc_loss": 0.002593994140625, + "ewc_loss_parallel": 2.592802047729492e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 8.878665924072266, + "learning_rate": 1.7253073336159387e-07, + "loss": 0.5902, + "mean_token_accuracy": 0.8164839744567871, + "num_tokens": 15554245.0, + "step": 408 + }, + { + "epoch": 0.052029003943518635, + "ewc_loss": 0.00262451171875, + "ewc_loss_parallel": 2.6226043701171875e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 8.638091087341309, + "learning_rate": 1.7295464179737177e-07, + "loss": 0.6183, + "mean_token_accuracy": 0.8116266131401062, + "num_tokens": 15593958.0, + "step": 409 + }, + { + "epoch": 0.05215621422210915, + "ewc_loss": 0.002593994140625, + "ewc_loss_parallel": 2.592802047729492e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 8.384969711303711, + "learning_rate": 1.7337855023314962e-07, + "loss": 0.5644, + "mean_token_accuracy": 0.8248791694641113, + "num_tokens": 15632567.0, + "step": 410 + }, + { + "epoch": 0.05228342450069966, + "ewc_loss": 0.0025787353515625, + "ewc_loss_parallel": 2.5779008865356445e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.07961654663086, + "learning_rate": 1.7380245866892752e-07, + "loss": 0.5754, + "mean_token_accuracy": 0.8213843107223511, + "num_tokens": 15668255.0, + "step": 411 + }, + { + "epoch": 0.052410634779290165, + "ewc_loss": 0.002655029296875, + "ewc_loss_parallel": 2.652406692504883e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.27226734161377, + "learning_rate": 1.7422636710470536e-07, + "loss": 0.6284, + "mean_token_accuracy": 0.8057376742362976, + "num_tokens": 15705435.0, + "step": 412 + }, + { + "epoch": 0.05253784505788068, + "ewc_loss": 0.0026702880859375, + "ewc_loss_parallel": 2.6673078536987305e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 8.861532211303711, + "learning_rate": 1.7465027554048326e-07, + "loss": 0.6481, + "mean_token_accuracy": 0.80116868019104, + "num_tokens": 15744657.0, + "step": 413 + }, + { + "epoch": 0.05266505533647119, + "ewc_loss": 0.0026092529296875, + "ewc_loss_parallel": 2.60770320892334e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 8.987380981445312, + "learning_rate": 1.750741839762611e-07, + "loss": 0.5898, + "mean_token_accuracy": 0.8194476366043091, + "num_tokens": 15777374.0, + "step": 414 + }, + { + "epoch": 0.052792265615061694, + "ewc_loss": 0.00262451171875, + "ewc_loss_parallel": 2.6226043701171875e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 8.840057373046875, + "learning_rate": 1.75498092412039e-07, + "loss": 0.5628, + "mean_token_accuracy": 0.8265946507453918, + "num_tokens": 15814047.0, + "step": 415 + }, + { + "epoch": 0.05291947589365221, + "ewc_loss": 0.0026397705078125, + "ewc_loss_parallel": 2.637505531311035e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 8.28353214263916, + "learning_rate": 1.7592200084781686e-07, + "loss": 0.6458, + "mean_token_accuracy": 0.8001805543899536, + "num_tokens": 15849110.0, + "step": 416 + }, + { + "epoch": 0.053046686172242716, + "ewc_loss": 0.0026092529296875, + "ewc_loss_parallel": 2.60770320892334e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 8.648272514343262, + "learning_rate": 1.7634590928359475e-07, + "loss": 0.6146, + "mean_token_accuracy": 0.8133435845375061, + "num_tokens": 15888910.0, + "step": 417 + }, + { + "epoch": 0.05317389645083323, + "ewc_loss": 0.002685546875, + "ewc_loss_parallel": 2.682209014892578e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 8.806100845336914, + "learning_rate": 1.767698177193726e-07, + "loss": 0.6017, + "mean_token_accuracy": 0.8146113753318787, + "num_tokens": 15925604.0, + "step": 418 + }, + { + "epoch": 0.05330110672942374, + "ewc_loss": 0.002685546875, + "ewc_loss_parallel": 2.682209014892578e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 8.419374465942383, + "learning_rate": 1.771937261551505e-07, + "loss": 0.5749, + "mean_token_accuracy": 0.828262448310852, + "num_tokens": 15961401.0, + "step": 419 + }, + { + "epoch": 0.053428317008014245, + "ewc_loss": 0.0026397705078125, + "ewc_loss_parallel": 2.637505531311035e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 8.724082946777344, + "learning_rate": 1.7761763459092835e-07, + "loss": 0.6057, + "mean_token_accuracy": 0.8140764832496643, + "num_tokens": 16001029.0, + "step": 420 + }, + { + "epoch": 0.05355552728660476, + "ewc_loss": 0.0027008056640625, + "ewc_loss_parallel": 2.6971101760864258e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.019049644470215, + "learning_rate": 1.7804154302670624e-07, + "loss": 0.5872, + "mean_token_accuracy": 0.8193696141242981, + "num_tokens": 16036014.0, + "step": 421 + }, + { + "epoch": 0.05368273756519527, + "ewc_loss": 0.0027313232421875, + "ewc_loss_parallel": 2.726912498474121e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 8.735569953918457, + "learning_rate": 1.784654514624841e-07, + "loss": 0.5688, + "mean_token_accuracy": 0.8212974667549133, + "num_tokens": 16075373.0, + "step": 422 + }, + { + "epoch": 0.05380994784378578, + "ewc_loss": 0.0027313232421875, + "ewc_loss_parallel": 2.726912498474121e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.084811210632324, + "learning_rate": 1.7888935989826196e-07, + "loss": 0.6109, + "mean_token_accuracy": 0.807906985282898, + "num_tokens": 16111352.0, + "step": 423 + }, + { + "epoch": 0.05393715812237629, + "ewc_loss": 0.0027618408203125, + "ewc_loss_parallel": 2.7567148208618164e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 8.825933456420898, + "learning_rate": 1.7931326833403984e-07, + "loss": 0.6273, + "mean_token_accuracy": 0.8058838248252869, + "num_tokens": 16148826.0, + "step": 424 + }, + { + "epoch": 0.054064368400966796, + "ewc_loss": 0.0027008056640625, + "ewc_loss_parallel": 2.6971101760864258e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 8.268967628479004, + "learning_rate": 1.797371767698177e-07, + "loss": 0.532, + "mean_token_accuracy": 0.8362812399864197, + "num_tokens": 16193491.0, + "step": 425 + }, + { + "epoch": 0.05419157867955731, + "ewc_loss": 0.0026702880859375, + "ewc_loss_parallel": 2.6673078536987305e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 8.863520622253418, + "learning_rate": 1.8016108520559558e-07, + "loss": 0.5842, + "mean_token_accuracy": 0.8209033608436584, + "num_tokens": 16235595.0, + "step": 426 + }, + { + "epoch": 0.05431878895814782, + "ewc_loss": 0.002777099609375, + "ewc_loss_parallel": 2.771615982055664e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.163789749145508, + "learning_rate": 1.8058499364137345e-07, + "loss": 0.6393, + "mean_token_accuracy": 0.8085329532623291, + "num_tokens": 16274105.0, + "step": 427 + }, + { + "epoch": 0.054445999236738325, + "ewc_loss": 0.0028533935546875, + "ewc_loss_parallel": 2.8461217880249023e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 8.81577205657959, + "learning_rate": 1.8100890207715133e-07, + "loss": 0.6823, + "mean_token_accuracy": 0.7874737977981567, + "num_tokens": 16315886.0, + "step": 428 + }, + { + "epoch": 0.05457320951532884, + "ewc_loss": 0.002777099609375, + "ewc_loss_parallel": 2.771615982055664e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 8.916504859924316, + "learning_rate": 1.814328105129292e-07, + "loss": 0.6238, + "mean_token_accuracy": 0.806714653968811, + "num_tokens": 16353021.0, + "step": 429 + }, + { + "epoch": 0.05470041979391935, + "ewc_loss": 0.002777099609375, + "ewc_loss_parallel": 2.771615982055664e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.389135360717773, + "learning_rate": 1.8185671894870707e-07, + "loss": 0.5977, + "mean_token_accuracy": 0.8124553561210632, + "num_tokens": 16380887.0, + "step": 430 + }, + { + "epoch": 0.05482763007250986, + "ewc_loss": 0.00286865234375, + "ewc_loss_parallel": 2.86102294921875e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 8.66264533996582, + "learning_rate": 1.8228062738448494e-07, + "loss": 0.6541, + "mean_token_accuracy": 0.8004610538482666, + "num_tokens": 16421066.0, + "step": 431 + }, + { + "epoch": 0.05495484035110037, + "ewc_loss": 0.0027618408203125, + "ewc_loss_parallel": 2.7567148208618164e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.01974105834961, + "learning_rate": 1.8270453582026282e-07, + "loss": 0.6233, + "mean_token_accuracy": 0.8077369332313538, + "num_tokens": 16457197.0, + "step": 432 + }, + { + "epoch": 0.055082050629690876, + "ewc_loss": 0.0028076171875, + "ewc_loss_parallel": 2.8014183044433594e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 8.597590446472168, + "learning_rate": 1.831284442560407e-07, + "loss": 0.5765, + "mean_token_accuracy": 0.8207961916923523, + "num_tokens": 16497385.0, + "step": 433 + }, + { + "epoch": 0.05520926090828139, + "ewc_loss": 0.0028076171875, + "ewc_loss_parallel": 2.8014183044433594e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 8.619101524353027, + "learning_rate": 1.8355235269181856e-07, + "loss": 0.5918, + "mean_token_accuracy": 0.8179654479026794, + "num_tokens": 16540099.0, + "step": 434 + }, + { + "epoch": 0.0553364711868719, + "ewc_loss": 0.002838134765625, + "ewc_loss_parallel": 2.8312206268310547e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.200772285461426, + "learning_rate": 1.8397626112759643e-07, + "loss": 0.5903, + "mean_token_accuracy": 0.8179854154586792, + "num_tokens": 16572760.0, + "step": 435 + }, + { + "epoch": 0.05546368146546241, + "ewc_loss": 0.0029144287109375, + "ewc_loss_parallel": 2.9206275939941406e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.130887031555176, + "learning_rate": 1.844001695633743e-07, + "loss": 0.5491, + "mean_token_accuracy": 0.8286387920379639, + "num_tokens": 16608549.0, + "step": 436 + }, + { + "epoch": 0.05559089174405292, + "ewc_loss": 0.0028839111328125, + "ewc_loss_parallel": 2.8908252716064453e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.15714168548584, + "learning_rate": 1.8482407799915218e-07, + "loss": 0.5882, + "mean_token_accuracy": 0.8192141652107239, + "num_tokens": 16643800.0, + "step": 437 + }, + { + "epoch": 0.05571810202264343, + "ewc_loss": 0.00286865234375, + "ewc_loss_parallel": 2.86102294921875e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.145246505737305, + "learning_rate": 1.8524798643493005e-07, + "loss": 0.6601, + "mean_token_accuracy": 0.7952760457992554, + "num_tokens": 16677948.0, + "step": 438 + }, + { + "epoch": 0.05584531230123394, + "ewc_loss": 0.0029144287109375, + "ewc_loss_parallel": 2.9206275939941406e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 8.767624855041504, + "learning_rate": 1.8567189487070792e-07, + "loss": 0.6316, + "mean_token_accuracy": 0.8075075745582581, + "num_tokens": 16716905.0, + "step": 439 + }, + { + "epoch": 0.05597252257982445, + "ewc_loss": 0.00286865234375, + "ewc_loss_parallel": 2.86102294921875e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 8.881339073181152, + "learning_rate": 1.8609580330648577e-07, + "loss": 0.543, + "mean_token_accuracy": 0.8306908011436462, + "num_tokens": 16756158.0, + "step": 440 + }, + { + "epoch": 0.05609973285841496, + "ewc_loss": 0.00286865234375, + "ewc_loss_parallel": 2.86102294921875e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.0349702835083, + "learning_rate": 1.8651971174226367e-07, + "loss": 0.6346, + "mean_token_accuracy": 0.8071919083595276, + "num_tokens": 16794261.0, + "step": 441 + }, + { + "epoch": 0.05622694313700547, + "ewc_loss": 0.0029144287109375, + "ewc_loss_parallel": 2.9206275939941406e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 8.633126258850098, + "learning_rate": 1.8694362017804152e-07, + "loss": 0.5912, + "mean_token_accuracy": 0.8159180879592896, + "num_tokens": 16834953.0, + "step": 442 + }, + { + "epoch": 0.05635415341559598, + "ewc_loss": 0.0028839111328125, + "ewc_loss_parallel": 2.8908252716064453e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 8.728684425354004, + "learning_rate": 1.8736752861381941e-07, + "loss": 0.55, + "mean_token_accuracy": 0.8305761218070984, + "num_tokens": 16873954.0, + "step": 443 + }, + { + "epoch": 0.05648136369418649, + "ewc_loss": 0.002899169921875, + "ewc_loss_parallel": 2.905726432800293e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 8.705058097839355, + "learning_rate": 1.8779143704959726e-07, + "loss": 0.5744, + "mean_token_accuracy": 0.81883305311203, + "num_tokens": 16906018.0, + "step": 444 + }, + { + "epoch": 0.056608573972777, + "ewc_loss": 0.0029449462890625, + "ewc_loss_parallel": 2.950429916381836e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.15754222869873, + "learning_rate": 1.8821534548537516e-07, + "loss": 0.6288, + "mean_token_accuracy": 0.807231068611145, + "num_tokens": 16938945.0, + "step": 445 + }, + { + "epoch": 0.05673578425136751, + "ewc_loss": 0.00299072265625, + "ewc_loss_parallel": 2.995133399963379e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 8.744545936584473, + "learning_rate": 1.88639253921153e-07, + "loss": 0.5851, + "mean_token_accuracy": 0.8215811252593994, + "num_tokens": 16982214.0, + "step": 446 + }, + { + "epoch": 0.05686299452995802, + "ewc_loss": 0.002960205078125, + "ewc_loss_parallel": 2.9653310775756836e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 8.887062072753906, + "learning_rate": 1.890631623569309e-07, + "loss": 0.5929, + "mean_token_accuracy": 0.8154445290565491, + "num_tokens": 17024367.0, + "step": 447 + }, + { + "epoch": 0.05699020480854853, + "ewc_loss": 0.0029296875, + "ewc_loss_parallel": 2.9355287551879883e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.69523811340332, + "learning_rate": 1.8948707079270875e-07, + "loss": 0.6171, + "mean_token_accuracy": 0.8058762550354004, + "num_tokens": 17057451.0, + "step": 448 + }, + { + "epoch": 0.057117415087139044, + "ewc_loss": 0.0030059814453125, + "ewc_loss_parallel": 3.0100345611572266e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.432912826538086, + "learning_rate": 1.8991097922848665e-07, + "loss": 0.5679, + "mean_token_accuracy": 0.8230741620063782, + "num_tokens": 17094633.0, + "step": 449 + }, + { + "epoch": 0.05724462536572955, + "ewc_loss": 0.0029754638671875, + "ewc_loss_parallel": 2.9802322387695312e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.378547668457031, + "learning_rate": 1.903348876642645e-07, + "loss": 0.622, + "mean_token_accuracy": 0.8068206310272217, + "num_tokens": 17135892.0, + "step": 450 + }, + { + "epoch": 0.05737183564432006, + "ewc_loss": 0.0029754638671875, + "ewc_loss_parallel": 2.9802322387695312e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.175579071044922, + "learning_rate": 1.907587961000424e-07, + "loss": 0.564, + "mean_token_accuracy": 0.8247605562210083, + "num_tokens": 17173881.0, + "step": 451 + }, + { + "epoch": 0.05749904592291057, + "ewc_loss": 0.0029754638671875, + "ewc_loss_parallel": 2.9802322387695312e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 8.89188289642334, + "learning_rate": 1.9118270453582024e-07, + "loss": 0.614, + "mean_token_accuracy": 0.8114122152328491, + "num_tokens": 17215668.0, + "step": 452 + }, + { + "epoch": 0.05762625620150108, + "ewc_loss": 0.002960205078125, + "ewc_loss_parallel": 2.9653310775756836e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 8.861260414123535, + "learning_rate": 1.9160661297159814e-07, + "loss": 0.6386, + "mean_token_accuracy": 0.8051509857177734, + "num_tokens": 17254339.0, + "step": 453 + }, + { + "epoch": 0.057753466480091595, + "ewc_loss": 0.0029754638671875, + "ewc_loss_parallel": 2.9802322387695312e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 8.862113952636719, + "learning_rate": 1.9203052140737599e-07, + "loss": 0.6217, + "mean_token_accuracy": 0.805698037147522, + "num_tokens": 17288858.0, + "step": 454 + }, + { + "epoch": 0.0578806767586821, + "ewc_loss": 0.00299072265625, + "ewc_loss_parallel": 2.995133399963379e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 8.971419334411621, + "learning_rate": 1.9245442984315389e-07, + "loss": 0.5982, + "mean_token_accuracy": 0.8178178071975708, + "num_tokens": 17330601.0, + "step": 455 + }, + { + "epoch": 0.05800788703727261, + "ewc_loss": 0.00299072265625, + "ewc_loss_parallel": 2.995133399963379e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.077463150024414, + "learning_rate": 1.9287833827893173e-07, + "loss": 0.6171, + "mean_token_accuracy": 0.8086214065551758, + "num_tokens": 17367797.0, + "step": 456 + }, + { + "epoch": 0.058135097315863124, + "ewc_loss": 0.0030364990234375, + "ewc_loss_parallel": 3.039836883544922e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 8.885064125061035, + "learning_rate": 1.9330224671470963e-07, + "loss": 0.6048, + "mean_token_accuracy": 0.812856912612915, + "num_tokens": 17403206.0, + "step": 457 + }, + { + "epoch": 0.05826230759445363, + "ewc_loss": 0.003021240234375, + "ewc_loss_parallel": 3.0249357223510742e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 8.821004867553711, + "learning_rate": 1.9372615515048748e-07, + "loss": 0.65, + "mean_token_accuracy": 0.8032327890396118, + "num_tokens": 17437207.0, + "step": 458 + }, + { + "epoch": 0.05838951787304414, + "ewc_loss": 0.0030059814453125, + "ewc_loss_parallel": 3.0100345611572266e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.139859199523926, + "learning_rate": 1.9415006358626535e-07, + "loss": 0.5862, + "mean_token_accuracy": 0.8212364912033081, + "num_tokens": 17473335.0, + "step": 459 + }, + { + "epoch": 0.05851672815163465, + "ewc_loss": 0.0030975341796875, + "ewc_loss_parallel": 3.0994415283203125e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.305647850036621, + "learning_rate": 1.9457397202204322e-07, + "loss": 0.5757, + "mean_token_accuracy": 0.8180733919143677, + "num_tokens": 17505104.0, + "step": 460 + }, + { + "epoch": 0.05864393843022516, + "ewc_loss": 0.003082275390625, + "ewc_loss_parallel": 3.084540367126465e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 8.9408597946167, + "learning_rate": 1.949978804578211e-07, + "loss": 0.5605, + "mean_token_accuracy": 0.8271260857582092, + "num_tokens": 17539052.0, + "step": 461 + }, + { + "epoch": 0.058771148708815675, + "ewc_loss": 0.0030364990234375, + "ewc_loss_parallel": 3.039836883544922e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 8.998085975646973, + "learning_rate": 1.9542178889359897e-07, + "loss": 0.6335, + "mean_token_accuracy": 0.8034695386886597, + "num_tokens": 17578942.0, + "step": 462 + }, + { + "epoch": 0.05889835898740618, + "ewc_loss": 0.003082275390625, + "ewc_loss_parallel": 3.084540367126465e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 8.980246543884277, + "learning_rate": 1.9584569732937684e-07, + "loss": 0.6124, + "mean_token_accuracy": 0.8140758275985718, + "num_tokens": 17611931.0, + "step": 463 + }, + { + "epoch": 0.05902556926599669, + "ewc_loss": 0.0030975341796875, + "ewc_loss_parallel": 3.0994415283203125e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.012784957885742, + "learning_rate": 1.962696057651547e-07, + "loss": 0.5163, + "mean_token_accuracy": 0.8398804068565369, + "num_tokens": 17648764.0, + "step": 464 + }, + { + "epoch": 0.059152779544587204, + "ewc_loss": 0.0030975341796875, + "ewc_loss_parallel": 3.0994415283203125e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.528729438781738, + "learning_rate": 1.9669351420093258e-07, + "loss": 0.6276, + "mean_token_accuracy": 0.8063817024230957, + "num_tokens": 17683099.0, + "step": 465 + }, + { + "epoch": 0.05927998982317771, + "ewc_loss": 0.0031890869140625, + "ewc_loss_parallel": 3.1888484954833984e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.136617660522461, + "learning_rate": 1.9711742263671046e-07, + "loss": 0.6119, + "mean_token_accuracy": 0.8155626058578491, + "num_tokens": 17727120.0, + "step": 466 + }, + { + "epoch": 0.059407200101768226, + "ewc_loss": 0.00311279296875, + "ewc_loss_parallel": 3.11434268951416e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.634404182434082, + "learning_rate": 1.9754133107248833e-07, + "loss": 0.568, + "mean_token_accuracy": 0.8181667327880859, + "num_tokens": 17761676.0, + "step": 467 + }, + { + "epoch": 0.059534410380358734, + "ewc_loss": 0.003143310546875, + "ewc_loss_parallel": 3.1441450119018555e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.03278636932373, + "learning_rate": 1.979652395082662e-07, + "loss": 0.6342, + "mean_token_accuracy": 0.8065924644470215, + "num_tokens": 17802311.0, + "step": 468 + }, + { + "epoch": 0.05966162065894924, + "ewc_loss": 0.003082275390625, + "ewc_loss_parallel": 3.084540367126465e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 8.70438289642334, + "learning_rate": 1.9838914794404408e-07, + "loss": 0.5247, + "mean_token_accuracy": 0.8334892392158508, + "num_tokens": 17839048.0, + "step": 469 + }, + { + "epoch": 0.059788830937539755, + "ewc_loss": 0.003082275390625, + "ewc_loss_parallel": 3.084540367126465e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 8.863144874572754, + "learning_rate": 1.9881305637982195e-07, + "loss": 0.5977, + "mean_token_accuracy": 0.8152921795845032, + "num_tokens": 17881510.0, + "step": 470 + }, + { + "epoch": 0.05991604121613026, + "ewc_loss": 0.00311279296875, + "ewc_loss_parallel": 3.11434268951416e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.928345680236816, + "learning_rate": 1.9923696481559982e-07, + "loss": 0.6247, + "mean_token_accuracy": 0.8092654347419739, + "num_tokens": 17915080.0, + "step": 471 + }, + { + "epoch": 0.06004325149472077, + "ewc_loss": 0.0031890869140625, + "ewc_loss_parallel": 3.1888484954833984e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.527360916137695, + "learning_rate": 1.996608732513777e-07, + "loss": 0.6566, + "mean_token_accuracy": 0.7995054721832275, + "num_tokens": 17957972.0, + "step": 472 + }, + { + "epoch": 0.060170461773311285, + "ewc_loss": 0.003204345703125, + "ewc_loss_parallel": 3.203749656677246e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.01264762878418, + "learning_rate": 2.0008478168715557e-07, + "loss": 0.5256, + "mean_token_accuracy": 0.8328503966331482, + "num_tokens": 17998008.0, + "step": 473 + }, + { + "epoch": 0.06029767205190179, + "ewc_loss": 0.003082275390625, + "ewc_loss_parallel": 3.084540367126465e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.2641019821167, + "learning_rate": 2.0050869012293344e-07, + "loss": 0.5349, + "mean_token_accuracy": 0.8311365842819214, + "num_tokens": 18032427.0, + "step": 474 + }, + { + "epoch": 0.060424882330492306, + "ewc_loss": 0.0031280517578125, + "ewc_loss_parallel": 3.129243850708008e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 8.883825302124023, + "learning_rate": 2.009325985587113e-07, + "loss": 0.5644, + "mean_token_accuracy": 0.8229593634605408, + "num_tokens": 18069848.0, + "step": 475 + }, + { + "epoch": 0.060552092609082814, + "ewc_loss": 0.00311279296875, + "ewc_loss_parallel": 3.11434268951416e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 8.832942008972168, + "learning_rate": 2.0135650699448918e-07, + "loss": 0.6581, + "mean_token_accuracy": 0.7975519895553589, + "num_tokens": 18109240.0, + "step": 476 + }, + { + "epoch": 0.06067930288767332, + "ewc_loss": 0.00311279296875, + "ewc_loss_parallel": 3.11434268951416e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 8.981467247009277, + "learning_rate": 2.0178041543026706e-07, + "loss": 0.629, + "mean_token_accuracy": 0.8080418109893799, + "num_tokens": 18146849.0, + "step": 477 + }, + { + "epoch": 0.060806513166263836, + "ewc_loss": 0.00311279296875, + "ewc_loss_parallel": 3.11434268951416e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.467509269714355, + "learning_rate": 2.022043238660449e-07, + "loss": 0.6597, + "mean_token_accuracy": 0.7963286638259888, + "num_tokens": 18186872.0, + "step": 478 + }, + { + "epoch": 0.06093372344485434, + "ewc_loss": 0.0031890869140625, + "ewc_loss_parallel": 3.1888484954833984e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.068099975585938, + "learning_rate": 2.026282323018228e-07, + "loss": 0.6352, + "mean_token_accuracy": 0.8068424463272095, + "num_tokens": 18222299.0, + "step": 479 + }, + { + "epoch": 0.06106093372344486, + "ewc_loss": 0.0031280517578125, + "ewc_loss_parallel": 3.129243850708008e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 8.715117454528809, + "learning_rate": 2.0305214073760065e-07, + "loss": 0.6385, + "mean_token_accuracy": 0.8076671361923218, + "num_tokens": 18261645.0, + "step": 480 + }, + { + "epoch": 0.061188144002035365, + "ewc_loss": 0.0031280517578125, + "ewc_loss_parallel": 3.129243850708008e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.233135223388672, + "learning_rate": 2.0347604917337855e-07, + "loss": 0.6094, + "mean_token_accuracy": 0.8134235143661499, + "num_tokens": 18292970.0, + "step": 481 + }, + { + "epoch": 0.06131535428062587, + "ewc_loss": 0.0032196044921875, + "ewc_loss_parallel": 3.2186508178710938e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.088891983032227, + "learning_rate": 2.038999576091564e-07, + "loss": 0.5553, + "mean_token_accuracy": 0.8295435905456543, + "num_tokens": 18331133.0, + "step": 482 + }, + { + "epoch": 0.06144256455921639, + "ewc_loss": 0.0031890869140625, + "ewc_loss_parallel": 3.1888484954833984e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.202621459960938, + "learning_rate": 2.043238660449343e-07, + "loss": 0.6426, + "mean_token_accuracy": 0.8023499250411987, + "num_tokens": 18368905.0, + "step": 483 + }, + { + "epoch": 0.061569774837806894, + "ewc_loss": 0.003204345703125, + "ewc_loss_parallel": 3.203749656677246e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.382076263427734, + "learning_rate": 2.0474777448071214e-07, + "loss": 0.5675, + "mean_token_accuracy": 0.8251606225967407, + "num_tokens": 18406362.0, + "step": 484 + }, + { + "epoch": 0.0616969851163974, + "ewc_loss": 0.003204345703125, + "ewc_loss_parallel": 3.203749656677246e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.112736701965332, + "learning_rate": 2.0517168291649004e-07, + "loss": 0.6051, + "mean_token_accuracy": 0.8139809966087341, + "num_tokens": 18439246.0, + "step": 485 + }, + { + "epoch": 0.061824195394987916, + "ewc_loss": 0.0031890869140625, + "ewc_loss_parallel": 3.1888484954833984e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.499053955078125, + "learning_rate": 2.0559559135226788e-07, + "loss": 0.6076, + "mean_token_accuracy": 0.8103122711181641, + "num_tokens": 18473408.0, + "step": 486 + }, + { + "epoch": 0.06195140567357842, + "ewc_loss": 0.00323486328125, + "ewc_loss_parallel": 3.2335519790649414e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.166570663452148, + "learning_rate": 2.0601949978804578e-07, + "loss": 0.5617, + "mean_token_accuracy": 0.8239738941192627, + "num_tokens": 18513246.0, + "step": 487 + }, + { + "epoch": 0.06207861595216894, + "ewc_loss": 0.003204345703125, + "ewc_loss_parallel": 3.203749656677246e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 8.918802261352539, + "learning_rate": 2.0644340822382363e-07, + "loss": 0.5334, + "mean_token_accuracy": 0.8324356079101562, + "num_tokens": 18549125.0, + "step": 488 + }, + { + "epoch": 0.062205826230759445, + "ewc_loss": 0.003204345703125, + "ewc_loss_parallel": 3.203749656677246e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.053918838500977, + "learning_rate": 2.0686731665960153e-07, + "loss": 0.5834, + "mean_token_accuracy": 0.8245291709899902, + "num_tokens": 18585306.0, + "step": 489 + }, + { + "epoch": 0.06233303650934995, + "ewc_loss": 0.00323486328125, + "ewc_loss_parallel": 3.2335519790649414e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.00033950805664, + "learning_rate": 2.0729122509537937e-07, + "loss": 0.5435, + "mean_token_accuracy": 0.8322445154190063, + "num_tokens": 18624378.0, + "step": 490 + }, + { + "epoch": 0.06246024678794047, + "ewc_loss": 0.0032501220703125, + "ewc_loss_parallel": 3.248453140258789e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.437751770019531, + "learning_rate": 2.0771513353115727e-07, + "loss": 0.5882, + "mean_token_accuracy": 0.8169045448303223, + "num_tokens": 18660814.0, + "step": 491 + }, + { + "epoch": 0.06258745706653097, + "ewc_loss": 0.0033111572265625, + "ewc_loss_parallel": 3.3080577850341797e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.105992317199707, + "learning_rate": 2.0813904196693512e-07, + "loss": 0.5541, + "mean_token_accuracy": 0.8285991549491882, + "num_tokens": 18702129.0, + "step": 492 + }, + { + "epoch": 0.06271466734512149, + "ewc_loss": 0.00323486328125, + "ewc_loss_parallel": 3.2335519790649414e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.477640151977539, + "learning_rate": 2.0856295040271302e-07, + "loss": 0.6282, + "mean_token_accuracy": 0.8037846088409424, + "num_tokens": 18737496.0, + "step": 493 + }, + { + "epoch": 0.06284187762371199, + "ewc_loss": 0.0032806396484375, + "ewc_loss_parallel": 3.2782554626464844e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 8.991774559020996, + "learning_rate": 2.0898685883849086e-07, + "loss": 0.5833, + "mean_token_accuracy": 0.8216476440429688, + "num_tokens": 18779332.0, + "step": 494 + }, + { + "epoch": 0.0629690879023025, + "ewc_loss": 0.00323486328125, + "ewc_loss_parallel": 3.2335519790649414e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 8.87586498260498, + "learning_rate": 2.0941076727426874e-07, + "loss": 0.5686, + "mean_token_accuracy": 0.8233376145362854, + "num_tokens": 18817976.0, + "step": 495 + }, + { + "epoch": 0.06309629818089302, + "ewc_loss": 0.0032806396484375, + "ewc_loss_parallel": 3.2782554626464844e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 8.78968334197998, + "learning_rate": 2.098346757100466e-07, + "loss": 0.5596, + "mean_token_accuracy": 0.8247767090797424, + "num_tokens": 18862860.0, + "step": 496 + }, + { + "epoch": 0.06322350845948353, + "ewc_loss": 0.0032958984375, + "ewc_loss_parallel": 3.293156623840332e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.030472755432129, + "learning_rate": 2.1025858414582448e-07, + "loss": 0.6018, + "mean_token_accuracy": 0.8131774067878723, + "num_tokens": 18899025.0, + "step": 497 + }, + { + "epoch": 0.06335071873807403, + "ewc_loss": 0.0032806396484375, + "ewc_loss_parallel": 3.2782554626464844e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.180715560913086, + "learning_rate": 2.1068249258160238e-07, + "loss": 0.5681, + "mean_token_accuracy": 0.8174260854721069, + "num_tokens": 18933118.0, + "step": 498 + }, + { + "epoch": 0.06347792901666455, + "ewc_loss": 0.0033111572265625, + "ewc_loss_parallel": 3.3080577850341797e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.176590919494629, + "learning_rate": 2.1110640101738023e-07, + "loss": 0.5694, + "mean_token_accuracy": 0.8252557516098022, + "num_tokens": 18969165.0, + "step": 499 + }, + { + "epoch": 0.06360513929525506, + "ewc_loss": 0.0032958984375, + "ewc_loss_parallel": 3.293156623840332e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.026233673095703, + "learning_rate": 2.1153030945315813e-07, + "loss": 0.5775, + "mean_token_accuracy": 0.8201936483383179, + "num_tokens": 19003882.0, + "step": 500 + }, + { + "epoch": 0.06373234957384556, + "ewc_loss": 0.0033111572265625, + "ewc_loss_parallel": 3.3080577850341797e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 8.983187675476074, + "learning_rate": 2.1195421788893597e-07, + "loss": 0.5775, + "mean_token_accuracy": 0.8218468427658081, + "num_tokens": 19037540.0, + "step": 501 + }, + { + "epoch": 0.06385955985243608, + "ewc_loss": 0.0032958984375, + "ewc_loss_parallel": 3.293156623840332e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.284393310546875, + "learning_rate": 2.1237812632471387e-07, + "loss": 0.5874, + "mean_token_accuracy": 0.8135951161384583, + "num_tokens": 19077270.0, + "step": 502 + }, + { + "epoch": 0.06398677013102659, + "ewc_loss": 0.0033416748046875, + "ewc_loss_parallel": 3.337860107421875e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.094712257385254, + "learning_rate": 2.1280203476049172e-07, + "loss": 0.6757, + "mean_token_accuracy": 0.7885056138038635, + "num_tokens": 19117579.0, + "step": 503 + }, + { + "epoch": 0.06411398040961709, + "ewc_loss": 0.0033111572265625, + "ewc_loss_parallel": 3.3080577850341797e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 8.879365921020508, + "learning_rate": 2.1322594319626962e-07, + "loss": 0.6157, + "mean_token_accuracy": 0.8089576959609985, + "num_tokens": 19156599.0, + "step": 504 + }, + { + "epoch": 0.0642411906882076, + "ewc_loss": 0.0033111572265625, + "ewc_loss_parallel": 3.3080577850341797e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.55672836303711, + "learning_rate": 2.1364985163204746e-07, + "loss": 0.6072, + "mean_token_accuracy": 0.8120589256286621, + "num_tokens": 19187367.0, + "step": 505 + }, + { + "epoch": 0.06436840096679812, + "ewc_loss": 0.00335693359375, + "ewc_loss_parallel": 3.3527612686157227e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.135152816772461, + "learning_rate": 2.1407376006782536e-07, + "loss": 0.5405, + "mean_token_accuracy": 0.83067387342453, + "num_tokens": 19223916.0, + "step": 506 + }, + { + "epoch": 0.06449561124538863, + "ewc_loss": 0.003326416015625, + "ewc_loss_parallel": 3.3229589462280273e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.092111587524414, + "learning_rate": 2.144976685036032e-07, + "loss": 0.5567, + "mean_token_accuracy": 0.8270295262336731, + "num_tokens": 19260336.0, + "step": 507 + }, + { + "epoch": 0.06462282152397913, + "ewc_loss": 0.003326416015625, + "ewc_loss_parallel": 3.3229589462280273e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.376781463623047, + "learning_rate": 2.149215769393811e-07, + "loss": 0.6208, + "mean_token_accuracy": 0.8108993768692017, + "num_tokens": 19299895.0, + "step": 508 + }, + { + "epoch": 0.06475003180256965, + "ewc_loss": 0.0033416748046875, + "ewc_loss_parallel": 3.337860107421875e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.254956245422363, + "learning_rate": 2.1534548537515895e-07, + "loss": 0.5601, + "mean_token_accuracy": 0.8248865604400635, + "num_tokens": 19336614.0, + "step": 509 + }, + { + "epoch": 0.06487724208116016, + "ewc_loss": 0.0033416748046875, + "ewc_loss_parallel": 3.337860107421875e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.102801322937012, + "learning_rate": 2.1576939381093685e-07, + "loss": 0.6107, + "mean_token_accuracy": 0.8132337331771851, + "num_tokens": 19378683.0, + "step": 510 + }, + { + "epoch": 0.06500445235975066, + "ewc_loss": 0.00335693359375, + "ewc_loss_parallel": 3.3527612686157227e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 8.975672721862793, + "learning_rate": 2.161933022467147e-07, + "loss": 0.5948, + "mean_token_accuracy": 0.8125658631324768, + "num_tokens": 19422598.0, + "step": 511 + }, + { + "epoch": 0.06513166263834118, + "ewc_loss": 0.0033416748046875, + "ewc_loss_parallel": 3.337860107421875e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 8.905326843261719, + "learning_rate": 2.166172106824926e-07, + "loss": 0.5845, + "mean_token_accuracy": 0.8217712640762329, + "num_tokens": 19466972.0, + "step": 512 + }, + { + "epoch": 0.06525887291693169, + "ewc_loss": 0.00335693359375, + "ewc_loss_parallel": 3.3527612686157227e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.392623901367188, + "learning_rate": 2.1704111911827044e-07, + "loss": 0.5762, + "mean_token_accuracy": 0.8171294927597046, + "num_tokens": 19503809.0, + "step": 513 + }, + { + "epoch": 0.0653860831955222, + "ewc_loss": 0.0033721923828125, + "ewc_loss_parallel": 3.3676624298095703e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.30720329284668, + "learning_rate": 2.1746502755404831e-07, + "loss": 0.6312, + "mean_token_accuracy": 0.8114003539085388, + "num_tokens": 19545713.0, + "step": 514 + }, + { + "epoch": 0.06551329347411271, + "ewc_loss": 0.0033416748046875, + "ewc_loss_parallel": 3.337860107421875e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.413412094116211, + "learning_rate": 2.178889359898262e-07, + "loss": 0.6183, + "mean_token_accuracy": 0.8121883869171143, + "num_tokens": 19582272.0, + "step": 515 + }, + { + "epoch": 0.06564050375270322, + "ewc_loss": 0.003387451171875, + "ewc_loss_parallel": 3.382563591003418e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 8.963459014892578, + "learning_rate": 2.1831284442560406e-07, + "loss": 0.6334, + "mean_token_accuracy": 0.8067617416381836, + "num_tokens": 19625574.0, + "step": 516 + }, + { + "epoch": 0.06576771403129372, + "ewc_loss": 0.003326416015625, + "ewc_loss_parallel": 3.3229589462280273e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 8.956510543823242, + "learning_rate": 2.1873675286138193e-07, + "loss": 0.6205, + "mean_token_accuracy": 0.8072735071182251, + "num_tokens": 19667791.0, + "step": 517 + }, + { + "epoch": 0.06589492430988424, + "ewc_loss": 0.0033721923828125, + "ewc_loss_parallel": 3.3676624298095703e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.171529769897461, + "learning_rate": 2.191606612971598e-07, + "loss": 0.6505, + "mean_token_accuracy": 0.8009616136550903, + "num_tokens": 19708289.0, + "step": 518 + }, + { + "epoch": 0.06602213458847475, + "ewc_loss": 0.0034027099609375, + "ewc_loss_parallel": 3.3974647521972656e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.101755142211914, + "learning_rate": 2.1958456973293768e-07, + "loss": 0.6286, + "mean_token_accuracy": 0.802263617515564, + "num_tokens": 19747444.0, + "step": 519 + }, + { + "epoch": 0.06614934486706527, + "ewc_loss": 0.0033721923828125, + "ewc_loss_parallel": 3.3676624298095703e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.21726131439209, + "learning_rate": 2.2000847816871555e-07, + "loss": 0.5173, + "mean_token_accuracy": 0.8346731066703796, + "num_tokens": 19787061.0, + "step": 520 + }, + { + "epoch": 0.06627655514565577, + "ewc_loss": 0.0034332275390625, + "ewc_loss_parallel": 3.427267074584961e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.262689590454102, + "learning_rate": 2.2043238660449342e-07, + "loss": 0.5585, + "mean_token_accuracy": 0.8273158073425293, + "num_tokens": 19826893.0, + "step": 521 + }, + { + "epoch": 0.06640376542424628, + "ewc_loss": 0.0034332275390625, + "ewc_loss_parallel": 3.427267074584961e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.341031074523926, + "learning_rate": 2.208562950402713e-07, + "loss": 0.5509, + "mean_token_accuracy": 0.8236265778541565, + "num_tokens": 19864285.0, + "step": 522 + }, + { + "epoch": 0.0665309757028368, + "ewc_loss": 0.0034332275390625, + "ewc_loss_parallel": 3.427267074584961e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.044190406799316, + "learning_rate": 2.2128020347604917e-07, + "loss": 0.5864, + "mean_token_accuracy": 0.8177666068077087, + "num_tokens": 19903806.0, + "step": 523 + }, + { + "epoch": 0.0666581859814273, + "ewc_loss": 0.0033721923828125, + "ewc_loss_parallel": 3.3676624298095703e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.20861530303955, + "learning_rate": 2.2170411191182704e-07, + "loss": 0.5636, + "mean_token_accuracy": 0.8229202628135681, + "num_tokens": 19950382.0, + "step": 524 + }, + { + "epoch": 0.06678539626001781, + "ewc_loss": 0.003387451171875, + "ewc_loss_parallel": 3.382563591003418e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.466981887817383, + "learning_rate": 2.221280203476049e-07, + "loss": 0.5878, + "mean_token_accuracy": 0.8145462870597839, + "num_tokens": 19984765.0, + "step": 525 + }, + { + "epoch": 0.06691260653860832, + "ewc_loss": 0.003448486328125, + "ewc_loss_parallel": 3.4421682357788086e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.278647422790527, + "learning_rate": 2.2255192878338279e-07, + "loss": 0.6287, + "mean_token_accuracy": 0.8085122108459473, + "num_tokens": 20021081.0, + "step": 526 + }, + { + "epoch": 0.06703981681719882, + "ewc_loss": 0.0034332275390625, + "ewc_loss_parallel": 3.427267074584961e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.680251121520996, + "learning_rate": 2.2297583721916066e-07, + "loss": 0.6028, + "mean_token_accuracy": 0.8131848573684692, + "num_tokens": 20055860.0, + "step": 527 + }, + { + "epoch": 0.06716702709578934, + "ewc_loss": 0.0034637451171875, + "ewc_loss_parallel": 3.4570693969726562e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.315101623535156, + "learning_rate": 2.2339974565493853e-07, + "loss": 0.556, + "mean_token_accuracy": 0.8287105560302734, + "num_tokens": 20095023.0, + "step": 528 + }, + { + "epoch": 0.06729423737437985, + "ewc_loss": 0.00341796875, + "ewc_loss_parallel": 3.4123659133911133e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.079708099365234, + "learning_rate": 2.238236540907164e-07, + "loss": 0.6103, + "mean_token_accuracy": 0.8119677901268005, + "num_tokens": 20134620.0, + "step": 529 + }, + { + "epoch": 0.06742144765297035, + "ewc_loss": 0.003387451171875, + "ewc_loss_parallel": 3.382563591003418e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.460023880004883, + "learning_rate": 2.2424756252649428e-07, + "loss": 0.5831, + "mean_token_accuracy": 0.8170394897460938, + "num_tokens": 20169690.0, + "step": 530 + }, + { + "epoch": 0.06754865793156087, + "ewc_loss": 0.0034942626953125, + "ewc_loss_parallel": 3.4868717193603516e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.009767532348633, + "learning_rate": 2.2467147096227215e-07, + "loss": 0.5752, + "mean_token_accuracy": 0.82064288854599, + "num_tokens": 20203609.0, + "step": 531 + }, + { + "epoch": 0.06767586821015138, + "ewc_loss": 0.0035400390625, + "ewc_loss_parallel": 3.546476364135742e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.4528226852417, + "learning_rate": 2.2509537939805002e-07, + "loss": 0.582, + "mean_token_accuracy": 0.8173366785049438, + "num_tokens": 20234616.0, + "step": 532 + }, + { + "epoch": 0.0678030784887419, + "ewc_loss": 0.003448486328125, + "ewc_loss_parallel": 3.4421682357788086e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.012495040893555, + "learning_rate": 2.2551928783382787e-07, + "loss": 0.5696, + "mean_token_accuracy": 0.8202855587005615, + "num_tokens": 20271634.0, + "step": 533 + }, + { + "epoch": 0.0679302887673324, + "ewc_loss": 0.0034637451171875, + "ewc_loss_parallel": 3.4570693969726562e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.394227981567383, + "learning_rate": 2.2594319626960577e-07, + "loss": 0.5982, + "mean_token_accuracy": 0.8182967901229858, + "num_tokens": 20309683.0, + "step": 534 + }, + { + "epoch": 0.06805749904592291, + "ewc_loss": 0.0035247802734375, + "ewc_loss_parallel": 3.5315752029418945e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.231873512268066, + "learning_rate": 2.263671047053836e-07, + "loss": 0.5121, + "mean_token_accuracy": 0.8346455097198486, + "num_tokens": 20346013.0, + "step": 535 + }, + { + "epoch": 0.06818470932451343, + "ewc_loss": 0.0034942626953125, + "ewc_loss_parallel": 3.5017728805541992e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.295308113098145, + "learning_rate": 2.267910131411615e-07, + "loss": 0.5707, + "mean_token_accuracy": 0.8249009847640991, + "num_tokens": 20387613.0, + "step": 536 + }, + { + "epoch": 0.06831191960310393, + "ewc_loss": 0.003509521484375, + "ewc_loss_parallel": 3.516674041748047e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.267663955688477, + "learning_rate": 2.2721492157693936e-07, + "loss": 0.5694, + "mean_token_accuracy": 0.8224421739578247, + "num_tokens": 20417579.0, + "step": 537 + }, + { + "epoch": 0.06843912988169444, + "ewc_loss": 0.0035400390625, + "ewc_loss_parallel": 3.546476364135742e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.293700218200684, + "learning_rate": 2.2763883001271726e-07, + "loss": 0.5421, + "mean_token_accuracy": 0.8288940787315369, + "num_tokens": 20455100.0, + "step": 538 + }, + { + "epoch": 0.06856634016028496, + "ewc_loss": 0.0035400390625, + "ewc_loss_parallel": 3.546476364135742e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.6752347946167, + "learning_rate": 2.280627384484951e-07, + "loss": 0.6043, + "mean_token_accuracy": 0.8091926574707031, + "num_tokens": 20496458.0, + "step": 539 + }, + { + "epoch": 0.06869355043887546, + "ewc_loss": 0.0036163330078125, + "ewc_loss_parallel": 3.6209821701049805e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.360370635986328, + "learning_rate": 2.28486646884273e-07, + "loss": 0.5773, + "mean_token_accuracy": 0.8195345401763916, + "num_tokens": 20533758.0, + "step": 540 + }, + { + "epoch": 0.06882076071746597, + "ewc_loss": 0.0035858154296875, + "ewc_loss_parallel": 3.591179847717285e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.26724910736084, + "learning_rate": 2.2891055532005085e-07, + "loss": 0.6225, + "mean_token_accuracy": 0.810275673866272, + "num_tokens": 20567259.0, + "step": 541 + }, + { + "epoch": 0.06894797099605648, + "ewc_loss": 0.0035858154296875, + "ewc_loss_parallel": 3.591179847717285e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.525126457214355, + "learning_rate": 2.2933446375582875e-07, + "loss": 0.6042, + "mean_token_accuracy": 0.8154175281524658, + "num_tokens": 20599333.0, + "step": 542 + }, + { + "epoch": 0.06907518127464699, + "ewc_loss": 0.0036468505859375, + "ewc_loss_parallel": 3.6507844924926758e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.452675819396973, + "learning_rate": 2.297583721916066e-07, + "loss": 0.5402, + "mean_token_accuracy": 0.8325679898262024, + "num_tokens": 20640195.0, + "step": 543 + }, + { + "epoch": 0.0692023915532375, + "ewc_loss": 0.0036163330078125, + "ewc_loss_parallel": 3.6209821701049805e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.41833209991455, + "learning_rate": 2.301822806273845e-07, + "loss": 0.5613, + "mean_token_accuracy": 0.8241598606109619, + "num_tokens": 20682617.0, + "step": 544 + }, + { + "epoch": 0.06932960183182801, + "ewc_loss": 0.0036163330078125, + "ewc_loss_parallel": 3.6209821701049805e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.645398139953613, + "learning_rate": 2.3060618906316234e-07, + "loss": 0.6141, + "mean_token_accuracy": 0.8116605281829834, + "num_tokens": 20718032.0, + "step": 545 + }, + { + "epoch": 0.06945681211041853, + "ewc_loss": 0.003692626953125, + "ewc_loss_parallel": 3.6954879760742188e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.410626411437988, + "learning_rate": 2.3103009749894024e-07, + "loss": 0.5767, + "mean_token_accuracy": 0.8208988308906555, + "num_tokens": 20755190.0, + "step": 546 + }, + { + "epoch": 0.06958402238900903, + "ewc_loss": 0.003631591796875, + "ewc_loss_parallel": 3.635883331298828e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 8.924070358276367, + "learning_rate": 2.3145400593471808e-07, + "loss": 0.6031, + "mean_token_accuracy": 0.8125118017196655, + "num_tokens": 20799088.0, + "step": 547 + }, + { + "epoch": 0.06971123266759954, + "ewc_loss": 0.003570556640625, + "ewc_loss_parallel": 3.5762786865234375e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.404620170593262, + "learning_rate": 2.3187791437049598e-07, + "loss": 0.6347, + "mean_token_accuracy": 0.8025035858154297, + "num_tokens": 20831819.0, + "step": 548 + }, + { + "epoch": 0.06983844294619006, + "ewc_loss": 0.0037078857421875, + "ewc_loss_parallel": 3.7103891372680664e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.542469024658203, + "learning_rate": 2.3230182280627383e-07, + "loss": 0.5648, + "mean_token_accuracy": 0.8257513046264648, + "num_tokens": 20861291.0, + "step": 549 + }, + { + "epoch": 0.06996565322478056, + "ewc_loss": 0.0036773681640625, + "ewc_loss_parallel": 3.680586814880371e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.325553894042969, + "learning_rate": 2.327257312420517e-07, + "loss": 0.5401, + "mean_token_accuracy": 0.8343253135681152, + "num_tokens": 20897446.0, + "step": 550 + }, + { + "epoch": 0.07009286350337107, + "ewc_loss": 0.0036773681640625, + "ewc_loss_parallel": 3.680586814880371e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.470247268676758, + "learning_rate": 2.3314963967782957e-07, + "loss": 0.563, + "mean_token_accuracy": 0.82077956199646, + "num_tokens": 20934423.0, + "step": 551 + }, + { + "epoch": 0.07022007378196159, + "ewc_loss": 0.0037078857421875, + "ewc_loss_parallel": 3.7103891372680664e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.420345306396484, + "learning_rate": 2.3357354811360745e-07, + "loss": 0.5803, + "mean_token_accuracy": 0.8222862482070923, + "num_tokens": 20978394.0, + "step": 552 + }, + { + "epoch": 0.07034728406055209, + "ewc_loss": 0.0037078857421875, + "ewc_loss_parallel": 3.7103891372680664e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.726045608520508, + "learning_rate": 2.3399745654938532e-07, + "loss": 0.6078, + "mean_token_accuracy": 0.8128058910369873, + "num_tokens": 21010669.0, + "step": 553 + }, + { + "epoch": 0.0704744943391426, + "ewc_loss": 0.003753662109375, + "ewc_loss_parallel": 3.7550926208496094e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.36900806427002, + "learning_rate": 2.344213649851632e-07, + "loss": 0.5759, + "mean_token_accuracy": 0.8226631879806519, + "num_tokens": 21047696.0, + "step": 554 + }, + { + "epoch": 0.07060170461773312, + "ewc_loss": 0.003692626953125, + "ewc_loss_parallel": 3.6954879760742188e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.187056541442871, + "learning_rate": 2.3484527342094106e-07, + "loss": 0.6576, + "mean_token_accuracy": 0.7992669343948364, + "num_tokens": 21091437.0, + "step": 555 + }, + { + "epoch": 0.07072891489632362, + "ewc_loss": 0.0037078857421875, + "ewc_loss_parallel": 3.7103891372680664e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.287169456481934, + "learning_rate": 2.3526918185671894e-07, + "loss": 0.6177, + "mean_token_accuracy": 0.8092235326766968, + "num_tokens": 21128880.0, + "step": 556 + }, + { + "epoch": 0.07085612517491413, + "ewc_loss": 0.0037689208984375, + "ewc_loss_parallel": 3.769993782043457e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.250482559204102, + "learning_rate": 2.356930902924968e-07, + "loss": 0.5706, + "mean_token_accuracy": 0.8234827518463135, + "num_tokens": 21168523.0, + "step": 557 + }, + { + "epoch": 0.07098333545350465, + "ewc_loss": 0.003753662109375, + "ewc_loss_parallel": 3.7550926208496094e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.289575576782227, + "learning_rate": 2.3611699872827468e-07, + "loss": 0.6578, + "mean_token_accuracy": 0.7969753742218018, + "num_tokens": 21211376.0, + "step": 558 + }, + { + "epoch": 0.07111054573209516, + "ewc_loss": 0.0037384033203125, + "ewc_loss_parallel": 3.7401914596557617e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.255143165588379, + "learning_rate": 2.3654090716405255e-07, + "loss": 0.5878, + "mean_token_accuracy": 0.8177542686462402, + "num_tokens": 21249162.0, + "step": 559 + }, + { + "epoch": 0.07123775601068566, + "ewc_loss": 0.0037689208984375, + "ewc_loss_parallel": 3.769993782043457e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.30785846710205, + "learning_rate": 2.3696481559983043e-07, + "loss": 0.6166, + "mean_token_accuracy": 0.807620644569397, + "num_tokens": 21282351.0, + "step": 560 + }, + { + "epoch": 0.07136496628927617, + "ewc_loss": 0.0037841796875, + "ewc_loss_parallel": 3.7848949432373047e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.46015453338623, + "learning_rate": 2.373887240356083e-07, + "loss": 0.606, + "mean_token_accuracy": 0.8126344084739685, + "num_tokens": 21319508.0, + "step": 561 + }, + { + "epoch": 0.07149217656786669, + "ewc_loss": 0.0037994384765625, + "ewc_loss_parallel": 3.7997961044311523e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.451614379882812, + "learning_rate": 2.3781263247138617e-07, + "loss": 0.5706, + "mean_token_accuracy": 0.823727011680603, + "num_tokens": 21357111.0, + "step": 562 + }, + { + "epoch": 0.07161938684645719, + "ewc_loss": 0.0037841796875, + "ewc_loss_parallel": 3.7848949432373047e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.386919975280762, + "learning_rate": 2.3823654090716404e-07, + "loss": 0.5726, + "mean_token_accuracy": 0.8198994994163513, + "num_tokens": 21393222.0, + "step": 563 + }, + { + "epoch": 0.0717465971250477, + "ewc_loss": 0.00384521484375, + "ewc_loss_parallel": 3.844499588012695e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.439534187316895, + "learning_rate": 2.386604493429419e-07, + "loss": 0.5274, + "mean_token_accuracy": 0.8330895304679871, + "num_tokens": 21429595.0, + "step": 564 + }, + { + "epoch": 0.07187380740363822, + "ewc_loss": 0.0037841796875, + "ewc_loss_parallel": 3.7848949432373047e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.385221481323242, + "learning_rate": 2.390843577787198e-07, + "loss": 0.6235, + "mean_token_accuracy": 0.8035075664520264, + "num_tokens": 21467341.0, + "step": 565 + }, + { + "epoch": 0.07200101768222872, + "ewc_loss": 0.0037689208984375, + "ewc_loss_parallel": 3.769993782043457e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.203534126281738, + "learning_rate": 2.3950826621449766e-07, + "loss": 0.4991, + "mean_token_accuracy": 0.845054030418396, + "num_tokens": 21510927.0, + "step": 566 + }, + { + "epoch": 0.07212822796081923, + "ewc_loss": 0.0037994384765625, + "ewc_loss_parallel": 3.7997961044311523e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.436868667602539, + "learning_rate": 2.3993217465027556e-07, + "loss": 0.5758, + "mean_token_accuracy": 0.8220071792602539, + "num_tokens": 21551604.0, + "step": 567 + }, + { + "epoch": 0.07225543823940975, + "ewc_loss": 0.00384521484375, + "ewc_loss_parallel": 3.844499588012695e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.418818473815918, + "learning_rate": 2.403560830860534e-07, + "loss": 0.6269, + "mean_token_accuracy": 0.810215950012207, + "num_tokens": 21591871.0, + "step": 568 + }, + { + "epoch": 0.07238264851800025, + "ewc_loss": 0.00384521484375, + "ewc_loss_parallel": 3.844499588012695e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.506743431091309, + "learning_rate": 2.4077999152183125e-07, + "loss": 0.6238, + "mean_token_accuracy": 0.8061359524726868, + "num_tokens": 21629810.0, + "step": 569 + }, + { + "epoch": 0.07250985879659076, + "ewc_loss": 0.00384521484375, + "ewc_loss_parallel": 3.844499588012695e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.533408164978027, + "learning_rate": 2.4120389995760915e-07, + "loss": 0.5223, + "mean_token_accuracy": 0.8402972221374512, + "num_tokens": 21668706.0, + "step": 570 + }, + { + "epoch": 0.07263706907518128, + "ewc_loss": 0.00384521484375, + "ewc_loss_parallel": 3.844499588012695e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.529180526733398, + "learning_rate": 2.41627808393387e-07, + "loss": 0.5676, + "mean_token_accuracy": 0.8216843605041504, + "num_tokens": 21712725.0, + "step": 571 + }, + { + "epoch": 0.07276427935377179, + "ewc_loss": 0.003875732421875, + "ewc_loss_parallel": 3.874301910400391e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.27562141418457, + "learning_rate": 2.420517168291649e-07, + "loss": 0.5765, + "mean_token_accuracy": 0.8230438232421875, + "num_tokens": 21746086.0, + "step": 572 + }, + { + "epoch": 0.07289148963236229, + "ewc_loss": 0.003875732421875, + "ewc_loss_parallel": 3.874301910400391e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.520772933959961, + "learning_rate": 2.4247562526494274e-07, + "loss": 0.5367, + "mean_token_accuracy": 0.8302721977233887, + "num_tokens": 21779573.0, + "step": 573 + }, + { + "epoch": 0.0730186999109528, + "ewc_loss": 0.003875732421875, + "ewc_loss_parallel": 3.874301910400391e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.60497760772705, + "learning_rate": 2.4289953370072064e-07, + "loss": 0.6092, + "mean_token_accuracy": 0.8110078573226929, + "num_tokens": 21818766.0, + "step": 574 + }, + { + "epoch": 0.07314591018954332, + "ewc_loss": 0.003875732421875, + "ewc_loss_parallel": 3.874301910400391e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.413437843322754, + "learning_rate": 2.433234421364985e-07, + "loss": 0.5655, + "mean_token_accuracy": 0.8284212350845337, + "num_tokens": 21855539.0, + "step": 575 + }, + { + "epoch": 0.07327312046813382, + "ewc_loss": 0.003875732421875, + "ewc_loss_parallel": 3.874301910400391e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.566496849060059, + "learning_rate": 2.437473505722764e-07, + "loss": 0.5288, + "mean_token_accuracy": 0.8335630297660828, + "num_tokens": 21892567.0, + "step": 576 + }, + { + "epoch": 0.07340033074672433, + "ewc_loss": 0.00390625, + "ewc_loss_parallel": 3.904104232788086e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.460923194885254, + "learning_rate": 2.4417125900805423e-07, + "loss": 0.5485, + "mean_token_accuracy": 0.8225746154785156, + "num_tokens": 21928607.0, + "step": 577 + }, + { + "epoch": 0.07352754102531485, + "ewc_loss": 0.00390625, + "ewc_loss_parallel": 3.904104232788086e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.284791946411133, + "learning_rate": 2.4459516744383213e-07, + "loss": 0.5961, + "mean_token_accuracy": 0.8161541223526001, + "num_tokens": 21963781.0, + "step": 578 + }, + { + "epoch": 0.07365475130390535, + "ewc_loss": 0.00390625, + "ewc_loss_parallel": 3.904104232788086e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.384206771850586, + "learning_rate": 2.4501907587961e-07, + "loss": 0.5562, + "mean_token_accuracy": 0.8272507190704346, + "num_tokens": 21999140.0, + "step": 579 + }, + { + "epoch": 0.07378196158249586, + "ewc_loss": 0.00390625, + "ewc_loss_parallel": 3.904104232788086e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.41475772857666, + "learning_rate": 2.454429843153879e-07, + "loss": 0.6566, + "mean_token_accuracy": 0.7966158390045166, + "num_tokens": 22038703.0, + "step": 580 + }, + { + "epoch": 0.07390917186108638, + "ewc_loss": 0.003936767578125, + "ewc_loss_parallel": 3.933906555175781e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.506511688232422, + "learning_rate": 2.458668927511657e-07, + "loss": 0.5696, + "mean_token_accuracy": 0.8152016401290894, + "num_tokens": 22071052.0, + "step": 581 + }, + { + "epoch": 0.07403638213967688, + "ewc_loss": 0.003936767578125, + "ewc_loss_parallel": 3.933906555175781e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.455100059509277, + "learning_rate": 2.462908011869436e-07, + "loss": 0.5346, + "mean_token_accuracy": 0.8327476978302002, + "num_tokens": 22109629.0, + "step": 582 + }, + { + "epoch": 0.0741635924182674, + "ewc_loss": 0.003936767578125, + "ewc_loss_parallel": 3.933906555175781e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.479325294494629, + "learning_rate": 2.4671470962272147e-07, + "loss": 0.5474, + "mean_token_accuracy": 0.8301402926445007, + "num_tokens": 22151544.0, + "step": 583 + }, + { + "epoch": 0.07429080269685791, + "ewc_loss": 0.003936767578125, + "ewc_loss_parallel": 3.933906555175781e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.51037883758545, + "learning_rate": 2.4713861805849937e-07, + "loss": 0.5251, + "mean_token_accuracy": 0.8327685594558716, + "num_tokens": 22189241.0, + "step": 584 + }, + { + "epoch": 0.07441801297544842, + "ewc_loss": 0.003936767578125, + "ewc_loss_parallel": 3.933906555175781e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.592334747314453, + "learning_rate": 2.475625264942772e-07, + "loss": 0.5404, + "mean_token_accuracy": 0.8298783302307129, + "num_tokens": 22229138.0, + "step": 585 + }, + { + "epoch": 0.07454522325403892, + "ewc_loss": 0.003936767578125, + "ewc_loss_parallel": 3.933906555175781e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.38941478729248, + "learning_rate": 2.479864349300551e-07, + "loss": 0.5796, + "mean_token_accuracy": 0.8206969499588013, + "num_tokens": 22264930.0, + "step": 586 + }, + { + "epoch": 0.07467243353262944, + "ewc_loss": 0.003936767578125, + "ewc_loss_parallel": 3.933906555175781e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.33560848236084, + "learning_rate": 2.4841034336583296e-07, + "loss": 0.5196, + "mean_token_accuracy": 0.8311470746994019, + "num_tokens": 22297162.0, + "step": 587 + }, + { + "epoch": 0.07479964381121995, + "ewc_loss": 0.003936767578125, + "ewc_loss_parallel": 3.933906555175781e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.47437572479248, + "learning_rate": 2.488342518016108e-07, + "loss": 0.6056, + "mean_token_accuracy": 0.8099402189254761, + "num_tokens": 22334779.0, + "step": 588 + }, + { + "epoch": 0.07492685408981045, + "ewc_loss": 0.003936767578125, + "ewc_loss_parallel": 3.933906555175781e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.443513870239258, + "learning_rate": 2.492581602373887e-07, + "loss": 0.5608, + "mean_token_accuracy": 0.8262753486633301, + "num_tokens": 22372820.0, + "step": 589 + }, + { + "epoch": 0.07505406436840097, + "ewc_loss": 0.003936767578125, + "ewc_loss_parallel": 3.933906555175781e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.453935623168945, + "learning_rate": 2.4968206867316655e-07, + "loss": 0.5821, + "mean_token_accuracy": 0.8185052871704102, + "num_tokens": 22417841.0, + "step": 590 + }, + { + "epoch": 0.07518127464699148, + "ewc_loss": 0.003936767578125, + "ewc_loss_parallel": 3.933906555175781e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.504087448120117, + "learning_rate": 2.5010597710894445e-07, + "loss": 0.4958, + "mean_token_accuracy": 0.8443707823753357, + "num_tokens": 22449133.0, + "step": 591 + }, + { + "epoch": 0.07530848492558198, + "ewc_loss": 0.003936767578125, + "ewc_loss_parallel": 3.933906555175781e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.569467544555664, + "learning_rate": 2.505298855447223e-07, + "loss": 0.561, + "mean_token_accuracy": 0.8248212337493896, + "num_tokens": 22491145.0, + "step": 592 + }, + { + "epoch": 0.0754356952041725, + "ewc_loss": 0.003936767578125, + "ewc_loss_parallel": 3.933906555175781e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.48649787902832, + "learning_rate": 2.509537939805002e-07, + "loss": 0.5743, + "mean_token_accuracy": 0.8237916827201843, + "num_tokens": 22531738.0, + "step": 593 + }, + { + "epoch": 0.07556290548276301, + "ewc_loss": 0.003936767578125, + "ewc_loss_parallel": 3.933906555175781e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.020889282226562, + "learning_rate": 2.513777024162781e-07, + "loss": 0.5536, + "mean_token_accuracy": 0.8290230631828308, + "num_tokens": 22563005.0, + "step": 594 + }, + { + "epoch": 0.07569011576135352, + "ewc_loss": 0.0040283203125, + "ewc_loss_parallel": 4.023313522338867e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.487837791442871, + "learning_rate": 2.5180161085205594e-07, + "loss": 0.5335, + "mean_token_accuracy": 0.8338282108306885, + "num_tokens": 22601333.0, + "step": 595 + }, + { + "epoch": 0.07581732603994402, + "ewc_loss": 0.003936767578125, + "ewc_loss_parallel": 3.933906555175781e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.267045974731445, + "learning_rate": 2.522255192878338e-07, + "loss": 0.5896, + "mean_token_accuracy": 0.8174474835395813, + "num_tokens": 22639528.0, + "step": 596 + }, + { + "epoch": 0.07594453631853454, + "ewc_loss": 0.00396728515625, + "ewc_loss_parallel": 3.9637088775634766e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.65932846069336, + "learning_rate": 2.526494277236117e-07, + "loss": 0.572, + "mean_token_accuracy": 0.8215019702911377, + "num_tokens": 22675290.0, + "step": 597 + }, + { + "epoch": 0.07607174659712505, + "ewc_loss": 0.0040283203125, + "ewc_loss_parallel": 4.023313522338867e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.836591720581055, + "learning_rate": 2.530733361593896e-07, + "loss": 0.5463, + "mean_token_accuracy": 0.8267026543617249, + "num_tokens": 22712823.0, + "step": 598 + }, + { + "epoch": 0.07619895687571555, + "ewc_loss": 0.003997802734375, + "ewc_loss_parallel": 3.993511199951172e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.447406768798828, + "learning_rate": 2.5349724459516743e-07, + "loss": 0.5668, + "mean_token_accuracy": 0.8240025043487549, + "num_tokens": 22750916.0, + "step": 599 + }, + { + "epoch": 0.07632616715430607, + "ewc_loss": 0.003936767578125, + "ewc_loss_parallel": 3.933906555175781e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.668180465698242, + "learning_rate": 2.539211530309453e-07, + "loss": 0.5192, + "mean_token_accuracy": 0.8349887132644653, + "num_tokens": 22782537.0, + "step": 600 + }, + { + "epoch": 0.07645337743289658, + "ewc_loss": 0.003997802734375, + "ewc_loss_parallel": 3.993511199951172e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.691917419433594, + "learning_rate": 2.543450614667232e-07, + "loss": 0.6116, + "mean_token_accuracy": 0.811138391494751, + "num_tokens": 22814805.0, + "step": 601 + }, + { + "epoch": 0.07658058771148708, + "ewc_loss": 0.003997802734375, + "ewc_loss_parallel": 3.993511199951172e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.52358627319336, + "learning_rate": 2.547689699025011e-07, + "loss": 0.6097, + "mean_token_accuracy": 0.8108119964599609, + "num_tokens": 22852654.0, + "step": 602 + }, + { + "epoch": 0.0767077979900776, + "ewc_loss": 0.003997802734375, + "ewc_loss_parallel": 3.993511199951172e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.558106422424316, + "learning_rate": 2.551928783382789e-07, + "loss": 0.5653, + "mean_token_accuracy": 0.822660505771637, + "num_tokens": 22894205.0, + "step": 603 + }, + { + "epoch": 0.07683500826866811, + "ewc_loss": 0.003997802734375, + "ewc_loss_parallel": 3.993511199951172e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.52061653137207, + "learning_rate": 2.5561678677405677e-07, + "loss": 0.5314, + "mean_token_accuracy": 0.8295522928237915, + "num_tokens": 22924095.0, + "step": 604 + }, + { + "epoch": 0.07696221854725861, + "ewc_loss": 0.003997802734375, + "ewc_loss_parallel": 3.993511199951172e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.577354431152344, + "learning_rate": 2.5604069520983467e-07, + "loss": 0.6134, + "mean_token_accuracy": 0.8069678544998169, + "num_tokens": 22960857.0, + "step": 605 + }, + { + "epoch": 0.07708942882584913, + "ewc_loss": 0.004058837890625, + "ewc_loss_parallel": 4.0531158447265625e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.444734573364258, + "learning_rate": 2.564646036456125e-07, + "loss": 0.5638, + "mean_token_accuracy": 0.8226248025894165, + "num_tokens": 23004670.0, + "step": 606 + }, + { + "epoch": 0.07721663910443964, + "ewc_loss": 0.004058837890625, + "ewc_loss_parallel": 4.0531158447265625e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.604738235473633, + "learning_rate": 2.568885120813904e-07, + "loss": 0.5644, + "mean_token_accuracy": 0.820043683052063, + "num_tokens": 23041999.0, + "step": 607 + }, + { + "epoch": 0.07734384938303016, + "ewc_loss": 0.0040283203125, + "ewc_loss_parallel": 4.023313522338867e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.557214736938477, + "learning_rate": 2.5731242051716826e-07, + "loss": 0.5536, + "mean_token_accuracy": 0.8286718130111694, + "num_tokens": 23080704.0, + "step": 608 + }, + { + "epoch": 0.07747105966162066, + "ewc_loss": 0.0040283203125, + "ewc_loss_parallel": 4.023313522338867e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.67643928527832, + "learning_rate": 2.5773632895294616e-07, + "loss": 0.5885, + "mean_token_accuracy": 0.8171877861022949, + "num_tokens": 23118662.0, + "step": 609 + }, + { + "epoch": 0.07759826994021117, + "ewc_loss": 0.004058837890625, + "ewc_loss_parallel": 4.0531158447265625e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.620406150817871, + "learning_rate": 2.58160237388724e-07, + "loss": 0.5117, + "mean_token_accuracy": 0.8348429203033447, + "num_tokens": 23150367.0, + "step": 610 + }, + { + "epoch": 0.07772548021880168, + "ewc_loss": 0.00408935546875, + "ewc_loss_parallel": 4.082918167114258e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.632161140441895, + "learning_rate": 2.585841458245019e-07, + "loss": 0.5701, + "mean_token_accuracy": 0.8210405111312866, + "num_tokens": 23185836.0, + "step": 611 + }, + { + "epoch": 0.07785269049739219, + "ewc_loss": 0.00408935546875, + "ewc_loss_parallel": 4.082918167114258e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.775821685791016, + "learning_rate": 2.5900805426027975e-07, + "loss": 0.5095, + "mean_token_accuracy": 0.83933424949646, + "num_tokens": 23219346.0, + "step": 612 + }, + { + "epoch": 0.0779799007759827, + "ewc_loss": 0.004119873046875, + "ewc_loss_parallel": 4.112720489501953e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.606169700622559, + "learning_rate": 2.5943196269605765e-07, + "loss": 0.5821, + "mean_token_accuracy": 0.8238528966903687, + "num_tokens": 23257569.0, + "step": 613 + }, + { + "epoch": 0.07810711105457321, + "ewc_loss": 0.00408935546875, + "ewc_loss_parallel": 4.082918167114258e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.461681365966797, + "learning_rate": 2.598558711318355e-07, + "loss": 0.5712, + "mean_token_accuracy": 0.8242905139923096, + "num_tokens": 23302944.0, + "step": 614 + }, + { + "epoch": 0.07823432133316371, + "ewc_loss": 0.00408935546875, + "ewc_loss_parallel": 4.082918167114258e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.694467544555664, + "learning_rate": 2.602797795676134e-07, + "loss": 0.5769, + "mean_token_accuracy": 0.8210256695747375, + "num_tokens": 23341660.0, + "step": 615 + }, + { + "epoch": 0.07836153161175423, + "ewc_loss": 0.004180908203125, + "ewc_loss_parallel": 4.172325134277344e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.790592193603516, + "learning_rate": 2.6070368800339124e-07, + "loss": 0.5767, + "mean_token_accuracy": 0.8138516545295715, + "num_tokens": 23375956.0, + "step": 616 + }, + { + "epoch": 0.07848874189034474, + "ewc_loss": 0.004150390625, + "ewc_loss_parallel": 4.1425228118896484e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.63235092163086, + "learning_rate": 2.6112759643916914e-07, + "loss": 0.5286, + "mean_token_accuracy": 0.8321681618690491, + "num_tokens": 23407568.0, + "step": 617 + }, + { + "epoch": 0.07861595216893524, + "ewc_loss": 0.004150390625, + "ewc_loss_parallel": 4.1425228118896484e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.664905548095703, + "learning_rate": 2.61551504874947e-07, + "loss": 0.562, + "mean_token_accuracy": 0.8245136737823486, + "num_tokens": 23440261.0, + "step": 618 + }, + { + "epoch": 0.07874316244752576, + "ewc_loss": 0.004180908203125, + "ewc_loss_parallel": 4.172325134277344e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.5977201461792, + "learning_rate": 2.619754133107249e-07, + "loss": 0.5423, + "mean_token_accuracy": 0.8307082653045654, + "num_tokens": 23481706.0, + "step": 619 + }, + { + "epoch": 0.07887037272611627, + "ewc_loss": 0.004150390625, + "ewc_loss_parallel": 4.1425228118896484e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.727148056030273, + "learning_rate": 2.623993217465028e-07, + "loss": 0.603, + "mean_token_accuracy": 0.808864951133728, + "num_tokens": 23515410.0, + "step": 620 + }, + { + "epoch": 0.07899758300470679, + "ewc_loss": 0.004180908203125, + "ewc_loss_parallel": 4.172325134277344e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.62744140625, + "learning_rate": 2.6282323018228063e-07, + "loss": 0.5883, + "mean_token_accuracy": 0.8207724690437317, + "num_tokens": 23554627.0, + "step": 621 + }, + { + "epoch": 0.07912479328329729, + "ewc_loss": 0.004180908203125, + "ewc_loss_parallel": 4.172325134277344e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.627124786376953, + "learning_rate": 2.632471386180585e-07, + "loss": 0.6106, + "mean_token_accuracy": 0.8092097043991089, + "num_tokens": 23593400.0, + "step": 622 + }, + { + "epoch": 0.0792520035618878, + "ewc_loss": 0.00421142578125, + "ewc_loss_parallel": 4.202127456665039e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.71731948852539, + "learning_rate": 2.6367104705383637e-07, + "loss": 0.6083, + "mean_token_accuracy": 0.8092427253723145, + "num_tokens": 23628242.0, + "step": 623 + }, + { + "epoch": 0.07937921384047832, + "ewc_loss": 0.004180908203125, + "ewc_loss_parallel": 4.172325134277344e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.756543159484863, + "learning_rate": 2.6409495548961427e-07, + "loss": 0.5938, + "mean_token_accuracy": 0.8197449445724487, + "num_tokens": 23671041.0, + "step": 624 + }, + { + "epoch": 0.07950642411906882, + "ewc_loss": 0.004180908203125, + "ewc_loss_parallel": 4.172325134277344e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.703675270080566, + "learning_rate": 2.6451886392539206e-07, + "loss": 0.5726, + "mean_token_accuracy": 0.824008584022522, + "num_tokens": 23701904.0, + "step": 625 + }, + { + "epoch": 0.07963363439765933, + "ewc_loss": 0.004180908203125, + "ewc_loss_parallel": 4.172325134277344e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.693129539489746, + "learning_rate": 2.6494277236116996e-07, + "loss": 0.5351, + "mean_token_accuracy": 0.8298010230064392, + "num_tokens": 23742374.0, + "step": 626 + }, + { + "epoch": 0.07976084467624985, + "ewc_loss": 0.004180908203125, + "ewc_loss_parallel": 4.172325134277344e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.813557624816895, + "learning_rate": 2.6536668079694786e-07, + "loss": 0.6419, + "mean_token_accuracy": 0.8013956546783447, + "num_tokens": 23780216.0, + "step": 627 + }, + { + "epoch": 0.07988805495484035, + "ewc_loss": 0.00421142578125, + "ewc_loss_parallel": 4.202127456665039e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.781184196472168, + "learning_rate": 2.6579058923272576e-07, + "loss": 0.4943, + "mean_token_accuracy": 0.8461324572563171, + "num_tokens": 23813641.0, + "step": 628 + }, + { + "epoch": 0.08001526523343086, + "ewc_loss": 0.004241943359375, + "ewc_loss_parallel": 4.231929779052734e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.717202186584473, + "learning_rate": 2.6621449766850356e-07, + "loss": 0.5802, + "mean_token_accuracy": 0.8202693462371826, + "num_tokens": 23850189.0, + "step": 629 + }, + { + "epoch": 0.08014247551202137, + "ewc_loss": 0.00421142578125, + "ewc_loss_parallel": 4.202127456665039e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.546305656433105, + "learning_rate": 2.6663840610428145e-07, + "loss": 0.5338, + "mean_token_accuracy": 0.8319794535636902, + "num_tokens": 23884410.0, + "step": 630 + }, + { + "epoch": 0.08026968579061187, + "ewc_loss": 0.004180908203125, + "ewc_loss_parallel": 4.172325134277344e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.673376083374023, + "learning_rate": 2.6706231454005935e-07, + "loss": 0.5451, + "mean_token_accuracy": 0.8289915323257446, + "num_tokens": 23930139.0, + "step": 631 + }, + { + "epoch": 0.08039689606920239, + "ewc_loss": 0.004241943359375, + "ewc_loss_parallel": 4.231929779052734e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.563189506530762, + "learning_rate": 2.6748622297583725e-07, + "loss": 0.5414, + "mean_token_accuracy": 0.8309341669082642, + "num_tokens": 23973124.0, + "step": 632 + }, + { + "epoch": 0.0805241063477929, + "ewc_loss": 0.004180908203125, + "ewc_loss_parallel": 4.172325134277344e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.631627082824707, + "learning_rate": 2.6791013141161505e-07, + "loss": 0.5509, + "mean_token_accuracy": 0.829120397567749, + "num_tokens": 24008676.0, + "step": 633 + }, + { + "epoch": 0.08065131662638342, + "ewc_loss": 0.004241943359375, + "ewc_loss_parallel": 4.231929779052734e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.789318084716797, + "learning_rate": 2.6833403984739294e-07, + "loss": 0.5483, + "mean_token_accuracy": 0.8269062638282776, + "num_tokens": 24044880.0, + "step": 634 + }, + { + "epoch": 0.08077852690497392, + "ewc_loss": 0.004241943359375, + "ewc_loss_parallel": 4.231929779052734e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.65324878692627, + "learning_rate": 2.6875794828317084e-07, + "loss": 0.5392, + "mean_token_accuracy": 0.8345847129821777, + "num_tokens": 24090140.0, + "step": 635 + }, + { + "epoch": 0.08090573718356443, + "ewc_loss": 0.004241943359375, + "ewc_loss_parallel": 4.231929779052734e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.95059585571289, + "learning_rate": 2.6918185671894874e-07, + "loss": 0.6254, + "mean_token_accuracy": 0.8095943927764893, + "num_tokens": 24123311.0, + "step": 636 + }, + { + "epoch": 0.08103294746215495, + "ewc_loss": 0.004302978515625, + "ewc_loss_parallel": 4.291534423828125e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.862176895141602, + "learning_rate": 2.6960576515472654e-07, + "loss": 0.6584, + "mean_token_accuracy": 0.7960623502731323, + "num_tokens": 24164019.0, + "step": 637 + }, + { + "epoch": 0.08116015774074545, + "ewc_loss": 0.004241943359375, + "ewc_loss_parallel": 4.231929779052734e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.800095558166504, + "learning_rate": 2.7002967359050443e-07, + "loss": 0.5256, + "mean_token_accuracy": 0.8352513313293457, + "num_tokens": 24205986.0, + "step": 638 + }, + { + "epoch": 0.08128736801933596, + "ewc_loss": 0.004241943359375, + "ewc_loss_parallel": 4.231929779052734e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.613250732421875, + "learning_rate": 2.7045358202628233e-07, + "loss": 0.583, + "mean_token_accuracy": 0.8153886198997498, + "num_tokens": 24251956.0, + "step": 639 + }, + { + "epoch": 0.08141457829792648, + "ewc_loss": 0.0042724609375, + "ewc_loss_parallel": 4.26173210144043e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.787614822387695, + "learning_rate": 2.7087749046206023e-07, + "loss": 0.5281, + "mean_token_accuracy": 0.8313931226730347, + "num_tokens": 24284990.0, + "step": 640 + }, + { + "epoch": 0.08154178857651698, + "ewc_loss": 0.0042724609375, + "ewc_loss_parallel": 4.26173210144043e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.791581153869629, + "learning_rate": 2.71301398897838e-07, + "loss": 0.5838, + "mean_token_accuracy": 0.8173868656158447, + "num_tokens": 24323305.0, + "step": 641 + }, + { + "epoch": 0.08166899885510749, + "ewc_loss": 0.004302978515625, + "ewc_loss_parallel": 4.291534423828125e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.571796417236328, + "learning_rate": 2.717253073336159e-07, + "loss": 0.4943, + "mean_token_accuracy": 0.8427569270133972, + "num_tokens": 24365847.0, + "step": 642 + }, + { + "epoch": 0.081796209133698, + "ewc_loss": 0.0042724609375, + "ewc_loss_parallel": 4.26173210144043e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.783638000488281, + "learning_rate": 2.721492157693938e-07, + "loss": 0.5022, + "mean_token_accuracy": 0.8407790064811707, + "num_tokens": 24407269.0, + "step": 643 + }, + { + "epoch": 0.0819234194122885, + "ewc_loss": 0.004302978515625, + "ewc_loss_parallel": 4.291534423828125e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.007226943969727, + "learning_rate": 2.7257312420517167e-07, + "loss": 0.5602, + "mean_token_accuracy": 0.8231455087661743, + "num_tokens": 24444990.0, + "step": 644 + }, + { + "epoch": 0.08205062969087902, + "ewc_loss": 0.004302978515625, + "ewc_loss_parallel": 4.291534423828125e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.642892837524414, + "learning_rate": 2.729970326409495e-07, + "loss": 0.5909, + "mean_token_accuracy": 0.8183534145355225, + "num_tokens": 24487293.0, + "step": 645 + }, + { + "epoch": 0.08217783996946953, + "ewc_loss": 0.0042724609375, + "ewc_loss_parallel": 4.26173210144043e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.722743034362793, + "learning_rate": 2.734209410767274e-07, + "loss": 0.5521, + "mean_token_accuracy": 0.8273075819015503, + "num_tokens": 24521403.0, + "step": 646 + }, + { + "epoch": 0.08230505024806005, + "ewc_loss": 0.00433349609375, + "ewc_loss_parallel": 4.32133674621582e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.815242767333984, + "learning_rate": 2.738448495125053e-07, + "loss": 0.5598, + "mean_token_accuracy": 0.8284623622894287, + "num_tokens": 24562740.0, + "step": 647 + }, + { + "epoch": 0.08243226052665055, + "ewc_loss": 0.00433349609375, + "ewc_loss_parallel": 4.32133674621582e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.633074760437012, + "learning_rate": 2.7426875794828316e-07, + "loss": 0.5681, + "mean_token_accuracy": 0.8174560070037842, + "num_tokens": 24597507.0, + "step": 648 + }, + { + "epoch": 0.08255947080524106, + "ewc_loss": 0.004302978515625, + "ewc_loss_parallel": 4.291534423828125e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.710471153259277, + "learning_rate": 2.74692666384061e-07, + "loss": 0.5295, + "mean_token_accuracy": 0.8320739269256592, + "num_tokens": 24635629.0, + "step": 649 + }, + { + "epoch": 0.08268668108383158, + "ewc_loss": 0.00433349609375, + "ewc_loss_parallel": 4.32133674621582e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.883650779724121, + "learning_rate": 2.751165748198389e-07, + "loss": 0.555, + "mean_token_accuracy": 0.8242303729057312, + "num_tokens": 24676087.0, + "step": 650 + }, + { + "epoch": 0.08281389136242208, + "ewc_loss": 0.004364013671875, + "ewc_loss_parallel": 4.351139068603516e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.908267974853516, + "learning_rate": 2.755404832556168e-07, + "loss": 0.5744, + "mean_token_accuracy": 0.821013867855072, + "num_tokens": 24713652.0, + "step": 651 + }, + { + "epoch": 0.0829411016410126, + "ewc_loss": 0.00433349609375, + "ewc_loss_parallel": 4.32133674621582e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.7058687210083, + "learning_rate": 2.7596439169139465e-07, + "loss": 0.5564, + "mean_token_accuracy": 0.8278703093528748, + "num_tokens": 24750130.0, + "step": 652 + }, + { + "epoch": 0.08306831191960311, + "ewc_loss": 0.00433349609375, + "ewc_loss_parallel": 4.32133674621582e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.857597351074219, + "learning_rate": 2.763883001271725e-07, + "loss": 0.607, + "mean_token_accuracy": 0.8100649118423462, + "num_tokens": 24787337.0, + "step": 653 + }, + { + "epoch": 0.08319552219819361, + "ewc_loss": 0.00433349609375, + "ewc_loss_parallel": 4.32133674621582e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.85219669342041, + "learning_rate": 2.768122085629504e-07, + "loss": 0.5982, + "mean_token_accuracy": 0.8066340684890747, + "num_tokens": 24822736.0, + "step": 654 + }, + { + "epoch": 0.08332273247678412, + "ewc_loss": 0.004364013671875, + "ewc_loss_parallel": 4.351139068603516e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.86165714263916, + "learning_rate": 2.772361169987283e-07, + "loss": 0.5381, + "mean_token_accuracy": 0.8344470262527466, + "num_tokens": 24858743.0, + "step": 655 + }, + { + "epoch": 0.08344994275537464, + "ewc_loss": 0.004364013671875, + "ewc_loss_parallel": 4.351139068603516e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.89405632019043, + "learning_rate": 2.7766002543450614e-07, + "loss": 0.5471, + "mean_token_accuracy": 0.8260343670845032, + "num_tokens": 24901286.0, + "step": 656 + }, + { + "epoch": 0.08357715303396514, + "ewc_loss": 0.004364013671875, + "ewc_loss_parallel": 4.351139068603516e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.824335098266602, + "learning_rate": 2.78083933870284e-07, + "loss": 0.6189, + "mean_token_accuracy": 0.8089404702186584, + "num_tokens": 24943457.0, + "step": 657 + }, + { + "epoch": 0.08370436331255565, + "ewc_loss": 0.004425048828125, + "ewc_loss_parallel": 4.410743713378906e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.818344116210938, + "learning_rate": 2.785078423060619e-07, + "loss": 0.5575, + "mean_token_accuracy": 0.8234314918518066, + "num_tokens": 24979247.0, + "step": 658 + }, + { + "epoch": 0.08383157359114617, + "ewc_loss": 0.004364013671875, + "ewc_loss_parallel": 4.351139068603516e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.946064949035645, + "learning_rate": 2.789317507418398e-07, + "loss": 0.5689, + "mean_token_accuracy": 0.8199851512908936, + "num_tokens": 25017456.0, + "step": 659 + }, + { + "epoch": 0.08395878386973668, + "ewc_loss": 0.00439453125, + "ewc_loss_parallel": 4.380941390991211e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.740262031555176, + "learning_rate": 2.7935565917761763e-07, + "loss": 0.5251, + "mean_token_accuracy": 0.8313121795654297, + "num_tokens": 25054682.0, + "step": 660 + }, + { + "epoch": 0.08408599414832718, + "ewc_loss": 0.00439453125, + "ewc_loss_parallel": 4.380941390991211e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.967916488647461, + "learning_rate": 2.797795676133955e-07, + "loss": 0.6, + "mean_token_accuracy": 0.8114233016967773, + "num_tokens": 25092501.0, + "step": 661 + }, + { + "epoch": 0.0842132044269177, + "ewc_loss": 0.004425048828125, + "ewc_loss_parallel": 4.410743713378906e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.886192321777344, + "learning_rate": 2.802034760491734e-07, + "loss": 0.4653, + "mean_token_accuracy": 0.8510423898696899, + "num_tokens": 25129253.0, + "step": 662 + }, + { + "epoch": 0.08434041470550821, + "ewc_loss": 0.00439453125, + "ewc_loss_parallel": 4.380941390991211e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.710569381713867, + "learning_rate": 2.806273844849512e-07, + "loss": 0.5052, + "mean_token_accuracy": 0.8413634300231934, + "num_tokens": 25169119.0, + "step": 663 + }, + { + "epoch": 0.08446762498409871, + "ewc_loss": 0.00439453125, + "ewc_loss_parallel": 4.380941390991211e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.978926658630371, + "learning_rate": 2.810512929207291e-07, + "loss": 0.5772, + "mean_token_accuracy": 0.8134267926216125, + "num_tokens": 25207654.0, + "step": 664 + }, + { + "epoch": 0.08459483526268922, + "ewc_loss": 0.004425048828125, + "ewc_loss_parallel": 4.410743713378906e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.862875938415527, + "learning_rate": 2.8147520135650697e-07, + "loss": 0.5214, + "mean_token_accuracy": 0.8317051529884338, + "num_tokens": 25243258.0, + "step": 665 + }, + { + "epoch": 0.08472204554127974, + "ewc_loss": 0.00439453125, + "ewc_loss_parallel": 4.380941390991211e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.844710350036621, + "learning_rate": 2.8189910979228487e-07, + "loss": 0.5618, + "mean_token_accuracy": 0.8258183002471924, + "num_tokens": 25281301.0, + "step": 666 + }, + { + "epoch": 0.08484925581987024, + "ewc_loss": 0.004425048828125, + "ewc_loss_parallel": 4.410743713378906e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.854604721069336, + "learning_rate": 2.823230182280627e-07, + "loss": 0.5116, + "mean_token_accuracy": 0.838157057762146, + "num_tokens": 25313500.0, + "step": 667 + }, + { + "epoch": 0.08497646609846075, + "ewc_loss": 0.00439453125, + "ewc_loss_parallel": 4.380941390991211e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.797249794006348, + "learning_rate": 2.827469266638406e-07, + "loss": 0.5502, + "mean_token_accuracy": 0.8264296054840088, + "num_tokens": 25355121.0, + "step": 668 + }, + { + "epoch": 0.08510367637705127, + "ewc_loss": 0.004425048828125, + "ewc_loss_parallel": 4.410743713378906e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.821249008178711, + "learning_rate": 2.8317083509961846e-07, + "loss": 0.5656, + "mean_token_accuracy": 0.8245188593864441, + "num_tokens": 25398868.0, + "step": 669 + }, + { + "epoch": 0.08523088665564178, + "ewc_loss": 0.004425048828125, + "ewc_loss_parallel": 4.410743713378906e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.838335037231445, + "learning_rate": 2.8359474353539636e-07, + "loss": 0.5441, + "mean_token_accuracy": 0.8277704119682312, + "num_tokens": 25438670.0, + "step": 670 + }, + { + "epoch": 0.08535809693423228, + "ewc_loss": 0.00445556640625, + "ewc_loss_parallel": 4.4405460357666016e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.94074535369873, + "learning_rate": 2.840186519711742e-07, + "loss": 0.5913, + "mean_token_accuracy": 0.8193872570991516, + "num_tokens": 25479243.0, + "step": 671 + }, + { + "epoch": 0.0854853072128228, + "ewc_loss": 0.004425048828125, + "ewc_loss_parallel": 4.410743713378906e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.90312671661377, + "learning_rate": 2.844425604069521e-07, + "loss": 0.5744, + "mean_token_accuracy": 0.8185998797416687, + "num_tokens": 25514615.0, + "step": 672 + }, + { + "epoch": 0.08561251749141331, + "ewc_loss": 0.004425048828125, + "ewc_loss_parallel": 4.410743713378906e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.091930389404297, + "learning_rate": 2.8486646884272995e-07, + "loss": 0.5525, + "mean_token_accuracy": 0.8255798816680908, + "num_tokens": 25552753.0, + "step": 673 + }, + { + "epoch": 0.08573972777000381, + "ewc_loss": 0.004425048828125, + "ewc_loss_parallel": 4.410743713378906e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.823478698730469, + "learning_rate": 2.8529037727850785e-07, + "loss": 0.4884, + "mean_token_accuracy": 0.8438563942909241, + "num_tokens": 25587415.0, + "step": 674 + }, + { + "epoch": 0.08586693804859433, + "ewc_loss": 0.004425048828125, + "ewc_loss_parallel": 4.410743713378906e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.871031761169434, + "learning_rate": 2.857142857142857e-07, + "loss": 0.5199, + "mean_token_accuracy": 0.8376665711402893, + "num_tokens": 25625962.0, + "step": 675 + }, + { + "epoch": 0.08599414832718484, + "ewc_loss": 0.004425048828125, + "ewc_loss_parallel": 4.410743713378906e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.984257698059082, + "learning_rate": 2.861381941500636e-07, + "loss": 0.5685, + "mean_token_accuracy": 0.8210362792015076, + "num_tokens": 25669680.0, + "step": 676 + }, + { + "epoch": 0.08612135860577534, + "ewc_loss": 0.00445556640625, + "ewc_loss_parallel": 4.4405460357666016e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.950937271118164, + "learning_rate": 2.8656210258584144e-07, + "loss": 0.6045, + "mean_token_accuracy": 0.8081158399581909, + "num_tokens": 25709221.0, + "step": 677 + }, + { + "epoch": 0.08624856888436586, + "ewc_loss": 0.00445556640625, + "ewc_loss_parallel": 4.4405460357666016e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.187766075134277, + "learning_rate": 2.869860110216193e-07, + "loss": 0.6044, + "mean_token_accuracy": 0.8083016872406006, + "num_tokens": 25741880.0, + "step": 678 + }, + { + "epoch": 0.08637577916295637, + "ewc_loss": 0.0045166015625, + "ewc_loss_parallel": 4.5299530029296875e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.973273277282715, + "learning_rate": 2.874099194573972e-07, + "loss": 0.5361, + "mean_token_accuracy": 0.8301615118980408, + "num_tokens": 25786282.0, + "step": 679 + }, + { + "epoch": 0.08650298944154687, + "ewc_loss": 0.00445556640625, + "ewc_loss_parallel": 4.4405460357666016e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.892849922180176, + "learning_rate": 2.878338278931751e-07, + "loss": 0.5834, + "mean_token_accuracy": 0.8187533617019653, + "num_tokens": 25820229.0, + "step": 680 + }, + { + "epoch": 0.08663019972013739, + "ewc_loss": 0.004425048828125, + "ewc_loss_parallel": 4.410743713378906e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.917281150817871, + "learning_rate": 2.8825773632895293e-07, + "loss": 0.5172, + "mean_token_accuracy": 0.8397639989852905, + "num_tokens": 25856465.0, + "step": 681 + }, + { + "epoch": 0.0867574099987279, + "ewc_loss": 0.00445556640625, + "ewc_loss_parallel": 4.470348358154297e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.881728172302246, + "learning_rate": 2.886816447647308e-07, + "loss": 0.5168, + "mean_token_accuracy": 0.836729884147644, + "num_tokens": 25897407.0, + "step": 682 + }, + { + "epoch": 0.08688462027731841, + "ewc_loss": 0.004486083984375, + "ewc_loss_parallel": 4.500150680541992e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.940733909606934, + "learning_rate": 2.891055532005087e-07, + "loss": 0.5394, + "mean_token_accuracy": 0.831034243106842, + "num_tokens": 25938825.0, + "step": 683 + }, + { + "epoch": 0.08701183055590891, + "ewc_loss": 0.004486083984375, + "ewc_loss_parallel": 4.500150680541992e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.068269729614258, + "learning_rate": 2.8952946163628657e-07, + "loss": 0.5716, + "mean_token_accuracy": 0.8172658681869507, + "num_tokens": 25974944.0, + "step": 684 + }, + { + "epoch": 0.08713904083449943, + "ewc_loss": 0.00445556640625, + "ewc_loss_parallel": 4.470348358154297e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.963165283203125, + "learning_rate": 2.899533700720644e-07, + "loss": 0.5291, + "mean_token_accuracy": 0.8309135437011719, + "num_tokens": 26012864.0, + "step": 685 + }, + { + "epoch": 0.08726625111308994, + "ewc_loss": 0.00457763671875, + "ewc_loss_parallel": 4.589557647705078e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.965986251831055, + "learning_rate": 2.9037727850784227e-07, + "loss": 0.5757, + "mean_token_accuracy": 0.8199269771575928, + "num_tokens": 26049779.0, + "step": 686 + }, + { + "epoch": 0.08739346139168044, + "ewc_loss": 0.004547119140625, + "ewc_loss_parallel": 4.559755325317383e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.971827507019043, + "learning_rate": 2.9080118694362016e-07, + "loss": 0.5455, + "mean_token_accuracy": 0.8297500610351562, + "num_tokens": 26089095.0, + "step": 687 + }, + { + "epoch": 0.08752067167027096, + "ewc_loss": 0.004486083984375, + "ewc_loss_parallel": 4.500150680541992e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.817803382873535, + "learning_rate": 2.9122509537939806e-07, + "loss": 0.545, + "mean_token_accuracy": 0.8288666605949402, + "num_tokens": 26132553.0, + "step": 688 + }, + { + "epoch": 0.08764788194886147, + "ewc_loss": 0.004547119140625, + "ewc_loss_parallel": 4.559755325317383e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.058137893676758, + "learning_rate": 2.916490038151759e-07, + "loss": 0.5954, + "mean_token_accuracy": 0.8111950159072876, + "num_tokens": 26171301.0, + "step": 689 + }, + { + "epoch": 0.08777509222745197, + "ewc_loss": 0.00457763671875, + "ewc_loss_parallel": 4.589557647705078e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.161643028259277, + "learning_rate": 2.9207291225095376e-07, + "loss": 0.5832, + "mean_token_accuracy": 0.8156188130378723, + "num_tokens": 26217871.0, + "step": 690 + }, + { + "epoch": 0.08790230250604249, + "ewc_loss": 0.00457763671875, + "ewc_loss_parallel": 4.589557647705078e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.826421737670898, + "learning_rate": 2.9249682068673166e-07, + "loss": 0.5464, + "mean_token_accuracy": 0.8264206647872925, + "num_tokens": 26255927.0, + "step": 691 + }, + { + "epoch": 0.088029512784633, + "ewc_loss": 0.0045166015625, + "ewc_loss_parallel": 4.5299530029296875e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.071673393249512, + "learning_rate": 2.9292072912250955e-07, + "loss": 0.5297, + "mean_token_accuracy": 0.8350098133087158, + "num_tokens": 26296041.0, + "step": 692 + }, + { + "epoch": 0.0881567230632235, + "ewc_loss": 0.004608154296875, + "ewc_loss_parallel": 4.6193599700927734e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.141831398010254, + "learning_rate": 2.933446375582874e-07, + "loss": 0.6126, + "mean_token_accuracy": 0.8095011115074158, + "num_tokens": 26333785.0, + "step": 693 + }, + { + "epoch": 0.08828393334181402, + "ewc_loss": 0.004547119140625, + "ewc_loss_parallel": 4.559755325317383e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.971796989440918, + "learning_rate": 2.9376854599406525e-07, + "loss": 0.5183, + "mean_token_accuracy": 0.8338373899459839, + "num_tokens": 26368837.0, + "step": 694 + }, + { + "epoch": 0.08841114362040453, + "ewc_loss": 0.0045166015625, + "ewc_loss_parallel": 4.5299530029296875e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.25498104095459, + "learning_rate": 2.9419245442984315e-07, + "loss": 0.594, + "mean_token_accuracy": 0.8159310817718506, + "num_tokens": 26402151.0, + "step": 695 + }, + { + "epoch": 0.08853835389899505, + "ewc_loss": 0.004608154296875, + "ewc_loss_parallel": 4.6193599700927734e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.05415153503418, + "learning_rate": 2.9461636286562104e-07, + "loss": 0.5452, + "mean_token_accuracy": 0.8274306058883667, + "num_tokens": 26435550.0, + "step": 696 + }, + { + "epoch": 0.08866556417758555, + "ewc_loss": 0.00457763671875, + "ewc_loss_parallel": 4.589557647705078e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.951804161071777, + "learning_rate": 2.9504027130139884e-07, + "loss": 0.5348, + "mean_token_accuracy": 0.8312236070632935, + "num_tokens": 26479299.0, + "step": 697 + }, + { + "epoch": 0.08879277445617606, + "ewc_loss": 0.00457763671875, + "ewc_loss_parallel": 4.589557647705078e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.080591201782227, + "learning_rate": 2.9546417973717674e-07, + "loss": 0.5719, + "mean_token_accuracy": 0.8199605941772461, + "num_tokens": 26519380.0, + "step": 698 + }, + { + "epoch": 0.08891998473476657, + "ewc_loss": 0.00457763671875, + "ewc_loss_parallel": 4.589557647705078e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.038457870483398, + "learning_rate": 2.9588808817295464e-07, + "loss": 0.4984, + "mean_token_accuracy": 0.840665340423584, + "num_tokens": 26557306.0, + "step": 699 + }, + { + "epoch": 0.08904719501335707, + "ewc_loss": 0.004638671875, + "ewc_loss_parallel": 4.649162292480469e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.13008975982666, + "learning_rate": 2.9631199660873253e-07, + "loss": 0.5688, + "mean_token_accuracy": 0.8215720057487488, + "num_tokens": 26593549.0, + "step": 700 + }, + { + "epoch": 0.08917440529194759, + "ewc_loss": 0.004730224609375, + "ewc_loss_parallel": 4.738569259643555e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.315045356750488, + "learning_rate": 2.9673590504451033e-07, + "loss": 0.4933, + "mean_token_accuracy": 0.8449761271476746, + "num_tokens": 26629926.0, + "step": 701 + }, + { + "epoch": 0.0893016155705381, + "ewc_loss": 0.004669189453125, + "ewc_loss_parallel": 4.678964614868164e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.033042907714844, + "learning_rate": 2.9715981348028823e-07, + "loss": 0.5257, + "mean_token_accuracy": 0.8355300426483154, + "num_tokens": 26668652.0, + "step": 702 + }, + { + "epoch": 0.0894288258491286, + "ewc_loss": 0.004638671875, + "ewc_loss_parallel": 4.649162292480469e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.03856086730957, + "learning_rate": 2.975837219160661e-07, + "loss": 0.5879, + "mean_token_accuracy": 0.8154009580612183, + "num_tokens": 26708084.0, + "step": 703 + }, + { + "epoch": 0.08955603612771912, + "ewc_loss": 0.004669189453125, + "ewc_loss_parallel": 4.678964614868164e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.103043556213379, + "learning_rate": 2.98007630351844e-07, + "loss": 0.5533, + "mean_token_accuracy": 0.8269299268722534, + "num_tokens": 26750134.0, + "step": 704 + }, + { + "epoch": 0.08968324640630963, + "ewc_loss": 0.004730224609375, + "ewc_loss_parallel": 4.738569259643555e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.333666801452637, + "learning_rate": 2.984315387876218e-07, + "loss": 0.5613, + "mean_token_accuracy": 0.8223140239715576, + "num_tokens": 26785410.0, + "step": 705 + }, + { + "epoch": 0.08981045668490013, + "ewc_loss": 0.004730224609375, + "ewc_loss_parallel": 4.738569259643555e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 9.966373443603516, + "learning_rate": 2.988554472233997e-07, + "loss": 0.5652, + "mean_token_accuracy": 0.825198769569397, + "num_tokens": 26825226.0, + "step": 706 + }, + { + "epoch": 0.08993766696349065, + "ewc_loss": 0.004669189453125, + "ewc_loss_parallel": 4.678964614868164e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.09920597076416, + "learning_rate": 2.992793556591776e-07, + "loss": 0.5587, + "mean_token_accuracy": 0.8237505555152893, + "num_tokens": 26865401.0, + "step": 707 + }, + { + "epoch": 0.09006487724208116, + "ewc_loss": 0.00482177734375, + "ewc_loss_parallel": 4.827976226806641e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.215412139892578, + "learning_rate": 2.997032640949555e-07, + "loss": 0.5692, + "mean_token_accuracy": 0.8219394683837891, + "num_tokens": 26905365.0, + "step": 708 + }, + { + "epoch": 0.09019208752067168, + "ewc_loss": 0.0047607421875, + "ewc_loss_parallel": 4.76837158203125e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.056781768798828, + "learning_rate": 3.001271725307333e-07, + "loss": 0.5095, + "mean_token_accuracy": 0.8388311862945557, + "num_tokens": 26941348.0, + "step": 709 + }, + { + "epoch": 0.09031929779926218, + "ewc_loss": 0.0047607421875, + "ewc_loss_parallel": 4.76837158203125e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.152122497558594, + "learning_rate": 3.005510809665112e-07, + "loss": 0.5448, + "mean_token_accuracy": 0.8291239738464355, + "num_tokens": 26975803.0, + "step": 710 + }, + { + "epoch": 0.09044650807785269, + "ewc_loss": 0.004852294921875, + "ewc_loss_parallel": 4.857778549194336e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.257559776306152, + "learning_rate": 3.009749894022891e-07, + "loss": 0.4904, + "mean_token_accuracy": 0.8439150452613831, + "num_tokens": 27014931.0, + "step": 711 + }, + { + "epoch": 0.0905737183564432, + "ewc_loss": 0.004791259765625, + "ewc_loss_parallel": 4.798173904418945e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.161541938781738, + "learning_rate": 3.01398897838067e-07, + "loss": 0.5485, + "mean_token_accuracy": 0.8293362855911255, + "num_tokens": 27055505.0, + "step": 712 + }, + { + "epoch": 0.0907009286350337, + "ewc_loss": 0.004791259765625, + "ewc_loss_parallel": 4.798173904418945e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.240887641906738, + "learning_rate": 3.018228062738448e-07, + "loss": 0.5749, + "mean_token_accuracy": 0.8222355246543884, + "num_tokens": 27092121.0, + "step": 713 + }, + { + "epoch": 0.09082813891362422, + "ewc_loss": 0.004791259765625, + "ewc_loss_parallel": 4.798173904418945e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.281388282775879, + "learning_rate": 3.022467147096227e-07, + "loss": 0.5121, + "mean_token_accuracy": 0.8376898765563965, + "num_tokens": 27129827.0, + "step": 714 + }, + { + "epoch": 0.09095534919221474, + "ewc_loss": 0.00482177734375, + "ewc_loss_parallel": 4.827976226806641e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.121386528015137, + "learning_rate": 3.026706231454006e-07, + "loss": 0.5292, + "mean_token_accuracy": 0.8324013352394104, + "num_tokens": 27174977.0, + "step": 715 + }, + { + "epoch": 0.09108255947080524, + "ewc_loss": 0.004791259765625, + "ewc_loss_parallel": 4.798173904418945e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.176545143127441, + "learning_rate": 3.0309453158117844e-07, + "loss": 0.5351, + "mean_token_accuracy": 0.8304606080055237, + "num_tokens": 27211472.0, + "step": 716 + }, + { + "epoch": 0.09120976974939575, + "ewc_loss": 0.004852294921875, + "ewc_loss_parallel": 4.857778549194336e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.275398254394531, + "learning_rate": 3.035184400169563e-07, + "loss": 0.6187, + "mean_token_accuracy": 0.810152530670166, + "num_tokens": 27249160.0, + "step": 717 + }, + { + "epoch": 0.09133698002798626, + "ewc_loss": 0.00482177734375, + "ewc_loss_parallel": 4.827976226806641e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.191300392150879, + "learning_rate": 3.039423484527342e-07, + "loss": 0.5265, + "mean_token_accuracy": 0.8324259519577026, + "num_tokens": 27287946.0, + "step": 718 + }, + { + "epoch": 0.09146419030657676, + "ewc_loss": 0.00482177734375, + "ewc_loss_parallel": 4.827976226806641e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.137691497802734, + "learning_rate": 3.043662568885121e-07, + "loss": 0.5607, + "mean_token_accuracy": 0.8188413977622986, + "num_tokens": 27323388.0, + "step": 719 + }, + { + "epoch": 0.09159140058516728, + "ewc_loss": 0.004852294921875, + "ewc_loss_parallel": 4.857778549194336e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.294164657592773, + "learning_rate": 3.0479016532428993e-07, + "loss": 0.6077, + "mean_token_accuracy": 0.8086369633674622, + "num_tokens": 27357524.0, + "step": 720 + }, + { + "epoch": 0.0917186108637578, + "ewc_loss": 0.004913330078125, + "ewc_loss_parallel": 4.9173831939697266e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.076929092407227, + "learning_rate": 3.052140737600678e-07, + "loss": 0.5141, + "mean_token_accuracy": 0.836186408996582, + "num_tokens": 27392037.0, + "step": 721 + }, + { + "epoch": 0.09184582114234831, + "ewc_loss": 0.004852294921875, + "ewc_loss_parallel": 4.857778549194336e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.059981346130371, + "learning_rate": 3.056379821958457e-07, + "loss": 0.5259, + "mean_token_accuracy": 0.8337371349334717, + "num_tokens": 27429149.0, + "step": 722 + }, + { + "epoch": 0.09197303142093881, + "ewc_loss": 0.0048828125, + "ewc_loss_parallel": 4.887580871582031e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.311995506286621, + "learning_rate": 3.060618906316236e-07, + "loss": 0.5318, + "mean_token_accuracy": 0.8347748517990112, + "num_tokens": 27465023.0, + "step": 723 + }, + { + "epoch": 0.09210024169952932, + "ewc_loss": 0.0048828125, + "ewc_loss_parallel": 4.887580871582031e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.232934951782227, + "learning_rate": 3.064857990674014e-07, + "loss": 0.5233, + "mean_token_accuracy": 0.8352406024932861, + "num_tokens": 27501268.0, + "step": 724 + }, + { + "epoch": 0.09222745197811984, + "ewc_loss": 0.004852294921875, + "ewc_loss_parallel": 4.857778549194336e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.271678924560547, + "learning_rate": 3.0690970750317927e-07, + "loss": 0.5754, + "mean_token_accuracy": 0.8197230100631714, + "num_tokens": 27539937.0, + "step": 725 + }, + { + "epoch": 0.09235466225671034, + "ewc_loss": 0.004913330078125, + "ewc_loss_parallel": 4.9173831939697266e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.165976524353027, + "learning_rate": 3.0733361593895717e-07, + "loss": 0.5369, + "mean_token_accuracy": 0.8310585618019104, + "num_tokens": 27574576.0, + "step": 726 + }, + { + "epoch": 0.09248187253530085, + "ewc_loss": 0.0048828125, + "ewc_loss_parallel": 4.887580871582031e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.275314331054688, + "learning_rate": 3.0775752437473507e-07, + "loss": 0.546, + "mean_token_accuracy": 0.8241546154022217, + "num_tokens": 27616036.0, + "step": 727 + }, + { + "epoch": 0.09260908281389137, + "ewc_loss": 0.004913330078125, + "ewc_loss_parallel": 4.9173831939697266e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.100507736206055, + "learning_rate": 3.081814328105129e-07, + "loss": 0.5698, + "mean_token_accuracy": 0.8209418058395386, + "num_tokens": 27662040.0, + "step": 728 + }, + { + "epoch": 0.09273629309248187, + "ewc_loss": 0.0048828125, + "ewc_loss_parallel": 4.887580871582031e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.186969757080078, + "learning_rate": 3.0860534124629076e-07, + "loss": 0.5986, + "mean_token_accuracy": 0.8125925660133362, + "num_tokens": 27705136.0, + "step": 729 + }, + { + "epoch": 0.09286350337107238, + "ewc_loss": 0.0048828125, + "ewc_loss_parallel": 4.887580871582031e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.249369621276855, + "learning_rate": 3.0902924968206866e-07, + "loss": 0.5488, + "mean_token_accuracy": 0.8263950347900391, + "num_tokens": 27740564.0, + "step": 730 + }, + { + "epoch": 0.0929907136496629, + "ewc_loss": 0.00494384765625, + "ewc_loss_parallel": 4.947185516357422e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.38060474395752, + "learning_rate": 3.0945315811784656e-07, + "loss": 0.5208, + "mean_token_accuracy": 0.8331599235534668, + "num_tokens": 27774788.0, + "step": 731 + }, + { + "epoch": 0.0931179239282534, + "ewc_loss": 0.0050048828125, + "ewc_loss_parallel": 5.0067901611328125e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.147241592407227, + "learning_rate": 3.098770665536244e-07, + "loss": 0.6665, + "mean_token_accuracy": 0.7902251482009888, + "num_tokens": 27818300.0, + "step": 732 + }, + { + "epoch": 0.09324513420684391, + "ewc_loss": 0.004913330078125, + "ewc_loss_parallel": 4.9173831939697266e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.140534400939941, + "learning_rate": 3.1030097498940225e-07, + "loss": 0.5077, + "mean_token_accuracy": 0.838313102722168, + "num_tokens": 27857739.0, + "step": 733 + }, + { + "epoch": 0.09337234448543442, + "ewc_loss": 0.0050048828125, + "ewc_loss_parallel": 5.0067901611328125e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.24885082244873, + "learning_rate": 3.1072488342518015e-07, + "loss": 0.5489, + "mean_token_accuracy": 0.8219105005264282, + "num_tokens": 27893897.0, + "step": 734 + }, + { + "epoch": 0.09349955476402494, + "ewc_loss": 0.004974365234375, + "ewc_loss_parallel": 4.976987838745117e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.15179443359375, + "learning_rate": 3.11148791860958e-07, + "loss": 0.5344, + "mean_token_accuracy": 0.8284677863121033, + "num_tokens": 27930511.0, + "step": 735 + }, + { + "epoch": 0.09362676504261544, + "ewc_loss": 0.00494384765625, + "ewc_loss_parallel": 4.947185516357422e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.358683586120605, + "learning_rate": 3.115727002967359e-07, + "loss": 0.5529, + "mean_token_accuracy": 0.8243306875228882, + "num_tokens": 27961128.0, + "step": 736 + }, + { + "epoch": 0.09375397532120595, + "ewc_loss": 0.004974365234375, + "ewc_loss_parallel": 4.976987838745117e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.252437591552734, + "learning_rate": 3.1199660873251374e-07, + "loss": 0.5532, + "mean_token_accuracy": 0.826819121837616, + "num_tokens": 28000748.0, + "step": 737 + }, + { + "epoch": 0.09388118559979647, + "ewc_loss": 0.004913330078125, + "ewc_loss_parallel": 4.9173831939697266e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.222875595092773, + "learning_rate": 3.1242051716829164e-07, + "loss": 0.5859, + "mean_token_accuracy": 0.8138946294784546, + "num_tokens": 28046455.0, + "step": 738 + }, + { + "epoch": 0.09400839587838697, + "ewc_loss": 0.0050048828125, + "ewc_loss_parallel": 5.0067901611328125e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.188323020935059, + "learning_rate": 3.128444256040695e-07, + "loss": 0.5625, + "mean_token_accuracy": 0.824015736579895, + "num_tokens": 28087026.0, + "step": 739 + }, + { + "epoch": 0.09413560615697748, + "ewc_loss": 0.0050048828125, + "ewc_loss_parallel": 5.0067901611328125e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.227596282958984, + "learning_rate": 3.132683340398474e-07, + "loss": 0.4844, + "mean_token_accuracy": 0.8436734080314636, + "num_tokens": 28126382.0, + "step": 740 + }, + { + "epoch": 0.094262816435568, + "ewc_loss": 0.005035400390625, + "ewc_loss_parallel": 5.036592483520508e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.280950546264648, + "learning_rate": 3.1369224247562523e-07, + "loss": 0.5125, + "mean_token_accuracy": 0.8385052680969238, + "num_tokens": 28161198.0, + "step": 741 + }, + { + "epoch": 0.0943900267141585, + "ewc_loss": 0.0050048828125, + "ewc_loss_parallel": 5.0067901611328125e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.161632537841797, + "learning_rate": 3.1411615091140313e-07, + "loss": 0.4808, + "mean_token_accuracy": 0.8469647169113159, + "num_tokens": 28204612.0, + "step": 742 + }, + { + "epoch": 0.09451723699274901, + "ewc_loss": 0.00506591796875, + "ewc_loss_parallel": 5.066394805908203e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.339731216430664, + "learning_rate": 3.14540059347181e-07, + "loss": 0.5351, + "mean_token_accuracy": 0.8301135897636414, + "num_tokens": 28241050.0, + "step": 743 + }, + { + "epoch": 0.09464444727133953, + "ewc_loss": 0.005035400390625, + "ewc_loss_parallel": 5.036592483520508e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.234618186950684, + "learning_rate": 3.149639677829589e-07, + "loss": 0.4945, + "mean_token_accuracy": 0.8404196500778198, + "num_tokens": 28278032.0, + "step": 744 + }, + { + "epoch": 0.09477165754993004, + "ewc_loss": 0.0050048828125, + "ewc_loss_parallel": 5.0067901611328125e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.282438278198242, + "learning_rate": 3.153878762187368e-07, + "loss": 0.5815, + "mean_token_accuracy": 0.8214890360832214, + "num_tokens": 28315599.0, + "step": 745 + }, + { + "epoch": 0.09489886782852054, + "ewc_loss": 0.005096435546875, + "ewc_loss_parallel": 5.0961971282958984e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.306485176086426, + "learning_rate": 3.158117846545146e-07, + "loss": 0.6056, + "mean_token_accuracy": 0.812767744064331, + "num_tokens": 28358504.0, + "step": 746 + }, + { + "epoch": 0.09502607810711106, + "ewc_loss": 0.005035400390625, + "ewc_loss_parallel": 5.036592483520508e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.336139678955078, + "learning_rate": 3.1623569309029247e-07, + "loss": 0.5219, + "mean_token_accuracy": 0.8342511653900146, + "num_tokens": 28389626.0, + "step": 747 + }, + { + "epoch": 0.09515328838570157, + "ewc_loss": 0.00506591796875, + "ewc_loss_parallel": 5.066394805908203e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.17105770111084, + "learning_rate": 3.1665960152607037e-07, + "loss": 0.5052, + "mean_token_accuracy": 0.838857889175415, + "num_tokens": 28430361.0, + "step": 748 + }, + { + "epoch": 0.09528049866429207, + "ewc_loss": 0.00506591796875, + "ewc_loss_parallel": 5.066394805908203e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.311990737915039, + "learning_rate": 3.1708350996184826e-07, + "loss": 0.5415, + "mean_token_accuracy": 0.8280858993530273, + "num_tokens": 28469792.0, + "step": 749 + }, + { + "epoch": 0.09540770894288259, + "ewc_loss": 0.005096435546875, + "ewc_loss_parallel": 5.0961971282958984e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.45001220703125, + "learning_rate": 3.175074183976261e-07, + "loss": 0.521, + "mean_token_accuracy": 0.8375961780548096, + "num_tokens": 28507569.0, + "step": 750 + }, + { + "epoch": 0.0955349192214731, + "ewc_loss": 0.005035400390625, + "ewc_loss_parallel": 5.036592483520508e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.284996032714844, + "learning_rate": 3.1793132683340396e-07, + "loss": 0.5469, + "mean_token_accuracy": 0.8279987573623657, + "num_tokens": 28542336.0, + "step": 751 + }, + { + "epoch": 0.0956621295000636, + "ewc_loss": 0.005035400390625, + "ewc_loss_parallel": 5.036592483520508e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.233894348144531, + "learning_rate": 3.1835523526918186e-07, + "loss": 0.4965, + "mean_token_accuracy": 0.8419511318206787, + "num_tokens": 28578587.0, + "step": 752 + }, + { + "epoch": 0.09578933977865411, + "ewc_loss": 0.00506591796875, + "ewc_loss_parallel": 5.066394805908203e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.45277214050293, + "learning_rate": 3.1877914370495975e-07, + "loss": 0.577, + "mean_token_accuracy": 0.8185834288597107, + "num_tokens": 28617505.0, + "step": 753 + }, + { + "epoch": 0.09591655005724463, + "ewc_loss": 0.005035400390625, + "ewc_loss_parallel": 5.036592483520508e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.370438575744629, + "learning_rate": 3.1920305214073755e-07, + "loss": 0.5827, + "mean_token_accuracy": 0.8183311820030212, + "num_tokens": 28658526.0, + "step": 754 + }, + { + "epoch": 0.09604376033583513, + "ewc_loss": 0.005096435546875, + "ewc_loss_parallel": 5.0961971282958984e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.279926300048828, + "learning_rate": 3.1962696057651545e-07, + "loss": 0.5176, + "mean_token_accuracy": 0.8351671099662781, + "num_tokens": 28698825.0, + "step": 755 + }, + { + "epoch": 0.09617097061442564, + "ewc_loss": 0.00506591796875, + "ewc_loss_parallel": 5.066394805908203e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.426280975341797, + "learning_rate": 3.2005086901229335e-07, + "loss": 0.5607, + "mean_token_accuracy": 0.8208291530609131, + "num_tokens": 28733238.0, + "step": 756 + }, + { + "epoch": 0.09629818089301616, + "ewc_loss": 0.005096435546875, + "ewc_loss_parallel": 5.0961971282958984e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.424293518066406, + "learning_rate": 3.2047477744807125e-07, + "loss": 0.5627, + "mean_token_accuracy": 0.8257201910018921, + "num_tokens": 28771552.0, + "step": 757 + }, + { + "epoch": 0.09642539117160667, + "ewc_loss": 0.005096435546875, + "ewc_loss_parallel": 5.0961971282958984e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.196877479553223, + "learning_rate": 3.2089868588384904e-07, + "loss": 0.5949, + "mean_token_accuracy": 0.8109548091888428, + "num_tokens": 28810827.0, + "step": 758 + }, + { + "epoch": 0.09655260145019717, + "ewc_loss": 0.00506591796875, + "ewc_loss_parallel": 5.066394805908203e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.436430931091309, + "learning_rate": 3.2132259431962694e-07, + "loss": 0.5044, + "mean_token_accuracy": 0.8403723835945129, + "num_tokens": 28849017.0, + "step": 759 + }, + { + "epoch": 0.09667981172878769, + "ewc_loss": 0.005096435546875, + "ewc_loss_parallel": 5.0961971282958984e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.255597114562988, + "learning_rate": 3.2174650275540484e-07, + "loss": 0.555, + "mean_token_accuracy": 0.8300420045852661, + "num_tokens": 28887659.0, + "step": 760 + }, + { + "epoch": 0.0968070220073782, + "ewc_loss": 0.005096435546875, + "ewc_loss_parallel": 5.0961971282958984e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.500232696533203, + "learning_rate": 3.2217041119118274e-07, + "loss": 0.5225, + "mean_token_accuracy": 0.8336538076400757, + "num_tokens": 28928983.0, + "step": 761 + }, + { + "epoch": 0.0969342322859687, + "ewc_loss": 0.00518798828125, + "ewc_loss_parallel": 5.185604095458984e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.3787202835083, + "learning_rate": 3.2259431962696053e-07, + "loss": 0.5563, + "mean_token_accuracy": 0.8258554339408875, + "num_tokens": 28962195.0, + "step": 762 + }, + { + "epoch": 0.09706144256455922, + "ewc_loss": 0.005157470703125, + "ewc_loss_parallel": 5.155801773071289e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.518288612365723, + "learning_rate": 3.2301822806273843e-07, + "loss": 0.5386, + "mean_token_accuracy": 0.8277430534362793, + "num_tokens": 28997351.0, + "step": 763 + }, + { + "epoch": 0.09718865284314973, + "ewc_loss": 0.00518798828125, + "ewc_loss_parallel": 5.185604095458984e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.452139854431152, + "learning_rate": 3.2344213649851633e-07, + "loss": 0.5656, + "mean_token_accuracy": 0.8230729103088379, + "num_tokens": 29040988.0, + "step": 764 + }, + { + "epoch": 0.09731586312174023, + "ewc_loss": 0.005126953125, + "ewc_loss_parallel": 5.125999450683594e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.24731731414795, + "learning_rate": 3.238660449342942e-07, + "loss": 0.5268, + "mean_token_accuracy": 0.8315876722335815, + "num_tokens": 29081873.0, + "step": 765 + }, + { + "epoch": 0.09744307340033075, + "ewc_loss": 0.005126953125, + "ewc_loss_parallel": 5.125999450683594e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.412464141845703, + "learning_rate": 3.24289953370072e-07, + "loss": 0.4959, + "mean_token_accuracy": 0.838611900806427, + "num_tokens": 29115360.0, + "step": 766 + }, + { + "epoch": 0.09757028367892126, + "ewc_loss": 0.00518798828125, + "ewc_loss_parallel": 5.185604095458984e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.50658130645752, + "learning_rate": 3.247138618058499e-07, + "loss": 0.5432, + "mean_token_accuracy": 0.8245834708213806, + "num_tokens": 29160139.0, + "step": 767 + }, + { + "epoch": 0.09769749395751176, + "ewc_loss": 0.005157470703125, + "ewc_loss_parallel": 5.155801773071289e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.306544303894043, + "learning_rate": 3.251377702416278e-07, + "loss": 0.572, + "mean_token_accuracy": 0.8213915228843689, + "num_tokens": 29202872.0, + "step": 768 + }, + { + "epoch": 0.09782470423610228, + "ewc_loss": 0.005126953125, + "ewc_loss_parallel": 5.125999450683594e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.392704010009766, + "learning_rate": 3.255616786774057e-07, + "loss": 0.5193, + "mean_token_accuracy": 0.8352165222167969, + "num_tokens": 29246965.0, + "step": 769 + }, + { + "epoch": 0.09795191451469279, + "ewc_loss": 0.005157470703125, + "ewc_loss_parallel": 5.155801773071289e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.50544548034668, + "learning_rate": 3.259855871131835e-07, + "loss": 0.4579, + "mean_token_accuracy": 0.8531385660171509, + "num_tokens": 29289531.0, + "step": 770 + }, + { + "epoch": 0.0980791247932833, + "ewc_loss": 0.00518798828125, + "ewc_loss_parallel": 5.185604095458984e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.3516206741333, + "learning_rate": 3.264094955489614e-07, + "loss": 0.5153, + "mean_token_accuracy": 0.8359605073928833, + "num_tokens": 29330484.0, + "step": 771 + }, + { + "epoch": 0.0982063350718738, + "ewc_loss": 0.005157470703125, + "ewc_loss_parallel": 5.155801773071289e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.413040161132812, + "learning_rate": 3.268334039847393e-07, + "loss": 0.5434, + "mean_token_accuracy": 0.8264510631561279, + "num_tokens": 29369595.0, + "step": 772 + }, + { + "epoch": 0.09833354535046432, + "ewc_loss": 0.00518798828125, + "ewc_loss_parallel": 5.185604095458984e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.471060752868652, + "learning_rate": 3.2725731242051715e-07, + "loss": 0.5315, + "mean_token_accuracy": 0.8339260816574097, + "num_tokens": 29411856.0, + "step": 773 + }, + { + "epoch": 0.09846075562905483, + "ewc_loss": 0.00518798828125, + "ewc_loss_parallel": 5.185604095458984e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.447591781616211, + "learning_rate": 3.27681220856295e-07, + "loss": 0.5139, + "mean_token_accuracy": 0.8386125564575195, + "num_tokens": 29454754.0, + "step": 774 + }, + { + "epoch": 0.09858796590764533, + "ewc_loss": 0.005157470703125, + "ewc_loss_parallel": 5.155801773071289e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.478875160217285, + "learning_rate": 3.281051292920729e-07, + "loss": 0.5089, + "mean_token_accuracy": 0.835228681564331, + "num_tokens": 29489131.0, + "step": 775 + }, + { + "epoch": 0.09871517618623585, + "ewc_loss": 0.00518798828125, + "ewc_loss_parallel": 5.185604095458984e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.400411605834961, + "learning_rate": 3.285290377278508e-07, + "loss": 0.564, + "mean_token_accuracy": 0.8194228410720825, + "num_tokens": 29528292.0, + "step": 776 + }, + { + "epoch": 0.09884238646482636, + "ewc_loss": 0.005157470703125, + "ewc_loss_parallel": 5.155801773071289e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.620634078979492, + "learning_rate": 3.2895294616362864e-07, + "loss": 0.4961, + "mean_token_accuracy": 0.8428031206130981, + "num_tokens": 29564878.0, + "step": 777 + }, + { + "epoch": 0.09896959674341686, + "ewc_loss": 0.0052490234375, + "ewc_loss_parallel": 5.245208740234375e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.516595840454102, + "learning_rate": 3.293768545994065e-07, + "loss": 0.5168, + "mean_token_accuracy": 0.8324657678604126, + "num_tokens": 29602826.0, + "step": 778 + }, + { + "epoch": 0.09909680702200738, + "ewc_loss": 0.00518798828125, + "ewc_loss_parallel": 5.185604095458984e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.457784652709961, + "learning_rate": 3.298007630351844e-07, + "loss": 0.5846, + "mean_token_accuracy": 0.8182271718978882, + "num_tokens": 29640323.0, + "step": 779 + }, + { + "epoch": 0.09922401730059789, + "ewc_loss": 0.00518798828125, + "ewc_loss_parallel": 5.185604095458984e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.428750038146973, + "learning_rate": 3.302246714709623e-07, + "loss": 0.5142, + "mean_token_accuracy": 0.8353773355484009, + "num_tokens": 29683302.0, + "step": 780 + }, + { + "epoch": 0.09935122757918839, + "ewc_loss": 0.005218505859375, + "ewc_loss_parallel": 5.21540641784668e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.481392860412598, + "learning_rate": 3.3064857990674013e-07, + "loss": 0.54, + "mean_token_accuracy": 0.8313062191009521, + "num_tokens": 29721085.0, + "step": 781 + }, + { + "epoch": 0.0994784378577789, + "ewc_loss": 0.005218505859375, + "ewc_loss_parallel": 5.21540641784668e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.623220443725586, + "learning_rate": 3.31072488342518e-07, + "loss": 0.4912, + "mean_token_accuracy": 0.8424354791641235, + "num_tokens": 29753810.0, + "step": 782 + }, + { + "epoch": 0.09960564813636942, + "ewc_loss": 0.0052490234375, + "ewc_loss_parallel": 5.245208740234375e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.475132942199707, + "learning_rate": 3.314963967782959e-07, + "loss": 0.5197, + "mean_token_accuracy": 0.8350633382797241, + "num_tokens": 29796488.0, + "step": 783 + }, + { + "epoch": 0.09973285841495994, + "ewc_loss": 0.005279541015625, + "ewc_loss_parallel": 5.27501106262207e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.532231330871582, + "learning_rate": 3.319203052140738e-07, + "loss": 0.5288, + "mean_token_accuracy": 0.8320451378822327, + "num_tokens": 29838181.0, + "step": 784 + }, + { + "epoch": 0.09986006869355044, + "ewc_loss": 0.0052490234375, + "ewc_loss_parallel": 5.245208740234375e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.72482681274414, + "learning_rate": 3.323442136498516e-07, + "loss": 0.5103, + "mean_token_accuracy": 0.8378092646598816, + "num_tokens": 29868893.0, + "step": 785 + }, + { + "epoch": 0.09998727897214095, + "ewc_loss": 0.005279541015625, + "ewc_loss_parallel": 5.27501106262207e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.577508926391602, + "learning_rate": 3.3276812208562947e-07, + "loss": 0.5271, + "mean_token_accuracy": 0.8315285444259644, + "num_tokens": 29903571.0, + "step": 786 + }, + { + "epoch": 0.10011448925073146, + "ewc_loss": 0.0052490234375, + "ewc_loss_parallel": 5.245208740234375e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.505707740783691, + "learning_rate": 3.3319203052140737e-07, + "loss": 0.5844, + "mean_token_accuracy": 0.8225685358047485, + "num_tokens": 29940438.0, + "step": 787 + }, + { + "epoch": 0.10024169952932196, + "ewc_loss": 0.0052490234375, + "ewc_loss_parallel": 5.245208740234375e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.581945419311523, + "learning_rate": 3.336159389571852e-07, + "loss": 0.5068, + "mean_token_accuracy": 0.8430322408676147, + "num_tokens": 29976538.0, + "step": 788 + }, + { + "epoch": 0.10036890980791248, + "ewc_loss": 0.005279541015625, + "ewc_loss_parallel": 5.27501106262207e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.438887596130371, + "learning_rate": 3.340398473929631e-07, + "loss": 0.5769, + "mean_token_accuracy": 0.8184617757797241, + "num_tokens": 30019814.0, + "step": 789 + }, + { + "epoch": 0.100496120086503, + "ewc_loss": 0.00531005859375, + "ewc_loss_parallel": 5.304813385009766e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.565927505493164, + "learning_rate": 3.3446375582874096e-07, + "loss": 0.5267, + "mean_token_accuracy": 0.8342880010604858, + "num_tokens": 30057578.0, + "step": 790 + }, + { + "epoch": 0.1006233303650935, + "ewc_loss": 0.005279541015625, + "ewc_loss_parallel": 5.27501106262207e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.445066452026367, + "learning_rate": 3.3488766426451886e-07, + "loss": 0.5996, + "mean_token_accuracy": 0.819659948348999, + "num_tokens": 30094869.0, + "step": 791 + }, + { + "epoch": 0.10075054064368401, + "ewc_loss": 0.0052490234375, + "ewc_loss_parallel": 5.245208740234375e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.570521354675293, + "learning_rate": 3.353115727002967e-07, + "loss": 0.5604, + "mean_token_accuracy": 0.8250659704208374, + "num_tokens": 30136446.0, + "step": 792 + }, + { + "epoch": 0.10087775092227452, + "ewc_loss": 0.005340576171875, + "ewc_loss_parallel": 5.334615707397461e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.575368881225586, + "learning_rate": 3.357354811360746e-07, + "loss": 0.5538, + "mean_token_accuracy": 0.8213194012641907, + "num_tokens": 30179226.0, + "step": 793 + }, + { + "epoch": 0.10100496120086502, + "ewc_loss": 0.005279541015625, + "ewc_loss_parallel": 5.27501106262207e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.547992706298828, + "learning_rate": 3.3615938957185245e-07, + "loss": 0.5455, + "mean_token_accuracy": 0.8273480534553528, + "num_tokens": 30213890.0, + "step": 794 + }, + { + "epoch": 0.10113217147945554, + "ewc_loss": 0.005279541015625, + "ewc_loss_parallel": 5.27501106262207e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.673600196838379, + "learning_rate": 3.3658329800763035e-07, + "loss": 0.5949, + "mean_token_accuracy": 0.8098899126052856, + "num_tokens": 30245691.0, + "step": 795 + }, + { + "epoch": 0.10125938175804605, + "ewc_loss": 0.00537109375, + "ewc_loss_parallel": 5.364418029785156e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.488553047180176, + "learning_rate": 3.370072064434082e-07, + "loss": 0.5502, + "mean_token_accuracy": 0.8193469047546387, + "num_tokens": 30281738.0, + "step": 796 + }, + { + "epoch": 0.10138659203663657, + "ewc_loss": 0.005279541015625, + "ewc_loss_parallel": 5.27501106262207e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.559022903442383, + "learning_rate": 3.374311148791861e-07, + "loss": 0.5137, + "mean_token_accuracy": 0.8343251943588257, + "num_tokens": 30320707.0, + "step": 797 + }, + { + "epoch": 0.10151380231522707, + "ewc_loss": 0.00537109375, + "ewc_loss_parallel": 5.364418029785156e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.761475563049316, + "learning_rate": 3.3785502331496394e-07, + "loss": 0.524, + "mean_token_accuracy": 0.831268846988678, + "num_tokens": 30351379.0, + "step": 798 + }, + { + "epoch": 0.10164101259381758, + "ewc_loss": 0.00537109375, + "ewc_loss_parallel": 5.364418029785156e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.643145561218262, + "learning_rate": 3.3827893175074184e-07, + "loss": 0.5474, + "mean_token_accuracy": 0.8246321678161621, + "num_tokens": 30382395.0, + "step": 799 + }, + { + "epoch": 0.1017682228724081, + "ewc_loss": 0.00543212890625, + "ewc_loss_parallel": 5.424022674560547e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.757527351379395, + "learning_rate": 3.387028401865197e-07, + "loss": 0.5534, + "mean_token_accuracy": 0.8286967277526855, + "num_tokens": 30419758.0, + "step": 800 + }, + { + "epoch": 0.1018954331509986, + "ewc_loss": 0.00543212890625, + "ewc_loss_parallel": 5.424022674560547e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.602568626403809, + "learning_rate": 3.391267486222976e-07, + "loss": 0.5361, + "mean_token_accuracy": 0.8267027139663696, + "num_tokens": 30458115.0, + "step": 801 + }, + { + "epoch": 0.10202264342958911, + "ewc_loss": 0.005340576171875, + "ewc_loss_parallel": 5.334615707397461e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.691551208496094, + "learning_rate": 3.3955065705807543e-07, + "loss": 0.5714, + "mean_token_accuracy": 0.8170080780982971, + "num_tokens": 30489679.0, + "step": 802 + }, + { + "epoch": 0.10214985370817962, + "ewc_loss": 0.00543212890625, + "ewc_loss_parallel": 5.424022674560547e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.624869346618652, + "learning_rate": 3.3997456549385333e-07, + "loss": 0.507, + "mean_token_accuracy": 0.8394147157669067, + "num_tokens": 30524874.0, + "step": 803 + }, + { + "epoch": 0.10227706398677013, + "ewc_loss": 0.00543212890625, + "ewc_loss_parallel": 5.424022674560547e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.587733268737793, + "learning_rate": 3.403984739296312e-07, + "loss": 0.5825, + "mean_token_accuracy": 0.8164583444595337, + "num_tokens": 30558354.0, + "step": 804 + }, + { + "epoch": 0.10240427426536064, + "ewc_loss": 0.005462646484375, + "ewc_loss_parallel": 5.453824996948242e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.610343933105469, + "learning_rate": 3.408223823654091e-07, + "loss": 0.512, + "mean_token_accuracy": 0.8360012173652649, + "num_tokens": 30594827.0, + "step": 805 + }, + { + "epoch": 0.10253148454395115, + "ewc_loss": 0.005523681640625, + "ewc_loss_parallel": 5.513429641723633e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.623819351196289, + "learning_rate": 3.412462908011869e-07, + "loss": 0.5519, + "mean_token_accuracy": 0.8315671682357788, + "num_tokens": 30635259.0, + "step": 806 + }, + { + "epoch": 0.10265869482254165, + "ewc_loss": 0.005523681640625, + "ewc_loss_parallel": 5.513429641723633e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.53672981262207, + "learning_rate": 3.4167019923696477e-07, + "loss": 0.5454, + "mean_token_accuracy": 0.8328053951263428, + "num_tokens": 30681850.0, + "step": 807 + }, + { + "epoch": 0.10278590510113217, + "ewc_loss": 0.00543212890625, + "ewc_loss_parallel": 5.424022674560547e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.670949935913086, + "learning_rate": 3.4209410767274267e-07, + "loss": 0.5129, + "mean_token_accuracy": 0.8408737182617188, + "num_tokens": 30721302.0, + "step": 808 + }, + { + "epoch": 0.10291311537972268, + "ewc_loss": 0.0054931640625, + "ewc_loss_parallel": 5.4836273193359375e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.872076034545898, + "learning_rate": 3.4251801610852057e-07, + "loss": 0.5878, + "mean_token_accuracy": 0.8124858140945435, + "num_tokens": 30761975.0, + "step": 809 + }, + { + "epoch": 0.1030403256583132, + "ewc_loss": 0.0054931640625, + "ewc_loss_parallel": 5.4836273193359375e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.643899917602539, + "learning_rate": 3.429419245442984e-07, + "loss": 0.541, + "mean_token_accuracy": 0.8277047276496887, + "num_tokens": 30797754.0, + "step": 810 + }, + { + "epoch": 0.1031675359369037, + "ewc_loss": 0.005462646484375, + "ewc_loss_parallel": 5.453824996948242e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.569442749023438, + "learning_rate": 3.4336583298007626e-07, + "loss": 0.5148, + "mean_token_accuracy": 0.8383234739303589, + "num_tokens": 30834183.0, + "step": 811 + }, + { + "epoch": 0.10329474621549421, + "ewc_loss": 0.005523681640625, + "ewc_loss_parallel": 5.513429641723633e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.549915313720703, + "learning_rate": 3.4378974141585416e-07, + "loss": 0.5396, + "mean_token_accuracy": 0.8307158350944519, + "num_tokens": 30874160.0, + "step": 812 + }, + { + "epoch": 0.10342195649408473, + "ewc_loss": 0.005523681640625, + "ewc_loss_parallel": 5.513429641723633e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.570842742919922, + "learning_rate": 3.4421364985163206e-07, + "loss": 0.4778, + "mean_token_accuracy": 0.8460886478424072, + "num_tokens": 30916827.0, + "step": 813 + }, + { + "epoch": 0.10354916677267523, + "ewc_loss": 0.005462646484375, + "ewc_loss_parallel": 5.453824996948242e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.69027042388916, + "learning_rate": 3.446375582874099e-07, + "loss": 0.5506, + "mean_token_accuracy": 0.8283933401107788, + "num_tokens": 30960063.0, + "step": 814 + }, + { + "epoch": 0.10367637705126574, + "ewc_loss": 0.00555419921875, + "ewc_loss_parallel": 5.543231964111328e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.823202133178711, + "learning_rate": 3.4506146672318775e-07, + "loss": 0.4965, + "mean_token_accuracy": 0.8387278318405151, + "num_tokens": 30995172.0, + "step": 815 + }, + { + "epoch": 0.10380358732985626, + "ewc_loss": 0.005523681640625, + "ewc_loss_parallel": 5.513429641723633e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.693224906921387, + "learning_rate": 3.4548537515896565e-07, + "loss": 0.5597, + "mean_token_accuracy": 0.8236805200576782, + "num_tokens": 31035089.0, + "step": 816 + }, + { + "epoch": 0.10393079760844676, + "ewc_loss": 0.005523681640625, + "ewc_loss_parallel": 5.513429641723633e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.731481552124023, + "learning_rate": 3.4590928359474355e-07, + "loss": 0.5388, + "mean_token_accuracy": 0.8287594318389893, + "num_tokens": 31074836.0, + "step": 817 + }, + { + "epoch": 0.10405800788703727, + "ewc_loss": 0.005523681640625, + "ewc_loss_parallel": 5.513429641723633e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.617574691772461, + "learning_rate": 3.463331920305214e-07, + "loss": 0.5587, + "mean_token_accuracy": 0.8233171701431274, + "num_tokens": 31114700.0, + "step": 818 + }, + { + "epoch": 0.10418521816562779, + "ewc_loss": 0.0054931640625, + "ewc_loss_parallel": 5.4836273193359375e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.733567237854004, + "learning_rate": 3.4675710046629924e-07, + "loss": 0.5071, + "mean_token_accuracy": 0.8380081057548523, + "num_tokens": 31148258.0, + "step": 819 + }, + { + "epoch": 0.1043124284442183, + "ewc_loss": 0.005615234375, + "ewc_loss_parallel": 5.602836608886719e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.85192584991455, + "learning_rate": 3.4718100890207714e-07, + "loss": 0.5472, + "mean_token_accuracy": 0.8271772861480713, + "num_tokens": 31185720.0, + "step": 820 + }, + { + "epoch": 0.1044396387228088, + "ewc_loss": 0.00555419921875, + "ewc_loss_parallel": 5.543231964111328e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.739131927490234, + "learning_rate": 3.4760491733785504e-07, + "loss": 0.5583, + "mean_token_accuracy": 0.8204208016395569, + "num_tokens": 31220353.0, + "step": 821 + }, + { + "epoch": 0.10456684900139931, + "ewc_loss": 0.00555419921875, + "ewc_loss_parallel": 5.543231964111328e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.70625114440918, + "learning_rate": 3.480288257736329e-07, + "loss": 0.5352, + "mean_token_accuracy": 0.828035295009613, + "num_tokens": 31256477.0, + "step": 822 + }, + { + "epoch": 0.10469405927998983, + "ewc_loss": 0.005615234375, + "ewc_loss_parallel": 5.602836608886719e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.775069236755371, + "learning_rate": 3.4845273420941073e-07, + "loss": 0.5331, + "mean_token_accuracy": 0.8319618701934814, + "num_tokens": 31294365.0, + "step": 823 + }, + { + "epoch": 0.10482126955858033, + "ewc_loss": 0.005615234375, + "ewc_loss_parallel": 5.602836608886719e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.777291297912598, + "learning_rate": 3.4887664264518863e-07, + "loss": 0.5276, + "mean_token_accuracy": 0.8285511136054993, + "num_tokens": 31329357.0, + "step": 824 + }, + { + "epoch": 0.10494847983717084, + "ewc_loss": 0.005584716796875, + "ewc_loss_parallel": 5.5730342864990234e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.624529838562012, + "learning_rate": 3.4930055108096653e-07, + "loss": 0.533, + "mean_token_accuracy": 0.8312632441520691, + "num_tokens": 31372180.0, + "step": 825 + }, + { + "epoch": 0.10507569011576136, + "ewc_loss": 0.005645751953125, + "ewc_loss_parallel": 5.632638931274414e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.730598449707031, + "learning_rate": 3.497244595167443e-07, + "loss": 0.5174, + "mean_token_accuracy": 0.8359121084213257, + "num_tokens": 31414866.0, + "step": 826 + }, + { + "epoch": 0.10520290039435186, + "ewc_loss": 0.005645751953125, + "ewc_loss_parallel": 5.632638931274414e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.794461250305176, + "learning_rate": 3.501483679525222e-07, + "loss": 0.5867, + "mean_token_accuracy": 0.8159888982772827, + "num_tokens": 31450583.0, + "step": 827 + }, + { + "epoch": 0.10533011067294237, + "ewc_loss": 0.005615234375, + "ewc_loss_parallel": 5.602836608886719e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.745637893676758, + "learning_rate": 3.505722763883001e-07, + "loss": 0.5172, + "mean_token_accuracy": 0.8336247205734253, + "num_tokens": 31490371.0, + "step": 828 + }, + { + "epoch": 0.10545732095153289, + "ewc_loss": 0.005615234375, + "ewc_loss_parallel": 5.602836608886719e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.778873443603516, + "learning_rate": 3.50996184824078e-07, + "loss": 0.5603, + "mean_token_accuracy": 0.822288990020752, + "num_tokens": 31525951.0, + "step": 829 + }, + { + "epoch": 0.10558453123012339, + "ewc_loss": 0.005584716796875, + "ewc_loss_parallel": 5.5730342864990234e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.735010147094727, + "learning_rate": 3.514200932598558e-07, + "loss": 0.5993, + "mean_token_accuracy": 0.812746524810791, + "num_tokens": 31570109.0, + "step": 830 + }, + { + "epoch": 0.1057117415087139, + "ewc_loss": 0.005615234375, + "ewc_loss_parallel": 5.602836608886719e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.788352012634277, + "learning_rate": 3.518440016956337e-07, + "loss": 0.5405, + "mean_token_accuracy": 0.8296331763267517, + "num_tokens": 31610569.0, + "step": 831 + }, + { + "epoch": 0.10583895178730442, + "ewc_loss": 0.00567626953125, + "ewc_loss_parallel": 5.662441253662109e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.775522232055664, + "learning_rate": 3.522679101314116e-07, + "loss": 0.5522, + "mean_token_accuracy": 0.8289076089859009, + "num_tokens": 31653585.0, + "step": 832 + }, + { + "epoch": 0.10596616206589493, + "ewc_loss": 0.00567626953125, + "ewc_loss_parallel": 5.662441253662109e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.735991477966309, + "learning_rate": 3.526918185671895e-07, + "loss": 0.562, + "mean_token_accuracy": 0.8208491802215576, + "num_tokens": 31696353.0, + "step": 833 + }, + { + "epoch": 0.10609337234448543, + "ewc_loss": 0.00567626953125, + "ewc_loss_parallel": 5.662441253662109e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.829605102539062, + "learning_rate": 3.531157270029673e-07, + "loss": 0.4931, + "mean_token_accuracy": 0.8425564765930176, + "num_tokens": 31730620.0, + "step": 834 + }, + { + "epoch": 0.10622058262307595, + "ewc_loss": 0.00567626953125, + "ewc_loss_parallel": 5.662441253662109e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.783180236816406, + "learning_rate": 3.535396354387452e-07, + "loss": 0.5082, + "mean_token_accuracy": 0.836699903011322, + "num_tokens": 31770538.0, + "step": 835 + }, + { + "epoch": 0.10634779290166646, + "ewc_loss": 0.00567626953125, + "ewc_loss_parallel": 5.662441253662109e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.794903755187988, + "learning_rate": 3.539635438745231e-07, + "loss": 0.5516, + "mean_token_accuracy": 0.8244003057479858, + "num_tokens": 31808687.0, + "step": 836 + }, + { + "epoch": 0.10647500318025696, + "ewc_loss": 0.00567626953125, + "ewc_loss_parallel": 5.662441253662109e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.819005966186523, + "learning_rate": 3.54387452310301e-07, + "loss": 0.5298, + "mean_token_accuracy": 0.8328048586845398, + "num_tokens": 31852310.0, + "step": 837 + }, + { + "epoch": 0.10660221345884748, + "ewc_loss": 0.005706787109375, + "ewc_loss_parallel": 5.692243576049805e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.768802642822266, + "learning_rate": 3.548113607460788e-07, + "loss": 0.4993, + "mean_token_accuracy": 0.8429931402206421, + "num_tokens": 31887897.0, + "step": 838 + }, + { + "epoch": 0.10672942373743799, + "ewc_loss": 0.00567626953125, + "ewc_loss_parallel": 5.662441253662109e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.902887344360352, + "learning_rate": 3.552352691818567e-07, + "loss": 0.5247, + "mean_token_accuracy": 0.8347318172454834, + "num_tokens": 31926159.0, + "step": 839 + }, + { + "epoch": 0.10685663401602849, + "ewc_loss": 0.005706787109375, + "ewc_loss_parallel": 5.692243576049805e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.832185745239258, + "learning_rate": 3.556591776176346e-07, + "loss": 0.4978, + "mean_token_accuracy": 0.8420590758323669, + "num_tokens": 31968432.0, + "step": 840 + }, + { + "epoch": 0.106983844294619, + "ewc_loss": 0.005706787109375, + "ewc_loss_parallel": 5.692243576049805e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.992095947265625, + "learning_rate": 3.560830860534125e-07, + "loss": 0.5332, + "mean_token_accuracy": 0.8322532176971436, + "num_tokens": 32000949.0, + "step": 841 + }, + { + "epoch": 0.10711105457320952, + "ewc_loss": 0.005706787109375, + "ewc_loss_parallel": 5.692243576049805e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.881021499633789, + "learning_rate": 3.565069944891903e-07, + "loss": 0.5765, + "mean_token_accuracy": 0.8181787133216858, + "num_tokens": 32033862.0, + "step": 842 + }, + { + "epoch": 0.10723826485180002, + "ewc_loss": 0.00567626953125, + "ewc_loss_parallel": 5.662441253662109e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.834044456481934, + "learning_rate": 3.569309029249682e-07, + "loss": 0.5376, + "mean_token_accuracy": 0.8308972120285034, + "num_tokens": 32068623.0, + "step": 843 + }, + { + "epoch": 0.10736547513039053, + "ewc_loss": 0.00567626953125, + "ewc_loss_parallel": 5.662441253662109e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.848053932189941, + "learning_rate": 3.573548113607461e-07, + "loss": 0.5237, + "mean_token_accuracy": 0.8340381979942322, + "num_tokens": 32106658.0, + "step": 844 + }, + { + "epoch": 0.10749268540898105, + "ewc_loss": 0.0057373046875, + "ewc_loss_parallel": 5.7220458984375e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.87053394317627, + "learning_rate": 3.577787197965239e-07, + "loss": 0.5676, + "mean_token_accuracy": 0.8192099332809448, + "num_tokens": 32142031.0, + "step": 845 + }, + { + "epoch": 0.10761989568757156, + "ewc_loss": 0.0057373046875, + "ewc_loss_parallel": 5.751848220825195e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.934309959411621, + "learning_rate": 3.5820262823230177e-07, + "loss": 0.4966, + "mean_token_accuracy": 0.845511794090271, + "num_tokens": 32182686.0, + "step": 846 + }, + { + "epoch": 0.10774710596616206, + "ewc_loss": 0.005767822265625, + "ewc_loss_parallel": 5.781650543212891e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.845756530761719, + "learning_rate": 3.5862653666807967e-07, + "loss": 0.5943, + "mean_token_accuracy": 0.8123201131820679, + "num_tokens": 32218027.0, + "step": 847 + }, + { + "epoch": 0.10787431624475258, + "ewc_loss": 0.0057373046875, + "ewc_loss_parallel": 5.7220458984375e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.912137985229492, + "learning_rate": 3.5905044510385757e-07, + "loss": 0.5476, + "mean_token_accuracy": 0.829521119594574, + "num_tokens": 32257854.0, + "step": 848 + }, + { + "epoch": 0.10800152652334309, + "ewc_loss": 0.00579833984375, + "ewc_loss_parallel": 5.811452865600586e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.88128662109375, + "learning_rate": 3.594743535396354e-07, + "loss": 0.5304, + "mean_token_accuracy": 0.8319528698921204, + "num_tokens": 32295019.0, + "step": 849 + }, + { + "epoch": 0.10812873680193359, + "ewc_loss": 0.0057373046875, + "ewc_loss_parallel": 5.751848220825195e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.971393585205078, + "learning_rate": 3.5989826197541326e-07, + "loss": 0.581, + "mean_token_accuracy": 0.8172149658203125, + "num_tokens": 32335652.0, + "step": 850 + }, + { + "epoch": 0.1082559470805241, + "ewc_loss": 0.005767822265625, + "ewc_loss_parallel": 5.781650543212891e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.765497207641602, + "learning_rate": 3.6032217041119116e-07, + "loss": 0.5281, + "mean_token_accuracy": 0.8290579915046692, + "num_tokens": 32377249.0, + "step": 851 + }, + { + "epoch": 0.10838315735911462, + "ewc_loss": 0.0057373046875, + "ewc_loss_parallel": 5.751848220825195e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.853706359863281, + "learning_rate": 3.6074607884696906e-07, + "loss": 0.4992, + "mean_token_accuracy": 0.8433611392974854, + "num_tokens": 32422467.0, + "step": 852 + }, + { + "epoch": 0.10851036763770512, + "ewc_loss": 0.005767822265625, + "ewc_loss_parallel": 5.781650543212891e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.85983943939209, + "learning_rate": 3.611699872827469e-07, + "loss": 0.4918, + "mean_token_accuracy": 0.842096209526062, + "num_tokens": 32462120.0, + "step": 853 + }, + { + "epoch": 0.10863757791629564, + "ewc_loss": 0.005767822265625, + "ewc_loss_parallel": 5.781650543212891e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.955582618713379, + "learning_rate": 3.6159389571852475e-07, + "loss": 0.6116, + "mean_token_accuracy": 0.8094074726104736, + "num_tokens": 32498484.0, + "step": 854 + }, + { + "epoch": 0.10876478819488615, + "ewc_loss": 0.005828857421875, + "ewc_loss_parallel": 5.841255187988281e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.94502067565918, + "learning_rate": 3.6201780415430265e-07, + "loss": 0.5452, + "mean_token_accuracy": 0.8281158208847046, + "num_tokens": 32541536.0, + "step": 855 + }, + { + "epoch": 0.10889199847347665, + "ewc_loss": 0.00579833984375, + "ewc_loss_parallel": 5.811452865600586e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.023465156555176, + "learning_rate": 3.6244171259008055e-07, + "loss": 0.5699, + "mean_token_accuracy": 0.8240607976913452, + "num_tokens": 32579750.0, + "step": 856 + }, + { + "epoch": 0.10901920875206716, + "ewc_loss": 0.00579833984375, + "ewc_loss_parallel": 5.811452865600586e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.875612258911133, + "learning_rate": 3.628656210258584e-07, + "loss": 0.5764, + "mean_token_accuracy": 0.8175470232963562, + "num_tokens": 32617518.0, + "step": 857 + }, + { + "epoch": 0.10914641903065768, + "ewc_loss": 0.00579833984375, + "ewc_loss_parallel": 5.811452865600586e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.000316619873047, + "learning_rate": 3.6328952946163624e-07, + "loss": 0.5181, + "mean_token_accuracy": 0.8348584175109863, + "num_tokens": 32653931.0, + "step": 858 + }, + { + "epoch": 0.1092736293092482, + "ewc_loss": 0.005828857421875, + "ewc_loss_parallel": 5.841255187988281e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.873762130737305, + "learning_rate": 3.6371343789741414e-07, + "loss": 0.5064, + "mean_token_accuracy": 0.8382956385612488, + "num_tokens": 32693797.0, + "step": 859 + }, + { + "epoch": 0.1094008395878387, + "ewc_loss": 0.005828857421875, + "ewc_loss_parallel": 5.841255187988281e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.00682258605957, + "learning_rate": 3.6413734633319204e-07, + "loss": 0.5553, + "mean_token_accuracy": 0.8254320621490479, + "num_tokens": 32732432.0, + "step": 860 + }, + { + "epoch": 0.10952804986642921, + "ewc_loss": 0.005859375, + "ewc_loss_parallel": 5.8710575103759766e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.005230903625488, + "learning_rate": 3.645612547689699e-07, + "loss": 0.5075, + "mean_token_accuracy": 0.8341967463493347, + "num_tokens": 32769225.0, + "step": 861 + }, + { + "epoch": 0.10965526014501972, + "ewc_loss": 0.005828857421875, + "ewc_loss_parallel": 5.841255187988281e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.94100570678711, + "learning_rate": 3.6498516320474773e-07, + "loss": 0.5015, + "mean_token_accuracy": 0.8419594764709473, + "num_tokens": 32812027.0, + "step": 862 + }, + { + "epoch": 0.10978247042361022, + "ewc_loss": 0.005859375, + "ewc_loss_parallel": 5.8710575103759766e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.96037769317627, + "learning_rate": 3.6540907164052563e-07, + "loss": 0.481, + "mean_token_accuracy": 0.8474754095077515, + "num_tokens": 32849334.0, + "step": 863 + }, + { + "epoch": 0.10990968070220074, + "ewc_loss": 0.005859375, + "ewc_loss_parallel": 5.8710575103759766e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.060441017150879, + "learning_rate": 3.658329800763035e-07, + "loss": 0.5745, + "mean_token_accuracy": 0.8175477385520935, + "num_tokens": 32888422.0, + "step": 864 + }, + { + "epoch": 0.11003689098079125, + "ewc_loss": 0.005859375, + "ewc_loss_parallel": 5.8710575103759766e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.973366737365723, + "learning_rate": 3.662568885120814e-07, + "loss": 0.5339, + "mean_token_accuracy": 0.82949298620224, + "num_tokens": 32922672.0, + "step": 865 + }, + { + "epoch": 0.11016410125938175, + "ewc_loss": 0.005859375, + "ewc_loss_parallel": 5.8710575103759766e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.864965438842773, + "learning_rate": 3.666807969478592e-07, + "loss": 0.563, + "mean_token_accuracy": 0.8191651105880737, + "num_tokens": 32961694.0, + "step": 866 + }, + { + "epoch": 0.11029131153797227, + "ewc_loss": 0.005889892578125, + "ewc_loss_parallel": 5.900859832763672e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.957420349121094, + "learning_rate": 3.671047053836371e-07, + "loss": 0.5525, + "mean_token_accuracy": 0.8234838843345642, + "num_tokens": 32998595.0, + "step": 867 + }, + { + "epoch": 0.11041852181656278, + "ewc_loss": 0.00592041015625, + "ewc_loss_parallel": 5.930662155151367e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.059054374694824, + "learning_rate": 3.6752861381941497e-07, + "loss": 0.5112, + "mean_token_accuracy": 0.8347465991973877, + "num_tokens": 33035884.0, + "step": 868 + }, + { + "epoch": 0.11054573209515328, + "ewc_loss": 0.005950927734375, + "ewc_loss_parallel": 5.9604644775390625e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.986713409423828, + "learning_rate": 3.6795252225519287e-07, + "loss": 0.5746, + "mean_token_accuracy": 0.8213998079299927, + "num_tokens": 33073335.0, + "step": 869 + }, + { + "epoch": 0.1106729423737438, + "ewc_loss": 0.005889892578125, + "ewc_loss_parallel": 5.900859832763672e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.920354843139648, + "learning_rate": 3.6837643069097077e-07, + "loss": 0.4729, + "mean_token_accuracy": 0.8477784395217896, + "num_tokens": 33111443.0, + "step": 870 + }, + { + "epoch": 0.11080015265233431, + "ewc_loss": 0.00592041015625, + "ewc_loss_parallel": 5.930662155151367e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.945033073425293, + "learning_rate": 3.688003391267486e-07, + "loss": 0.5325, + "mean_token_accuracy": 0.834466278553009, + "num_tokens": 33156362.0, + "step": 871 + }, + { + "epoch": 0.11092736293092482, + "ewc_loss": 0.00592041015625, + "ewc_loss_parallel": 5.930662155151367e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.898727416992188, + "learning_rate": 3.6922424756252646e-07, + "loss": 0.5018, + "mean_token_accuracy": 0.83955979347229, + "num_tokens": 33204968.0, + "step": 872 + }, + { + "epoch": 0.11105457320951533, + "ewc_loss": 0.005889892578125, + "ewc_loss_parallel": 5.900859832763672e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.067780494689941, + "learning_rate": 3.6964815599830436e-07, + "loss": 0.5503, + "mean_token_accuracy": 0.8232656717300415, + "num_tokens": 33239265.0, + "step": 873 + }, + { + "epoch": 0.11118178348810584, + "ewc_loss": 0.005950927734375, + "ewc_loss_parallel": 5.9604644775390625e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.068909645080566, + "learning_rate": 3.7007206443408226e-07, + "loss": 0.4817, + "mean_token_accuracy": 0.8475544452667236, + "num_tokens": 33275897.0, + "step": 874 + }, + { + "epoch": 0.11130899376669635, + "ewc_loss": 0.00592041015625, + "ewc_loss_parallel": 5.930662155151367e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.030485153198242, + "learning_rate": 3.704959728698601e-07, + "loss": 0.6073, + "mean_token_accuracy": 0.8098195791244507, + "num_tokens": 33313966.0, + "step": 875 + }, + { + "epoch": 0.11143620404528685, + "ewc_loss": 0.005950927734375, + "ewc_loss_parallel": 5.9604644775390625e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.020992279052734, + "learning_rate": 3.7091988130563795e-07, + "loss": 0.5502, + "mean_token_accuracy": 0.8267896175384521, + "num_tokens": 33352909.0, + "step": 876 + }, + { + "epoch": 0.11156341432387737, + "ewc_loss": 0.005950927734375, + "ewc_loss_parallel": 5.9604644775390625e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.913315773010254, + "learning_rate": 3.7134378974141585e-07, + "loss": 0.532, + "mean_token_accuracy": 0.8323889970779419, + "num_tokens": 33400120.0, + "step": 877 + }, + { + "epoch": 0.11169062460246788, + "ewc_loss": 0.005950927734375, + "ewc_loss_parallel": 5.9604644775390625e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.075492858886719, + "learning_rate": 3.7176769817719375e-07, + "loss": 0.5621, + "mean_token_accuracy": 0.8223938941955566, + "num_tokens": 33436703.0, + "step": 878 + }, + { + "epoch": 0.11181783488105838, + "ewc_loss": 0.006011962890625, + "ewc_loss_parallel": 6.020069122314453e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.112468719482422, + "learning_rate": 3.7219160661297154e-07, + "loss": 0.4701, + "mean_token_accuracy": 0.852033793926239, + "num_tokens": 33473515.0, + "step": 879 + }, + { + "epoch": 0.1119450451596489, + "ewc_loss": 0.0059814453125, + "ewc_loss_parallel": 5.990266799926758e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.927395820617676, + "learning_rate": 3.7261551504874944e-07, + "loss": 0.5324, + "mean_token_accuracy": 0.833818793296814, + "num_tokens": 33507659.0, + "step": 880 + }, + { + "epoch": 0.11207225543823941, + "ewc_loss": 0.005950927734375, + "ewc_loss_parallel": 5.9604644775390625e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.18947982788086, + "learning_rate": 3.7303942348452734e-07, + "loss": 0.478, + "mean_token_accuracy": 0.8449106216430664, + "num_tokens": 33542430.0, + "step": 881 + }, + { + "epoch": 0.11219946571682991, + "ewc_loss": 0.0059814453125, + "ewc_loss_parallel": 5.990266799926758e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.007027626037598, + "learning_rate": 3.7346333192030524e-07, + "loss": 0.4838, + "mean_token_accuracy": 0.8485879898071289, + "num_tokens": 33582579.0, + "step": 882 + }, + { + "epoch": 0.11232667599542043, + "ewc_loss": 0.005889892578125, + "ewc_loss_parallel": 5.900859832763672e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.891799926757812, + "learning_rate": 3.7388724035608303e-07, + "loss": 0.5079, + "mean_token_accuracy": 0.8423506021499634, + "num_tokens": 33625928.0, + "step": 883 + }, + { + "epoch": 0.11245388627401094, + "ewc_loss": 0.0059814453125, + "ewc_loss_parallel": 5.990266799926758e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.121931076049805, + "learning_rate": 3.7431114879186093e-07, + "loss": 0.5507, + "mean_token_accuracy": 0.8273491859436035, + "num_tokens": 33661325.0, + "step": 884 + }, + { + "epoch": 0.11258109655260146, + "ewc_loss": 0.0059814453125, + "ewc_loss_parallel": 5.990266799926758e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.101176261901855, + "learning_rate": 3.7473505722763883e-07, + "loss": 0.5727, + "mean_token_accuracy": 0.8182263374328613, + "num_tokens": 33696472.0, + "step": 885 + }, + { + "epoch": 0.11270830683119196, + "ewc_loss": 0.005950927734375, + "ewc_loss_parallel": 5.9604644775390625e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.036575317382812, + "learning_rate": 3.7515896566341673e-07, + "loss": 0.5282, + "mean_token_accuracy": 0.8300994038581848, + "num_tokens": 33732637.0, + "step": 886 + }, + { + "epoch": 0.11283551710978247, + "ewc_loss": 0.006011962890625, + "ewc_loss_parallel": 6.020069122314453e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.184743881225586, + "learning_rate": 3.755828740991945e-07, + "loss": 0.5513, + "mean_token_accuracy": 0.8256086111068726, + "num_tokens": 33775106.0, + "step": 887 + }, + { + "epoch": 0.11296272738837299, + "ewc_loss": 0.006011962890625, + "ewc_loss_parallel": 6.020069122314453e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.04217529296875, + "learning_rate": 3.760067825349724e-07, + "loss": 0.513, + "mean_token_accuracy": 0.8387917280197144, + "num_tokens": 33811297.0, + "step": 888 + }, + { + "epoch": 0.11308993766696349, + "ewc_loss": 0.006011962890625, + "ewc_loss_parallel": 6.020069122314453e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.074125289916992, + "learning_rate": 3.764306909707503e-07, + "loss": 0.4747, + "mean_token_accuracy": 0.846867561340332, + "num_tokens": 33854038.0, + "step": 889 + }, + { + "epoch": 0.113217147945554, + "ewc_loss": 0.00604248046875, + "ewc_loss_parallel": 6.0498714447021484e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.081238746643066, + "learning_rate": 3.768545994065282e-07, + "loss": 0.5316, + "mean_token_accuracy": 0.8306074738502502, + "num_tokens": 33884929.0, + "step": 890 + }, + { + "epoch": 0.11334435822414451, + "ewc_loss": 0.0059814453125, + "ewc_loss_parallel": 5.990266799926758e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 10.970782279968262, + "learning_rate": 3.77278507842306e-07, + "loss": 0.5394, + "mean_token_accuracy": 0.8315808773040771, + "num_tokens": 33926065.0, + "step": 891 + }, + { + "epoch": 0.11347156850273502, + "ewc_loss": 0.00604248046875, + "ewc_loss_parallel": 6.0498714447021484e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.069509506225586, + "learning_rate": 3.777024162780839e-07, + "loss": 0.4887, + "mean_token_accuracy": 0.8415079116821289, + "num_tokens": 33965557.0, + "step": 892 + }, + { + "epoch": 0.11359877878132553, + "ewc_loss": 0.006134033203125, + "ewc_loss_parallel": 6.139278411865234e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.18265438079834, + "learning_rate": 3.781263247138618e-07, + "loss": 0.4994, + "mean_token_accuracy": 0.8407939672470093, + "num_tokens": 34004132.0, + "step": 893 + }, + { + "epoch": 0.11372598905991604, + "ewc_loss": 0.006103515625, + "ewc_loss_parallel": 6.109476089477539e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.157940864562988, + "learning_rate": 3.785502331496397e-07, + "loss": 0.5272, + "mean_token_accuracy": 0.8249938488006592, + "num_tokens": 34039781.0, + "step": 894 + }, + { + "epoch": 0.11385319933850654, + "ewc_loss": 0.006072998046875, + "ewc_loss_parallel": 6.079673767089844e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.228582382202148, + "learning_rate": 3.789741415854175e-07, + "loss": 0.5548, + "mean_token_accuracy": 0.8230118751525879, + "num_tokens": 34080487.0, + "step": 895 + }, + { + "epoch": 0.11398040961709706, + "ewc_loss": 0.006103515625, + "ewc_loss_parallel": 6.109476089477539e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.086329460144043, + "learning_rate": 3.793980500211954e-07, + "loss": 0.53, + "mean_token_accuracy": 0.8320916295051575, + "num_tokens": 34119913.0, + "step": 896 + }, + { + "epoch": 0.11410761989568757, + "ewc_loss": 0.006072998046875, + "ewc_loss_parallel": 6.079673767089844e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.131827354431152, + "learning_rate": 3.798219584569733e-07, + "loss": 0.4648, + "mean_token_accuracy": 0.8504379391670227, + "num_tokens": 34155811.0, + "step": 897 + }, + { + "epoch": 0.11423483017427809, + "ewc_loss": 0.006072998046875, + "ewc_loss_parallel": 6.079673767089844e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.069655418395996, + "learning_rate": 3.8024586689275115e-07, + "loss": 0.5868, + "mean_token_accuracy": 0.8186862468719482, + "num_tokens": 34195022.0, + "step": 898 + }, + { + "epoch": 0.11436204045286859, + "ewc_loss": 0.006103515625, + "ewc_loss_parallel": 6.109476089477539e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.220834732055664, + "learning_rate": 3.80669775328529e-07, + "loss": 0.5192, + "mean_token_accuracy": 0.8311923146247864, + "num_tokens": 34229182.0, + "step": 899 + }, + { + "epoch": 0.1144892507314591, + "ewc_loss": 0.006103515625, + "ewc_loss_parallel": 6.109476089477539e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.309686660766602, + "learning_rate": 3.810936837643069e-07, + "loss": 0.4777, + "mean_token_accuracy": 0.8450090885162354, + "num_tokens": 34266931.0, + "step": 900 + }, + { + "epoch": 0.11461646101004962, + "ewc_loss": 0.00616455078125, + "ewc_loss_parallel": 6.16908073425293e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.218480110168457, + "learning_rate": 3.815175922000848e-07, + "loss": 0.4723, + "mean_token_accuracy": 0.8452094793319702, + "num_tokens": 34301705.0, + "step": 901 + }, + { + "epoch": 0.11474367128864012, + "ewc_loss": 0.006103515625, + "ewc_loss_parallel": 6.109476089477539e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.090598106384277, + "learning_rate": 3.8194150063586264e-07, + "loss": 0.5561, + "mean_token_accuracy": 0.8285641074180603, + "num_tokens": 34340975.0, + "step": 902 + }, + { + "epoch": 0.11487088156723063, + "ewc_loss": 0.006195068359375, + "ewc_loss_parallel": 6.198883056640625e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.095065116882324, + "learning_rate": 3.823654090716405e-07, + "loss": 0.522, + "mean_token_accuracy": 0.8320522904396057, + "num_tokens": 34380024.0, + "step": 903 + }, + { + "epoch": 0.11499809184582115, + "ewc_loss": 0.006103515625, + "ewc_loss_parallel": 6.109476089477539e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.068158149719238, + "learning_rate": 3.827893175074184e-07, + "loss": 0.5432, + "mean_token_accuracy": 0.8347440958023071, + "num_tokens": 34419654.0, + "step": 904 + }, + { + "epoch": 0.11512530212441165, + "ewc_loss": 0.006195068359375, + "ewc_loss_parallel": 6.198883056640625e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.246685981750488, + "learning_rate": 3.832132259431963e-07, + "loss": 0.4744, + "mean_token_accuracy": 0.8496615886688232, + "num_tokens": 34453034.0, + "step": 905 + }, + { + "epoch": 0.11525251240300216, + "ewc_loss": 0.0062255859375, + "ewc_loss_parallel": 6.22868537902832e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.319170951843262, + "learning_rate": 3.8363713437897413e-07, + "loss": 0.5815, + "mean_token_accuracy": 0.8163115978240967, + "num_tokens": 34493694.0, + "step": 906 + }, + { + "epoch": 0.11537972268159268, + "ewc_loss": 0.006195068359375, + "ewc_loss_parallel": 6.198883056640625e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.107304573059082, + "learning_rate": 3.8406104281475197e-07, + "loss": 0.5735, + "mean_token_accuracy": 0.8206253051757812, + "num_tokens": 34531636.0, + "step": 907 + }, + { + "epoch": 0.11550693296018319, + "ewc_loss": 0.006134033203125, + "ewc_loss_parallel": 6.139278411865234e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.092842102050781, + "learning_rate": 3.8448495125052987e-07, + "loss": 0.5569, + "mean_token_accuracy": 0.8211956024169922, + "num_tokens": 34570851.0, + "step": 908 + }, + { + "epoch": 0.11563414323877369, + "ewc_loss": 0.0062255859375, + "ewc_loss_parallel": 6.22868537902832e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.140606880187988, + "learning_rate": 3.8490885968630777e-07, + "loss": 0.5087, + "mean_token_accuracy": 0.8398898839950562, + "num_tokens": 34613314.0, + "step": 909 + }, + { + "epoch": 0.1157613535173642, + "ewc_loss": 0.006256103515625, + "ewc_loss_parallel": 6.258487701416016e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.119017601013184, + "learning_rate": 3.853327681220856e-07, + "loss": 0.5236, + "mean_token_accuracy": 0.832334578037262, + "num_tokens": 34656701.0, + "step": 910 + }, + { + "epoch": 0.11588856379595472, + "ewc_loss": 0.006256103515625, + "ewc_loss_parallel": 6.258487701416016e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.309151649475098, + "learning_rate": 3.8575667655786346e-07, + "loss": 0.4996, + "mean_token_accuracy": 0.8416734933853149, + "num_tokens": 34688595.0, + "step": 911 + }, + { + "epoch": 0.11601577407454522, + "ewc_loss": 0.00628662109375, + "ewc_loss_parallel": 6.288290023803711e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.258482933044434, + "learning_rate": 3.8618058499364136e-07, + "loss": 0.5179, + "mean_token_accuracy": 0.8366302847862244, + "num_tokens": 34726609.0, + "step": 912 + }, + { + "epoch": 0.11614298435313573, + "ewc_loss": 0.006256103515625, + "ewc_loss_parallel": 6.258487701416016e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.19645881652832, + "learning_rate": 3.8660449342941926e-07, + "loss": 0.5759, + "mean_token_accuracy": 0.8216055631637573, + "num_tokens": 34769213.0, + "step": 913 + }, + { + "epoch": 0.11627019463172625, + "ewc_loss": 0.006256103515625, + "ewc_loss_parallel": 6.258487701416016e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.2431058883667, + "learning_rate": 3.870284018651971e-07, + "loss": 0.547, + "mean_token_accuracy": 0.8274402022361755, + "num_tokens": 34806722.0, + "step": 914 + }, + { + "epoch": 0.11639740491031675, + "ewc_loss": 0.0062255859375, + "ewc_loss_parallel": 6.22868537902832e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.172798156738281, + "learning_rate": 3.8745231030097495e-07, + "loss": 0.5029, + "mean_token_accuracy": 0.8426200151443481, + "num_tokens": 34846408.0, + "step": 915 + }, + { + "epoch": 0.11652461518890726, + "ewc_loss": 0.0062255859375, + "ewc_loss_parallel": 6.22868537902832e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.2007417678833, + "learning_rate": 3.8787621873675285e-07, + "loss": 0.4891, + "mean_token_accuracy": 0.8436423540115356, + "num_tokens": 34883791.0, + "step": 916 + }, + { + "epoch": 0.11665182546749778, + "ewc_loss": 0.006256103515625, + "ewc_loss_parallel": 6.258487701416016e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.2182035446167, + "learning_rate": 3.883001271725307e-07, + "loss": 0.4851, + "mean_token_accuracy": 0.8458905220031738, + "num_tokens": 34922768.0, + "step": 917 + }, + { + "epoch": 0.11677903574608828, + "ewc_loss": 0.006256103515625, + "ewc_loss_parallel": 6.258487701416016e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.161470413208008, + "learning_rate": 3.887240356083086e-07, + "loss": 0.4658, + "mean_token_accuracy": 0.8499764800071716, + "num_tokens": 34962184.0, + "step": 918 + }, + { + "epoch": 0.11690624602467879, + "ewc_loss": 0.00628662109375, + "ewc_loss_parallel": 6.288290023803711e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.386494636535645, + "learning_rate": 3.8914794404408644e-07, + "loss": 0.5105, + "mean_token_accuracy": 0.8362277746200562, + "num_tokens": 34991455.0, + "step": 919 + }, + { + "epoch": 0.1170334563032693, + "ewc_loss": 0.006378173828125, + "ewc_loss_parallel": 6.377696990966797e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.359533309936523, + "learning_rate": 3.8957185247986434e-07, + "loss": 0.5389, + "mean_token_accuracy": 0.8270268440246582, + "num_tokens": 35025044.0, + "step": 920 + }, + { + "epoch": 0.11716066658185982, + "ewc_loss": 0.006256103515625, + "ewc_loss_parallel": 6.258487701416016e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.22579288482666, + "learning_rate": 3.899957609156422e-07, + "loss": 0.5129, + "mean_token_accuracy": 0.835875391960144, + "num_tokens": 35064564.0, + "step": 921 + }, + { + "epoch": 0.11728787686045032, + "ewc_loss": 0.00628662109375, + "ewc_loss_parallel": 6.288290023803711e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.246161460876465, + "learning_rate": 3.904196693514201e-07, + "loss": 0.5301, + "mean_token_accuracy": 0.8326363563537598, + "num_tokens": 35109786.0, + "step": 922 + }, + { + "epoch": 0.11741508713904084, + "ewc_loss": 0.006317138671875, + "ewc_loss_parallel": 6.318092346191406e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.383330345153809, + "learning_rate": 3.9084357778719793e-07, + "loss": 0.5936, + "mean_token_accuracy": 0.8120492100715637, + "num_tokens": 35148351.0, + "step": 923 + }, + { + "epoch": 0.11754229741763135, + "ewc_loss": 0.00628662109375, + "ewc_loss_parallel": 6.288290023803711e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.408360481262207, + "learning_rate": 3.9126748622297583e-07, + "loss": 0.5578, + "mean_token_accuracy": 0.8251485228538513, + "num_tokens": 35183464.0, + "step": 924 + }, + { + "epoch": 0.11766950769622185, + "ewc_loss": 0.006256103515625, + "ewc_loss_parallel": 6.258487701416016e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.127041816711426, + "learning_rate": 3.916913946587537e-07, + "loss": 0.5394, + "mean_token_accuracy": 0.8275742530822754, + "num_tokens": 35218561.0, + "step": 925 + }, + { + "epoch": 0.11779671797481236, + "ewc_loss": 0.00628662109375, + "ewc_loss_parallel": 6.288290023803711e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.313324928283691, + "learning_rate": 3.921153030945316e-07, + "loss": 0.554, + "mean_token_accuracy": 0.8237093687057495, + "num_tokens": 35257740.0, + "step": 926 + }, + { + "epoch": 0.11792392825340288, + "ewc_loss": 0.00634765625, + "ewc_loss_parallel": 6.3478946685791016e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.335200309753418, + "learning_rate": 3.925392115303094e-07, + "loss": 0.51, + "mean_token_accuracy": 0.8372139930725098, + "num_tokens": 35291632.0, + "step": 927 + }, + { + "epoch": 0.11805113853199338, + "ewc_loss": 0.006317138671875, + "ewc_loss_parallel": 6.318092346191406e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.441987037658691, + "learning_rate": 3.929631199660873e-07, + "loss": 0.5148, + "mean_token_accuracy": 0.8338524103164673, + "num_tokens": 35327686.0, + "step": 928 + }, + { + "epoch": 0.1181783488105839, + "ewc_loss": 0.00640869140625, + "ewc_loss_parallel": 6.407499313354492e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.397930145263672, + "learning_rate": 3.9338702840186517e-07, + "loss": 0.5166, + "mean_token_accuracy": 0.8335764408111572, + "num_tokens": 35364480.0, + "step": 929 + }, + { + "epoch": 0.11830555908917441, + "ewc_loss": 0.00634765625, + "ewc_loss_parallel": 6.3478946685791016e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.195032119750977, + "learning_rate": 3.9381093683764307e-07, + "loss": 0.522, + "mean_token_accuracy": 0.8351596593856812, + "num_tokens": 35406017.0, + "step": 930 + }, + { + "epoch": 0.11843276936776491, + "ewc_loss": 0.006378173828125, + "ewc_loss_parallel": 6.377696990966797e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.258423805236816, + "learning_rate": 3.942348452734209e-07, + "loss": 0.5811, + "mean_token_accuracy": 0.8165842294692993, + "num_tokens": 35448447.0, + "step": 931 + }, + { + "epoch": 0.11855997964635542, + "ewc_loss": 0.006378173828125, + "ewc_loss_parallel": 6.377696990966797e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.441691398620605, + "learning_rate": 3.946587537091988e-07, + "loss": 0.5295, + "mean_token_accuracy": 0.8299527168273926, + "num_tokens": 35488120.0, + "step": 932 + }, + { + "epoch": 0.11868718992494594, + "ewc_loss": 0.00640869140625, + "ewc_loss_parallel": 6.407499313354492e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.307446479797363, + "learning_rate": 3.9508266214497666e-07, + "loss": 0.5026, + "mean_token_accuracy": 0.8388119339942932, + "num_tokens": 35531759.0, + "step": 933 + }, + { + "epoch": 0.11881440020353645, + "ewc_loss": 0.00634765625, + "ewc_loss_parallel": 6.3478946685791016e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.20001220703125, + "learning_rate": 3.9550657058075456e-07, + "loss": 0.4874, + "mean_token_accuracy": 0.8456229567527771, + "num_tokens": 35571476.0, + "step": 934 + }, + { + "epoch": 0.11894161048212695, + "ewc_loss": 0.00640869140625, + "ewc_loss_parallel": 6.407499313354492e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.379361152648926, + "learning_rate": 3.959304790165324e-07, + "loss": 0.5285, + "mean_token_accuracy": 0.8336330056190491, + "num_tokens": 35608440.0, + "step": 935 + }, + { + "epoch": 0.11906882076071747, + "ewc_loss": 0.00640869140625, + "ewc_loss_parallel": 6.407499313354492e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.358095169067383, + "learning_rate": 3.9635438745231025e-07, + "loss": 0.524, + "mean_token_accuracy": 0.8346856832504272, + "num_tokens": 35641861.0, + "step": 936 + }, + { + "epoch": 0.11919603103930798, + "ewc_loss": 0.006378173828125, + "ewc_loss_parallel": 6.377696990966797e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.3583402633667, + "learning_rate": 3.9677829588808815e-07, + "loss": 0.5465, + "mean_token_accuracy": 0.8262609243392944, + "num_tokens": 35684626.0, + "step": 937 + }, + { + "epoch": 0.11932324131789848, + "ewc_loss": 0.006378173828125, + "ewc_loss_parallel": 6.377696990966797e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.450848579406738, + "learning_rate": 3.9720220432386605e-07, + "loss": 0.5512, + "mean_token_accuracy": 0.8231324553489685, + "num_tokens": 35718125.0, + "step": 938 + }, + { + "epoch": 0.119450451596489, + "ewc_loss": 0.006439208984375, + "ewc_loss_parallel": 6.4373016357421875e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.320228576660156, + "learning_rate": 3.976261127596439e-07, + "loss": 0.496, + "mean_token_accuracy": 0.8439708948135376, + "num_tokens": 35752818.0, + "step": 939 + }, + { + "epoch": 0.11957766187507951, + "ewc_loss": 0.006378173828125, + "ewc_loss_parallel": 6.377696990966797e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.392444610595703, + "learning_rate": 3.9805002119542174e-07, + "loss": 0.4751, + "mean_token_accuracy": 0.8482145071029663, + "num_tokens": 35787097.0, + "step": 940 + }, + { + "epoch": 0.11970487215367001, + "ewc_loss": 0.006439208984375, + "ewc_loss_parallel": 6.4373016357421875e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.3844575881958, + "learning_rate": 3.9847392963119964e-07, + "loss": 0.5747, + "mean_token_accuracy": 0.8210455179214478, + "num_tokens": 35825887.0, + "step": 941 + }, + { + "epoch": 0.11983208243226053, + "ewc_loss": 0.006378173828125, + "ewc_loss_parallel": 6.377696990966797e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.343664169311523, + "learning_rate": 3.9889783806697754e-07, + "loss": 0.5003, + "mean_token_accuracy": 0.8432450294494629, + "num_tokens": 35863592.0, + "step": 942 + }, + { + "epoch": 0.11995929271085104, + "ewc_loss": 0.00640869140625, + "ewc_loss_parallel": 6.407499313354492e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.372085571289062, + "learning_rate": 3.993217465027554e-07, + "loss": 0.5768, + "mean_token_accuracy": 0.8215351104736328, + "num_tokens": 35904483.0, + "step": 943 + }, + { + "epoch": 0.12008650298944154, + "ewc_loss": 0.00640869140625, + "ewc_loss_parallel": 6.407499313354492e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.436942100524902, + "learning_rate": 3.9974565493853323e-07, + "loss": 0.4629, + "mean_token_accuracy": 0.8526290655136108, + "num_tokens": 35938662.0, + "step": 944 + }, + { + "epoch": 0.12021371326803205, + "ewc_loss": 0.006439208984375, + "ewc_loss_parallel": 6.4373016357421875e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.361159324645996, + "learning_rate": 4.0016956337431113e-07, + "loss": 0.5277, + "mean_token_accuracy": 0.8316980600357056, + "num_tokens": 35975176.0, + "step": 945 + }, + { + "epoch": 0.12034092354662257, + "ewc_loss": 0.006378173828125, + "ewc_loss_parallel": 6.377696990966797e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.352616310119629, + "learning_rate": 4.0059347181008903e-07, + "loss": 0.5134, + "mean_token_accuracy": 0.8360581994056702, + "num_tokens": 36008893.0, + "step": 946 + }, + { + "epoch": 0.12046813382521308, + "ewc_loss": 0.006439208984375, + "ewc_loss_parallel": 6.4373016357421875e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.357592582702637, + "learning_rate": 4.010173802458669e-07, + "loss": 0.473, + "mean_token_accuracy": 0.8513116836547852, + "num_tokens": 36049882.0, + "step": 947 + }, + { + "epoch": 0.12059534410380358, + "ewc_loss": 0.006439208984375, + "ewc_loss_parallel": 6.4373016357421875e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.42014217376709, + "learning_rate": 4.014412886816447e-07, + "loss": 0.5873, + "mean_token_accuracy": 0.8186429738998413, + "num_tokens": 36090615.0, + "step": 948 + }, + { + "epoch": 0.1207225543823941, + "ewc_loss": 0.0064697265625, + "ewc_loss_parallel": 6.467103958129883e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.379570960998535, + "learning_rate": 4.018651971174226e-07, + "loss": 0.5166, + "mean_token_accuracy": 0.8326207399368286, + "num_tokens": 36128159.0, + "step": 949 + }, + { + "epoch": 0.12084976466098461, + "ewc_loss": 0.006439208984375, + "ewc_loss_parallel": 6.4373016357421875e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.378317832946777, + "learning_rate": 4.022891055532005e-07, + "loss": 0.5298, + "mean_token_accuracy": 0.8318923711776733, + "num_tokens": 36172851.0, + "step": 950 + }, + { + "epoch": 0.12097697493957511, + "ewc_loss": 0.006500244140625, + "ewc_loss_parallel": 6.496906280517578e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.440315246582031, + "learning_rate": 4.0271301398897837e-07, + "loss": 0.5808, + "mean_token_accuracy": 0.8202083110809326, + "num_tokens": 36214282.0, + "step": 951 + }, + { + "epoch": 0.12110418521816563, + "ewc_loss": 0.006439208984375, + "ewc_loss_parallel": 6.4373016357421875e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.445229530334473, + "learning_rate": 4.031369224247562e-07, + "loss": 0.5022, + "mean_token_accuracy": 0.8358110785484314, + "num_tokens": 36248619.0, + "step": 952 + }, + { + "epoch": 0.12123139549675614, + "ewc_loss": 0.0064697265625, + "ewc_loss_parallel": 6.467103958129883e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.421783447265625, + "learning_rate": 4.035608308605341e-07, + "loss": 0.5392, + "mean_token_accuracy": 0.8281242251396179, + "num_tokens": 36289070.0, + "step": 953 + }, + { + "epoch": 0.12135860577534664, + "ewc_loss": 0.0064697265625, + "ewc_loss_parallel": 6.467103958129883e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.393950462341309, + "learning_rate": 4.03984739296312e-07, + "loss": 0.5383, + "mean_token_accuracy": 0.8319294452667236, + "num_tokens": 36330883.0, + "step": 954 + }, + { + "epoch": 0.12148581605393716, + "ewc_loss": 0.006500244140625, + "ewc_loss_parallel": 6.496906280517578e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.453580856323242, + "learning_rate": 4.044086477320898e-07, + "loss": 0.5177, + "mean_token_accuracy": 0.8355134129524231, + "num_tokens": 36371507.0, + "step": 955 + }, + { + "epoch": 0.12161302633252767, + "ewc_loss": 0.00653076171875, + "ewc_loss_parallel": 6.5267086029052734e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.53702163696289, + "learning_rate": 4.048325561678677e-07, + "loss": 0.5411, + "mean_token_accuracy": 0.8279442191123962, + "num_tokens": 36407262.0, + "step": 956 + }, + { + "epoch": 0.12174023661111817, + "ewc_loss": 0.00653076171875, + "ewc_loss_parallel": 6.5267086029052734e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.53119945526123, + "learning_rate": 4.052564646036456e-07, + "loss": 0.5366, + "mean_token_accuracy": 0.8311896324157715, + "num_tokens": 36448089.0, + "step": 957 + }, + { + "epoch": 0.12186744688970869, + "ewc_loss": 0.006500244140625, + "ewc_loss_parallel": 6.496906280517578e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.445676803588867, + "learning_rate": 4.056803730394235e-07, + "loss": 0.55, + "mean_token_accuracy": 0.8231745958328247, + "num_tokens": 36484022.0, + "step": 958 + }, + { + "epoch": 0.1219946571682992, + "ewc_loss": 0.006500244140625, + "ewc_loss_parallel": 6.496906280517578e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.390281677246094, + "learning_rate": 4.061042814752013e-07, + "loss": 0.5255, + "mean_token_accuracy": 0.8340187072753906, + "num_tokens": 36519385.0, + "step": 959 + }, + { + "epoch": 0.12212186744688971, + "ewc_loss": 0.00653076171875, + "ewc_loss_parallel": 6.5267086029052734e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.647344589233398, + "learning_rate": 4.065281899109792e-07, + "loss": 0.5292, + "mean_token_accuracy": 0.8306756019592285, + "num_tokens": 36551943.0, + "step": 960 + }, + { + "epoch": 0.12224907772548022, + "ewc_loss": 0.006622314453125, + "ewc_loss_parallel": 6.616115570068359e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.565139770507812, + "learning_rate": 4.069520983467571e-07, + "loss": 0.4747, + "mean_token_accuracy": 0.8482080698013306, + "num_tokens": 36590927.0, + "step": 961 + }, + { + "epoch": 0.12237628800407073, + "ewc_loss": 0.00653076171875, + "ewc_loss_parallel": 6.5267086029052734e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.485408782958984, + "learning_rate": 4.07376006782535e-07, + "loss": 0.5087, + "mean_token_accuracy": 0.8358437418937683, + "num_tokens": 36625199.0, + "step": 962 + }, + { + "epoch": 0.12250349828266124, + "ewc_loss": 0.006591796875, + "ewc_loss_parallel": 6.586313247680664e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.525609016418457, + "learning_rate": 4.077999152183128e-07, + "loss": 0.4701, + "mean_token_accuracy": 0.8490269184112549, + "num_tokens": 36658981.0, + "step": 963 + }, + { + "epoch": 0.12263070856125174, + "ewc_loss": 0.006591796875, + "ewc_loss_parallel": 6.586313247680664e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.466449737548828, + "learning_rate": 4.082238236540907e-07, + "loss": 0.5304, + "mean_token_accuracy": 0.8349442481994629, + "num_tokens": 36704214.0, + "step": 964 + }, + { + "epoch": 0.12275791883984226, + "ewc_loss": 0.006591796875, + "ewc_loss_parallel": 6.586313247680664e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.533071517944336, + "learning_rate": 4.086477320898686e-07, + "loss": 0.4992, + "mean_token_accuracy": 0.8394196033477783, + "num_tokens": 36744782.0, + "step": 965 + }, + { + "epoch": 0.12288512911843277, + "ewc_loss": 0.006591796875, + "ewc_loss_parallel": 6.586313247680664e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.620100021362305, + "learning_rate": 4.090716405256465e-07, + "loss": 0.5354, + "mean_token_accuracy": 0.8280181884765625, + "num_tokens": 36783645.0, + "step": 966 + }, + { + "epoch": 0.12301233939702327, + "ewc_loss": 0.006622314453125, + "ewc_loss_parallel": 6.616115570068359e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.609374046325684, + "learning_rate": 4.094955489614243e-07, + "loss": 0.5036, + "mean_token_accuracy": 0.8388323783874512, + "num_tokens": 36817539.0, + "step": 967 + }, + { + "epoch": 0.12313954967561379, + "ewc_loss": 0.006591796875, + "ewc_loss_parallel": 6.586313247680664e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.581934928894043, + "learning_rate": 4.099194573972022e-07, + "loss": 0.5048, + "mean_token_accuracy": 0.8380284905433655, + "num_tokens": 36859144.0, + "step": 968 + }, + { + "epoch": 0.1232667599542043, + "ewc_loss": 0.006622314453125, + "ewc_loss_parallel": 6.616115570068359e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.480524063110352, + "learning_rate": 4.1034336583298007e-07, + "loss": 0.547, + "mean_token_accuracy": 0.8263152837753296, + "num_tokens": 36900833.0, + "step": 969 + }, + { + "epoch": 0.1233939702327948, + "ewc_loss": 0.006561279296875, + "ewc_loss_parallel": 6.556510925292969e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.544888496398926, + "learning_rate": 4.1076727426875797e-07, + "loss": 0.5369, + "mean_token_accuracy": 0.8291109204292297, + "num_tokens": 36940935.0, + "step": 970 + }, + { + "epoch": 0.12352118051138532, + "ewc_loss": 0.006622314453125, + "ewc_loss_parallel": 6.616115570068359e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.606925964355469, + "learning_rate": 4.1119118270453577e-07, + "loss": 0.5994, + "mean_token_accuracy": 0.8097186088562012, + "num_tokens": 36979874.0, + "step": 971 + }, + { + "epoch": 0.12364839078997583, + "ewc_loss": 0.006591796875, + "ewc_loss_parallel": 6.586313247680664e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.52630615234375, + "learning_rate": 4.1161509114031366e-07, + "loss": 0.48, + "mean_token_accuracy": 0.8480772972106934, + "num_tokens": 37017917.0, + "step": 972 + }, + { + "epoch": 0.12377560106856635, + "ewc_loss": 0.006591796875, + "ewc_loss_parallel": 6.586313247680664e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.509787559509277, + "learning_rate": 4.1203899957609156e-07, + "loss": 0.5446, + "mean_token_accuracy": 0.8270389437675476, + "num_tokens": 37054914.0, + "step": 973 + }, + { + "epoch": 0.12390281134715685, + "ewc_loss": 0.006591796875, + "ewc_loss_parallel": 6.586313247680664e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.56696605682373, + "learning_rate": 4.124629080118694e-07, + "loss": 0.4949, + "mean_token_accuracy": 0.8392511606216431, + "num_tokens": 37088277.0, + "step": 974 + }, + { + "epoch": 0.12403002162574736, + "ewc_loss": 0.006591796875, + "ewc_loss_parallel": 6.586313247680664e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.574953079223633, + "learning_rate": 4.1288681644764726e-07, + "loss": 0.5414, + "mean_token_accuracy": 0.8264532685279846, + "num_tokens": 37126963.0, + "step": 975 + }, + { + "epoch": 0.12415723190433788, + "ewc_loss": 0.006622314453125, + "ewc_loss_parallel": 6.616115570068359e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.706050872802734, + "learning_rate": 4.1331072488342515e-07, + "loss": 0.555, + "mean_token_accuracy": 0.8251731395721436, + "num_tokens": 37161948.0, + "step": 976 + }, + { + "epoch": 0.12428444218292838, + "ewc_loss": 0.006683349609375, + "ewc_loss_parallel": 6.67572021484375e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.558656692504883, + "learning_rate": 4.1373463331920305e-07, + "loss": 0.4686, + "mean_token_accuracy": 0.8507957458496094, + "num_tokens": 37198552.0, + "step": 977 + }, + { + "epoch": 0.12441165246151889, + "ewc_loss": 0.00665283203125, + "ewc_loss_parallel": 6.645917892456055e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.611902236938477, + "learning_rate": 4.141585417549809e-07, + "loss": 0.5397, + "mean_token_accuracy": 0.8257325291633606, + "num_tokens": 37235990.0, + "step": 978 + }, + { + "epoch": 0.1245388627401094, + "ewc_loss": 0.006683349609375, + "ewc_loss_parallel": 6.67572021484375e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.64260196685791, + "learning_rate": 4.1458245019075875e-07, + "loss": 0.4984, + "mean_token_accuracy": 0.8390541672706604, + "num_tokens": 37269664.0, + "step": 979 + }, + { + "epoch": 0.1246660730186999, + "ewc_loss": 0.0067138671875, + "ewc_loss_parallel": 6.705522537231445e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.60258674621582, + "learning_rate": 4.1500635862653664e-07, + "loss": 0.5295, + "mean_token_accuracy": 0.8328495025634766, + "num_tokens": 37308647.0, + "step": 980 + }, + { + "epoch": 0.12479328329729042, + "ewc_loss": 0.0067138671875, + "ewc_loss_parallel": 6.705522537231445e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.539310455322266, + "learning_rate": 4.1543026706231454e-07, + "loss": 0.479, + "mean_token_accuracy": 0.847491443157196, + "num_tokens": 37345953.0, + "step": 981 + }, + { + "epoch": 0.12492049357588093, + "ewc_loss": 0.006622314453125, + "ewc_loss_parallel": 6.616115570068359e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.623638153076172, + "learning_rate": 4.158541754980924e-07, + "loss": 0.4751, + "mean_token_accuracy": 0.8502812385559082, + "num_tokens": 37382735.0, + "step": 982 + }, + { + "epoch": 0.12504770385447145, + "ewc_loss": 0.0067138671875, + "ewc_loss_parallel": 6.705522537231445e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.732346534729004, + "learning_rate": 4.1627808393387024e-07, + "loss": 0.5417, + "mean_token_accuracy": 0.8258789777755737, + "num_tokens": 37419605.0, + "step": 983 + }, + { + "epoch": 0.12517491413306195, + "ewc_loss": 0.006744384765625, + "ewc_loss_parallel": 6.735324859619141e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.587833404541016, + "learning_rate": 4.1670199236964813e-07, + "loss": 0.5345, + "mean_token_accuracy": 0.8247244358062744, + "num_tokens": 37455395.0, + "step": 984 + }, + { + "epoch": 0.12530212441165245, + "ewc_loss": 0.0067138671875, + "ewc_loss_parallel": 6.705522537231445e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.653383255004883, + "learning_rate": 4.1712590080542603e-07, + "loss": 0.4856, + "mean_token_accuracy": 0.8448973894119263, + "num_tokens": 37492208.0, + "step": 985 + }, + { + "epoch": 0.12542933469024298, + "ewc_loss": 0.00677490234375, + "ewc_loss_parallel": 6.765127182006836e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.57749080657959, + "learning_rate": 4.175498092412039e-07, + "loss": 0.532, + "mean_token_accuracy": 0.8339201211929321, + "num_tokens": 37533675.0, + "step": 986 + }, + { + "epoch": 0.12555654496883348, + "ewc_loss": 0.006744384765625, + "ewc_loss_parallel": 6.735324859619141e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.657464981079102, + "learning_rate": 4.179737176769817e-07, + "loss": 0.4973, + "mean_token_accuracy": 0.8422200679779053, + "num_tokens": 37572792.0, + "step": 987 + }, + { + "epoch": 0.12568375524742398, + "ewc_loss": 0.006805419921875, + "ewc_loss_parallel": 6.794929504394531e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.727885246276855, + "learning_rate": 4.183976261127596e-07, + "loss": 0.4933, + "mean_token_accuracy": 0.841144323348999, + "num_tokens": 37611481.0, + "step": 988 + }, + { + "epoch": 0.1258109655260145, + "ewc_loss": 0.006744384765625, + "ewc_loss_parallel": 6.735324859619141e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.624896049499512, + "learning_rate": 4.1882153454853747e-07, + "loss": 0.5723, + "mean_token_accuracy": 0.8183115720748901, + "num_tokens": 37648253.0, + "step": 989 + }, + { + "epoch": 0.125938175804605, + "ewc_loss": 0.00677490234375, + "ewc_loss_parallel": 6.765127182006836e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.641664505004883, + "learning_rate": 4.1924544298431537e-07, + "loss": 0.5044, + "mean_token_accuracy": 0.8405468463897705, + "num_tokens": 37687086.0, + "step": 990 + }, + { + "epoch": 0.12606538608319554, + "ewc_loss": 0.0068359375, + "ewc_loss_parallel": 6.8247318267822266e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.644551277160645, + "learning_rate": 4.196693514200932e-07, + "loss": 0.499, + "mean_token_accuracy": 0.8412900567054749, + "num_tokens": 37730928.0, + "step": 991 + }, + { + "epoch": 0.12619259636178604, + "ewc_loss": 0.006866455078125, + "ewc_loss_parallel": 6.854534149169922e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.63580322265625, + "learning_rate": 4.200932598558711e-07, + "loss": 0.5245, + "mean_token_accuracy": 0.8305172920227051, + "num_tokens": 37771857.0, + "step": 992 + }, + { + "epoch": 0.12631980664037654, + "ewc_loss": 0.0068359375, + "ewc_loss_parallel": 6.8247318267822266e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.68128490447998, + "learning_rate": 4.2051716829164896e-07, + "loss": 0.5272, + "mean_token_accuracy": 0.8350937366485596, + "num_tokens": 37817517.0, + "step": 993 + }, + { + "epoch": 0.12644701691896706, + "ewc_loss": 0.0068359375, + "ewc_loss_parallel": 6.8247318267822266e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.760419845581055, + "learning_rate": 4.2094107672742686e-07, + "loss": 0.5702, + "mean_token_accuracy": 0.8232302665710449, + "num_tokens": 37855891.0, + "step": 994 + }, + { + "epoch": 0.12657422719755757, + "ewc_loss": 0.0068359375, + "ewc_loss_parallel": 6.8247318267822266e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.555272102355957, + "learning_rate": 4.2136498516320476e-07, + "loss": 0.4915, + "mean_token_accuracy": 0.8441894054412842, + "num_tokens": 37888719.0, + "step": 995 + }, + { + "epoch": 0.12670143747614807, + "ewc_loss": 0.006866455078125, + "ewc_loss_parallel": 6.854534149169922e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.67696475982666, + "learning_rate": 4.217888935989826e-07, + "loss": 0.5416, + "mean_token_accuracy": 0.8286706209182739, + "num_tokens": 37932968.0, + "step": 996 + }, + { + "epoch": 0.1268286477547386, + "ewc_loss": 0.006927490234375, + "ewc_loss_parallel": 6.9141387939453125e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.763500213623047, + "learning_rate": 4.2221280203476045e-07, + "loss": 0.533, + "mean_token_accuracy": 0.8290736675262451, + "num_tokens": 37974297.0, + "step": 997 + }, + { + "epoch": 0.1269558580333291, + "ewc_loss": 0.006927490234375, + "ewc_loss_parallel": 6.9141387939453125e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.700416564941406, + "learning_rate": 4.2263671047053835e-07, + "loss": 0.5507, + "mean_token_accuracy": 0.8254102468490601, + "num_tokens": 38020294.0, + "step": 998 + }, + { + "epoch": 0.1270830683119196, + "ewc_loss": 0.006866455078125, + "ewc_loss_parallel": 6.854534149169922e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.597610473632812, + "learning_rate": 4.2306061890631625e-07, + "loss": 0.5302, + "mean_token_accuracy": 0.832314133644104, + "num_tokens": 38063322.0, + "step": 999 + }, + { + "epoch": 0.12721027859051012, + "ewc_loss": 0.006866455078125, + "ewc_loss_parallel": 6.854534149169922e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.732376098632812, + "learning_rate": 4.234845273420941e-07, + "loss": 0.6193, + "mean_token_accuracy": 0.8123222589492798, + "num_tokens": 38101052.0, + "step": 1000 + }, + { + "epoch": 0.12733748886910062, + "ewc_loss": 0.006988525390625, + "ewc_loss_parallel": 6.973743438720703e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.720954895019531, + "learning_rate": 4.2390843577787194e-07, + "loss": 0.5381, + "mean_token_accuracy": 0.8287047743797302, + "num_tokens": 38145472.0, + "step": 1001 + }, + { + "epoch": 0.12746469914769112, + "ewc_loss": 0.00689697265625, + "ewc_loss_parallel": 6.884336471557617e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.74082088470459, + "learning_rate": 4.2433234421364984e-07, + "loss": 0.5141, + "mean_token_accuracy": 0.8361234664916992, + "num_tokens": 38182662.0, + "step": 1002 + }, + { + "epoch": 0.12759190942628165, + "ewc_loss": 0.0069580078125, + "ewc_loss_parallel": 6.943941116333008e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.665311813354492, + "learning_rate": 4.2475625264942774e-07, + "loss": 0.5008, + "mean_token_accuracy": 0.8416522741317749, + "num_tokens": 38224735.0, + "step": 1003 + }, + { + "epoch": 0.12771911970487215, + "ewc_loss": 0.006927490234375, + "ewc_loss_parallel": 6.9141387939453125e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.727293014526367, + "learning_rate": 4.251801610852056e-07, + "loss": 0.5448, + "mean_token_accuracy": 0.8278605937957764, + "num_tokens": 38255847.0, + "step": 1004 + }, + { + "epoch": 0.12784632998346265, + "ewc_loss": 0.006988525390625, + "ewc_loss_parallel": 6.973743438720703e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.904386520385742, + "learning_rate": 4.2560406952098343e-07, + "loss": 0.5583, + "mean_token_accuracy": 0.8248149156570435, + "num_tokens": 38296921.0, + "step": 1005 + }, + { + "epoch": 0.12797354026205318, + "ewc_loss": 0.006988525390625, + "ewc_loss_parallel": 6.973743438720703e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.56218433380127, + "learning_rate": 4.2602797795676133e-07, + "loss": 0.568, + "mean_token_accuracy": 0.8217660784721375, + "num_tokens": 38336408.0, + "step": 1006 + }, + { + "epoch": 0.12810075054064368, + "ewc_loss": 0.006927490234375, + "ewc_loss_parallel": 6.9141387939453125e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.719252586364746, + "learning_rate": 4.2645188639253923e-07, + "loss": 0.4752, + "mean_token_accuracy": 0.8467041850090027, + "num_tokens": 38372809.0, + "step": 1007 + }, + { + "epoch": 0.12822796081923418, + "ewc_loss": 0.006988525390625, + "ewc_loss_parallel": 7.0035457611083984e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.731159210205078, + "learning_rate": 4.26875794828317e-07, + "loss": 0.5349, + "mean_token_accuracy": 0.8290074467658997, + "num_tokens": 38413312.0, + "step": 1008 + }, + { + "epoch": 0.1283551710978247, + "ewc_loss": 0.006988525390625, + "ewc_loss_parallel": 6.973743438720703e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.683406829833984, + "learning_rate": 4.272997032640949e-07, + "loss": 0.557, + "mean_token_accuracy": 0.8229226469993591, + "num_tokens": 38457710.0, + "step": 1009 + }, + { + "epoch": 0.1284823813764152, + "ewc_loss": 0.00701904296875, + "ewc_loss_parallel": 7.033348083496094e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.77942943572998, + "learning_rate": 4.277236116998728e-07, + "loss": 0.5164, + "mean_token_accuracy": 0.836679220199585, + "num_tokens": 38499630.0, + "step": 1010 + }, + { + "epoch": 0.1286095916550057, + "ewc_loss": 0.007049560546875, + "ewc_loss_parallel": 7.063150405883789e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.792839050292969, + "learning_rate": 4.281475201356507e-07, + "loss": 0.5238, + "mean_token_accuracy": 0.8346880674362183, + "num_tokens": 38538133.0, + "step": 1011 + }, + { + "epoch": 0.12873680193359624, + "ewc_loss": 0.00701904296875, + "ewc_loss_parallel": 7.033348083496094e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.722024917602539, + "learning_rate": 4.285714285714285e-07, + "loss": 0.5573, + "mean_token_accuracy": 0.8249072432518005, + "num_tokens": 38576816.0, + "step": 1012 + }, + { + "epoch": 0.12886401221218674, + "ewc_loss": 0.007049560546875, + "ewc_loss_parallel": 7.063150405883789e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.858878135681152, + "learning_rate": 4.289953370072064e-07, + "loss": 0.4944, + "mean_token_accuracy": 0.8424280881881714, + "num_tokens": 38615270.0, + "step": 1013 + }, + { + "epoch": 0.12899122249077727, + "ewc_loss": 0.007110595703125, + "ewc_loss_parallel": 7.12275505065918e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.74111557006836, + "learning_rate": 4.294192454429843e-07, + "loss": 0.5471, + "mean_token_accuracy": 0.8257010579109192, + "num_tokens": 38654173.0, + "step": 1014 + }, + { + "epoch": 0.12911843276936777, + "ewc_loss": 0.006988525390625, + "ewc_loss_parallel": 7.0035457611083984e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.629594802856445, + "learning_rate": 4.298431538787622e-07, + "loss": 0.432, + "mean_token_accuracy": 0.8623251914978027, + "num_tokens": 38695758.0, + "step": 1015 + }, + { + "epoch": 0.12924564304795827, + "ewc_loss": 0.00701904296875, + "ewc_loss_parallel": 7.033348083496094e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.842967987060547, + "learning_rate": 4.3026706231454e-07, + "loss": 0.5788, + "mean_token_accuracy": 0.8169549107551575, + "num_tokens": 38729524.0, + "step": 1016 + }, + { + "epoch": 0.1293728533265488, + "ewc_loss": 0.007080078125, + "ewc_loss_parallel": 7.092952728271484e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.697041511535645, + "learning_rate": 4.306909707503179e-07, + "loss": 0.5259, + "mean_token_accuracy": 0.8364776372909546, + "num_tokens": 38767500.0, + "step": 1017 + }, + { + "epoch": 0.1295000636051393, + "ewc_loss": 0.00701904296875, + "ewc_loss_parallel": 7.033348083496094e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.788421630859375, + "learning_rate": 4.311148791860958e-07, + "loss": 0.5129, + "mean_token_accuracy": 0.8356242179870605, + "num_tokens": 38798683.0, + "step": 1018 + }, + { + "epoch": 0.1296272738837298, + "ewc_loss": 0.007110595703125, + "ewc_loss_parallel": 7.12275505065918e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.849468231201172, + "learning_rate": 4.315387876218737e-07, + "loss": 0.5628, + "mean_token_accuracy": 0.8231849670410156, + "num_tokens": 38833653.0, + "step": 1019 + }, + { + "epoch": 0.12975448416232033, + "ewc_loss": 0.007080078125, + "ewc_loss_parallel": 7.092952728271484e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.783622741699219, + "learning_rate": 4.319626960576515e-07, + "loss": 0.5596, + "mean_token_accuracy": 0.820275068283081, + "num_tokens": 38869428.0, + "step": 1020 + }, + { + "epoch": 0.12988169444091083, + "ewc_loss": 0.007080078125, + "ewc_loss_parallel": 7.092952728271484e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.82163143157959, + "learning_rate": 4.323866044934294e-07, + "loss": 0.4961, + "mean_token_accuracy": 0.8377563953399658, + "num_tokens": 38909502.0, + "step": 1021 + }, + { + "epoch": 0.13000890471950133, + "ewc_loss": 0.00714111328125, + "ewc_loss_parallel": 7.152557373046875e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.727696418762207, + "learning_rate": 4.328105129292073e-07, + "loss": 0.5215, + "mean_token_accuracy": 0.8360294699668884, + "num_tokens": 38951730.0, + "step": 1022 + }, + { + "epoch": 0.13013611499809186, + "ewc_loss": 0.00714111328125, + "ewc_loss_parallel": 7.152557373046875e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.652445793151855, + "learning_rate": 4.332344213649852e-07, + "loss": 0.4987, + "mean_token_accuracy": 0.8380230069160461, + "num_tokens": 38988981.0, + "step": 1023 + }, + { + "epoch": 0.13026332527668236, + "ewc_loss": 0.00714111328125, + "ewc_loss_parallel": 7.152557373046875e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.691095352172852, + "learning_rate": 4.33658329800763e-07, + "loss": 0.5065, + "mean_token_accuracy": 0.8400635719299316, + "num_tokens": 39026289.0, + "step": 1024 + }, + { + "epoch": 0.13039053555527286, + "ewc_loss": 0.007110595703125, + "ewc_loss_parallel": 7.12275505065918e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.767659187316895, + "learning_rate": 4.340822382365409e-07, + "loss": 0.4977, + "mean_token_accuracy": 0.83852219581604, + "num_tokens": 39069113.0, + "step": 1025 + }, + { + "epoch": 0.13051774583386339, + "ewc_loss": 0.00714111328125, + "ewc_loss_parallel": 7.152557373046875e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.754389762878418, + "learning_rate": 4.345061466723188e-07, + "loss": 0.4818, + "mean_token_accuracy": 0.8461475968360901, + "num_tokens": 39112991.0, + "step": 1026 + }, + { + "epoch": 0.13064495611245389, + "ewc_loss": 0.007171630859375, + "ewc_loss_parallel": 7.18235969543457e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.776925086975098, + "learning_rate": 4.3493005510809663e-07, + "loss": 0.5446, + "mean_token_accuracy": 0.8282307386398315, + "num_tokens": 39149593.0, + "step": 1027 + }, + { + "epoch": 0.1307721663910444, + "ewc_loss": 0.00714111328125, + "ewc_loss_parallel": 7.152557373046875e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.90604305267334, + "learning_rate": 4.353539635438745e-07, + "loss": 0.5079, + "mean_token_accuracy": 0.8375231027603149, + "num_tokens": 39186570.0, + "step": 1028 + }, + { + "epoch": 0.13089937666963491, + "ewc_loss": 0.007171630859375, + "ewc_loss_parallel": 7.18235969543457e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.846163749694824, + "learning_rate": 4.357778719796524e-07, + "loss": 0.571, + "mean_token_accuracy": 0.817068338394165, + "num_tokens": 39220666.0, + "step": 1029 + }, + { + "epoch": 0.13102658694822542, + "ewc_loss": 0.00714111328125, + "ewc_loss_parallel": 7.152557373046875e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.877298355102539, + "learning_rate": 4.362017804154303e-07, + "loss": 0.4906, + "mean_token_accuracy": 0.8446714282035828, + "num_tokens": 39258372.0, + "step": 1030 + }, + { + "epoch": 0.13115379722681592, + "ewc_loss": 0.0072021484375, + "ewc_loss_parallel": 7.212162017822266e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.859256744384766, + "learning_rate": 4.366256888512081e-07, + "loss": 0.4852, + "mean_token_accuracy": 0.8463922142982483, + "num_tokens": 39293647.0, + "step": 1031 + }, + { + "epoch": 0.13128100750540644, + "ewc_loss": 0.00714111328125, + "ewc_loss_parallel": 7.152557373046875e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.747572898864746, + "learning_rate": 4.3704959728698597e-07, + "loss": 0.5143, + "mean_token_accuracy": 0.8347582221031189, + "num_tokens": 39332811.0, + "step": 1032 + }, + { + "epoch": 0.13140821778399694, + "ewc_loss": 0.00714111328125, + "ewc_loss_parallel": 7.152557373046875e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.940815925598145, + "learning_rate": 4.3747350572276386e-07, + "loss": 0.5161, + "mean_token_accuracy": 0.8339845538139343, + "num_tokens": 39365971.0, + "step": 1033 + }, + { + "epoch": 0.13153542806258745, + "ewc_loss": 0.00726318359375, + "ewc_loss_parallel": 7.271766662597656e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.774677276611328, + "learning_rate": 4.3789741415854176e-07, + "loss": 0.5297, + "mean_token_accuracy": 0.832958996295929, + "num_tokens": 39403178.0, + "step": 1034 + }, + { + "epoch": 0.13166263834117797, + "ewc_loss": 0.00714111328125, + "ewc_loss_parallel": 7.152557373046875e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.823163032531738, + "learning_rate": 4.383213225943196e-07, + "loss": 0.5188, + "mean_token_accuracy": 0.8329308032989502, + "num_tokens": 39441303.0, + "step": 1035 + }, + { + "epoch": 0.13178984861976847, + "ewc_loss": 0.007232666015625, + "ewc_loss_parallel": 7.241964340209961e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.805877685546875, + "learning_rate": 4.3874523103009746e-07, + "loss": 0.4648, + "mean_token_accuracy": 0.8521503806114197, + "num_tokens": 39477359.0, + "step": 1036 + }, + { + "epoch": 0.13191705889835897, + "ewc_loss": 0.007232666015625, + "ewc_loss_parallel": 7.241964340209961e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.800521850585938, + "learning_rate": 4.3916913946587536e-07, + "loss": 0.474, + "mean_token_accuracy": 0.8447203636169434, + "num_tokens": 39511687.0, + "step": 1037 + }, + { + "epoch": 0.1320442691769495, + "ewc_loss": 0.00732421875, + "ewc_loss_parallel": 7.331371307373047e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.856371879577637, + "learning_rate": 4.3959304790165325e-07, + "loss": 0.4898, + "mean_token_accuracy": 0.8408910036087036, + "num_tokens": 39547453.0, + "step": 1038 + }, + { + "epoch": 0.13217147945554, + "ewc_loss": 0.00726318359375, + "ewc_loss_parallel": 7.271766662597656e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.94178581237793, + "learning_rate": 4.400169563374311e-07, + "loss": 0.5063, + "mean_token_accuracy": 0.8372157216072083, + "num_tokens": 39588934.0, + "step": 1039 + }, + { + "epoch": 0.13229868973413053, + "ewc_loss": 0.007293701171875, + "ewc_loss_parallel": 7.3015689849853516e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.87307071685791, + "learning_rate": 4.4044086477320895e-07, + "loss": 0.5994, + "mean_token_accuracy": 0.8154809474945068, + "num_tokens": 39627638.0, + "step": 1040 + }, + { + "epoch": 0.13242590001272103, + "ewc_loss": 0.007232666015625, + "ewc_loss_parallel": 7.241964340209961e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.897027969360352, + "learning_rate": 4.4086477320898685e-07, + "loss": 0.506, + "mean_token_accuracy": 0.8374277353286743, + "num_tokens": 39661696.0, + "step": 1041 + }, + { + "epoch": 0.13255311029131153, + "ewc_loss": 0.007293701171875, + "ewc_loss_parallel": 7.3015689849853516e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.880155563354492, + "learning_rate": 4.4128868164476474e-07, + "loss": 0.5146, + "mean_token_accuracy": 0.840277373790741, + "num_tokens": 39702357.0, + "step": 1042 + }, + { + "epoch": 0.13268032056990206, + "ewc_loss": 0.00732421875, + "ewc_loss_parallel": 7.331371307373047e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.857732772827148, + "learning_rate": 4.417125900805426e-07, + "loss": 0.4988, + "mean_token_accuracy": 0.8416647911071777, + "num_tokens": 39744960.0, + "step": 1043 + }, + { + "epoch": 0.13280753084849256, + "ewc_loss": 0.00732421875, + "ewc_loss_parallel": 7.331371307373047e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.892093658447266, + "learning_rate": 4.4213649851632044e-07, + "loss": 0.5574, + "mean_token_accuracy": 0.8264672160148621, + "num_tokens": 39785394.0, + "step": 1044 + }, + { + "epoch": 0.13293474112708306, + "ewc_loss": 0.00738525390625, + "ewc_loss_parallel": 7.3909759521484375e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.976778030395508, + "learning_rate": 4.4256040695209834e-07, + "loss": 0.5376, + "mean_token_accuracy": 0.8325023651123047, + "num_tokens": 39822698.0, + "step": 1045 + }, + { + "epoch": 0.1330619514056736, + "ewc_loss": 0.007354736328125, + "ewc_loss_parallel": 7.361173629760742e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.934375762939453, + "learning_rate": 4.429843153878762e-07, + "loss": 0.5364, + "mean_token_accuracy": 0.8278343677520752, + "num_tokens": 39856874.0, + "step": 1046 + }, + { + "epoch": 0.1331891616842641, + "ewc_loss": 0.007354736328125, + "ewc_loss_parallel": 7.361173629760742e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.891219139099121, + "learning_rate": 4.434082238236541e-07, + "loss": 0.5682, + "mean_token_accuracy": 0.8200808763504028, + "num_tokens": 39899683.0, + "step": 1047 + }, + { + "epoch": 0.1333163719628546, + "ewc_loss": 0.007354736328125, + "ewc_loss_parallel": 7.361173629760742e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.932842254638672, + "learning_rate": 4.4383213225943193e-07, + "loss": 0.5461, + "mean_token_accuracy": 0.8293055295944214, + "num_tokens": 39940565.0, + "step": 1048 + }, + { + "epoch": 0.13344358224144512, + "ewc_loss": 0.0074462890625, + "ewc_loss_parallel": 7.450580596923828e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.908238410949707, + "learning_rate": 4.442560406952098e-07, + "loss": 0.5456, + "mean_token_accuracy": 0.8247959613800049, + "num_tokens": 39979010.0, + "step": 1049 + }, + { + "epoch": 0.13357079252003562, + "ewc_loss": 0.00738525390625, + "ewc_loss_parallel": 7.3909759521484375e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.942458152770996, + "learning_rate": 4.4467994913098767e-07, + "loss": 0.4943, + "mean_token_accuracy": 0.8428441286087036, + "num_tokens": 40021151.0, + "step": 1050 + }, + { + "epoch": 0.13369800279862612, + "ewc_loss": 0.00738525390625, + "ewc_loss_parallel": 7.3909759521484375e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.965224266052246, + "learning_rate": 4.4510385756676557e-07, + "loss": 0.5542, + "mean_token_accuracy": 0.823411226272583, + "num_tokens": 40059324.0, + "step": 1051 + }, + { + "epoch": 0.13382521307721665, + "ewc_loss": 0.00738525390625, + "ewc_loss_parallel": 7.3909759521484375e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.831328392028809, + "learning_rate": 4.455277660025434e-07, + "loss": 0.5092, + "mean_token_accuracy": 0.836510181427002, + "num_tokens": 40104577.0, + "step": 1052 + }, + { + "epoch": 0.13395242335580715, + "ewc_loss": 0.00750732421875, + "ewc_loss_parallel": 7.510185241699219e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.00139331817627, + "learning_rate": 4.459516744383213e-07, + "loss": 0.5332, + "mean_token_accuracy": 0.827804446220398, + "num_tokens": 40139734.0, + "step": 1053 + }, + { + "epoch": 0.13407963363439765, + "ewc_loss": 0.007537841796875, + "ewc_loss_parallel": 7.539987564086914e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.950247764587402, + "learning_rate": 4.4637558287409916e-07, + "loss": 0.5305, + "mean_token_accuracy": 0.8310688138008118, + "num_tokens": 40180724.0, + "step": 1054 + }, + { + "epoch": 0.13420684391298818, + "ewc_loss": 0.0074462890625, + "ewc_loss_parallel": 7.450580596923828e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.99051570892334, + "learning_rate": 4.4679949130987706e-07, + "loss": 0.5285, + "mean_token_accuracy": 0.8376107215881348, + "num_tokens": 40220475.0, + "step": 1055 + }, + { + "epoch": 0.13433405419157868, + "ewc_loss": 0.0074462890625, + "ewc_loss_parallel": 7.450580596923828e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.947351455688477, + "learning_rate": 4.472233997456549e-07, + "loss": 0.5401, + "mean_token_accuracy": 0.8288286924362183, + "num_tokens": 40258208.0, + "step": 1056 + }, + { + "epoch": 0.13446126447016918, + "ewc_loss": 0.007476806640625, + "ewc_loss_parallel": 7.4803829193115234e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.901870727539062, + "learning_rate": 4.476473081814328e-07, + "loss": 0.5235, + "mean_token_accuracy": 0.832162618637085, + "num_tokens": 40299127.0, + "step": 1057 + }, + { + "epoch": 0.1345884747487597, + "ewc_loss": 0.007476806640625, + "ewc_loss_parallel": 7.4803829193115234e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.937260627746582, + "learning_rate": 4.4807121661721065e-07, + "loss": 0.4818, + "mean_token_accuracy": 0.843725323677063, + "num_tokens": 40339331.0, + "step": 1058 + }, + { + "epoch": 0.1347156850273502, + "ewc_loss": 0.00762939453125, + "ewc_loss_parallel": 7.62939453125e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.940528869628906, + "learning_rate": 4.4849512505298855e-07, + "loss": 0.5263, + "mean_token_accuracy": 0.831292450428009, + "num_tokens": 40379386.0, + "step": 1059 + }, + { + "epoch": 0.1348428953059407, + "ewc_loss": 0.007476806640625, + "ewc_loss_parallel": 7.4803829193115234e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.827064514160156, + "learning_rate": 4.489190334887664e-07, + "loss": 0.5645, + "mean_token_accuracy": 0.8270004987716675, + "num_tokens": 40419868.0, + "step": 1060 + }, + { + "epoch": 0.13497010558453124, + "ewc_loss": 0.00750732421875, + "ewc_loss_parallel": 7.510185241699219e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.028865814208984, + "learning_rate": 4.493429419245443e-07, + "loss": 0.5326, + "mean_token_accuracy": 0.8325956463813782, + "num_tokens": 40455346.0, + "step": 1061 + }, + { + "epoch": 0.13509731586312174, + "ewc_loss": 0.007568359375, + "ewc_loss_parallel": 7.569789886474609e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.957399368286133, + "learning_rate": 4.4976685036032214e-07, + "loss": 0.4906, + "mean_token_accuracy": 0.842049241065979, + "num_tokens": 40490087.0, + "step": 1062 + }, + { + "epoch": 0.13522452614171224, + "ewc_loss": 0.00750732421875, + "ewc_loss_parallel": 7.510185241699219e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.041399955749512, + "learning_rate": 4.5019075879610004e-07, + "loss": 0.5485, + "mean_token_accuracy": 0.822020411491394, + "num_tokens": 40524799.0, + "step": 1063 + }, + { + "epoch": 0.13535173642030277, + "ewc_loss": 0.007598876953125, + "ewc_loss_parallel": 7.599592208862305e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.068105697631836, + "learning_rate": 4.506146672318779e-07, + "loss": 0.5223, + "mean_token_accuracy": 0.8334411382675171, + "num_tokens": 40563740.0, + "step": 1064 + }, + { + "epoch": 0.13547894669889327, + "ewc_loss": 0.007568359375, + "ewc_loss_parallel": 7.569789886474609e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.906197547912598, + "learning_rate": 4.5103857566765573e-07, + "loss": 0.5753, + "mean_token_accuracy": 0.8240394592285156, + "num_tokens": 40604743.0, + "step": 1065 + }, + { + "epoch": 0.1356061569774838, + "ewc_loss": 0.007568359375, + "ewc_loss_parallel": 7.569789886474609e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.011568069458008, + "learning_rate": 4.5146248410343363e-07, + "loss": 0.4814, + "mean_token_accuracy": 0.8466009497642517, + "num_tokens": 40640909.0, + "step": 1066 + }, + { + "epoch": 0.1357333672560743, + "ewc_loss": 0.007598876953125, + "ewc_loss_parallel": 7.599592208862305e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.068324089050293, + "learning_rate": 4.5188639253921153e-07, + "loss": 0.4891, + "mean_token_accuracy": 0.844881534576416, + "num_tokens": 40678930.0, + "step": 1067 + }, + { + "epoch": 0.1358605775346648, + "ewc_loss": 0.007598876953125, + "ewc_loss_parallel": 7.599592208862305e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.040764808654785, + "learning_rate": 4.523103009749894e-07, + "loss": 0.4834, + "mean_token_accuracy": 0.8445001840591431, + "num_tokens": 40715609.0, + "step": 1068 + }, + { + "epoch": 0.13598778781325532, + "ewc_loss": 0.007598876953125, + "ewc_loss_parallel": 7.599592208862305e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.974160194396973, + "learning_rate": 4.527342094107672e-07, + "loss": 0.5072, + "mean_token_accuracy": 0.8364561200141907, + "num_tokens": 40760533.0, + "step": 1069 + }, + { + "epoch": 0.13611499809184582, + "ewc_loss": 0.007568359375, + "ewc_loss_parallel": 7.569789886474609e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.013204574584961, + "learning_rate": 4.531581178465451e-07, + "loss": 0.5344, + "mean_token_accuracy": 0.8309742212295532, + "num_tokens": 40800039.0, + "step": 1070 + }, + { + "epoch": 0.13624220837043632, + "ewc_loss": 0.007568359375, + "ewc_loss_parallel": 7.569789886474609e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.014046669006348, + "learning_rate": 4.53582026282323e-07, + "loss": 0.4983, + "mean_token_accuracy": 0.8411903977394104, + "num_tokens": 40840207.0, + "step": 1071 + }, + { + "epoch": 0.13636941864902685, + "ewc_loss": 0.007598876953125, + "ewc_loss_parallel": 7.599592208862305e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.974178314208984, + "learning_rate": 4.5400593471810087e-07, + "loss": 0.5802, + "mean_token_accuracy": 0.8161674737930298, + "num_tokens": 40879559.0, + "step": 1072 + }, + { + "epoch": 0.13649662892761735, + "ewc_loss": 0.007598876953125, + "ewc_loss_parallel": 7.599592208862305e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.003345489501953, + "learning_rate": 4.544298431538787e-07, + "loss": 0.5075, + "mean_token_accuracy": 0.8387813568115234, + "num_tokens": 40919608.0, + "step": 1073 + }, + { + "epoch": 0.13662383920620785, + "ewc_loss": 0.007598876953125, + "ewc_loss_parallel": 7.599592208862305e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.103632926940918, + "learning_rate": 4.548537515896566e-07, + "loss": 0.5681, + "mean_token_accuracy": 0.8198221921920776, + "num_tokens": 40953648.0, + "step": 1074 + }, + { + "epoch": 0.13675104948479838, + "ewc_loss": 0.007598876953125, + "ewc_loss_parallel": 7.599592208862305e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.05536937713623, + "learning_rate": 4.552776600254345e-07, + "loss": 0.5399, + "mean_token_accuracy": 0.8294236660003662, + "num_tokens": 40989675.0, + "step": 1075 + }, + { + "epoch": 0.13687825976338888, + "ewc_loss": 0.00762939453125, + "ewc_loss_parallel": 7.62939453125e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.02551555633545, + "learning_rate": 4.5570156846121236e-07, + "loss": 0.4906, + "mean_token_accuracy": 0.8436180949211121, + "num_tokens": 41027992.0, + "step": 1076 + }, + { + "epoch": 0.13700547004197938, + "ewc_loss": 0.00762939453125, + "ewc_loss_parallel": 7.62939453125e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.107522964477539, + "learning_rate": 4.561254768969902e-07, + "loss": 0.4902, + "mean_token_accuracy": 0.8404577970504761, + "num_tokens": 41066925.0, + "step": 1077 + }, + { + "epoch": 0.1371326803205699, + "ewc_loss": 0.00762939453125, + "ewc_loss_parallel": 7.62939453125e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.96033763885498, + "learning_rate": 4.565493853327681e-07, + "loss": 0.4997, + "mean_token_accuracy": 0.8417818546295166, + "num_tokens": 41105090.0, + "step": 1078 + }, + { + "epoch": 0.1372598905991604, + "ewc_loss": 0.0076904296875, + "ewc_loss_parallel": 7.68899917602539e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.232565879821777, + "learning_rate": 4.56973293768546e-07, + "loss": 0.5414, + "mean_token_accuracy": 0.832269012928009, + "num_tokens": 41141924.0, + "step": 1079 + }, + { + "epoch": 0.1373871008777509, + "ewc_loss": 0.0076904296875, + "ewc_loss_parallel": 7.68899917602539e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.08324909210205, + "learning_rate": 4.573972022043238e-07, + "loss": 0.6036, + "mean_token_accuracy": 0.809248685836792, + "num_tokens": 41181512.0, + "step": 1080 + }, + { + "epoch": 0.13751431115634144, + "ewc_loss": 0.00762939453125, + "ewc_loss_parallel": 7.62939453125e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 11.998373985290527, + "learning_rate": 4.578211106401017e-07, + "loss": 0.551, + "mean_token_accuracy": 0.8246284127235413, + "num_tokens": 41224015.0, + "step": 1081 + }, + { + "epoch": 0.13764152143493194, + "ewc_loss": 0.0076904296875, + "ewc_loss_parallel": 7.68899917602539e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.072265625, + "learning_rate": 4.582450190758796e-07, + "loss": 0.5528, + "mean_token_accuracy": 0.8263922929763794, + "num_tokens": 41265568.0, + "step": 1082 + }, + { + "epoch": 0.13776873171352244, + "ewc_loss": 0.0076904296875, + "ewc_loss_parallel": 7.68899917602539e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.04740047454834, + "learning_rate": 4.586689275116575e-07, + "loss": 0.465, + "mean_token_accuracy": 0.850975751876831, + "num_tokens": 41303595.0, + "step": 1083 + }, + { + "epoch": 0.13789594199211297, + "ewc_loss": 0.0076904296875, + "ewc_loss_parallel": 7.68899917602539e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.144268989562988, + "learning_rate": 4.590928359474353e-07, + "loss": 0.5705, + "mean_token_accuracy": 0.8221426010131836, + "num_tokens": 41340119.0, + "step": 1084 + }, + { + "epoch": 0.13802315227070347, + "ewc_loss": 0.00775146484375, + "ewc_loss_parallel": 7.748603820800781e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.116059303283691, + "learning_rate": 4.595167443832132e-07, + "loss": 0.5215, + "mean_token_accuracy": 0.837476909160614, + "num_tokens": 41380184.0, + "step": 1085 + }, + { + "epoch": 0.13815036254929397, + "ewc_loss": 0.0076904296875, + "ewc_loss_parallel": 7.68899917602539e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.034926414489746, + "learning_rate": 4.599406528189911e-07, + "loss": 0.4638, + "mean_token_accuracy": 0.8494983315467834, + "num_tokens": 41419038.0, + "step": 1086 + }, + { + "epoch": 0.1382775728278845, + "ewc_loss": 0.00775146484375, + "ewc_loss_parallel": 7.748603820800781e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.125872611999512, + "learning_rate": 4.60364561254769e-07, + "loss": 0.5086, + "mean_token_accuracy": 0.8346143364906311, + "num_tokens": 41455767.0, + "step": 1087 + }, + { + "epoch": 0.138404783106475, + "ewc_loss": 0.00762939453125, + "ewc_loss_parallel": 7.62939453125e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.260269165039062, + "learning_rate": 4.607884696905468e-07, + "loss": 0.5049, + "mean_token_accuracy": 0.8374367952346802, + "num_tokens": 41497493.0, + "step": 1088 + }, + { + "epoch": 0.1385319933850655, + "ewc_loss": 0.00775146484375, + "ewc_loss_parallel": 7.748603820800781e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.050371170043945, + "learning_rate": 4.612123781263247e-07, + "loss": 0.5357, + "mean_token_accuracy": 0.8301644325256348, + "num_tokens": 41542940.0, + "step": 1089 + }, + { + "epoch": 0.13865920366365603, + "ewc_loss": 0.00762939453125, + "ewc_loss_parallel": 7.62939453125e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.174864768981934, + "learning_rate": 4.616362865621026e-07, + "loss": 0.4361, + "mean_token_accuracy": 0.8565027713775635, + "num_tokens": 41579302.0, + "step": 1090 + }, + { + "epoch": 0.13878641394224653, + "ewc_loss": 0.0078125, + "ewc_loss_parallel": 7.808208465576172e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.135092735290527, + "learning_rate": 4.620601949978805e-07, + "loss": 0.5358, + "mean_token_accuracy": 0.8312575817108154, + "num_tokens": 41617555.0, + "step": 1091 + }, + { + "epoch": 0.13891362422083706, + "ewc_loss": 0.0076904296875, + "ewc_loss_parallel": 7.68899917602539e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.103464126586914, + "learning_rate": 4.6248410343365827e-07, + "loss": 0.5855, + "mean_token_accuracy": 0.8140828013420105, + "num_tokens": 41654660.0, + "step": 1092 + }, + { + "epoch": 0.13904083449942756, + "ewc_loss": 0.00775146484375, + "ewc_loss_parallel": 7.748603820800781e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.07858943939209, + "learning_rate": 4.6290801186943617e-07, + "loss": 0.493, + "mean_token_accuracy": 0.8424609899520874, + "num_tokens": 41693773.0, + "step": 1093 + }, + { + "epoch": 0.13916804477801806, + "ewc_loss": 0.00775146484375, + "ewc_loss_parallel": 7.748603820800781e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.036812782287598, + "learning_rate": 4.6333192030521407e-07, + "loss": 0.5079, + "mean_token_accuracy": 0.8413670063018799, + "num_tokens": 41728864.0, + "step": 1094 + }, + { + "epoch": 0.13929525505660859, + "ewc_loss": 0.0076904296875, + "ewc_loss_parallel": 7.68899917602539e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.122074127197266, + "learning_rate": 4.6375582874099196e-07, + "loss": 0.482, + "mean_token_accuracy": 0.8455995917320251, + "num_tokens": 41770596.0, + "step": 1095 + }, + { + "epoch": 0.1394224653351991, + "ewc_loss": 0.00775146484375, + "ewc_loss_parallel": 7.748603820800781e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.04488754272461, + "learning_rate": 4.6417973717676976e-07, + "loss": 0.5015, + "mean_token_accuracy": 0.8433177471160889, + "num_tokens": 41806585.0, + "step": 1096 + }, + { + "epoch": 0.1395496756137896, + "ewc_loss": 0.00775146484375, + "ewc_loss_parallel": 7.748603820800781e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.144166946411133, + "learning_rate": 4.6460364561254766e-07, + "loss": 0.5288, + "mean_token_accuracy": 0.8303141593933105, + "num_tokens": 41845488.0, + "step": 1097 + }, + { + "epoch": 0.13967688589238011, + "ewc_loss": 0.0078125, + "ewc_loss_parallel": 7.808208465576172e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.18511962890625, + "learning_rate": 4.6502755404832556e-07, + "loss": 0.4946, + "mean_token_accuracy": 0.8405709266662598, + "num_tokens": 41880674.0, + "step": 1098 + }, + { + "epoch": 0.13980409617097062, + "ewc_loss": 0.00775146484375, + "ewc_loss_parallel": 7.748603820800781e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.195478439331055, + "learning_rate": 4.654514624841034e-07, + "loss": 0.509, + "mean_token_accuracy": 0.8368365168571472, + "num_tokens": 41913020.0, + "step": 1099 + }, + { + "epoch": 0.13993130644956112, + "ewc_loss": 0.00775146484375, + "ewc_loss_parallel": 7.748603820800781e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.3157958984375, + "learning_rate": 4.6587537091988125e-07, + "loss": 0.5148, + "mean_token_accuracy": 0.8377689123153687, + "num_tokens": 41948483.0, + "step": 1100 + }, + { + "epoch": 0.14005851672815164, + "ewc_loss": 0.0078125, + "ewc_loss_parallel": 7.808208465576172e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.287211418151855, + "learning_rate": 4.6629927935565915e-07, + "loss": 0.4855, + "mean_token_accuracy": 0.8450942039489746, + "num_tokens": 41985659.0, + "step": 1101 + }, + { + "epoch": 0.14018572700674214, + "ewc_loss": 0.00775146484375, + "ewc_loss_parallel": 7.748603820800781e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.115751266479492, + "learning_rate": 4.6672318779143705e-07, + "loss": 0.5834, + "mean_token_accuracy": 0.8164722919464111, + "num_tokens": 42027746.0, + "step": 1102 + }, + { + "epoch": 0.14031293728533265, + "ewc_loss": 0.00775146484375, + "ewc_loss_parallel": 7.748603820800781e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.26691722869873, + "learning_rate": 4.671470962272149e-07, + "loss": 0.5404, + "mean_token_accuracy": 0.827308177947998, + "num_tokens": 42060903.0, + "step": 1103 + }, + { + "epoch": 0.14044014756392317, + "ewc_loss": 0.0078125, + "ewc_loss_parallel": 7.808208465576172e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.214556694030762, + "learning_rate": 4.6757100466299274e-07, + "loss": 0.5342, + "mean_token_accuracy": 0.8332637548446655, + "num_tokens": 42102722.0, + "step": 1104 + }, + { + "epoch": 0.14056735784251367, + "ewc_loss": 0.00787353515625, + "ewc_loss_parallel": 7.867813110351562e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.198467254638672, + "learning_rate": 4.6799491309877064e-07, + "loss": 0.5435, + "mean_token_accuracy": 0.8254144191741943, + "num_tokens": 42143418.0, + "step": 1105 + }, + { + "epoch": 0.14069456812110417, + "ewc_loss": 0.0078125, + "ewc_loss_parallel": 7.808208465576172e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.212813377380371, + "learning_rate": 4.6841882153454854e-07, + "loss": 0.4831, + "mean_token_accuracy": 0.8427122831344604, + "num_tokens": 42177313.0, + "step": 1106 + }, + { + "epoch": 0.1408217783996947, + "ewc_loss": 0.00787353515625, + "ewc_loss_parallel": 7.867813110351562e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.282185554504395, + "learning_rate": 4.688427299703264e-07, + "loss": 0.4616, + "mean_token_accuracy": 0.8483529090881348, + "num_tokens": 42214158.0, + "step": 1107 + }, + { + "epoch": 0.1409489886782852, + "ewc_loss": 0.0078125, + "ewc_loss_parallel": 7.808208465576172e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.153898239135742, + "learning_rate": 4.6926663840610423e-07, + "loss": 0.521, + "mean_token_accuracy": 0.8350378274917603, + "num_tokens": 42254189.0, + "step": 1108 + }, + { + "epoch": 0.1410761989568757, + "ewc_loss": 0.00787353515625, + "ewc_loss_parallel": 7.867813110351562e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.27721118927002, + "learning_rate": 4.6969054684188213e-07, + "loss": 0.5435, + "mean_token_accuracy": 0.8241465091705322, + "num_tokens": 42296853.0, + "step": 1109 + }, + { + "epoch": 0.14120340923546623, + "ewc_loss": 0.00787353515625, + "ewc_loss_parallel": 7.867813110351562e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.230351448059082, + "learning_rate": 4.7011445527766003e-07, + "loss": 0.5139, + "mean_token_accuracy": 0.8332959413528442, + "num_tokens": 42331983.0, + "step": 1110 + }, + { + "epoch": 0.14133061951405673, + "ewc_loss": 0.0078125, + "ewc_loss_parallel": 7.808208465576172e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.14443302154541, + "learning_rate": 4.7053836371343787e-07, + "loss": 0.5224, + "mean_token_accuracy": 0.8354398012161255, + "num_tokens": 42370496.0, + "step": 1111 + }, + { + "epoch": 0.14145782979264723, + "ewc_loss": 0.0078125, + "ewc_loss_parallel": 7.808208465576172e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.340744972229004, + "learning_rate": 4.709622721492157e-07, + "loss": 0.5022, + "mean_token_accuracy": 0.8393762111663818, + "num_tokens": 42406489.0, + "step": 1112 + }, + { + "epoch": 0.14158504007123776, + "ewc_loss": 0.00787353515625, + "ewc_loss_parallel": 7.867813110351562e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.360148429870605, + "learning_rate": 4.713861805849936e-07, + "loss": 0.5011, + "mean_token_accuracy": 0.8387312889099121, + "num_tokens": 42440679.0, + "step": 1113 + }, + { + "epoch": 0.14171225034982826, + "ewc_loss": 0.00787353515625, + "ewc_loss_parallel": 7.867813110351562e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.228656768798828, + "learning_rate": 4.718100890207715e-07, + "loss": 0.5746, + "mean_token_accuracy": 0.8173525333404541, + "num_tokens": 42476699.0, + "step": 1114 + }, + { + "epoch": 0.1418394606284188, + "ewc_loss": 0.00787353515625, + "ewc_loss_parallel": 7.867813110351562e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.340749740600586, + "learning_rate": 4.7223399745654936e-07, + "loss": 0.5306, + "mean_token_accuracy": 0.8315371870994568, + "num_tokens": 42513348.0, + "step": 1115 + }, + { + "epoch": 0.1419666709070093, + "ewc_loss": 0.00787353515625, + "ewc_loss_parallel": 7.867813110351562e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.34717845916748, + "learning_rate": 4.726579058923272e-07, + "loss": 0.5217, + "mean_token_accuracy": 0.836309552192688, + "num_tokens": 42550633.0, + "step": 1116 + }, + { + "epoch": 0.1420938811855998, + "ewc_loss": 0.00787353515625, + "ewc_loss_parallel": 7.867813110351562e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.30082893371582, + "learning_rate": 4.730818143281051e-07, + "loss": 0.546, + "mean_token_accuracy": 0.8298760652542114, + "num_tokens": 42593649.0, + "step": 1117 + }, + { + "epoch": 0.14222109146419032, + "ewc_loss": 0.00787353515625, + "ewc_loss_parallel": 7.867813110351562e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.312434196472168, + "learning_rate": 4.7350572276388295e-07, + "loss": 0.4933, + "mean_token_accuracy": 0.842394232749939, + "num_tokens": 42632379.0, + "step": 1118 + }, + { + "epoch": 0.14234830174278082, + "ewc_loss": 0.00787353515625, + "ewc_loss_parallel": 7.867813110351562e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.404672622680664, + "learning_rate": 4.7392963119966085e-07, + "loss": 0.5191, + "mean_token_accuracy": 0.8337863087654114, + "num_tokens": 42670319.0, + "step": 1119 + }, + { + "epoch": 0.14247551202137132, + "ewc_loss": 0.00787353515625, + "ewc_loss_parallel": 7.867813110351562e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.29625415802002, + "learning_rate": 4.7435353963543875e-07, + "loss": 0.5126, + "mean_token_accuracy": 0.8378877639770508, + "num_tokens": 42712680.0, + "step": 1120 + }, + { + "epoch": 0.14260272229996185, + "ewc_loss": 0.00787353515625, + "ewc_loss_parallel": 7.867813110351562e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.315295219421387, + "learning_rate": 4.747774480712166e-07, + "loss": 0.5298, + "mean_token_accuracy": 0.8339815139770508, + "num_tokens": 42748241.0, + "step": 1121 + }, + { + "epoch": 0.14272993257855235, + "ewc_loss": 0.00787353515625, + "ewc_loss_parallel": 7.867813110351562e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.46017074584961, + "learning_rate": 4.7520135650699444e-07, + "loss": 0.5504, + "mean_token_accuracy": 0.8268893957138062, + "num_tokens": 42783208.0, + "step": 1122 + }, + { + "epoch": 0.14285714285714285, + "ewc_loss": 0.00787353515625, + "ewc_loss_parallel": 7.867813110351562e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.38823413848877, + "learning_rate": 4.7562526494277234e-07, + "loss": 0.5247, + "mean_token_accuracy": 0.8349792957305908, + "num_tokens": 42818570.0, + "step": 1123 + }, + { + "epoch": 0.14298435313573338, + "ewc_loss": 0.00787353515625, + "ewc_loss_parallel": 7.867813110351562e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.272984504699707, + "learning_rate": 4.7604917337855024e-07, + "loss": 0.4636, + "mean_token_accuracy": 0.8496227264404297, + "num_tokens": 42852891.0, + "step": 1124 + }, + { + "epoch": 0.14311156341432388, + "ewc_loss": 0.00787353515625, + "ewc_loss_parallel": 7.867813110351562e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.406676292419434, + "learning_rate": 4.764730818143281e-07, + "loss": 0.5144, + "mean_token_accuracy": 0.829879641532898, + "num_tokens": 42887359.0, + "step": 1125 + }, + { + "epoch": 0.14323877369291438, + "ewc_loss": 0.00787353515625, + "ewc_loss_parallel": 7.867813110351562e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.295372009277344, + "learning_rate": 4.768969902501059e-07, + "loss": 0.5671, + "mean_token_accuracy": 0.8199899196624756, + "num_tokens": 42929308.0, + "step": 1126 + }, + { + "epoch": 0.1433659839715049, + "ewc_loss": 0.00787353515625, + "ewc_loss_parallel": 7.867813110351562e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.379881858825684, + "learning_rate": 4.773208986858838e-07, + "loss": 0.5158, + "mean_token_accuracy": 0.8337193131446838, + "num_tokens": 42969373.0, + "step": 1127 + }, + { + "epoch": 0.1434931942500954, + "ewc_loss": 0.00799560546875, + "ewc_loss_parallel": 7.987022399902344e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.28520679473877, + "learning_rate": 4.777448071216617e-07, + "loss": 0.5203, + "mean_token_accuracy": 0.835921049118042, + "num_tokens": 43010133.0, + "step": 1128 + }, + { + "epoch": 0.1436204045286859, + "ewc_loss": 0.00787353515625, + "ewc_loss_parallel": 7.867813110351562e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.335691452026367, + "learning_rate": 4.781687155574396e-07, + "loss": 0.5469, + "mean_token_accuracy": 0.8268896341323853, + "num_tokens": 43053743.0, + "step": 1129 + }, + { + "epoch": 0.14374761480727644, + "ewc_loss": 0.00787353515625, + "ewc_loss_parallel": 7.867813110351562e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.256495475769043, + "learning_rate": 4.785926239932175e-07, + "loss": 0.4868, + "mean_token_accuracy": 0.8439429998397827, + "num_tokens": 43094125.0, + "step": 1130 + }, + { + "epoch": 0.14387482508586694, + "ewc_loss": 0.0079345703125, + "ewc_loss_parallel": 7.927417755126953e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.40746784210205, + "learning_rate": 4.790165324289953e-07, + "loss": 0.5139, + "mean_token_accuracy": 0.8343490362167358, + "num_tokens": 43132399.0, + "step": 1131 + }, + { + "epoch": 0.14400203536445744, + "ewc_loss": 0.0079345703125, + "ewc_loss_parallel": 7.927417755126953e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.369135856628418, + "learning_rate": 4.794404408647732e-07, + "loss": 0.5287, + "mean_token_accuracy": 0.8338662981987, + "num_tokens": 43169036.0, + "step": 1132 + }, + { + "epoch": 0.14412924564304797, + "ewc_loss": 0.00787353515625, + "ewc_loss_parallel": 7.867813110351562e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.343294143676758, + "learning_rate": 4.798643493005511e-07, + "loss": 0.457, + "mean_token_accuracy": 0.8514181971549988, + "num_tokens": 43209397.0, + "step": 1133 + }, + { + "epoch": 0.14425645592163847, + "ewc_loss": 0.0079345703125, + "ewc_loss_parallel": 7.927417755126953e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.308039665222168, + "learning_rate": 4.80288257736329e-07, + "loss": 0.5558, + "mean_token_accuracy": 0.8253411054611206, + "num_tokens": 43246788.0, + "step": 1134 + }, + { + "epoch": 0.14438366620022897, + "ewc_loss": 0.0079345703125, + "ewc_loss_parallel": 7.927417755126953e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.363608360290527, + "learning_rate": 4.807121661721068e-07, + "loss": 0.4666, + "mean_token_accuracy": 0.8498232364654541, + "num_tokens": 43282774.0, + "step": 1135 + }, + { + "epoch": 0.1445108764788195, + "ewc_loss": 0.00799560546875, + "ewc_loss_parallel": 7.987022399902344e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.350996017456055, + "learning_rate": 4.811360746078847e-07, + "loss": 0.5248, + "mean_token_accuracy": 0.8372782468795776, + "num_tokens": 43322303.0, + "step": 1136 + }, + { + "epoch": 0.14463808675741, + "ewc_loss": 0.00787353515625, + "ewc_loss_parallel": 7.867813110351562e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.36040210723877, + "learning_rate": 4.815599830436625e-07, + "loss": 0.5171, + "mean_token_accuracy": 0.8353043794631958, + "num_tokens": 43354611.0, + "step": 1137 + }, + { + "epoch": 0.1447652970360005, + "ewc_loss": 0.00799560546875, + "ewc_loss_parallel": 7.987022399902344e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.360922813415527, + "learning_rate": 4.819838914794405e-07, + "loss": 0.4982, + "mean_token_accuracy": 0.8407561779022217, + "num_tokens": 43390640.0, + "step": 1138 + }, + { + "epoch": 0.14489250731459102, + "ewc_loss": 0.0079345703125, + "ewc_loss_parallel": 7.927417755126953e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.289999008178711, + "learning_rate": 4.824077999152183e-07, + "loss": 0.5193, + "mean_token_accuracy": 0.8302294611930847, + "num_tokens": 43424513.0, + "step": 1139 + }, + { + "epoch": 0.14501971759318152, + "ewc_loss": 0.00799560546875, + "ewc_loss_parallel": 7.987022399902344e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.276989936828613, + "learning_rate": 4.828317083509962e-07, + "loss": 0.525, + "mean_token_accuracy": 0.8323386907577515, + "num_tokens": 43461699.0, + "step": 1140 + }, + { + "epoch": 0.14514692787177205, + "ewc_loss": 0.0079345703125, + "ewc_loss_parallel": 7.927417755126953e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.469209671020508, + "learning_rate": 4.83255616786774e-07, + "loss": 0.4987, + "mean_token_accuracy": 0.8443878293037415, + "num_tokens": 43494681.0, + "step": 1141 + }, + { + "epoch": 0.14527413815036255, + "ewc_loss": 0.00799560546875, + "ewc_loss_parallel": 7.987022399902344e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.399243354797363, + "learning_rate": 4.83679525222552e-07, + "loss": 0.4743, + "mean_token_accuracy": 0.8482863306999207, + "num_tokens": 43531318.0, + "step": 1142 + }, + { + "epoch": 0.14540134842895305, + "ewc_loss": 0.00799560546875, + "ewc_loss_parallel": 7.987022399902344e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.376792907714844, + "learning_rate": 4.841034336583298e-07, + "loss": 0.4895, + "mean_token_accuracy": 0.847821056842804, + "num_tokens": 43569991.0, + "step": 1143 + }, + { + "epoch": 0.14552855870754358, + "ewc_loss": 0.00799560546875, + "ewc_loss_parallel": 7.987022399902344e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.312251091003418, + "learning_rate": 4.845273420941076e-07, + "loss": 0.4774, + "mean_token_accuracy": 0.8489515781402588, + "num_tokens": 43609978.0, + "step": 1144 + }, + { + "epoch": 0.14565576898613408, + "ewc_loss": 0.00799560546875, + "ewc_loss_parallel": 7.987022399902344e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.391182899475098, + "learning_rate": 4.849512505298855e-07, + "loss": 0.5276, + "mean_token_accuracy": 0.8283102512359619, + "num_tokens": 43640419.0, + "step": 1145 + }, + { + "epoch": 0.14578297926472458, + "ewc_loss": 0.0079345703125, + "ewc_loss_parallel": 7.927417755126953e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.38520622253418, + "learning_rate": 4.853751589656634e-07, + "loss": 0.4872, + "mean_token_accuracy": 0.8445019721984863, + "num_tokens": 43682408.0, + "step": 1146 + }, + { + "epoch": 0.1459101895433151, + "ewc_loss": 0.00799560546875, + "ewc_loss_parallel": 7.987022399902344e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.401800155639648, + "learning_rate": 4.857990674014413e-07, + "loss": 0.5278, + "mean_token_accuracy": 0.8312177658081055, + "num_tokens": 43726158.0, + "step": 1147 + }, + { + "epoch": 0.1460373998219056, + "ewc_loss": 0.00799560546875, + "ewc_loss_parallel": 7.987022399902344e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.394098281860352, + "learning_rate": 4.862229758372191e-07, + "loss": 0.5579, + "mean_token_accuracy": 0.8210121989250183, + "num_tokens": 43766466.0, + "step": 1148 + }, + { + "epoch": 0.1461646101004961, + "ewc_loss": 0.00799560546875, + "ewc_loss_parallel": 7.987022399902344e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.356945991516113, + "learning_rate": 4.86646884272997e-07, + "loss": 0.5431, + "mean_token_accuracy": 0.8251600861549377, + "num_tokens": 43807255.0, + "step": 1149 + }, + { + "epoch": 0.14629182037908664, + "ewc_loss": 0.0079345703125, + "ewc_loss_parallel": 7.927417755126953e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.347716331481934, + "learning_rate": 4.870707927087749e-07, + "loss": 0.5222, + "mean_token_accuracy": 0.832148015499115, + "num_tokens": 43845419.0, + "step": 1150 + }, + { + "epoch": 0.14641903065767714, + "ewc_loss": 0.0079345703125, + "ewc_loss_parallel": 7.927417755126953e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.490226745605469, + "learning_rate": 4.874947011445528e-07, + "loss": 0.5309, + "mean_token_accuracy": 0.8337206840515137, + "num_tokens": 43882942.0, + "step": 1151 + }, + { + "epoch": 0.14654624093626764, + "ewc_loss": 0.0079345703125, + "ewc_loss_parallel": 7.927417755126953e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.353541374206543, + "learning_rate": 4.879186095803306e-07, + "loss": 0.4494, + "mean_token_accuracy": 0.8559638261795044, + "num_tokens": 43923109.0, + "step": 1152 + }, + { + "epoch": 0.14667345121485817, + "ewc_loss": 0.00799560546875, + "ewc_loss_parallel": 7.987022399902344e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.503822326660156, + "learning_rate": 4.883425180161085e-07, + "loss": 0.4872, + "mean_token_accuracy": 0.8421391844749451, + "num_tokens": 43959752.0, + "step": 1153 + }, + { + "epoch": 0.14680066149344867, + "ewc_loss": 0.0079345703125, + "ewc_loss_parallel": 7.927417755126953e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.548168182373047, + "learning_rate": 4.887664264518864e-07, + "loss": 0.5356, + "mean_token_accuracy": 0.834293007850647, + "num_tokens": 43996247.0, + "step": 1154 + }, + { + "epoch": 0.14692787177203917, + "ewc_loss": 0.00799560546875, + "ewc_loss_parallel": 7.987022399902344e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.337867736816406, + "learning_rate": 4.891903348876643e-07, + "loss": 0.5167, + "mean_token_accuracy": 0.8386348485946655, + "num_tokens": 44035751.0, + "step": 1155 + }, + { + "epoch": 0.1470550820506297, + "ewc_loss": 0.00799560546875, + "ewc_loss_parallel": 7.987022399902344e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.48686695098877, + "learning_rate": 4.896142433234421e-07, + "loss": 0.5517, + "mean_token_accuracy": 0.8299074172973633, + "num_tokens": 44073225.0, + "step": 1156 + }, + { + "epoch": 0.1471822923292202, + "ewc_loss": 0.00799560546875, + "ewc_loss_parallel": 7.987022399902344e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.468146324157715, + "learning_rate": 4.9003815175922e-07, + "loss": 0.5548, + "mean_token_accuracy": 0.8298104405403137, + "num_tokens": 44111761.0, + "step": 1157 + }, + { + "epoch": 0.1473095026078107, + "ewc_loss": 0.00799560546875, + "ewc_loss_parallel": 7.987022399902344e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.491997718811035, + "learning_rate": 4.904620601949979e-07, + "loss": 0.5058, + "mean_token_accuracy": 0.8383140563964844, + "num_tokens": 44151311.0, + "step": 1158 + }, + { + "epoch": 0.14743671288640123, + "ewc_loss": 0.00799560546875, + "ewc_loss_parallel": 7.987022399902344e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.47915267944336, + "learning_rate": 4.908859686307758e-07, + "loss": 0.5526, + "mean_token_accuracy": 0.8279401063919067, + "num_tokens": 44184623.0, + "step": 1159 + }, + { + "epoch": 0.14756392316499173, + "ewc_loss": 0.00799560546875, + "ewc_loss_parallel": 7.987022399902344e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.417906761169434, + "learning_rate": 4.913098770665536e-07, + "loss": 0.5369, + "mean_token_accuracy": 0.8330686092376709, + "num_tokens": 44221753.0, + "step": 1160 + }, + { + "epoch": 0.14769113344358223, + "ewc_loss": 0.00799560546875, + "ewc_loss_parallel": 7.987022399902344e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.569067001342773, + "learning_rate": 4.917337855023314e-07, + "loss": 0.4898, + "mean_token_accuracy": 0.839706301689148, + "num_tokens": 44262542.0, + "step": 1161 + }, + { + "epoch": 0.14781834372217276, + "ewc_loss": 0.0079345703125, + "ewc_loss_parallel": 7.927417755126953e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.532443046569824, + "learning_rate": 4.921576939381094e-07, + "loss": 0.5131, + "mean_token_accuracy": 0.8406579494476318, + "num_tokens": 44300294.0, + "step": 1162 + }, + { + "epoch": 0.14794555400076326, + "ewc_loss": 0.00799560546875, + "ewc_loss_parallel": 7.987022399902344e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.503740310668945, + "learning_rate": 4.925816023738872e-07, + "loss": 0.5587, + "mean_token_accuracy": 0.8221731185913086, + "num_tokens": 44338476.0, + "step": 1163 + }, + { + "epoch": 0.14807276427935376, + "ewc_loss": 0.0079345703125, + "ewc_loss_parallel": 7.927417755126953e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.515341758728027, + "learning_rate": 4.930055108096651e-07, + "loss": 0.4615, + "mean_token_accuracy": 0.8516969680786133, + "num_tokens": 44367437.0, + "step": 1164 + }, + { + "epoch": 0.1481999745579443, + "ewc_loss": 0.00799560546875, + "ewc_loss_parallel": 7.987022399902344e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.517955780029297, + "learning_rate": 4.934294192454429e-07, + "loss": 0.571, + "mean_token_accuracy": 0.8200781345367432, + "num_tokens": 44402636.0, + "step": 1165 + }, + { + "epoch": 0.1483271848365348, + "ewc_loss": 0.00799560546875, + "ewc_loss_parallel": 7.987022399902344e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.519135475158691, + "learning_rate": 4.938533276812209e-07, + "loss": 0.5131, + "mean_token_accuracy": 0.8372783660888672, + "num_tokens": 44438187.0, + "step": 1166 + }, + { + "epoch": 0.14845439511512531, + "ewc_loss": 0.008056640625, + "ewc_loss_parallel": 8.046627044677734e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.61942195892334, + "learning_rate": 4.942772361169987e-07, + "loss": 0.4369, + "mean_token_accuracy": 0.8550529479980469, + "num_tokens": 44478345.0, + "step": 1167 + }, + { + "epoch": 0.14858160539371582, + "ewc_loss": 0.00799560546875, + "ewc_loss_parallel": 7.987022399902344e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.63437271118164, + "learning_rate": 4.947011445527766e-07, + "loss": 0.5173, + "mean_token_accuracy": 0.8357503414154053, + "num_tokens": 44520272.0, + "step": 1168 + }, + { + "epoch": 0.14870881567230632, + "ewc_loss": 0.008056640625, + "ewc_loss_parallel": 8.046627044677734e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.533101081848145, + "learning_rate": 4.951250529885544e-07, + "loss": 0.4666, + "mean_token_accuracy": 0.8517851829528809, + "num_tokens": 44561058.0, + "step": 1169 + }, + { + "epoch": 0.14883602595089684, + "ewc_loss": 0.00799560546875, + "ewc_loss_parallel": 7.987022399902344e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.43239688873291, + "learning_rate": 4.955489614243324e-07, + "loss": 0.6017, + "mean_token_accuracy": 0.8085548877716064, + "num_tokens": 44599371.0, + "step": 1170 + }, + { + "epoch": 0.14896323622948734, + "ewc_loss": 0.00799560546875, + "ewc_loss_parallel": 7.987022399902344e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.51605224609375, + "learning_rate": 4.959728698601102e-07, + "loss": 0.5111, + "mean_token_accuracy": 0.8291642665863037, + "num_tokens": 44640499.0, + "step": 1171 + }, + { + "epoch": 0.14909044650807785, + "ewc_loss": 0.008056640625, + "ewc_loss_parallel": 8.046627044677734e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.613371849060059, + "learning_rate": 4.963967782958881e-07, + "loss": 0.4976, + "mean_token_accuracy": 0.8422402143478394, + "num_tokens": 44683164.0, + "step": 1172 + }, + { + "epoch": 0.14921765678666837, + "ewc_loss": 0.008056640625, + "ewc_loss_parallel": 8.046627044677734e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.456167221069336, + "learning_rate": 4.968206867316659e-07, + "loss": 0.5129, + "mean_token_accuracy": 0.834255576133728, + "num_tokens": 44721938.0, + "step": 1173 + }, + { + "epoch": 0.14934486706525887, + "ewc_loss": 0.00799560546875, + "ewc_loss_parallel": 7.987022399902344e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.561389923095703, + "learning_rate": 4.972445951674439e-07, + "loss": 0.4808, + "mean_token_accuracy": 0.8431504368782043, + "num_tokens": 44753506.0, + "step": 1174 + }, + { + "epoch": 0.14947207734384937, + "ewc_loss": 0.008056640625, + "ewc_loss_parallel": 8.046627044677734e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.59189510345459, + "learning_rate": 4.976685036032216e-07, + "loss": 0.5726, + "mean_token_accuracy": 0.8182093501091003, + "num_tokens": 44792170.0, + "step": 1175 + }, + { + "epoch": 0.1495992876224399, + "ewc_loss": 0.008056640625, + "ewc_loss_parallel": 8.046627044677734e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.466469764709473, + "learning_rate": 4.980924120389996e-07, + "loss": 0.4478, + "mean_token_accuracy": 0.8577089309692383, + "num_tokens": 44830733.0, + "step": 1176 + }, + { + "epoch": 0.1497264979010304, + "ewc_loss": 0.0081787109375, + "ewc_loss_parallel": 8.165836334228516e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.59426212310791, + "learning_rate": 4.985163204747774e-07, + "loss": 0.4631, + "mean_token_accuracy": 0.8510322570800781, + "num_tokens": 44869743.0, + "step": 1177 + }, + { + "epoch": 0.1498537081796209, + "ewc_loss": 0.0081787109375, + "ewc_loss_parallel": 8.165836334228516e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.593523979187012, + "learning_rate": 4.989402289105554e-07, + "loss": 0.5096, + "mean_token_accuracy": 0.8427073955535889, + "num_tokens": 44913358.0, + "step": 1178 + }, + { + "epoch": 0.14998091845821143, + "ewc_loss": 0.0081787109375, + "ewc_loss_parallel": 8.165836334228516e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.719350814819336, + "learning_rate": 4.993641373463331e-07, + "loss": 0.5694, + "mean_token_accuracy": 0.818381667137146, + "num_tokens": 44949963.0, + "step": 1179 + }, + { + "epoch": 0.15010812873680193, + "ewc_loss": 0.008056640625, + "ewc_loss_parallel": 8.046627044677734e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.512406349182129, + "learning_rate": 4.997880457821111e-07, + "loss": 0.5246, + "mean_token_accuracy": 0.8312338590621948, + "num_tokens": 44988569.0, + "step": 1180 + }, + { + "epoch": 0.15023533901539243, + "ewc_loss": 0.00811767578125, + "ewc_loss_parallel": 8.106231689453125e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.651808738708496, + "learning_rate": 5.002119542178889e-07, + "loss": 0.5006, + "mean_token_accuracy": 0.8390627503395081, + "num_tokens": 45032634.0, + "step": 1181 + }, + { + "epoch": 0.15036254929398296, + "ewc_loss": 0.0081787109375, + "ewc_loss_parallel": 8.165836334228516e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.473877906799316, + "learning_rate": 5.006358626536667e-07, + "loss": 0.4685, + "mean_token_accuracy": 0.8508508801460266, + "num_tokens": 45069685.0, + "step": 1182 + }, + { + "epoch": 0.15048975957257346, + "ewc_loss": 0.0081787109375, + "ewc_loss_parallel": 8.165836334228516e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.643593788146973, + "learning_rate": 5.010597710894446e-07, + "loss": 0.4918, + "mean_token_accuracy": 0.8419939875602722, + "num_tokens": 45107481.0, + "step": 1183 + }, + { + "epoch": 0.15061696985116396, + "ewc_loss": 0.00823974609375, + "ewc_loss_parallel": 8.225440979003906e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.592196464538574, + "learning_rate": 5.014836795252225e-07, + "loss": 0.4758, + "mean_token_accuracy": 0.847081184387207, + "num_tokens": 45145747.0, + "step": 1184 + }, + { + "epoch": 0.1507441801297545, + "ewc_loss": 0.0081787109375, + "ewc_loss_parallel": 8.165836334228516e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.541363716125488, + "learning_rate": 5.019075879610004e-07, + "loss": 0.4834, + "mean_token_accuracy": 0.8477543592453003, + "num_tokens": 45183140.0, + "step": 1185 + }, + { + "epoch": 0.150871390408345, + "ewc_loss": 0.00823974609375, + "ewc_loss_parallel": 8.225440979003906e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.625596046447754, + "learning_rate": 5.023314963967783e-07, + "loss": 0.5164, + "mean_token_accuracy": 0.8343249559402466, + "num_tokens": 45224911.0, + "step": 1186 + }, + { + "epoch": 0.1509986006869355, + "ewc_loss": 0.00830078125, + "ewc_loss_parallel": 8.285045623779297e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.649040222167969, + "learning_rate": 5.027554048325562e-07, + "loss": 0.5734, + "mean_token_accuracy": 0.820523738861084, + "num_tokens": 45257388.0, + "step": 1187 + }, + { + "epoch": 0.15112581096552602, + "ewc_loss": 0.00823974609375, + "ewc_loss_parallel": 8.225440979003906e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.661224365234375, + "learning_rate": 5.03179313268334e-07, + "loss": 0.5583, + "mean_token_accuracy": 0.8230977654457092, + "num_tokens": 45297876.0, + "step": 1188 + }, + { + "epoch": 0.15125302124411652, + "ewc_loss": 0.0081787109375, + "ewc_loss_parallel": 8.165836334228516e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.667407989501953, + "learning_rate": 5.036032217041119e-07, + "loss": 0.5321, + "mean_token_accuracy": 0.8288295865058899, + "num_tokens": 45334040.0, + "step": 1189 + }, + { + "epoch": 0.15138023152270705, + "ewc_loss": 0.0081787109375, + "ewc_loss_parallel": 8.165836334228516e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.624092102050781, + "learning_rate": 5.040271301398897e-07, + "loss": 0.5173, + "mean_token_accuracy": 0.8324903249740601, + "num_tokens": 45375985.0, + "step": 1190 + }, + { + "epoch": 0.15150744180129755, + "ewc_loss": 0.0081787109375, + "ewc_loss_parallel": 8.165836334228516e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.587992668151855, + "learning_rate": 5.044510385756676e-07, + "loss": 0.4861, + "mean_token_accuracy": 0.8412160277366638, + "num_tokens": 45419951.0, + "step": 1191 + }, + { + "epoch": 0.15163465207988805, + "ewc_loss": 0.00823974609375, + "ewc_loss_parallel": 8.225440979003906e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.61101245880127, + "learning_rate": 5.048749470114455e-07, + "loss": 0.4994, + "mean_token_accuracy": 0.8415899872779846, + "num_tokens": 45456988.0, + "step": 1192 + }, + { + "epoch": 0.15176186235847858, + "ewc_loss": 0.00823974609375, + "ewc_loss_parallel": 8.225440979003906e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.619914054870605, + "learning_rate": 5.052988554472234e-07, + "loss": 0.5173, + "mean_token_accuracy": 0.8331971168518066, + "num_tokens": 45488226.0, + "step": 1193 + }, + { + "epoch": 0.15188907263706908, + "ewc_loss": 0.0081787109375, + "ewc_loss_parallel": 8.165836334228516e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.574991226196289, + "learning_rate": 5.057227638830013e-07, + "loss": 0.5148, + "mean_token_accuracy": 0.8370804786682129, + "num_tokens": 45524661.0, + "step": 1194 + }, + { + "epoch": 0.15201628291565958, + "ewc_loss": 0.0081787109375, + "ewc_loss_parallel": 8.165836334228516e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.713335990905762, + "learning_rate": 5.061466723187792e-07, + "loss": 0.5168, + "mean_token_accuracy": 0.8362868428230286, + "num_tokens": 45563491.0, + "step": 1195 + }, + { + "epoch": 0.1521434931942501, + "ewc_loss": 0.00823974609375, + "ewc_loss_parallel": 8.225440979003906e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.573807716369629, + "learning_rate": 5.065705807545569e-07, + "loss": 0.4867, + "mean_token_accuracy": 0.8452432155609131, + "num_tokens": 45596771.0, + "step": 1196 + }, + { + "epoch": 0.1522707034728406, + "ewc_loss": 0.0081787109375, + "ewc_loss_parallel": 8.165836334228516e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.56946086883545, + "learning_rate": 5.069944891903349e-07, + "loss": 0.505, + "mean_token_accuracy": 0.8380602598190308, + "num_tokens": 45640049.0, + "step": 1197 + }, + { + "epoch": 0.1523979137514311, + "ewc_loss": 0.00830078125, + "ewc_loss_parallel": 8.285045623779297e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.616555213928223, + "learning_rate": 5.074183976261127e-07, + "loss": 0.5055, + "mean_token_accuracy": 0.837063729763031, + "num_tokens": 45677375.0, + "step": 1198 + }, + { + "epoch": 0.15252512403002164, + "ewc_loss": 0.00830078125, + "ewc_loss_parallel": 8.285045623779297e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.738813400268555, + "learning_rate": 5.078423060618906e-07, + "loss": 0.5086, + "mean_token_accuracy": 0.8383195400238037, + "num_tokens": 45719253.0, + "step": 1199 + }, + { + "epoch": 0.15265233430861214, + "ewc_loss": 0.0084228515625, + "ewc_loss_parallel": 8.404254913330078e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.545432090759277, + "learning_rate": 5.082662144976685e-07, + "loss": 0.5057, + "mean_token_accuracy": 0.8347282409667969, + "num_tokens": 45756065.0, + "step": 1200 + }, + { + "epoch": 0.15277954458720264, + "ewc_loss": 0.00836181640625, + "ewc_loss_parallel": 8.344650268554688e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.605493545532227, + "learning_rate": 5.086901229334464e-07, + "loss": 0.4937, + "mean_token_accuracy": 0.8445885181427002, + "num_tokens": 45791707.0, + "step": 1201 + }, + { + "epoch": 0.15290675486579317, + "ewc_loss": 0.00836181640625, + "ewc_loss_parallel": 8.344650268554688e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.768563270568848, + "learning_rate": 5.091140313692243e-07, + "loss": 0.5092, + "mean_token_accuracy": 0.8368902802467346, + "num_tokens": 45828464.0, + "step": 1202 + }, + { + "epoch": 0.15303396514438367, + "ewc_loss": 0.00836181640625, + "ewc_loss_parallel": 8.344650268554688e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.658034324645996, + "learning_rate": 5.095379398050022e-07, + "loss": 0.4924, + "mean_token_accuracy": 0.8416407704353333, + "num_tokens": 45867046.0, + "step": 1203 + }, + { + "epoch": 0.15316117542297417, + "ewc_loss": 0.00836181640625, + "ewc_loss_parallel": 8.344650268554688e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.705753326416016, + "learning_rate": 5.099618482407799e-07, + "loss": 0.524, + "mean_token_accuracy": 0.8342246413230896, + "num_tokens": 45900530.0, + "step": 1204 + }, + { + "epoch": 0.1532883857015647, + "ewc_loss": 0.0084228515625, + "ewc_loss_parallel": 8.404254913330078e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.684626579284668, + "learning_rate": 5.103857566765578e-07, + "loss": 0.5025, + "mean_token_accuracy": 0.8411786556243896, + "num_tokens": 45940002.0, + "step": 1205 + }, + { + "epoch": 0.1534155959801552, + "ewc_loss": 0.0084228515625, + "ewc_loss_parallel": 8.404254913330078e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.721952438354492, + "learning_rate": 5.108096651123357e-07, + "loss": 0.4628, + "mean_token_accuracy": 0.85057532787323, + "num_tokens": 45975299.0, + "step": 1206 + }, + { + "epoch": 0.1535428062587457, + "ewc_loss": 0.0084228515625, + "ewc_loss_parallel": 8.404254913330078e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.601231575012207, + "learning_rate": 5.112335735481135e-07, + "loss": 0.5019, + "mean_token_accuracy": 0.8377670049667358, + "num_tokens": 46012693.0, + "step": 1207 + }, + { + "epoch": 0.15367001653733622, + "ewc_loss": 0.00836181640625, + "ewc_loss_parallel": 8.344650268554688e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.649438858032227, + "learning_rate": 5.116574819838915e-07, + "loss": 0.4054, + "mean_token_accuracy": 0.8689509630203247, + "num_tokens": 46051558.0, + "step": 1208 + }, + { + "epoch": 0.15379722681592672, + "ewc_loss": 0.00836181640625, + "ewc_loss_parallel": 8.344650268554688e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.682868957519531, + "learning_rate": 5.120813904196693e-07, + "loss": 0.4904, + "mean_token_accuracy": 0.8430601954460144, + "num_tokens": 46087173.0, + "step": 1209 + }, + { + "epoch": 0.15392443709451722, + "ewc_loss": 0.00836181640625, + "ewc_loss_parallel": 8.344650268554688e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.68654727935791, + "learning_rate": 5.125052988554473e-07, + "loss": 0.5459, + "mean_token_accuracy": 0.8305606842041016, + "num_tokens": 46120482.0, + "step": 1210 + }, + { + "epoch": 0.15405164737310775, + "ewc_loss": 0.0084228515625, + "ewc_loss_parallel": 8.404254913330078e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.677855491638184, + "learning_rate": 5.12929207291225e-07, + "loss": 0.4945, + "mean_token_accuracy": 0.8401952981948853, + "num_tokens": 46154478.0, + "step": 1211 + }, + { + "epoch": 0.15417885765169825, + "ewc_loss": 0.0084228515625, + "ewc_loss_parallel": 8.404254913330078e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.74691390991211, + "learning_rate": 5.133531157270029e-07, + "loss": 0.4707, + "mean_token_accuracy": 0.8488531112670898, + "num_tokens": 46195012.0, + "step": 1212 + }, + { + "epoch": 0.15430606793028875, + "ewc_loss": 0.0084228515625, + "ewc_loss_parallel": 8.404254913330078e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.713265419006348, + "learning_rate": 5.137770241627808e-07, + "loss": 0.4346, + "mean_token_accuracy": 0.8595854043960571, + "num_tokens": 46230641.0, + "step": 1213 + }, + { + "epoch": 0.15443327820887928, + "ewc_loss": 0.0084228515625, + "ewc_loss_parallel": 8.404254913330078e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.79246997833252, + "learning_rate": 5.142009325985587e-07, + "loss": 0.4762, + "mean_token_accuracy": 0.8495641946792603, + "num_tokens": 46268993.0, + "step": 1214 + }, + { + "epoch": 0.15456048848746978, + "ewc_loss": 0.0084228515625, + "ewc_loss_parallel": 8.404254913330078e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.789996147155762, + "learning_rate": 5.146248410343365e-07, + "loss": 0.5097, + "mean_token_accuracy": 0.8383392095565796, + "num_tokens": 46309310.0, + "step": 1215 + }, + { + "epoch": 0.1546876987660603, + "ewc_loss": 0.0084228515625, + "ewc_loss_parallel": 8.404254913330078e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.681997299194336, + "learning_rate": 5.150487494701145e-07, + "loss": 0.4503, + "mean_token_accuracy": 0.8557009100914001, + "num_tokens": 46352415.0, + "step": 1216 + }, + { + "epoch": 0.1548149090446508, + "ewc_loss": 0.0084228515625, + "ewc_loss_parallel": 8.404254913330078e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.742369651794434, + "learning_rate": 5.154726579058923e-07, + "loss": 0.5062, + "mean_token_accuracy": 0.836621880531311, + "num_tokens": 46387838.0, + "step": 1217 + }, + { + "epoch": 0.1549421193232413, + "ewc_loss": 0.0084228515625, + "ewc_loss_parallel": 8.404254913330078e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.633662223815918, + "learning_rate": 5.158965663416703e-07, + "loss": 0.5013, + "mean_token_accuracy": 0.8403664231300354, + "num_tokens": 46432669.0, + "step": 1218 + }, + { + "epoch": 0.15506932960183184, + "ewc_loss": 0.0084228515625, + "ewc_loss_parallel": 8.404254913330078e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.669387817382812, + "learning_rate": 5.16320474777448e-07, + "loss": 0.456, + "mean_token_accuracy": 0.853337287902832, + "num_tokens": 46474383.0, + "step": 1219 + }, + { + "epoch": 0.15519653988042234, + "ewc_loss": 0.00848388671875, + "ewc_loss_parallel": 8.463859558105469e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.784900665283203, + "learning_rate": 5.167443832132259e-07, + "loss": 0.5073, + "mean_token_accuracy": 0.838552713394165, + "num_tokens": 46514263.0, + "step": 1220 + }, + { + "epoch": 0.15532375015901284, + "ewc_loss": 0.00848388671875, + "ewc_loss_parallel": 8.463859558105469e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.882806777954102, + "learning_rate": 5.171682916490038e-07, + "loss": 0.4738, + "mean_token_accuracy": 0.8482029438018799, + "num_tokens": 46549835.0, + "step": 1221 + }, + { + "epoch": 0.15545096043760337, + "ewc_loss": 0.00836181640625, + "ewc_loss_parallel": 8.344650268554688e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.809479713439941, + "learning_rate": 5.175922000847816e-07, + "loss": 0.4619, + "mean_token_accuracy": 0.8522147536277771, + "num_tokens": 46584811.0, + "step": 1222 + }, + { + "epoch": 0.15557817071619387, + "ewc_loss": 0.0084228515625, + "ewc_loss_parallel": 8.404254913330078e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.81147575378418, + "learning_rate": 5.180161085205595e-07, + "loss": 0.4827, + "mean_token_accuracy": 0.8457783460617065, + "num_tokens": 46621441.0, + "step": 1223 + }, + { + "epoch": 0.15570538099478437, + "ewc_loss": 0.00848388671875, + "ewc_loss_parallel": 8.463859558105469e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.822847366333008, + "learning_rate": 5.184400169563374e-07, + "loss": 0.5366, + "mean_token_accuracy": 0.829421877861023, + "num_tokens": 46661572.0, + "step": 1224 + }, + { + "epoch": 0.1558325912733749, + "ewc_loss": 0.00848388671875, + "ewc_loss_parallel": 8.463859558105469e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.822673797607422, + "learning_rate": 5.188639253921153e-07, + "loss": 0.5119, + "mean_token_accuracy": 0.8356021642684937, + "num_tokens": 46702157.0, + "step": 1225 + }, + { + "epoch": 0.1559598015519654, + "ewc_loss": 0.00848388671875, + "ewc_loss_parallel": 8.463859558105469e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.979616165161133, + "learning_rate": 5.192878338278932e-07, + "loss": 0.4888, + "mean_token_accuracy": 0.843900203704834, + "num_tokens": 46739119.0, + "step": 1226 + }, + { + "epoch": 0.1560870118305559, + "ewc_loss": 0.00848388671875, + "ewc_loss_parallel": 8.463859558105469e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.663126945495605, + "learning_rate": 5.19711742263671e-07, + "loss": 0.5465, + "mean_token_accuracy": 0.826886773109436, + "num_tokens": 46783990.0, + "step": 1227 + }, + { + "epoch": 0.15621422210914643, + "ewc_loss": 0.00848388671875, + "ewc_loss_parallel": 8.463859558105469e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.926816940307617, + "learning_rate": 5.201356506994488e-07, + "loss": 0.4831, + "mean_token_accuracy": 0.8397809267044067, + "num_tokens": 46816131.0, + "step": 1228 + }, + { + "epoch": 0.15634143238773693, + "ewc_loss": 0.008544921875, + "ewc_loss_parallel": 8.52346420288086e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.787227630615234, + "learning_rate": 5.205595591352268e-07, + "loss": 0.5683, + "mean_token_accuracy": 0.8211485147476196, + "num_tokens": 46856453.0, + "step": 1229 + }, + { + "epoch": 0.15646864266632743, + "ewc_loss": 0.00848388671875, + "ewc_loss_parallel": 8.463859558105469e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.82210636138916, + "learning_rate": 5.209834675710046e-07, + "loss": 0.5064, + "mean_token_accuracy": 0.8373012542724609, + "num_tokens": 46897961.0, + "step": 1230 + }, + { + "epoch": 0.15659585294491796, + "ewc_loss": 0.00848388671875, + "ewc_loss_parallel": 8.463859558105469e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.059012413024902, + "learning_rate": 5.214073760067825e-07, + "loss": 0.5816, + "mean_token_accuracy": 0.8175909519195557, + "num_tokens": 46931961.0, + "step": 1231 + }, + { + "epoch": 0.15672306322350846, + "ewc_loss": 0.00848388671875, + "ewc_loss_parallel": 8.463859558105469e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.946467399597168, + "learning_rate": 5.218312844425604e-07, + "loss": 0.5022, + "mean_token_accuracy": 0.8421138525009155, + "num_tokens": 46967822.0, + "step": 1232 + }, + { + "epoch": 0.15685027350209896, + "ewc_loss": 0.00848388671875, + "ewc_loss_parallel": 8.463859558105469e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.83220386505127, + "learning_rate": 5.222551928783383e-07, + "loss": 0.4819, + "mean_token_accuracy": 0.8453594446182251, + "num_tokens": 47004511.0, + "step": 1233 + }, + { + "epoch": 0.1569774837806895, + "ewc_loss": 0.008544921875, + "ewc_loss_parallel": 8.52346420288086e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.71896743774414, + "learning_rate": 5.226791013141161e-07, + "loss": 0.5352, + "mean_token_accuracy": 0.8318471312522888, + "num_tokens": 47047244.0, + "step": 1234 + }, + { + "epoch": 0.15710469405928, + "ewc_loss": 0.00848388671875, + "ewc_loss_parallel": 8.463859558105469e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.054577827453613, + "learning_rate": 5.23103009749894e-07, + "loss": 0.4936, + "mean_token_accuracy": 0.8418493270874023, + "num_tokens": 47088206.0, + "step": 1235 + }, + { + "epoch": 0.1572319043378705, + "ewc_loss": 0.0086669921875, + "ewc_loss_parallel": 8.64267349243164e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.85148811340332, + "learning_rate": 5.235269181856718e-07, + "loss": 0.5017, + "mean_token_accuracy": 0.8387792110443115, + "num_tokens": 47132733.0, + "step": 1236 + }, + { + "epoch": 0.15735911461646102, + "ewc_loss": 0.00848388671875, + "ewc_loss_parallel": 8.463859558105469e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.814496040344238, + "learning_rate": 5.239508266214498e-07, + "loss": 0.5601, + "mean_token_accuracy": 0.8190048933029175, + "num_tokens": 47166840.0, + "step": 1237 + }, + { + "epoch": 0.15748632489505152, + "ewc_loss": 0.00860595703125, + "ewc_loss_parallel": 8.58306884765625e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.739717483520508, + "learning_rate": 5.243747350572276e-07, + "loss": 0.5194, + "mean_token_accuracy": 0.8357627391815186, + "num_tokens": 47203444.0, + "step": 1238 + }, + { + "epoch": 0.15761353517364202, + "ewc_loss": 0.00860595703125, + "ewc_loss_parallel": 8.58306884765625e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.800460815429688, + "learning_rate": 5.247986434930056e-07, + "loss": 0.5128, + "mean_token_accuracy": 0.8395668268203735, + "num_tokens": 47240380.0, + "step": 1239 + }, + { + "epoch": 0.15774074545223254, + "ewc_loss": 0.0086669921875, + "ewc_loss_parallel": 8.64267349243164e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.92947006225586, + "learning_rate": 5.252225519287834e-07, + "loss": 0.5737, + "mean_token_accuracy": 0.8184254765510559, + "num_tokens": 47277886.0, + "step": 1240 + }, + { + "epoch": 0.15786795573082305, + "ewc_loss": 0.0086669921875, + "ewc_loss_parallel": 8.64267349243164e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.950408935546875, + "learning_rate": 5.256464603645613e-07, + "loss": 0.5018, + "mean_token_accuracy": 0.843899667263031, + "num_tokens": 47315261.0, + "step": 1241 + }, + { + "epoch": 0.15799516600941357, + "ewc_loss": 0.0086669921875, + "ewc_loss_parallel": 8.64267349243164e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.8980131149292, + "learning_rate": 5.260703688003391e-07, + "loss": 0.4999, + "mean_token_accuracy": 0.8406266570091248, + "num_tokens": 47357489.0, + "step": 1242 + }, + { + "epoch": 0.15812237628800407, + "ewc_loss": 0.00860595703125, + "ewc_loss_parallel": 8.58306884765625e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.878878593444824, + "learning_rate": 5.26494277236117e-07, + "loss": 0.5182, + "mean_token_accuracy": 0.835308313369751, + "num_tokens": 47389851.0, + "step": 1243 + }, + { + "epoch": 0.15824958656659457, + "ewc_loss": 0.0086669921875, + "ewc_loss_parallel": 8.64267349243164e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.949312210083008, + "learning_rate": 5.269181856718948e-07, + "loss": 0.5437, + "mean_token_accuracy": 0.8252023458480835, + "num_tokens": 47427491.0, + "step": 1244 + }, + { + "epoch": 0.1583767968451851, + "ewc_loss": 0.0086669921875, + "ewc_loss_parallel": 8.64267349243164e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.976261138916016, + "learning_rate": 5.273420941076727e-07, + "loss": 0.4829, + "mean_token_accuracy": 0.8473025560379028, + "num_tokens": 47462790.0, + "step": 1245 + }, + { + "epoch": 0.1585040071237756, + "ewc_loss": 0.0086669921875, + "ewc_loss_parallel": 8.64267349243164e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.950928688049316, + "learning_rate": 5.277660025434506e-07, + "loss": 0.4455, + "mean_token_accuracy": 0.8556032180786133, + "num_tokens": 47502240.0, + "step": 1246 + }, + { + "epoch": 0.1586312174023661, + "ewc_loss": 0.0086669921875, + "ewc_loss_parallel": 8.64267349243164e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.936640739440918, + "learning_rate": 5.281899109792285e-07, + "loss": 0.4709, + "mean_token_accuracy": 0.8488712310791016, + "num_tokens": 47542493.0, + "step": 1247 + }, + { + "epoch": 0.15875842768095663, + "ewc_loss": 0.00872802734375, + "ewc_loss_parallel": 8.702278137207031e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.821710586547852, + "learning_rate": 5.286138194150064e-07, + "loss": 0.5079, + "mean_token_accuracy": 0.8402732014656067, + "num_tokens": 47583378.0, + "step": 1248 + }, + { + "epoch": 0.15888563795954713, + "ewc_loss": 0.00872802734375, + "ewc_loss_parallel": 8.702278137207031e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.00404167175293, + "learning_rate": 5.290377278507841e-07, + "loss": 0.5325, + "mean_token_accuracy": 0.836117684841156, + "num_tokens": 47622671.0, + "step": 1249 + }, + { + "epoch": 0.15901284823813763, + "ewc_loss": 0.0086669921875, + "ewc_loss_parallel": 8.64267349243164e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.878372192382812, + "learning_rate": 5.294616362865621e-07, + "loss": 0.4919, + "mean_token_accuracy": 0.8465044498443604, + "num_tokens": 47663774.0, + "step": 1250 + }, + { + "epoch": 0.15914005851672816, + "ewc_loss": 0.0086669921875, + "ewc_loss_parallel": 8.64267349243164e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.837578773498535, + "learning_rate": 5.298855447223399e-07, + "loss": 0.563, + "mean_token_accuracy": 0.8305225372314453, + "num_tokens": 47706143.0, + "step": 1251 + }, + { + "epoch": 0.15926726879531866, + "ewc_loss": 0.00872802734375, + "ewc_loss_parallel": 8.702278137207031e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.971196174621582, + "learning_rate": 5.303094531581178e-07, + "loss": 0.5254, + "mean_token_accuracy": 0.8342245221138, + "num_tokens": 47741197.0, + "step": 1252 + }, + { + "epoch": 0.15939447907390916, + "ewc_loss": 0.0087890625, + "ewc_loss_parallel": 8.761882781982422e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.868794441223145, + "learning_rate": 5.307333615938957e-07, + "loss": 0.4481, + "mean_token_accuracy": 0.8555448055267334, + "num_tokens": 47779306.0, + "step": 1253 + }, + { + "epoch": 0.1595216893524997, + "ewc_loss": 0.00872802734375, + "ewc_loss_parallel": 8.702278137207031e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.029092788696289, + "learning_rate": 5.311572700296736e-07, + "loss": 0.5334, + "mean_token_accuracy": 0.8318103551864624, + "num_tokens": 47811056.0, + "step": 1254 + }, + { + "epoch": 0.1596488996310902, + "ewc_loss": 0.0087890625, + "ewc_loss_parallel": 8.761882781982422e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.853080749511719, + "learning_rate": 5.315811784654515e-07, + "loss": 0.441, + "mean_token_accuracy": 0.853745698928833, + "num_tokens": 47847075.0, + "step": 1255 + }, + { + "epoch": 0.1597761099096807, + "ewc_loss": 0.0086669921875, + "ewc_loss_parallel": 8.64267349243164e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.904752731323242, + "learning_rate": 5.320050869012294e-07, + "loss": 0.4979, + "mean_token_accuracy": 0.8422524929046631, + "num_tokens": 47892639.0, + "step": 1256 + }, + { + "epoch": 0.15990332018827122, + "ewc_loss": 0.0087890625, + "ewc_loss_parallel": 8.761882781982422e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.068914413452148, + "learning_rate": 5.324289953370071e-07, + "loss": 0.5564, + "mean_token_accuracy": 0.8253458142280579, + "num_tokens": 47933522.0, + "step": 1257 + }, + { + "epoch": 0.16003053046686172, + "ewc_loss": 0.0087890625, + "ewc_loss_parallel": 8.761882781982422e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.040011405944824, + "learning_rate": 5.328529037727851e-07, + "loss": 0.4927, + "mean_token_accuracy": 0.8415149450302124, + "num_tokens": 47977307.0, + "step": 1258 + }, + { + "epoch": 0.16015774074545222, + "ewc_loss": 0.0087890625, + "ewc_loss_parallel": 8.761882781982422e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.077057838439941, + "learning_rate": 5.332768122085629e-07, + "loss": 0.4898, + "mean_token_accuracy": 0.8422901034355164, + "num_tokens": 48015129.0, + "step": 1259 + }, + { + "epoch": 0.16028495102404275, + "ewc_loss": 0.0087890625, + "ewc_loss_parallel": 8.761882781982422e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.074043273925781, + "learning_rate": 5.337007206443408e-07, + "loss": 0.5252, + "mean_token_accuracy": 0.8293788433074951, + "num_tokens": 48051904.0, + "step": 1260 + }, + { + "epoch": 0.16041216130263325, + "ewc_loss": 0.0087890625, + "ewc_loss_parallel": 8.761882781982422e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.034411430358887, + "learning_rate": 5.341246290801187e-07, + "loss": 0.4862, + "mean_token_accuracy": 0.8477809429168701, + "num_tokens": 48094001.0, + "step": 1261 + }, + { + "epoch": 0.16053937158122375, + "ewc_loss": 0.0087890625, + "ewc_loss_parallel": 8.761882781982422e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.989248275756836, + "learning_rate": 5.345485375158966e-07, + "loss": 0.4782, + "mean_token_accuracy": 0.8442726135253906, + "num_tokens": 48134124.0, + "step": 1262 + }, + { + "epoch": 0.16066658185981428, + "ewc_loss": 0.00872802734375, + "ewc_loss_parallel": 8.702278137207031e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.100203514099121, + "learning_rate": 5.349724459516745e-07, + "loss": 0.4682, + "mean_token_accuracy": 0.8483677506446838, + "num_tokens": 48170730.0, + "step": 1263 + }, + { + "epoch": 0.16079379213840478, + "ewc_loss": 0.0087890625, + "ewc_loss_parallel": 8.761882781982422e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.99997615814209, + "learning_rate": 5.353963543874522e-07, + "loss": 0.5398, + "mean_token_accuracy": 0.8297888040542603, + "num_tokens": 48212640.0, + "step": 1264 + }, + { + "epoch": 0.1609210024169953, + "ewc_loss": 0.00872802734375, + "ewc_loss_parallel": 8.702278137207031e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.189427375793457, + "learning_rate": 5.358202628232301e-07, + "loss": 0.5191, + "mean_token_accuracy": 0.8356733322143555, + "num_tokens": 48243136.0, + "step": 1265 + }, + { + "epoch": 0.1610482126955858, + "ewc_loss": 0.00885009765625, + "ewc_loss_parallel": 8.821487426757812e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.263940811157227, + "learning_rate": 5.36244171259008e-07, + "loss": 0.4962, + "mean_token_accuracy": 0.8417842984199524, + "num_tokens": 48282905.0, + "step": 1266 + }, + { + "epoch": 0.1611754229741763, + "ewc_loss": 0.0087890625, + "ewc_loss_parallel": 8.761882781982422e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.953886985778809, + "learning_rate": 5.366680796947859e-07, + "loss": 0.5344, + "mean_token_accuracy": 0.8317962884902954, + "num_tokens": 48323325.0, + "step": 1267 + }, + { + "epoch": 0.16130263325276684, + "ewc_loss": 0.0087890625, + "ewc_loss_parallel": 8.761882781982422e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.064638137817383, + "learning_rate": 5.370919881305637e-07, + "loss": 0.512, + "mean_token_accuracy": 0.8401402831077576, + "num_tokens": 48358400.0, + "step": 1268 + }, + { + "epoch": 0.16142984353135734, + "ewc_loss": 0.0087890625, + "ewc_loss_parallel": 8.761882781982422e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.082134246826172, + "learning_rate": 5.375158965663417e-07, + "loss": 0.5139, + "mean_token_accuracy": 0.8386591672897339, + "num_tokens": 48393018.0, + "step": 1269 + }, + { + "epoch": 0.16155705380994784, + "ewc_loss": 0.0087890625, + "ewc_loss_parallel": 8.761882781982422e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.045318603515625, + "learning_rate": 5.379398050021195e-07, + "loss": 0.5007, + "mean_token_accuracy": 0.8407801985740662, + "num_tokens": 48432952.0, + "step": 1270 + }, + { + "epoch": 0.16168426408853837, + "ewc_loss": 0.0087890625, + "ewc_loss_parallel": 8.761882781982422e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.049734115600586, + "learning_rate": 5.383637134378975e-07, + "loss": 0.4902, + "mean_token_accuracy": 0.8413190245628357, + "num_tokens": 48466913.0, + "step": 1271 + }, + { + "epoch": 0.16181147436712887, + "ewc_loss": 0.0087890625, + "ewc_loss_parallel": 8.761882781982422e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.130372047424316, + "learning_rate": 5.387876218736752e-07, + "loss": 0.5697, + "mean_token_accuracy": 0.816650390625, + "num_tokens": 48512658.0, + "step": 1272 + }, + { + "epoch": 0.16193868464571937, + "ewc_loss": 0.0087890625, + "ewc_loss_parallel": 8.761882781982422e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.117546081542969, + "learning_rate": 5.392115303094531e-07, + "loss": 0.4663, + "mean_token_accuracy": 0.8491997718811035, + "num_tokens": 48552283.0, + "step": 1273 + }, + { + "epoch": 0.1620658949243099, + "ewc_loss": 0.0087890625, + "ewc_loss_parallel": 8.761882781982422e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.085551261901855, + "learning_rate": 5.39635438745231e-07, + "loss": 0.4465, + "mean_token_accuracy": 0.8563791513442993, + "num_tokens": 48590504.0, + "step": 1274 + }, + { + "epoch": 0.1621931052029004, + "ewc_loss": 0.0087890625, + "ewc_loss_parallel": 8.761882781982422e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.101685523986816, + "learning_rate": 5.400593471810089e-07, + "loss": 0.5346, + "mean_token_accuracy": 0.8307382464408875, + "num_tokens": 48629993.0, + "step": 1275 + }, + { + "epoch": 0.1623203154814909, + "ewc_loss": 0.0087890625, + "ewc_loss_parallel": 8.761882781982422e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.12755012512207, + "learning_rate": 5.404832556167867e-07, + "loss": 0.5389, + "mean_token_accuracy": 0.8293173909187317, + "num_tokens": 48665074.0, + "step": 1276 + }, + { + "epoch": 0.16244752576008142, + "ewc_loss": 0.00885009765625, + "ewc_loss_parallel": 8.821487426757812e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.03591537475586, + "learning_rate": 5.409071640525647e-07, + "loss": 0.496, + "mean_token_accuracy": 0.8458505868911743, + "num_tokens": 48705524.0, + "step": 1277 + }, + { + "epoch": 0.16257473603867192, + "ewc_loss": 0.0087890625, + "ewc_loss_parallel": 8.761882781982422e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.064872741699219, + "learning_rate": 5.413310724883425e-07, + "loss": 0.5426, + "mean_token_accuracy": 0.819242537021637, + "num_tokens": 48744030.0, + "step": 1278 + }, + { + "epoch": 0.16270194631726242, + "ewc_loss": 0.00885009765625, + "ewc_loss_parallel": 8.821487426757812e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.103238105773926, + "learning_rate": 5.417549809241205e-07, + "loss": 0.5343, + "mean_token_accuracy": 0.8337389230728149, + "num_tokens": 48788401.0, + "step": 1279 + }, + { + "epoch": 0.16282915659585295, + "ewc_loss": 0.0087890625, + "ewc_loss_parallel": 8.761882781982422e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.121952056884766, + "learning_rate": 5.421788893598982e-07, + "loss": 0.537, + "mean_token_accuracy": 0.8296350240707397, + "num_tokens": 48823046.0, + "step": 1280 + }, + { + "epoch": 0.16295636687444345, + "ewc_loss": 0.0089111328125, + "ewc_loss_parallel": 8.881092071533203e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.03795051574707, + "learning_rate": 5.42602797795676e-07, + "loss": 0.5431, + "mean_token_accuracy": 0.8267502784729004, + "num_tokens": 48868293.0, + "step": 1281 + }, + { + "epoch": 0.16308357715303395, + "ewc_loss": 0.0087890625, + "ewc_loss_parallel": 8.761882781982422e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.963841438293457, + "learning_rate": 5.43026706231454e-07, + "loss": 0.571, + "mean_token_accuracy": 0.8189884424209595, + "num_tokens": 48908775.0, + "step": 1282 + }, + { + "epoch": 0.16321078743162448, + "ewc_loss": 0.0089111328125, + "ewc_loss_parallel": 8.881092071533203e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.059721946716309, + "learning_rate": 5.434506146672319e-07, + "loss": 0.5488, + "mean_token_accuracy": 0.8260821104049683, + "num_tokens": 48957640.0, + "step": 1283 + }, + { + "epoch": 0.16333799771021498, + "ewc_loss": 0.00885009765625, + "ewc_loss_parallel": 8.821487426757812e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.991992950439453, + "learning_rate": 5.438745231030097e-07, + "loss": 0.5388, + "mean_token_accuracy": 0.8262081146240234, + "num_tokens": 48993601.0, + "step": 1284 + }, + { + "epoch": 0.16346520798880548, + "ewc_loss": 0.00885009765625, + "ewc_loss_parallel": 8.821487426757812e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.220636367797852, + "learning_rate": 5.442984315387876e-07, + "loss": 0.4468, + "mean_token_accuracy": 0.8561345338821411, + "num_tokens": 49028411.0, + "step": 1285 + }, + { + "epoch": 0.163592418267396, + "ewc_loss": 0.0089111328125, + "ewc_loss_parallel": 8.881092071533203e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.08780574798584, + "learning_rate": 5.447223399745655e-07, + "loss": 0.5095, + "mean_token_accuracy": 0.8405057191848755, + "num_tokens": 49067449.0, + "step": 1286 + }, + { + "epoch": 0.1637196285459865, + "ewc_loss": 0.0089111328125, + "ewc_loss_parallel": 8.881092071533203e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.1602201461792, + "learning_rate": 5.451462484103433e-07, + "loss": 0.4845, + "mean_token_accuracy": 0.8456141948699951, + "num_tokens": 49102650.0, + "step": 1287 + }, + { + "epoch": 0.163846838824577, + "ewc_loss": 0.0089111328125, + "ewc_loss_parallel": 8.940696716308594e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.098954200744629, + "learning_rate": 5.455701568461212e-07, + "loss": 0.5101, + "mean_token_accuracy": 0.8354247212409973, + "num_tokens": 49137110.0, + "step": 1288 + }, + { + "epoch": 0.16397404910316754, + "ewc_loss": 0.0089111328125, + "ewc_loss_parallel": 8.881092071533203e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.323856353759766, + "learning_rate": 5.45994065281899e-07, + "loss": 0.5813, + "mean_token_accuracy": 0.81386399269104, + "num_tokens": 49177765.0, + "step": 1289 + }, + { + "epoch": 0.16410125938175804, + "ewc_loss": 0.0089111328125, + "ewc_loss_parallel": 8.940696716308594e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.050873756408691, + "learning_rate": 5.46417973717677e-07, + "loss": 0.521, + "mean_token_accuracy": 0.8307969570159912, + "num_tokens": 49207860.0, + "step": 1290 + }, + { + "epoch": 0.16422846966034857, + "ewc_loss": 0.0089111328125, + "ewc_loss_parallel": 8.940696716308594e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.109111785888672, + "learning_rate": 5.468418821534548e-07, + "loss": 0.4528, + "mean_token_accuracy": 0.8538773655891418, + "num_tokens": 49239994.0, + "step": 1291 + }, + { + "epoch": 0.16435567993893907, + "ewc_loss": 0.0089111328125, + "ewc_loss_parallel": 8.940696716308594e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 12.984725952148438, + "learning_rate": 5.472657905892327e-07, + "loss": 0.4678, + "mean_token_accuracy": 0.8510261178016663, + "num_tokens": 49273464.0, + "step": 1292 + }, + { + "epoch": 0.16448289021752957, + "ewc_loss": 0.0089111328125, + "ewc_loss_parallel": 8.940696716308594e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.166831016540527, + "learning_rate": 5.476896990250106e-07, + "loss": 0.5827, + "mean_token_accuracy": 0.816304087638855, + "num_tokens": 49313144.0, + "step": 1293 + }, + { + "epoch": 0.1646101004961201, + "ewc_loss": 0.009033203125, + "ewc_loss_parallel": 9.059906005859375e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.209545135498047, + "learning_rate": 5.481136074607885e-07, + "loss": 0.4687, + "mean_token_accuracy": 0.8450263142585754, + "num_tokens": 49351392.0, + "step": 1294 + }, + { + "epoch": 0.1647373107747106, + "ewc_loss": 0.0089111328125, + "ewc_loss_parallel": 8.940696716308594e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.04089641571045, + "learning_rate": 5.485375158965663e-07, + "loss": 0.4698, + "mean_token_accuracy": 0.8499795198440552, + "num_tokens": 49396726.0, + "step": 1295 + }, + { + "epoch": 0.1648645210533011, + "ewc_loss": 0.0089111328125, + "ewc_loss_parallel": 8.940696716308594e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.127274513244629, + "learning_rate": 5.489614243323442e-07, + "loss": 0.5036, + "mean_token_accuracy": 0.8375946283340454, + "num_tokens": 49435856.0, + "step": 1296 + }, + { + "epoch": 0.16499173133189163, + "ewc_loss": 0.00897216796875, + "ewc_loss_parallel": 9.000301361083984e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.13117504119873, + "learning_rate": 5.49385332768122e-07, + "loss": 0.4442, + "mean_token_accuracy": 0.8551770448684692, + "num_tokens": 49473695.0, + "step": 1297 + }, + { + "epoch": 0.16511894161048213, + "ewc_loss": 0.00897216796875, + "ewc_loss_parallel": 9.000301361083984e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.070991516113281, + "learning_rate": 5.498092412039e-07, + "loss": 0.5717, + "mean_token_accuracy": 0.8232150077819824, + "num_tokens": 49509165.0, + "step": 1298 + }, + { + "epoch": 0.16524615188907263, + "ewc_loss": 0.00897216796875, + "ewc_loss_parallel": 9.000301361083984e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.166274070739746, + "learning_rate": 5.502331496396778e-07, + "loss": 0.4757, + "mean_token_accuracy": 0.8459803462028503, + "num_tokens": 49553790.0, + "step": 1299 + }, + { + "epoch": 0.16537336216766316, + "ewc_loss": 0.00897216796875, + "ewc_loss_parallel": 9.000301361083984e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.209131240844727, + "learning_rate": 5.506570580754557e-07, + "loss": 0.4983, + "mean_token_accuracy": 0.8412187099456787, + "num_tokens": 49593124.0, + "step": 1300 + }, + { + "epoch": 0.16550057244625366, + "ewc_loss": 0.00897216796875, + "ewc_loss_parallel": 9.000301361083984e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.115501403808594, + "learning_rate": 5.510809665112336e-07, + "loss": 0.4872, + "mean_token_accuracy": 0.8468648195266724, + "num_tokens": 49632016.0, + "step": 1301 + }, + { + "epoch": 0.16562778272484416, + "ewc_loss": 0.0089111328125, + "ewc_loss_parallel": 8.940696716308594e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.192098617553711, + "learning_rate": 5.515048749470113e-07, + "loss": 0.5008, + "mean_token_accuracy": 0.8402530550956726, + "num_tokens": 49670689.0, + "step": 1302 + }, + { + "epoch": 0.1657549930034347, + "ewc_loss": 0.00897216796875, + "ewc_loss_parallel": 9.000301361083984e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.217198371887207, + "learning_rate": 5.519287833827893e-07, + "loss": 0.4869, + "mean_token_accuracy": 0.8427135944366455, + "num_tokens": 49709230.0, + "step": 1303 + }, + { + "epoch": 0.1658822032820252, + "ewc_loss": 0.00897216796875, + "ewc_loss_parallel": 9.000301361083984e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.019393920898438, + "learning_rate": 5.523526918185671e-07, + "loss": 0.5444, + "mean_token_accuracy": 0.8256363868713379, + "num_tokens": 49741648.0, + "step": 1304 + }, + { + "epoch": 0.1660094135606157, + "ewc_loss": 0.009033203125, + "ewc_loss_parallel": 9.059906005859375e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.161117553710938, + "learning_rate": 5.52776600254345e-07, + "loss": 0.462, + "mean_token_accuracy": 0.8484950661659241, + "num_tokens": 49773906.0, + "step": 1305 + }, + { + "epoch": 0.16613662383920622, + "ewc_loss": 0.009033203125, + "ewc_loss_parallel": 9.059906005859375e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.159464836120605, + "learning_rate": 5.532005086901229e-07, + "loss": 0.5199, + "mean_token_accuracy": 0.839043915271759, + "num_tokens": 49812645.0, + "step": 1306 + }, + { + "epoch": 0.16626383411779672, + "ewc_loss": 0.00897216796875, + "ewc_loss_parallel": 9.000301361083984e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.143985748291016, + "learning_rate": 5.536244171259008e-07, + "loss": 0.4889, + "mean_token_accuracy": 0.8445745706558228, + "num_tokens": 49854500.0, + "step": 1307 + }, + { + "epoch": 0.16639104439638722, + "ewc_loss": 0.009033203125, + "ewc_loss_parallel": 9.059906005859375e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.115368843078613, + "learning_rate": 5.540483255616786e-07, + "loss": 0.5829, + "mean_token_accuracy": 0.8184230327606201, + "num_tokens": 49889408.0, + "step": 1308 + }, + { + "epoch": 0.16651825467497774, + "ewc_loss": 0.009033203125, + "ewc_loss_parallel": 9.059906005859375e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.121015548706055, + "learning_rate": 5.544722339974566e-07, + "loss": 0.5072, + "mean_token_accuracy": 0.8391111493110657, + "num_tokens": 49923581.0, + "step": 1309 + }, + { + "epoch": 0.16664546495356825, + "ewc_loss": 0.009033203125, + "ewc_loss_parallel": 9.059906005859375e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.153913497924805, + "learning_rate": 5.548961424332343e-07, + "loss": 0.4961, + "mean_token_accuracy": 0.8384160399436951, + "num_tokens": 49959433.0, + "step": 1310 + }, + { + "epoch": 0.16677267523215875, + "ewc_loss": 0.009033203125, + "ewc_loss_parallel": 9.059906005859375e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.257874488830566, + "learning_rate": 5.553200508690123e-07, + "loss": 0.464, + "mean_token_accuracy": 0.8532129526138306, + "num_tokens": 50000307.0, + "step": 1311 + }, + { + "epoch": 0.16689988551074927, + "ewc_loss": 0.0091552734375, + "ewc_loss_parallel": 9.179115295410156e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.289555549621582, + "learning_rate": 5.557439593047901e-07, + "loss": 0.5573, + "mean_token_accuracy": 0.8218183517456055, + "num_tokens": 50035587.0, + "step": 1312 + }, + { + "epoch": 0.16702709578933977, + "ewc_loss": 0.0091552734375, + "ewc_loss_parallel": 9.179115295410156e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.105566024780273, + "learning_rate": 5.56167867740568e-07, + "loss": 0.4945, + "mean_token_accuracy": 0.8434439301490784, + "num_tokens": 50079425.0, + "step": 1313 + }, + { + "epoch": 0.16715430606793028, + "ewc_loss": 0.0091552734375, + "ewc_loss_parallel": 9.179115295410156e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.352564811706543, + "learning_rate": 5.565917761763459e-07, + "loss": 0.4766, + "mean_token_accuracy": 0.8477671146392822, + "num_tokens": 50110674.0, + "step": 1314 + }, + { + "epoch": 0.1672815163465208, + "ewc_loss": 0.00909423828125, + "ewc_loss_parallel": 9.119510650634766e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.168967247009277, + "learning_rate": 5.570156846121238e-07, + "loss": 0.5165, + "mean_token_accuracy": 0.8334718942642212, + "num_tokens": 50149033.0, + "step": 1315 + }, + { + "epoch": 0.1674087266251113, + "ewc_loss": 0.00909423828125, + "ewc_loss_parallel": 9.119510650634766e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.285367965698242, + "learning_rate": 5.574395930479016e-07, + "loss": 0.5357, + "mean_token_accuracy": 0.8306257128715515, + "num_tokens": 50185728.0, + "step": 1316 + }, + { + "epoch": 0.16753593690370183, + "ewc_loss": 0.0091552734375, + "ewc_loss_parallel": 9.179115295410156e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.393991470336914, + "learning_rate": 5.578635014836796e-07, + "loss": 0.4863, + "mean_token_accuracy": 0.8452426195144653, + "num_tokens": 50219671.0, + "step": 1317 + }, + { + "epoch": 0.16766314718229233, + "ewc_loss": 0.0091552734375, + "ewc_loss_parallel": 9.179115295410156e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.208609580993652, + "learning_rate": 5.582874099194573e-07, + "loss": 0.5478, + "mean_token_accuracy": 0.8298492431640625, + "num_tokens": 50251989.0, + "step": 1318 + }, + { + "epoch": 0.16779035746088283, + "ewc_loss": 0.009033203125, + "ewc_loss_parallel": 9.059906005859375e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.117340087890625, + "learning_rate": 5.587113183552353e-07, + "loss": 0.5313, + "mean_token_accuracy": 0.8319380879402161, + "num_tokens": 50291079.0, + "step": 1319 + }, + { + "epoch": 0.16791756773947336, + "ewc_loss": 0.0091552734375, + "ewc_loss_parallel": 9.179115295410156e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.233375549316406, + "learning_rate": 5.591352267910131e-07, + "loss": 0.4862, + "mean_token_accuracy": 0.844597339630127, + "num_tokens": 50332262.0, + "step": 1320 + }, + { + "epoch": 0.16804477801806386, + "ewc_loss": 0.00921630859375, + "ewc_loss_parallel": 9.238719940185547e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.280664443969727, + "learning_rate": 5.59559135226791e-07, + "loss": 0.5164, + "mean_token_accuracy": 0.8382208347320557, + "num_tokens": 50371278.0, + "step": 1321 + }, + { + "epoch": 0.16817198829665436, + "ewc_loss": 0.00909423828125, + "ewc_loss_parallel": 9.119510650634766e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.28407096862793, + "learning_rate": 5.599830436625689e-07, + "loss": 0.4946, + "mean_token_accuracy": 0.841249942779541, + "num_tokens": 50409463.0, + "step": 1322 + }, + { + "epoch": 0.1682991985752449, + "ewc_loss": 0.00927734375, + "ewc_loss_parallel": 9.298324584960938e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.318868637084961, + "learning_rate": 5.604069520983468e-07, + "loss": 0.4879, + "mean_token_accuracy": 0.8441396951675415, + "num_tokens": 50446745.0, + "step": 1323 + }, + { + "epoch": 0.1684264088538354, + "ewc_loss": 0.0091552734375, + "ewc_loss_parallel": 9.179115295410156e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.459575653076172, + "learning_rate": 5.608308605341246e-07, + "loss": 0.4791, + "mean_token_accuracy": 0.8455764055252075, + "num_tokens": 50479725.0, + "step": 1324 + }, + { + "epoch": 0.1685536191324259, + "ewc_loss": 0.00909423828125, + "ewc_loss_parallel": 9.119510650634766e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.272205352783203, + "learning_rate": 5.612547689699024e-07, + "loss": 0.5429, + "mean_token_accuracy": 0.8277576565742493, + "num_tokens": 50514734.0, + "step": 1325 + }, + { + "epoch": 0.16868082941101642, + "ewc_loss": 0.00909423828125, + "ewc_loss_parallel": 9.119510650634766e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.209309577941895, + "learning_rate": 5.616786774056803e-07, + "loss": 0.521, + "mean_token_accuracy": 0.8363876342773438, + "num_tokens": 50555165.0, + "step": 1326 + }, + { + "epoch": 0.16880803968960692, + "ewc_loss": 0.0091552734375, + "ewc_loss_parallel": 9.179115295410156e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.429325103759766, + "learning_rate": 5.621025858414582e-07, + "loss": 0.5225, + "mean_token_accuracy": 0.8333730101585388, + "num_tokens": 50598251.0, + "step": 1327 + }, + { + "epoch": 0.16893524996819742, + "ewc_loss": 0.0091552734375, + "ewc_loss_parallel": 9.179115295410156e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.168557167053223, + "learning_rate": 5.625264942772361e-07, + "loss": 0.5145, + "mean_token_accuracy": 0.8385032415390015, + "num_tokens": 50638423.0, + "step": 1328 + }, + { + "epoch": 0.16906246024678795, + "ewc_loss": 0.00921630859375, + "ewc_loss_parallel": 9.238719940185547e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.397907257080078, + "learning_rate": 5.629504027130139e-07, + "loss": 0.5737, + "mean_token_accuracy": 0.8270665407180786, + "num_tokens": 50678226.0, + "step": 1329 + }, + { + "epoch": 0.16918967052537845, + "ewc_loss": 0.00921630859375, + "ewc_loss_parallel": 9.238719940185547e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.309298515319824, + "learning_rate": 5.633743111487919e-07, + "loss": 0.5733, + "mean_token_accuracy": 0.815520167350769, + "num_tokens": 50717542.0, + "step": 1330 + }, + { + "epoch": 0.16931688080396895, + "ewc_loss": 0.0091552734375, + "ewc_loss_parallel": 9.179115295410156e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.415352821350098, + "learning_rate": 5.637982195845697e-07, + "loss": 0.4531, + "mean_token_accuracy": 0.8549542427062988, + "num_tokens": 50752914.0, + "step": 1331 + }, + { + "epoch": 0.16944409108255948, + "ewc_loss": 0.00921630859375, + "ewc_loss_parallel": 9.238719940185547e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.320319175720215, + "learning_rate": 5.642221280203476e-07, + "loss": 0.4521, + "mean_token_accuracy": 0.8559597730636597, + "num_tokens": 50789276.0, + "step": 1332 + }, + { + "epoch": 0.16957130136114998, + "ewc_loss": 0.00921630859375, + "ewc_loss_parallel": 9.238719940185547e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.454206466674805, + "learning_rate": 5.646460364561254e-07, + "loss": 0.4728, + "mean_token_accuracy": 0.8461429476737976, + "num_tokens": 50820759.0, + "step": 1333 + }, + { + "epoch": 0.16969851163974048, + "ewc_loss": 0.00921630859375, + "ewc_loss_parallel": 9.238719940185547e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.404422760009766, + "learning_rate": 5.650699448919033e-07, + "loss": 0.5636, + "mean_token_accuracy": 0.8223134279251099, + "num_tokens": 50863241.0, + "step": 1334 + }, + { + "epoch": 0.169825721918331, + "ewc_loss": 0.00927734375, + "ewc_loss_parallel": 9.298324584960938e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.392452239990234, + "learning_rate": 5.654938533276812e-07, + "loss": 0.5036, + "mean_token_accuracy": 0.8391119241714478, + "num_tokens": 50904296.0, + "step": 1335 + }, + { + "epoch": 0.1699529321969215, + "ewc_loss": 0.00921630859375, + "ewc_loss_parallel": 9.238719940185547e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.319605827331543, + "learning_rate": 5.659177617634591e-07, + "loss": 0.4879, + "mean_token_accuracy": 0.8443939685821533, + "num_tokens": 50942180.0, + "step": 1336 + }, + { + "epoch": 0.170080142475512, + "ewc_loss": 0.00921630859375, + "ewc_loss_parallel": 9.238719940185547e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.418400764465332, + "learning_rate": 5.663416701992369e-07, + "loss": 0.5151, + "mean_token_accuracy": 0.8370801210403442, + "num_tokens": 50980287.0, + "step": 1337 + }, + { + "epoch": 0.17020735275410254, + "ewc_loss": 0.00927734375, + "ewc_loss_parallel": 9.298324584960938e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.479711532592773, + "learning_rate": 5.667655786350149e-07, + "loss": 0.5791, + "mean_token_accuracy": 0.8114035725593567, + "num_tokens": 51014218.0, + "step": 1338 + }, + { + "epoch": 0.17033456303269304, + "ewc_loss": 0.00933837890625, + "ewc_loss_parallel": 9.357929229736328e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.403234481811523, + "learning_rate": 5.671894870707927e-07, + "loss": 0.4947, + "mean_token_accuracy": 0.8447702527046204, + "num_tokens": 51055915.0, + "step": 1339 + }, + { + "epoch": 0.17046177331128357, + "ewc_loss": 0.00921630859375, + "ewc_loss_parallel": 9.238719940185547e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.370410919189453, + "learning_rate": 5.676133955065705e-07, + "loss": 0.4372, + "mean_token_accuracy": 0.8565218448638916, + "num_tokens": 51090684.0, + "step": 1340 + }, + { + "epoch": 0.17058898358987407, + "ewc_loss": 0.00933837890625, + "ewc_loss_parallel": 9.357929229736328e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.556793212890625, + "learning_rate": 5.680373039423484e-07, + "loss": 0.4858, + "mean_token_accuracy": 0.84377121925354, + "num_tokens": 51130156.0, + "step": 1341 + }, + { + "epoch": 0.17071619386846457, + "ewc_loss": 0.00927734375, + "ewc_loss_parallel": 9.298324584960938e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.442337036132812, + "learning_rate": 5.684612123781263e-07, + "loss": 0.5253, + "mean_token_accuracy": 0.8322643041610718, + "num_tokens": 51171282.0, + "step": 1342 + }, + { + "epoch": 0.1708434041470551, + "ewc_loss": 0.00933837890625, + "ewc_loss_parallel": 9.357929229736328e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.351313591003418, + "learning_rate": 5.688851208139042e-07, + "loss": 0.542, + "mean_token_accuracy": 0.8265811204910278, + "num_tokens": 51210616.0, + "step": 1343 + }, + { + "epoch": 0.1709706144256456, + "ewc_loss": 0.00921630859375, + "ewc_loss_parallel": 9.238719940185547e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.398855209350586, + "learning_rate": 5.69309029249682e-07, + "loss": 0.5282, + "mean_token_accuracy": 0.8369084596633911, + "num_tokens": 51253357.0, + "step": 1344 + }, + { + "epoch": 0.1710978247042361, + "ewc_loss": 0.00933837890625, + "ewc_loss_parallel": 9.357929229736328e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.389463424682617, + "learning_rate": 5.697329376854599e-07, + "loss": 0.5064, + "mean_token_accuracy": 0.8369461894035339, + "num_tokens": 51292671.0, + "step": 1345 + }, + { + "epoch": 0.17122503498282662, + "ewc_loss": 0.00933837890625, + "ewc_loss_parallel": 9.357929229736328e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.48347282409668, + "learning_rate": 5.701568461212378e-07, + "loss": 0.4468, + "mean_token_accuracy": 0.8576080203056335, + "num_tokens": 51329145.0, + "step": 1346 + }, + { + "epoch": 0.17135224526141712, + "ewc_loss": 0.00921630859375, + "ewc_loss_parallel": 9.238719940185547e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.170797348022461, + "learning_rate": 5.705807545570157e-07, + "loss": 0.4555, + "mean_token_accuracy": 0.853171706199646, + "num_tokens": 51371265.0, + "step": 1347 + }, + { + "epoch": 0.17147945554000762, + "ewc_loss": 0.00933837890625, + "ewc_loss_parallel": 9.357929229736328e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.473722457885742, + "learning_rate": 5.710046629927934e-07, + "loss": 0.4936, + "mean_token_accuracy": 0.8396013975143433, + "num_tokens": 51410262.0, + "step": 1348 + }, + { + "epoch": 0.17160666581859815, + "ewc_loss": 0.00946044921875, + "ewc_loss_parallel": 9.47713851928711e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.590147018432617, + "learning_rate": 5.714285714285714e-07, + "loss": 0.4634, + "mean_token_accuracy": 0.8510478734970093, + "num_tokens": 51447796.0, + "step": 1349 + }, + { + "epoch": 0.17173387609718865, + "ewc_loss": 0.00946044921875, + "ewc_loss_parallel": 9.47713851928711e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.466023445129395, + "learning_rate": 5.718524798643492e-07, + "loss": 0.5281, + "mean_token_accuracy": 0.8355045914649963, + "num_tokens": 51482677.0, + "step": 1350 + }, + { + "epoch": 0.17186108637577915, + "ewc_loss": 0.00933837890625, + "ewc_loss_parallel": 9.357929229736328e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.628532409667969, + "learning_rate": 5.722763883001272e-07, + "loss": 0.5278, + "mean_token_accuracy": 0.8343169689178467, + "num_tokens": 51524111.0, + "step": 1351 + }, + { + "epoch": 0.17198829665436968, + "ewc_loss": 0.00946044921875, + "ewc_loss_parallel": 9.47713851928711e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.374526977539062, + "learning_rate": 5.72700296735905e-07, + "loss": 0.5013, + "mean_token_accuracy": 0.8394551873207092, + "num_tokens": 51561008.0, + "step": 1352 + }, + { + "epoch": 0.17211550693296018, + "ewc_loss": 0.00933837890625, + "ewc_loss_parallel": 9.357929229736328e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.527591705322266, + "learning_rate": 5.731242051716829e-07, + "loss": 0.5544, + "mean_token_accuracy": 0.8212670087814331, + "num_tokens": 51603091.0, + "step": 1353 + }, + { + "epoch": 0.17224271721155068, + "ewc_loss": 0.0093994140625, + "ewc_loss_parallel": 9.417533874511719e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.508907318115234, + "learning_rate": 5.735481136074608e-07, + "loss": 0.4701, + "mean_token_accuracy": 0.84780353307724, + "num_tokens": 51637789.0, + "step": 1354 + }, + { + "epoch": 0.1723699274901412, + "ewc_loss": 0.0093994140625, + "ewc_loss_parallel": 9.417533874511719e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.433746337890625, + "learning_rate": 5.739720220432386e-07, + "loss": 0.4745, + "mean_token_accuracy": 0.8446921110153198, + "num_tokens": 51674207.0, + "step": 1355 + }, + { + "epoch": 0.1724971377687317, + "ewc_loss": 0.00933837890625, + "ewc_loss_parallel": 9.357929229736328e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.337084770202637, + "learning_rate": 5.743959304790164e-07, + "loss": 0.571, + "mean_token_accuracy": 0.8218507766723633, + "num_tokens": 51715109.0, + "step": 1356 + }, + { + "epoch": 0.1726243480473222, + "ewc_loss": 0.0093994140625, + "ewc_loss_parallel": 9.417533874511719e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.526078224182129, + "learning_rate": 5.748198389147944e-07, + "loss": 0.5179, + "mean_token_accuracy": 0.8371042013168335, + "num_tokens": 51753373.0, + "step": 1357 + }, + { + "epoch": 0.17275155832591274, + "ewc_loss": 0.0093994140625, + "ewc_loss_parallel": 9.417533874511719e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.575726509094238, + "learning_rate": 5.752437473505722e-07, + "loss": 0.5559, + "mean_token_accuracy": 0.827684223651886, + "num_tokens": 51795100.0, + "step": 1358 + }, + { + "epoch": 0.17287876860450324, + "ewc_loss": 0.00933837890625, + "ewc_loss_parallel": 9.357929229736328e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.403068542480469, + "learning_rate": 5.756676557863502e-07, + "loss": 0.5291, + "mean_token_accuracy": 0.8313689231872559, + "num_tokens": 51831840.0, + "step": 1359 + }, + { + "epoch": 0.17300597888309374, + "ewc_loss": 0.00933837890625, + "ewc_loss_parallel": 9.357929229736328e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.506708145141602, + "learning_rate": 5.76091564222128e-07, + "loss": 0.4813, + "mean_token_accuracy": 0.8472420573234558, + "num_tokens": 51870227.0, + "step": 1360 + }, + { + "epoch": 0.17313318916168427, + "ewc_loss": 0.00946044921875, + "ewc_loss_parallel": 9.47713851928711e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.603128433227539, + "learning_rate": 5.765154726579059e-07, + "loss": 0.5075, + "mean_token_accuracy": 0.8390032649040222, + "num_tokens": 51908251.0, + "step": 1361 + }, + { + "epoch": 0.17326039944027477, + "ewc_loss": 0.00946044921875, + "ewc_loss_parallel": 9.47713851928711e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.526555061340332, + "learning_rate": 5.769393810936838e-07, + "loss": 0.497, + "mean_token_accuracy": 0.8413236141204834, + "num_tokens": 51944531.0, + "step": 1362 + }, + { + "epoch": 0.17338760971886527, + "ewc_loss": 0.00946044921875, + "ewc_loss_parallel": 9.47713851928711e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.49058723449707, + "learning_rate": 5.773632895294616e-07, + "loss": 0.5518, + "mean_token_accuracy": 0.8251607418060303, + "num_tokens": 51985742.0, + "step": 1363 + }, + { + "epoch": 0.1735148199974558, + "ewc_loss": 0.00946044921875, + "ewc_loss_parallel": 9.47713851928711e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.489421844482422, + "learning_rate": 5.777871979652394e-07, + "loss": 0.4875, + "mean_token_accuracy": 0.8446332812309265, + "num_tokens": 52022036.0, + "step": 1364 + }, + { + "epoch": 0.1736420302760463, + "ewc_loss": 0.009521484375, + "ewc_loss_parallel": 9.5367431640625e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.559830665588379, + "learning_rate": 5.782111064010173e-07, + "loss": 0.5103, + "mean_token_accuracy": 0.8364454507827759, + "num_tokens": 52066077.0, + "step": 1365 + }, + { + "epoch": 0.17376924055463683, + "ewc_loss": 0.009521484375, + "ewc_loss_parallel": 9.5367431640625e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.39433765411377, + "learning_rate": 5.786350148367952e-07, + "loss": 0.5018, + "mean_token_accuracy": 0.8397225141525269, + "num_tokens": 52111280.0, + "step": 1366 + }, + { + "epoch": 0.17389645083322733, + "ewc_loss": 0.0093994140625, + "ewc_loss_parallel": 9.417533874511719e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.590474128723145, + "learning_rate": 5.790589232725731e-07, + "loss": 0.4994, + "mean_token_accuracy": 0.8381450176239014, + "num_tokens": 52153148.0, + "step": 1367 + }, + { + "epoch": 0.17402366111181783, + "ewc_loss": 0.00946044921875, + "ewc_loss_parallel": 9.47713851928711e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.515302658081055, + "learning_rate": 5.79482831708351e-07, + "loss": 0.4842, + "mean_token_accuracy": 0.8455049991607666, + "num_tokens": 52186722.0, + "step": 1368 + }, + { + "epoch": 0.17415087139040836, + "ewc_loss": 0.009521484375, + "ewc_loss_parallel": 9.5367431640625e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.586997985839844, + "learning_rate": 5.799067401441288e-07, + "loss": 0.4952, + "mean_token_accuracy": 0.8393337726593018, + "num_tokens": 52220837.0, + "step": 1369 + }, + { + "epoch": 0.17427808166899886, + "ewc_loss": 0.009521484375, + "ewc_loss_parallel": 9.5367431640625e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.564440727233887, + "learning_rate": 5.803306485799068e-07, + "loss": 0.4794, + "mean_token_accuracy": 0.8479533195495605, + "num_tokens": 52257049.0, + "step": 1370 + }, + { + "epoch": 0.17440529194758936, + "ewc_loss": 0.00958251953125, + "ewc_loss_parallel": 9.59634780883789e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.548019409179688, + "learning_rate": 5.807545570156845e-07, + "loss": 0.456, + "mean_token_accuracy": 0.8516356945037842, + "num_tokens": 52297632.0, + "step": 1371 + }, + { + "epoch": 0.1745325022261799, + "ewc_loss": 0.009521484375, + "ewc_loss_parallel": 9.5367431640625e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.725382804870605, + "learning_rate": 5.811784654514624e-07, + "loss": 0.5567, + "mean_token_accuracy": 0.8237301111221313, + "num_tokens": 52331008.0, + "step": 1372 + }, + { + "epoch": 0.1746597125047704, + "ewc_loss": 0.009521484375, + "ewc_loss_parallel": 9.5367431640625e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.479629516601562, + "learning_rate": 5.816023738872403e-07, + "loss": 0.4876, + "mean_token_accuracy": 0.8451366424560547, + "num_tokens": 52369420.0, + "step": 1373 + }, + { + "epoch": 0.1747869227833609, + "ewc_loss": 0.009521484375, + "ewc_loss_parallel": 9.5367431640625e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.703763961791992, + "learning_rate": 5.820262823230182e-07, + "loss": 0.5185, + "mean_token_accuracy": 0.8352684378623962, + "num_tokens": 52411759.0, + "step": 1374 + }, + { + "epoch": 0.17491413306195142, + "ewc_loss": 0.009521484375, + "ewc_loss_parallel": 9.5367431640625e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.491270065307617, + "learning_rate": 5.824501907587961e-07, + "loss": 0.4781, + "mean_token_accuracy": 0.8463485240936279, + "num_tokens": 52446459.0, + "step": 1375 + }, + { + "epoch": 0.17504134334054192, + "ewc_loss": 0.00946044921875, + "ewc_loss_parallel": 9.47713851928711e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.591568946838379, + "learning_rate": 5.82874099194574e-07, + "loss": 0.5616, + "mean_token_accuracy": 0.8206483721733093, + "num_tokens": 52484615.0, + "step": 1376 + }, + { + "epoch": 0.17516855361913242, + "ewc_loss": 0.0096435546875, + "ewc_loss_parallel": 9.655952453613281e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.637808799743652, + "learning_rate": 5.832980076303518e-07, + "loss": 0.5724, + "mean_token_accuracy": 0.8181472420692444, + "num_tokens": 52525195.0, + "step": 1377 + }, + { + "epoch": 0.17529576389772294, + "ewc_loss": 0.00958251953125, + "ewc_loss_parallel": 9.59634780883789e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.553678512573242, + "learning_rate": 5.837219160661297e-07, + "loss": 0.4551, + "mean_token_accuracy": 0.8535500764846802, + "num_tokens": 52558783.0, + "step": 1378 + }, + { + "epoch": 0.17542297417631345, + "ewc_loss": 0.00958251953125, + "ewc_loss_parallel": 9.59634780883789e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.500085830688477, + "learning_rate": 5.841458245019075e-07, + "loss": 0.4512, + "mean_token_accuracy": 0.8542001247406006, + "num_tokens": 52599176.0, + "step": 1379 + }, + { + "epoch": 0.17555018445490395, + "ewc_loss": 0.00946044921875, + "ewc_loss_parallel": 9.47713851928711e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.707733154296875, + "learning_rate": 5.845697329376855e-07, + "loss": 0.5081, + "mean_token_accuracy": 0.8414750695228577, + "num_tokens": 52630865.0, + "step": 1380 + }, + { + "epoch": 0.17567739473349447, + "ewc_loss": 0.00958251953125, + "ewc_loss_parallel": 9.59634780883789e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.493341445922852, + "learning_rate": 5.849936413734633e-07, + "loss": 0.5589, + "mean_token_accuracy": 0.8236284255981445, + "num_tokens": 52671702.0, + "step": 1381 + }, + { + "epoch": 0.17580460501208497, + "ewc_loss": 0.009521484375, + "ewc_loss_parallel": 9.5367431640625e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.513846397399902, + "learning_rate": 5.854175498092412e-07, + "loss": 0.4633, + "mean_token_accuracy": 0.8520216941833496, + "num_tokens": 52712288.0, + "step": 1382 + }, + { + "epoch": 0.17593181529067548, + "ewc_loss": 0.0096435546875, + "ewc_loss_parallel": 9.655952453613281e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.56915283203125, + "learning_rate": 5.858414582450191e-07, + "loss": 0.5065, + "mean_token_accuracy": 0.8402970433235168, + "num_tokens": 52756569.0, + "step": 1383 + }, + { + "epoch": 0.176059025569266, + "ewc_loss": 0.00970458984375, + "ewc_loss_parallel": 9.715557098388672e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.584451675415039, + "learning_rate": 5.86265366680797e-07, + "loss": 0.5294, + "mean_token_accuracy": 0.8330674171447754, + "num_tokens": 52796195.0, + "step": 1384 + }, + { + "epoch": 0.1761862358478565, + "ewc_loss": 0.0096435546875, + "ewc_loss_parallel": 9.655952453613281e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.538782119750977, + "learning_rate": 5.866892751165748e-07, + "loss": 0.5007, + "mean_token_accuracy": 0.8414781093597412, + "num_tokens": 52839687.0, + "step": 1385 + }, + { + "epoch": 0.176313446126447, + "ewc_loss": 0.0096435546875, + "ewc_loss_parallel": 9.655952453613281e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.606650352478027, + "learning_rate": 5.871131835523526e-07, + "loss": 0.5547, + "mean_token_accuracy": 0.8244059681892395, + "num_tokens": 52875660.0, + "step": 1386 + }, + { + "epoch": 0.17644065640503753, + "ewc_loss": 0.00958251953125, + "ewc_loss_parallel": 9.59634780883789e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.589339256286621, + "learning_rate": 5.875370919881305e-07, + "loss": 0.5116, + "mean_token_accuracy": 0.8353200554847717, + "num_tokens": 52921220.0, + "step": 1387 + }, + { + "epoch": 0.17656786668362803, + "ewc_loss": 0.0096435546875, + "ewc_loss_parallel": 9.655952453613281e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.538922309875488, + "learning_rate": 5.879610004239084e-07, + "loss": 0.5654, + "mean_token_accuracy": 0.8242533206939697, + "num_tokens": 52960042.0, + "step": 1388 + }, + { + "epoch": 0.17669507696221853, + "ewc_loss": 0.009521484375, + "ewc_loss_parallel": 9.5367431640625e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.67161750793457, + "learning_rate": 5.883849088596863e-07, + "loss": 0.5489, + "mean_token_accuracy": 0.8266627788543701, + "num_tokens": 53002817.0, + "step": 1389 + }, + { + "epoch": 0.17682228724080906, + "ewc_loss": 0.0096435546875, + "ewc_loss_parallel": 9.655952453613281e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.685932159423828, + "learning_rate": 5.888088172954641e-07, + "loss": 0.5231, + "mean_token_accuracy": 0.8344141840934753, + "num_tokens": 53037751.0, + "step": 1390 + }, + { + "epoch": 0.17694949751939956, + "ewc_loss": 0.00958251953125, + "ewc_loss_parallel": 9.59634780883789e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.58338737487793, + "learning_rate": 5.892327257312421e-07, + "loss": 0.4336, + "mean_token_accuracy": 0.8619579076766968, + "num_tokens": 53074286.0, + "step": 1391 + }, + { + "epoch": 0.1770767077979901, + "ewc_loss": 0.0096435546875, + "ewc_loss_parallel": 9.655952453613281e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.599923133850098, + "learning_rate": 5.896566341670199e-07, + "loss": 0.4564, + "mean_token_accuracy": 0.8480451703071594, + "num_tokens": 53113525.0, + "step": 1392 + }, + { + "epoch": 0.1772039180765806, + "ewc_loss": 0.00958251953125, + "ewc_loss_parallel": 9.59634780883789e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.558930397033691, + "learning_rate": 5.900805426027977e-07, + "loss": 0.4994, + "mean_token_accuracy": 0.8403888940811157, + "num_tokens": 53150137.0, + "step": 1393 + }, + { + "epoch": 0.1773311283551711, + "ewc_loss": 0.0096435546875, + "ewc_loss_parallel": 9.655952453613281e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.662184715270996, + "learning_rate": 5.905044510385756e-07, + "loss": 0.4514, + "mean_token_accuracy": 0.8538491725921631, + "num_tokens": 53189519.0, + "step": 1394 + }, + { + "epoch": 0.17745833863376162, + "ewc_loss": 0.009765625, + "ewc_loss_parallel": 9.775161743164062e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.610318183898926, + "learning_rate": 5.909283594743535e-07, + "loss": 0.5649, + "mean_token_accuracy": 0.8210757374763489, + "num_tokens": 53227452.0, + "step": 1395 + }, + { + "epoch": 0.17758554891235212, + "ewc_loss": 0.0096435546875, + "ewc_loss_parallel": 9.655952453613281e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.609862327575684, + "learning_rate": 5.913522679101314e-07, + "loss": 0.4966, + "mean_token_accuracy": 0.8369971513748169, + "num_tokens": 53265875.0, + "step": 1396 + }, + { + "epoch": 0.17771275919094262, + "ewc_loss": 0.009765625, + "ewc_loss_parallel": 9.775161743164062e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.634790420532227, + "learning_rate": 5.917761763459093e-07, + "loss": 0.5568, + "mean_token_accuracy": 0.8226222395896912, + "num_tokens": 53305477.0, + "step": 1397 + }, + { + "epoch": 0.17783996946953315, + "ewc_loss": 0.00970458984375, + "ewc_loss_parallel": 9.715557098388672e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.594178199768066, + "learning_rate": 5.922000847816871e-07, + "loss": 0.4749, + "mean_token_accuracy": 0.8493068218231201, + "num_tokens": 53342800.0, + "step": 1398 + }, + { + "epoch": 0.17796717974812365, + "ewc_loss": 0.00982666015625, + "ewc_loss_parallel": 9.834766387939453e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.672309875488281, + "learning_rate": 5.926239932174651e-07, + "loss": 0.5398, + "mean_token_accuracy": 0.8297773003578186, + "num_tokens": 53387476.0, + "step": 1399 + }, + { + "epoch": 0.17809439002671415, + "ewc_loss": 0.00970458984375, + "ewc_loss_parallel": 9.715557098388672e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.629151344299316, + "learning_rate": 5.930479016532429e-07, + "loss": 0.4968, + "mean_token_accuracy": 0.8412397503852844, + "num_tokens": 53424628.0, + "step": 1400 + }, + { + "epoch": 0.17822160030530468, + "ewc_loss": 0.0098876953125, + "ewc_loss_parallel": 9.894371032714844e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.663630485534668, + "learning_rate": 5.934718100890207e-07, + "loss": 0.4787, + "mean_token_accuracy": 0.8443553447723389, + "num_tokens": 53464459.0, + "step": 1401 + }, + { + "epoch": 0.17834881058389518, + "ewc_loss": 0.00970458984375, + "ewc_loss_parallel": 9.715557098388672e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.579421043395996, + "learning_rate": 5.938957185247986e-07, + "loss": 0.5251, + "mean_token_accuracy": 0.8293845057487488, + "num_tokens": 53501721.0, + "step": 1402 + }, + { + "epoch": 0.17847602086248568, + "ewc_loss": 0.0096435546875, + "ewc_loss_parallel": 9.655952453613281e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.586064338684082, + "learning_rate": 5.943196269605765e-07, + "loss": 0.4863, + "mean_token_accuracy": 0.8414299488067627, + "num_tokens": 53536607.0, + "step": 1403 + }, + { + "epoch": 0.1786032311410762, + "ewc_loss": 0.0098876953125, + "ewc_loss_parallel": 9.894371032714844e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.68840503692627, + "learning_rate": 5.947435353963544e-07, + "loss": 0.5385, + "mean_token_accuracy": 0.8327409029006958, + "num_tokens": 53569809.0, + "step": 1404 + }, + { + "epoch": 0.1787304414196667, + "ewc_loss": 0.00982666015625, + "ewc_loss_parallel": 9.834766387939453e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.586417198181152, + "learning_rate": 5.951674438321323e-07, + "loss": 0.5141, + "mean_token_accuracy": 0.8366702795028687, + "num_tokens": 53604423.0, + "step": 1405 + }, + { + "epoch": 0.1788576516982572, + "ewc_loss": 0.00982666015625, + "ewc_loss_parallel": 9.834766387939453e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.727350234985352, + "learning_rate": 5.955913522679101e-07, + "loss": 0.5107, + "mean_token_accuracy": 0.8341063857078552, + "num_tokens": 53642145.0, + "step": 1406 + }, + { + "epoch": 0.17898486197684774, + "ewc_loss": 0.00982666015625, + "ewc_loss_parallel": 9.834766387939453e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.643336296081543, + "learning_rate": 5.96015260703688e-07, + "loss": 0.5865, + "mean_token_accuracy": 0.8128001093864441, + "num_tokens": 53682998.0, + "step": 1407 + }, + { + "epoch": 0.17911207225543824, + "ewc_loss": 0.00982666015625, + "ewc_loss_parallel": 9.834766387939453e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.73631477355957, + "learning_rate": 5.964391691394659e-07, + "loss": 0.5187, + "mean_token_accuracy": 0.8370780348777771, + "num_tokens": 53729764.0, + "step": 1408 + }, + { + "epoch": 0.17923928253402874, + "ewc_loss": 0.00982666015625, + "ewc_loss_parallel": 9.834766387939453e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.747529983520508, + "learning_rate": 5.968630775752436e-07, + "loss": 0.5529, + "mean_token_accuracy": 0.8256770372390747, + "num_tokens": 53764522.0, + "step": 1409 + }, + { + "epoch": 0.17936649281261927, + "ewc_loss": 0.009765625, + "ewc_loss_parallel": 9.775161743164062e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.821052551269531, + "learning_rate": 5.972869860110216e-07, + "loss": 0.5344, + "mean_token_accuracy": 0.826530396938324, + "num_tokens": 53799791.0, + "step": 1410 + }, + { + "epoch": 0.17949370309120977, + "ewc_loss": 0.009765625, + "ewc_loss_parallel": 9.775161743164062e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.51575756072998, + "learning_rate": 5.977108944467994e-07, + "loss": 0.5728, + "mean_token_accuracy": 0.8226487636566162, + "num_tokens": 53840181.0, + "step": 1411 + }, + { + "epoch": 0.17962091336980027, + "ewc_loss": 0.009765625, + "ewc_loss_parallel": 9.775161743164062e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.670082092285156, + "learning_rate": 5.981348028825774e-07, + "loss": 0.4782, + "mean_token_accuracy": 0.8498783111572266, + "num_tokens": 53882813.0, + "step": 1412 + }, + { + "epoch": 0.1797481236483908, + "ewc_loss": 0.00982666015625, + "ewc_loss_parallel": 9.834766387939453e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.748991966247559, + "learning_rate": 5.985587113183552e-07, + "loss": 0.5273, + "mean_token_accuracy": 0.8332073092460632, + "num_tokens": 53922198.0, + "step": 1413 + }, + { + "epoch": 0.1798753339269813, + "ewc_loss": 0.0098876953125, + "ewc_loss_parallel": 9.894371032714844e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.674304962158203, + "learning_rate": 5.989826197541331e-07, + "loss": 0.5217, + "mean_token_accuracy": 0.835880696773529, + "num_tokens": 53959390.0, + "step": 1414 + }, + { + "epoch": 0.18000254420557182, + "ewc_loss": 0.00982666015625, + "ewc_loss_parallel": 9.834766387939453e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.697629928588867, + "learning_rate": 5.99406528189911e-07, + "loss": 0.5513, + "mean_token_accuracy": 0.83099365234375, + "num_tokens": 54005707.0, + "step": 1415 + }, + { + "epoch": 0.18012975448416232, + "ewc_loss": 0.0098876953125, + "ewc_loss_parallel": 9.894371032714844e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.737464904785156, + "learning_rate": 5.998304366256888e-07, + "loss": 0.5139, + "mean_token_accuracy": 0.8354997634887695, + "num_tokens": 54047808.0, + "step": 1416 + }, + { + "epoch": 0.18025696476275282, + "ewc_loss": 0.0098876953125, + "ewc_loss_parallel": 9.894371032714844e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.685689926147461, + "learning_rate": 6.002543450614666e-07, + "loss": 0.4377, + "mean_token_accuracy": 0.8616663813591003, + "num_tokens": 54082929.0, + "step": 1417 + }, + { + "epoch": 0.18038417504134335, + "ewc_loss": 0.010009765625, + "ewc_loss_parallel": 1.0013580322265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 13.78050708770752, + "learning_rate": 6.006782534972446e-07, + "loss": 0.4995, + "mean_token_accuracy": 0.8405765295028687, + "num_tokens": 54120199.0, + "step": 1418 + }, + { + "epoch": 0.18051138531993385, + "ewc_loss": 0.00994873046875, + "ewc_loss_parallel": 9.953975677490234e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.816366195678711, + "learning_rate": 6.011021619330224e-07, + "loss": 0.5352, + "mean_token_accuracy": 0.8285102844238281, + "num_tokens": 54158629.0, + "step": 1419 + }, + { + "epoch": 0.18063859559852435, + "ewc_loss": 0.010009765625, + "ewc_loss_parallel": 1.0013580322265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 13.767717361450195, + "learning_rate": 6.015260703688004e-07, + "loss": 0.5152, + "mean_token_accuracy": 0.8333718776702881, + "num_tokens": 54194772.0, + "step": 1420 + }, + { + "epoch": 0.18076580587711488, + "ewc_loss": 0.0098876953125, + "ewc_loss_parallel": 9.894371032714844e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.718744277954102, + "learning_rate": 6.019499788045782e-07, + "loss": 0.5236, + "mean_token_accuracy": 0.8346614837646484, + "num_tokens": 54229825.0, + "step": 1421 + }, + { + "epoch": 0.18089301615570538, + "ewc_loss": 0.0098876953125, + "ewc_loss_parallel": 9.894371032714844e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.692853927612305, + "learning_rate": 6.023738872403561e-07, + "loss": 0.4793, + "mean_token_accuracy": 0.8438609838485718, + "num_tokens": 54260051.0, + "step": 1422 + }, + { + "epoch": 0.18102022643429588, + "ewc_loss": 0.010009765625, + "ewc_loss_parallel": 1.0013580322265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 13.760848999023438, + "learning_rate": 6.02797795676134e-07, + "loss": 0.5058, + "mean_token_accuracy": 0.8431036472320557, + "num_tokens": 54297865.0, + "step": 1423 + }, + { + "epoch": 0.1811474367128864, + "ewc_loss": 0.010009765625, + "ewc_loss_parallel": 1.0013580322265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 13.780471801757812, + "learning_rate": 6.032217041119118e-07, + "loss": 0.5211, + "mean_token_accuracy": 0.8299878835678101, + "num_tokens": 54334618.0, + "step": 1424 + }, + { + "epoch": 0.1812746469914769, + "ewc_loss": 0.00994873046875, + "ewc_loss_parallel": 9.953975677490234e-06, + "ewc_loss_perp": 0.0, + "grad_norm": 13.719801902770996, + "learning_rate": 6.036456125476896e-07, + "loss": 0.5578, + "mean_token_accuracy": 0.8224218487739563, + "num_tokens": 54369761.0, + "step": 1425 + }, + { + "epoch": 0.1814018572700674, + "ewc_loss": 0.01007080078125, + "ewc_loss_parallel": 1.0073184967041016e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 13.834538459777832, + "learning_rate": 6.040695209834675e-07, + "loss": 0.4566, + "mean_token_accuracy": 0.8513047695159912, + "num_tokens": 54411549.0, + "step": 1426 + }, + { + "epoch": 0.18152906754865794, + "ewc_loss": 0.01007080078125, + "ewc_loss_parallel": 1.0073184967041016e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 13.84178638458252, + "learning_rate": 6.044934294192454e-07, + "loss": 0.5411, + "mean_token_accuracy": 0.8299803733825684, + "num_tokens": 54449137.0, + "step": 1427 + }, + { + "epoch": 0.18165627782724844, + "ewc_loss": 0.01007080078125, + "ewc_loss_parallel": 1.0073184967041016e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 13.733931541442871, + "learning_rate": 6.049173378550233e-07, + "loss": 0.4962, + "mean_token_accuracy": 0.8442100286483765, + "num_tokens": 54494604.0, + "step": 1428 + }, + { + "epoch": 0.18178348810583894, + "ewc_loss": 0.0101318359375, + "ewc_loss_parallel": 1.0132789611816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 13.738930702209473, + "learning_rate": 6.053412462908012e-07, + "loss": 0.4767, + "mean_token_accuracy": 0.8459398746490479, + "num_tokens": 54533533.0, + "step": 1429 + }, + { + "epoch": 0.18191069838442947, + "ewc_loss": 0.010009765625, + "ewc_loss_parallel": 1.0013580322265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 13.772261619567871, + "learning_rate": 6.05765154726579e-07, + "loss": 0.5492, + "mean_token_accuracy": 0.8273980617523193, + "num_tokens": 54575211.0, + "step": 1430 + }, + { + "epoch": 0.18203790866301997, + "ewc_loss": 0.010009765625, + "ewc_loss_parallel": 1.0013580322265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 13.73306655883789, + "learning_rate": 6.061890631623569e-07, + "loss": 0.4532, + "mean_token_accuracy": 0.8536117076873779, + "num_tokens": 54618308.0, + "step": 1431 + }, + { + "epoch": 0.18216511894161047, + "ewc_loss": 0.010009765625, + "ewc_loss_parallel": 1.0013580322265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 13.770964622497559, + "learning_rate": 6.066129715981347e-07, + "loss": 0.4923, + "mean_token_accuracy": 0.8414987921714783, + "num_tokens": 54660260.0, + "step": 1432 + }, + { + "epoch": 0.182292329220201, + "ewc_loss": 0.01007080078125, + "ewc_loss_parallel": 1.0073184967041016e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 13.856724739074707, + "learning_rate": 6.070368800339126e-07, + "loss": 0.4657, + "mean_token_accuracy": 0.8498469591140747, + "num_tokens": 54692056.0, + "step": 1433 + }, + { + "epoch": 0.1824195394987915, + "ewc_loss": 0.01025390625, + "ewc_loss_parallel": 1.0251998901367188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 13.92392635345459, + "learning_rate": 6.074607884696905e-07, + "loss": 0.4865, + "mean_token_accuracy": 0.8430107831954956, + "num_tokens": 54729252.0, + "step": 1434 + }, + { + "epoch": 0.182546749777382, + "ewc_loss": 0.01019287109375, + "ewc_loss_parallel": 1.0192394256591797e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 13.821216583251953, + "learning_rate": 6.078846969054684e-07, + "loss": 0.5099, + "mean_token_accuracy": 0.8354023098945618, + "num_tokens": 54763422.0, + "step": 1435 + }, + { + "epoch": 0.18267396005597253, + "ewc_loss": 0.01007080078125, + "ewc_loss_parallel": 1.0073184967041016e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 13.858633995056152, + "learning_rate": 6.083086053412463e-07, + "loss": 0.4838, + "mean_token_accuracy": 0.8452993631362915, + "num_tokens": 54803494.0, + "step": 1436 + }, + { + "epoch": 0.18280117033456303, + "ewc_loss": 0.01025390625, + "ewc_loss_parallel": 1.0251998901367188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 13.869681358337402, + "learning_rate": 6.087325137770242e-07, + "loss": 0.4743, + "mean_token_accuracy": 0.8468495607376099, + "num_tokens": 54839349.0, + "step": 1437 + }, + { + "epoch": 0.18292838061315353, + "ewc_loss": 0.010009765625, + "ewc_loss_parallel": 1.0013580322265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 13.817660331726074, + "learning_rate": 6.09156422212802e-07, + "loss": 0.5476, + "mean_token_accuracy": 0.8303584456443787, + "num_tokens": 54881941.0, + "step": 1438 + }, + { + "epoch": 0.18305559089174406, + "ewc_loss": 0.010009765625, + "ewc_loss_parallel": 1.0013580322265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 13.725811958312988, + "learning_rate": 6.095803306485799e-07, + "loss": 0.4976, + "mean_token_accuracy": 0.8382574319839478, + "num_tokens": 54922927.0, + "step": 1439 + }, + { + "epoch": 0.18318280117033456, + "ewc_loss": 0.0101318359375, + "ewc_loss_parallel": 1.0132789611816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.005352020263672, + "learning_rate": 6.100042390843577e-07, + "loss": 0.5267, + "mean_token_accuracy": 0.8314822912216187, + "num_tokens": 54963885.0, + "step": 1440 + }, + { + "epoch": 0.1833100114489251, + "ewc_loss": 0.01019287109375, + "ewc_loss_parallel": 1.0192394256591797e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 13.834760665893555, + "learning_rate": 6.104281475201356e-07, + "loss": 0.4753, + "mean_token_accuracy": 0.8460750579833984, + "num_tokens": 55001284.0, + "step": 1441 + }, + { + "epoch": 0.1834372217275156, + "ewc_loss": 0.010009765625, + "ewc_loss_parallel": 1.0013580322265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 13.75156307220459, + "learning_rate": 6.108520559559135e-07, + "loss": 0.5352, + "mean_token_accuracy": 0.8300719857215881, + "num_tokens": 55034813.0, + "step": 1442 + }, + { + "epoch": 0.1835644320061061, + "ewc_loss": 0.01019287109375, + "ewc_loss_parallel": 1.0192394256591797e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 13.978972434997559, + "learning_rate": 6.112759643916914e-07, + "loss": 0.4404, + "mean_token_accuracy": 0.8575005531311035, + "num_tokens": 55070620.0, + "step": 1443 + }, + { + "epoch": 0.18369164228469662, + "ewc_loss": 0.0101318359375, + "ewc_loss_parallel": 1.0132789611816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 13.828730583190918, + "learning_rate": 6.116998728274693e-07, + "loss": 0.5229, + "mean_token_accuracy": 0.8385809659957886, + "num_tokens": 55108394.0, + "step": 1444 + }, + { + "epoch": 0.18381885256328712, + "ewc_loss": 0.01007080078125, + "ewc_loss_parallel": 1.0073184967041016e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 13.899569511413574, + "learning_rate": 6.121237812632472e-07, + "loss": 0.5271, + "mean_token_accuracy": 0.8313718438148499, + "num_tokens": 55148170.0, + "step": 1445 + }, + { + "epoch": 0.18394606284187762, + "ewc_loss": 0.01025390625, + "ewc_loss_parallel": 1.0251998901367188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 13.913768768310547, + "learning_rate": 6.125476896990249e-07, + "loss": 0.4682, + "mean_token_accuracy": 0.851661741733551, + "num_tokens": 55188102.0, + "step": 1446 + }, + { + "epoch": 0.18407327312046814, + "ewc_loss": 0.01031494140625, + "ewc_loss_parallel": 1.0311603546142578e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 13.853039741516113, + "learning_rate": 6.129715981348028e-07, + "loss": 0.5075, + "mean_token_accuracy": 0.8426756262779236, + "num_tokens": 55225893.0, + "step": 1447 + }, + { + "epoch": 0.18420048339905865, + "ewc_loss": 0.0101318359375, + "ewc_loss_parallel": 1.0132789611816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 13.94486141204834, + "learning_rate": 6.133955065705807e-07, + "loss": 0.5264, + "mean_token_accuracy": 0.8302478790283203, + "num_tokens": 55265321.0, + "step": 1448 + }, + { + "epoch": 0.18432769367764915, + "ewc_loss": 0.01019287109375, + "ewc_loss_parallel": 1.0192394256591797e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 13.93410873413086, + "learning_rate": 6.138194150063585e-07, + "loss": 0.4862, + "mean_token_accuracy": 0.846295952796936, + "num_tokens": 55304179.0, + "step": 1449 + }, + { + "epoch": 0.18445490395623967, + "ewc_loss": 0.01019287109375, + "ewc_loss_parallel": 1.0192394256591797e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 13.808897018432617, + "learning_rate": 6.142433234421365e-07, + "loss": 0.5209, + "mean_token_accuracy": 0.8338026404380798, + "num_tokens": 55343266.0, + "step": 1450 + }, + { + "epoch": 0.18458211423483017, + "ewc_loss": 0.01019287109375, + "ewc_loss_parallel": 1.0192394256591797e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 13.955789566040039, + "learning_rate": 6.146672318779143e-07, + "loss": 0.5252, + "mean_token_accuracy": 0.8335514068603516, + "num_tokens": 55381147.0, + "step": 1451 + }, + { + "epoch": 0.18470932451342068, + "ewc_loss": 0.01025390625, + "ewc_loss_parallel": 1.0251998901367188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.02345085144043, + "learning_rate": 6.150911403136923e-07, + "loss": 0.5259, + "mean_token_accuracy": 0.8353763818740845, + "num_tokens": 55414855.0, + "step": 1452 + }, + { + "epoch": 0.1848365347920112, + "ewc_loss": 0.01019287109375, + "ewc_loss_parallel": 1.0192394256591797e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 13.772453308105469, + "learning_rate": 6.155150487494701e-07, + "loss": 0.4486, + "mean_token_accuracy": 0.8556772470474243, + "num_tokens": 55449699.0, + "step": 1453 + }, + { + "epoch": 0.1849637450706017, + "ewc_loss": 0.0101318359375, + "ewc_loss_parallel": 1.0132789611816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 13.866290092468262, + "learning_rate": 6.159389571852479e-07, + "loss": 0.476, + "mean_token_accuracy": 0.8484845161437988, + "num_tokens": 55493202.0, + "step": 1454 + }, + { + "epoch": 0.1850909553491922, + "ewc_loss": 0.01043701171875, + "ewc_loss_parallel": 1.043081283569336e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 13.922584533691406, + "learning_rate": 6.163628656210258e-07, + "loss": 0.4992, + "mean_token_accuracy": 0.8423913717269897, + "num_tokens": 55539873.0, + "step": 1455 + }, + { + "epoch": 0.18521816562778273, + "ewc_loss": 0.0103759765625, + "ewc_loss_parallel": 1.0371208190917969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 13.87277603149414, + "learning_rate": 6.167867740568037e-07, + "loss": 0.5584, + "mean_token_accuracy": 0.8241441249847412, + "num_tokens": 55576338.0, + "step": 1456 + }, + { + "epoch": 0.18534537590637323, + "ewc_loss": 0.0103759765625, + "ewc_loss_parallel": 1.0371208190917969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 13.972970008850098, + "learning_rate": 6.172106824925815e-07, + "loss": 0.478, + "mean_token_accuracy": 0.8455958366394043, + "num_tokens": 55618330.0, + "step": 1457 + }, + { + "epoch": 0.18547258618496373, + "ewc_loss": 0.0103759765625, + "ewc_loss_parallel": 1.0371208190917969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 13.853973388671875, + "learning_rate": 6.176345909283595e-07, + "loss": 0.4953, + "mean_token_accuracy": 0.8418896794319153, + "num_tokens": 55657394.0, + "step": 1458 + }, + { + "epoch": 0.18559979646355426, + "ewc_loss": 0.01025390625, + "ewc_loss_parallel": 1.0251998901367188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 13.982878684997559, + "learning_rate": 6.180584993641373e-07, + "loss": 0.4814, + "mean_token_accuracy": 0.845251739025116, + "num_tokens": 55695087.0, + "step": 1459 + }, + { + "epoch": 0.18572700674214476, + "ewc_loss": 0.0103759765625, + "ewc_loss_parallel": 1.0371208190917969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 13.895635604858398, + "learning_rate": 6.184824077999153e-07, + "loss": 0.4651, + "mean_token_accuracy": 0.8510371446609497, + "num_tokens": 55730852.0, + "step": 1460 + }, + { + "epoch": 0.18585421702073526, + "ewc_loss": 0.01043701171875, + "ewc_loss_parallel": 1.043081283569336e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.139654159545898, + "learning_rate": 6.189063162356931e-07, + "loss": 0.562, + "mean_token_accuracy": 0.8234992027282715, + "num_tokens": 55765678.0, + "step": 1461 + }, + { + "epoch": 0.1859814272993258, + "ewc_loss": 0.0103759765625, + "ewc_loss_parallel": 1.0371208190917969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.046082496643066, + "learning_rate": 6.193302246714709e-07, + "loss": 0.4877, + "mean_token_accuracy": 0.8466023802757263, + "num_tokens": 55804409.0, + "step": 1462 + }, + { + "epoch": 0.1861086375779163, + "ewc_loss": 0.01031494140625, + "ewc_loss_parallel": 1.0311603546142578e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 13.913202285766602, + "learning_rate": 6.197541331072488e-07, + "loss": 0.5171, + "mean_token_accuracy": 0.835240364074707, + "num_tokens": 55839520.0, + "step": 1463 + }, + { + "epoch": 0.1862358478565068, + "ewc_loss": 0.0103759765625, + "ewc_loss_parallel": 1.0371208190917969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.126001358032227, + "learning_rate": 6.201780415430267e-07, + "loss": 0.4965, + "mean_token_accuracy": 0.84184730052948, + "num_tokens": 55879089.0, + "step": 1464 + }, + { + "epoch": 0.18636305813509732, + "ewc_loss": 0.0103759765625, + "ewc_loss_parallel": 1.0371208190917969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.106042861938477, + "learning_rate": 6.206019499788045e-07, + "loss": 0.5409, + "mean_token_accuracy": 0.8290380239486694, + "num_tokens": 55916655.0, + "step": 1465 + }, + { + "epoch": 0.18649026841368782, + "ewc_loss": 0.0103759765625, + "ewc_loss_parallel": 1.0371208190917969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 13.904438018798828, + "learning_rate": 6.210258584145825e-07, + "loss": 0.4743, + "mean_token_accuracy": 0.8450349569320679, + "num_tokens": 55953828.0, + "step": 1466 + }, + { + "epoch": 0.18661747869227835, + "ewc_loss": 0.0103759765625, + "ewc_loss_parallel": 1.0371208190917969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.009737014770508, + "learning_rate": 6.214497668503603e-07, + "loss": 0.5229, + "mean_token_accuracy": 0.8299355506896973, + "num_tokens": 55992037.0, + "step": 1467 + }, + { + "epoch": 0.18674468897086885, + "ewc_loss": 0.0103759765625, + "ewc_loss_parallel": 1.0371208190917969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 13.952884674072266, + "learning_rate": 6.218736752861383e-07, + "loss": 0.4845, + "mean_token_accuracy": 0.8466161489486694, + "num_tokens": 56020424.0, + "step": 1468 + }, + { + "epoch": 0.18687189924945935, + "ewc_loss": 0.01043701171875, + "ewc_loss_parallel": 1.043081283569336e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.03742790222168, + "learning_rate": 6.22297583721916e-07, + "loss": 0.5157, + "mean_token_accuracy": 0.8418957591056824, + "num_tokens": 56060839.0, + "step": 1469 + }, + { + "epoch": 0.18699910952804988, + "ewc_loss": 0.0103759765625, + "ewc_loss_parallel": 1.0371208190917969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.100489616394043, + "learning_rate": 6.227214921576938e-07, + "loss": 0.5471, + "mean_token_accuracy": 0.8312335014343262, + "num_tokens": 56099095.0, + "step": 1470 + }, + { + "epoch": 0.18712631980664038, + "ewc_loss": 0.01043701171875, + "ewc_loss_parallel": 1.043081283569336e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 13.964750289916992, + "learning_rate": 6.231454005934718e-07, + "loss": 0.4416, + "mean_token_accuracy": 0.8591556549072266, + "num_tokens": 56133805.0, + "step": 1471 + }, + { + "epoch": 0.18725353008523088, + "ewc_loss": 0.0103759765625, + "ewc_loss_parallel": 1.0371208190917969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.055900573730469, + "learning_rate": 6.235693090292496e-07, + "loss": 0.5622, + "mean_token_accuracy": 0.8255844116210938, + "num_tokens": 56173064.0, + "step": 1472 + }, + { + "epoch": 0.1873807403638214, + "ewc_loss": 0.0103759765625, + "ewc_loss_parallel": 1.0371208190917969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 13.932709693908691, + "learning_rate": 6.239932174650275e-07, + "loss": 0.4426, + "mean_token_accuracy": 0.8605096936225891, + "num_tokens": 56213285.0, + "step": 1473 + }, + { + "epoch": 0.1875079506424119, + "ewc_loss": 0.0103759765625, + "ewc_loss_parallel": 1.0371208190917969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 13.991719245910645, + "learning_rate": 6.244171259008054e-07, + "loss": 0.5262, + "mean_token_accuracy": 0.8300623297691345, + "num_tokens": 56248218.0, + "step": 1474 + }, + { + "epoch": 0.1876351609210024, + "ewc_loss": 0.01055908203125, + "ewc_loss_parallel": 1.055002212524414e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.067178726196289, + "learning_rate": 6.248410343365833e-07, + "loss": 0.4703, + "mean_token_accuracy": 0.8502501249313354, + "num_tokens": 56286098.0, + "step": 1475 + }, + { + "epoch": 0.18776237119959294, + "ewc_loss": 0.0103759765625, + "ewc_loss_parallel": 1.0371208190917969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 13.888971328735352, + "learning_rate": 6.252649427723612e-07, + "loss": 0.4775, + "mean_token_accuracy": 0.848820686340332, + "num_tokens": 56329732.0, + "step": 1476 + }, + { + "epoch": 0.18788958147818344, + "ewc_loss": 0.010498046875, + "ewc_loss_parallel": 1.049041748046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.012165069580078, + "learning_rate": 6.25688851208139e-07, + "loss": 0.4826, + "mean_token_accuracy": 0.8472242951393127, + "num_tokens": 56367838.0, + "step": 1477 + }, + { + "epoch": 0.18801679175677394, + "ewc_loss": 0.01043701171875, + "ewc_loss_parallel": 1.043081283569336e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 13.924901008605957, + "learning_rate": 6.261127596439168e-07, + "loss": 0.5346, + "mean_token_accuracy": 0.8289931416511536, + "num_tokens": 56413059.0, + "step": 1478 + }, + { + "epoch": 0.18814400203536447, + "ewc_loss": 0.01043701171875, + "ewc_loss_parallel": 1.043081283569336e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 13.919289588928223, + "learning_rate": 6.265366680796948e-07, + "loss": 0.4378, + "mean_token_accuracy": 0.8543103933334351, + "num_tokens": 56453806.0, + "step": 1479 + }, + { + "epoch": 0.18827121231395497, + "ewc_loss": 0.01055908203125, + "ewc_loss_parallel": 1.055002212524414e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.16554069519043, + "learning_rate": 6.269605765154726e-07, + "loss": 0.4861, + "mean_token_accuracy": 0.8431036472320557, + "num_tokens": 56497448.0, + "step": 1480 + }, + { + "epoch": 0.18839842259254547, + "ewc_loss": 0.0106201171875, + "ewc_loss_parallel": 1.0609626770019531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.035067558288574, + "learning_rate": 6.273844849512505e-07, + "loss": 0.488, + "mean_token_accuracy": 0.8430237174034119, + "num_tokens": 56537731.0, + "step": 1481 + }, + { + "epoch": 0.188525632871136, + "ewc_loss": 0.0106201171875, + "ewc_loss_parallel": 1.0609626770019531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 13.998955726623535, + "learning_rate": 6.278083933870284e-07, + "loss": 0.4273, + "mean_token_accuracy": 0.862947940826416, + "num_tokens": 56580944.0, + "step": 1482 + }, + { + "epoch": 0.1886528431497265, + "ewc_loss": 0.0106201171875, + "ewc_loss_parallel": 1.0609626770019531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.106527328491211, + "learning_rate": 6.282323018228063e-07, + "loss": 0.4597, + "mean_token_accuracy": 0.8538340330123901, + "num_tokens": 56616306.0, + "step": 1483 + }, + { + "epoch": 0.188780053428317, + "ewc_loss": 0.0106201171875, + "ewc_loss_parallel": 1.0609626770019531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.005897521972656, + "learning_rate": 6.286562102585841e-07, + "loss": 0.5398, + "mean_token_accuracy": 0.8329033255577087, + "num_tokens": 56661874.0, + "step": 1484 + }, + { + "epoch": 0.18890726370690752, + "ewc_loss": 0.0106201171875, + "ewc_loss_parallel": 1.0609626770019531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.05244255065918, + "learning_rate": 6.29080118694362e-07, + "loss": 0.4345, + "mean_token_accuracy": 0.8578691482543945, + "num_tokens": 56696697.0, + "step": 1485 + }, + { + "epoch": 0.18903447398549802, + "ewc_loss": 0.01068115234375, + "ewc_loss_parallel": 1.0669231414794922e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.208761215209961, + "learning_rate": 6.295040271301398e-07, + "loss": 0.4342, + "mean_token_accuracy": 0.858435869216919, + "num_tokens": 56732581.0, + "step": 1486 + }, + { + "epoch": 0.18916168426408853, + "ewc_loss": 0.0107421875, + "ewc_loss_parallel": 1.0728836059570312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.139883041381836, + "learning_rate": 6.299279355659178e-07, + "loss": 0.4842, + "mean_token_accuracy": 0.8460595607757568, + "num_tokens": 56777905.0, + "step": 1487 + }, + { + "epoch": 0.18928889454267905, + "ewc_loss": 0.0106201171875, + "ewc_loss_parallel": 1.0609626770019531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.153311729431152, + "learning_rate": 6.303518440016956e-07, + "loss": 0.5034, + "mean_token_accuracy": 0.8382800221443176, + "num_tokens": 56812915.0, + "step": 1488 + }, + { + "epoch": 0.18941610482126955, + "ewc_loss": 0.01068115234375, + "ewc_loss_parallel": 1.0669231414794922e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.20378303527832, + "learning_rate": 6.307757524374735e-07, + "loss": 0.4988, + "mean_token_accuracy": 0.842928409576416, + "num_tokens": 56846907.0, + "step": 1489 + }, + { + "epoch": 0.18954331509986008, + "ewc_loss": 0.01055908203125, + "ewc_loss_parallel": 1.055002212524414e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.196173667907715, + "learning_rate": 6.311996608732514e-07, + "loss": 0.5258, + "mean_token_accuracy": 0.836978554725647, + "num_tokens": 56884614.0, + "step": 1490 + }, + { + "epoch": 0.18967052537845058, + "ewc_loss": 0.0107421875, + "ewc_loss_parallel": 1.0728836059570312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.132170677185059, + "learning_rate": 6.316235693090292e-07, + "loss": 0.5178, + "mean_token_accuracy": 0.8338713049888611, + "num_tokens": 56920508.0, + "step": 1491 + }, + { + "epoch": 0.18979773565704108, + "ewc_loss": 0.0106201171875, + "ewc_loss_parallel": 1.0609626770019531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.103278160095215, + "learning_rate": 6.320474777448071e-07, + "loss": 0.5204, + "mean_token_accuracy": 0.8332321643829346, + "num_tokens": 56957629.0, + "step": 1492 + }, + { + "epoch": 0.1899249459356316, + "ewc_loss": 0.01068115234375, + "ewc_loss_parallel": 1.0669231414794922e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.265873908996582, + "learning_rate": 6.324713861805849e-07, + "loss": 0.4788, + "mean_token_accuracy": 0.844652533531189, + "num_tokens": 56997265.0, + "step": 1493 + }, + { + "epoch": 0.1900521562142221, + "ewc_loss": 0.010498046875, + "ewc_loss_parallel": 1.049041748046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.044293403625488, + "learning_rate": 6.328952946163628e-07, + "loss": 0.5827, + "mean_token_accuracy": 0.8202635049819946, + "num_tokens": 57031519.0, + "step": 1494 + }, + { + "epoch": 0.1901793664928126, + "ewc_loss": 0.0107421875, + "ewc_loss_parallel": 1.0728836059570312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.15061092376709, + "learning_rate": 6.333192030521407e-07, + "loss": 0.4526, + "mean_token_accuracy": 0.8507962226867676, + "num_tokens": 57067264.0, + "step": 1495 + }, + { + "epoch": 0.19030657677140314, + "ewc_loss": 0.0106201171875, + "ewc_loss_parallel": 1.0609626770019531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.055157661437988, + "learning_rate": 6.337431114879186e-07, + "loss": 0.5074, + "mean_token_accuracy": 0.8391233682632446, + "num_tokens": 57104906.0, + "step": 1496 + }, + { + "epoch": 0.19043378704999364, + "ewc_loss": 0.01080322265625, + "ewc_loss_parallel": 1.0788440704345703e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.261996269226074, + "learning_rate": 6.341670199236965e-07, + "loss": 0.4721, + "mean_token_accuracy": 0.8495460152626038, + "num_tokens": 57142534.0, + "step": 1497 + }, + { + "epoch": 0.19056099732858414, + "ewc_loss": 0.01068115234375, + "ewc_loss_parallel": 1.0669231414794922e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.02553653717041, + "learning_rate": 6.345909283594744e-07, + "loss": 0.4543, + "mean_token_accuracy": 0.8539451360702515, + "num_tokens": 57188735.0, + "step": 1498 + }, + { + "epoch": 0.19068820760717467, + "ewc_loss": 0.01068115234375, + "ewc_loss_parallel": 1.0669231414794922e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.037555694580078, + "learning_rate": 6.350148367952522e-07, + "loss": 0.5071, + "mean_token_accuracy": 0.832978367805481, + "num_tokens": 57224610.0, + "step": 1499 + }, + { + "epoch": 0.19081541788576517, + "ewc_loss": 0.0107421875, + "ewc_loss_parallel": 1.0728836059570312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.293051719665527, + "learning_rate": 6.354387452310301e-07, + "loss": 0.4611, + "mean_token_accuracy": 0.8485709428787231, + "num_tokens": 57259553.0, + "step": 1500 + }, + { + "epoch": 0.19094262816435567, + "ewc_loss": 0.01068115234375, + "ewc_loss_parallel": 1.0669231414794922e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.078368186950684, + "learning_rate": 6.358626536668079e-07, + "loss": 0.4665, + "mean_token_accuracy": 0.8511337041854858, + "num_tokens": 57295819.0, + "step": 1501 + }, + { + "epoch": 0.1910698384429462, + "ewc_loss": 0.01068115234375, + "ewc_loss_parallel": 1.0669231414794922e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.138331413269043, + "learning_rate": 6.362865621025858e-07, + "loss": 0.4534, + "mean_token_accuracy": 0.8558326363563538, + "num_tokens": 57339110.0, + "step": 1502 + }, + { + "epoch": 0.1911970487215367, + "ewc_loss": 0.0107421875, + "ewc_loss_parallel": 1.0728836059570312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.104896545410156, + "learning_rate": 6.367104705383637e-07, + "loss": 0.4827, + "mean_token_accuracy": 0.8453016877174377, + "num_tokens": 57377311.0, + "step": 1503 + }, + { + "epoch": 0.1913242590001272, + "ewc_loss": 0.0107421875, + "ewc_loss_parallel": 1.0728836059570312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.16076946258545, + "learning_rate": 6.371343789741416e-07, + "loss": 0.4955, + "mean_token_accuracy": 0.8378461599349976, + "num_tokens": 57419281.0, + "step": 1504 + }, + { + "epoch": 0.19145146927871773, + "ewc_loss": 0.01080322265625, + "ewc_loss_parallel": 1.0788440704345703e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.061935424804688, + "learning_rate": 6.375582874099195e-07, + "loss": 0.4819, + "mean_token_accuracy": 0.847267746925354, + "num_tokens": 57458134.0, + "step": 1505 + }, + { + "epoch": 0.19157867955730823, + "ewc_loss": 0.0107421875, + "ewc_loss_parallel": 1.0728836059570312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.136311531066895, + "learning_rate": 6.379821958456974e-07, + "loss": 0.5377, + "mean_token_accuracy": 0.8283808827400208, + "num_tokens": 57492931.0, + "step": 1506 + }, + { + "epoch": 0.19170588983589873, + "ewc_loss": 0.0107421875, + "ewc_loss_parallel": 1.0728836059570312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.144609451293945, + "learning_rate": 6.384061042814751e-07, + "loss": 0.5078, + "mean_token_accuracy": 0.8368309736251831, + "num_tokens": 57529581.0, + "step": 1507 + }, + { + "epoch": 0.19183310011448926, + "ewc_loss": 0.01080322265625, + "ewc_loss_parallel": 1.0788440704345703e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.144582748413086, + "learning_rate": 6.38830012717253e-07, + "loss": 0.4951, + "mean_token_accuracy": 0.8424067497253418, + "num_tokens": 57570302.0, + "step": 1508 + }, + { + "epoch": 0.19196031039307976, + "ewc_loss": 0.0107421875, + "ewc_loss_parallel": 1.0728836059570312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.126449584960938, + "learning_rate": 6.392539211530309e-07, + "loss": 0.5076, + "mean_token_accuracy": 0.8358895778656006, + "num_tokens": 57615974.0, + "step": 1509 + }, + { + "epoch": 0.19208752067167026, + "ewc_loss": 0.01080322265625, + "ewc_loss_parallel": 1.0788440704345703e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.007722854614258, + "learning_rate": 6.396778295888087e-07, + "loss": 0.4715, + "mean_token_accuracy": 0.850821316242218, + "num_tokens": 57653737.0, + "step": 1510 + }, + { + "epoch": 0.1922147309502608, + "ewc_loss": 0.01092529296875, + "ewc_loss_parallel": 1.0907649993896484e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.180646896362305, + "learning_rate": 6.401017380245867e-07, + "loss": 0.5051, + "mean_token_accuracy": 0.8372681140899658, + "num_tokens": 57693431.0, + "step": 1511 + }, + { + "epoch": 0.1923419412288513, + "ewc_loss": 0.01080322265625, + "ewc_loss_parallel": 1.0788440704345703e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.186454772949219, + "learning_rate": 6.405256464603645e-07, + "loss": 0.4898, + "mean_token_accuracy": 0.8415923118591309, + "num_tokens": 57736413.0, + "step": 1512 + }, + { + "epoch": 0.1924691515074418, + "ewc_loss": 0.01080322265625, + "ewc_loss_parallel": 1.0788440704345703e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.27212905883789, + "learning_rate": 6.409495548961425e-07, + "loss": 0.4743, + "mean_token_accuracy": 0.849925696849823, + "num_tokens": 57773293.0, + "step": 1513 + }, + { + "epoch": 0.19259636178603232, + "ewc_loss": 0.0108642578125, + "ewc_loss_parallel": 1.0848045349121094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.16082763671875, + "learning_rate": 6.413734633319203e-07, + "loss": 0.4873, + "mean_token_accuracy": 0.8450383543968201, + "num_tokens": 57809294.0, + "step": 1514 + }, + { + "epoch": 0.19272357206462282, + "ewc_loss": 0.010986328125, + "ewc_loss_parallel": 1.0967254638671875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.188432693481445, + "learning_rate": 6.417973717676981e-07, + "loss": 0.4576, + "mean_token_accuracy": 0.8514529466629028, + "num_tokens": 57843610.0, + "step": 1515 + }, + { + "epoch": 0.19285078234321335, + "ewc_loss": 0.01092529296875, + "ewc_loss_parallel": 1.0907649993896484e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.264181137084961, + "learning_rate": 6.42221280203476e-07, + "loss": 0.518, + "mean_token_accuracy": 0.8320920467376709, + "num_tokens": 57893098.0, + "step": 1516 + }, + { + "epoch": 0.19297799262180385, + "ewc_loss": 0.01104736328125, + "ewc_loss_parallel": 1.1026859283447266e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.096797943115234, + "learning_rate": 6.426451886392539e-07, + "loss": 0.5452, + "mean_token_accuracy": 0.8315415382385254, + "num_tokens": 57933098.0, + "step": 1517 + }, + { + "epoch": 0.19310520290039435, + "ewc_loss": 0.010986328125, + "ewc_loss_parallel": 1.0967254638671875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.221393585205078, + "learning_rate": 6.430690970750317e-07, + "loss": 0.4767, + "mean_token_accuracy": 0.8472231030464172, + "num_tokens": 57971502.0, + "step": 1518 + }, + { + "epoch": 0.19323241317898487, + "ewc_loss": 0.01104736328125, + "ewc_loss_parallel": 1.1026859283447266e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.262463569641113, + "learning_rate": 6.434930055108097e-07, + "loss": 0.5272, + "mean_token_accuracy": 0.830608606338501, + "num_tokens": 58001557.0, + "step": 1519 + }, + { + "epoch": 0.19335962345757537, + "ewc_loss": 0.01092529296875, + "ewc_loss_parallel": 1.0907649993896484e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.22391128540039, + "learning_rate": 6.439169139465875e-07, + "loss": 0.5357, + "mean_token_accuracy": 0.8288347721099854, + "num_tokens": 58038058.0, + "step": 1520 + }, + { + "epoch": 0.19348683373616588, + "ewc_loss": 0.01104736328125, + "ewc_loss_parallel": 1.1026859283447266e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.294925689697266, + "learning_rate": 6.443408223823655e-07, + "loss": 0.5027, + "mean_token_accuracy": 0.8386461734771729, + "num_tokens": 58073138.0, + "step": 1521 + }, + { + "epoch": 0.1936140440147564, + "ewc_loss": 0.01104736328125, + "ewc_loss_parallel": 1.1026859283447266e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.29820728302002, + "learning_rate": 6.447647308181432e-07, + "loss": 0.4943, + "mean_token_accuracy": 0.840509831905365, + "num_tokens": 58111793.0, + "step": 1522 + }, + { + "epoch": 0.1937412542933469, + "ewc_loss": 0.01104736328125, + "ewc_loss_parallel": 1.1026859283447266e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.129831314086914, + "learning_rate": 6.451886392539211e-07, + "loss": 0.4818, + "mean_token_accuracy": 0.8445993661880493, + "num_tokens": 58152911.0, + "step": 1523 + }, + { + "epoch": 0.1938684645719374, + "ewc_loss": 0.01116943359375, + "ewc_loss_parallel": 1.1146068572998047e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.401949882507324, + "learning_rate": 6.45612547689699e-07, + "loss": 0.5006, + "mean_token_accuracy": 0.8410547375679016, + "num_tokens": 58192095.0, + "step": 1524 + }, + { + "epoch": 0.19399567485052793, + "ewc_loss": 0.01116943359375, + "ewc_loss_parallel": 1.1146068572998047e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.324193000793457, + "learning_rate": 6.460364561254769e-07, + "loss": 0.4549, + "mean_token_accuracy": 0.8530381917953491, + "num_tokens": 58232651.0, + "step": 1525 + }, + { + "epoch": 0.19412288512911843, + "ewc_loss": 0.01104736328125, + "ewc_loss_parallel": 1.1026859283447266e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.127801895141602, + "learning_rate": 6.464603645612547e-07, + "loss": 0.4518, + "mean_token_accuracy": 0.8552663326263428, + "num_tokens": 58270802.0, + "step": 1526 + }, + { + "epoch": 0.19425009540770893, + "ewc_loss": 0.0111083984375, + "ewc_loss_parallel": 1.1086463928222656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.325737953186035, + "learning_rate": 6.468842729970327e-07, + "loss": 0.4842, + "mean_token_accuracy": 0.8472055196762085, + "num_tokens": 58313179.0, + "step": 1527 + }, + { + "epoch": 0.19437730568629946, + "ewc_loss": 0.0111083984375, + "ewc_loss_parallel": 1.1086463928222656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.402999877929688, + "learning_rate": 6.473081814328105e-07, + "loss": 0.5235, + "mean_token_accuracy": 0.8373307585716248, + "num_tokens": 58354728.0, + "step": 1528 + }, + { + "epoch": 0.19450451596488996, + "ewc_loss": 0.01116943359375, + "ewc_loss_parallel": 1.1146068572998047e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.286308288574219, + "learning_rate": 6.477320898685885e-07, + "loss": 0.5085, + "mean_token_accuracy": 0.8400663733482361, + "num_tokens": 58398649.0, + "step": 1529 + }, + { + "epoch": 0.19463172624348046, + "ewc_loss": 0.010986328125, + "ewc_loss_parallel": 1.0967254638671875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.35716438293457, + "learning_rate": 6.481559983043662e-07, + "loss": 0.5189, + "mean_token_accuracy": 0.8334460258483887, + "num_tokens": 58438261.0, + "step": 1530 + }, + { + "epoch": 0.194758936522071, + "ewc_loss": 0.01116943359375, + "ewc_loss_parallel": 1.1146068572998047e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.351550102233887, + "learning_rate": 6.48579906740144e-07, + "loss": 0.4635, + "mean_token_accuracy": 0.8493707180023193, + "num_tokens": 58479141.0, + "step": 1531 + }, + { + "epoch": 0.1948861468006615, + "ewc_loss": 0.01116943359375, + "ewc_loss_parallel": 1.1146068572998047e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.352554321289062, + "learning_rate": 6.49003815175922e-07, + "loss": 0.4316, + "mean_token_accuracy": 0.8615394830703735, + "num_tokens": 58515700.0, + "step": 1532 + }, + { + "epoch": 0.195013357079252, + "ewc_loss": 0.01123046875, + "ewc_loss_parallel": 1.1205673217773438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.333549499511719, + "learning_rate": 6.494277236116998e-07, + "loss": 0.4385, + "mean_token_accuracy": 0.86031174659729, + "num_tokens": 58549825.0, + "step": 1533 + }, + { + "epoch": 0.19514056735784252, + "ewc_loss": 0.0111083984375, + "ewc_loss_parallel": 1.1086463928222656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.432008743286133, + "learning_rate": 6.498516320474777e-07, + "loss": 0.5778, + "mean_token_accuracy": 0.8207955360412598, + "num_tokens": 58596110.0, + "step": 1534 + }, + { + "epoch": 0.19526777763643302, + "ewc_loss": 0.01116943359375, + "ewc_loss_parallel": 1.1146068572998047e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.321202278137207, + "learning_rate": 6.502755404832556e-07, + "loss": 0.4888, + "mean_token_accuracy": 0.8407690525054932, + "num_tokens": 58632410.0, + "step": 1535 + }, + { + "epoch": 0.19539498791502352, + "ewc_loss": 0.010986328125, + "ewc_loss_parallel": 1.0967254638671875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.287745475769043, + "learning_rate": 6.506994489190335e-07, + "loss": 0.5506, + "mean_token_accuracy": 0.8300646543502808, + "num_tokens": 58671339.0, + "step": 1536 + }, + { + "epoch": 0.19552219819361405, + "ewc_loss": 0.01116943359375, + "ewc_loss_parallel": 1.1146068572998047e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.431102752685547, + "learning_rate": 6.511233573548114e-07, + "loss": 0.45, + "mean_token_accuracy": 0.8548032641410828, + "num_tokens": 58711786.0, + "step": 1537 + }, + { + "epoch": 0.19564940847220455, + "ewc_loss": 0.0111083984375, + "ewc_loss_parallel": 1.1086463928222656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.24413013458252, + "learning_rate": 6.515472657905892e-07, + "loss": 0.4931, + "mean_token_accuracy": 0.8416759967803955, + "num_tokens": 58743600.0, + "step": 1538 + }, + { + "epoch": 0.19577661875079505, + "ewc_loss": 0.0111083984375, + "ewc_loss_parallel": 1.1086463928222656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.404497146606445, + "learning_rate": 6.51971174226367e-07, + "loss": 0.551, + "mean_token_accuracy": 0.8252519369125366, + "num_tokens": 58780622.0, + "step": 1539 + }, + { + "epoch": 0.19590382902938558, + "ewc_loss": 0.01116943359375, + "ewc_loss_parallel": 1.1146068572998047e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.422091484069824, + "learning_rate": 6.52395082662145e-07, + "loss": 0.5443, + "mean_token_accuracy": 0.8269665241241455, + "num_tokens": 58819370.0, + "step": 1540 + }, + { + "epoch": 0.19603103930797608, + "ewc_loss": 0.01116943359375, + "ewc_loss_parallel": 1.1146068572998047e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.175599098205566, + "learning_rate": 6.528189910979228e-07, + "loss": 0.4673, + "mean_token_accuracy": 0.8496986627578735, + "num_tokens": 58856270.0, + "step": 1541 + }, + { + "epoch": 0.1961582495865666, + "ewc_loss": 0.01123046875, + "ewc_loss_parallel": 1.1205673217773438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.373149871826172, + "learning_rate": 6.532428995337007e-07, + "loss": 0.4841, + "mean_token_accuracy": 0.8446728587150574, + "num_tokens": 58898702.0, + "step": 1542 + }, + { + "epoch": 0.1962854598651571, + "ewc_loss": 0.0113525390625, + "ewc_loss_parallel": 1.1324882507324219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.51828384399414, + "learning_rate": 6.536668079694786e-07, + "loss": 0.5061, + "mean_token_accuracy": 0.8360108137130737, + "num_tokens": 58934602.0, + "step": 1543 + }, + { + "epoch": 0.1964126701437476, + "ewc_loss": 0.01116943359375, + "ewc_loss_parallel": 1.1146068572998047e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.577285766601562, + "learning_rate": 6.540907164052565e-07, + "loss": 0.4914, + "mean_token_accuracy": 0.8426134586334229, + "num_tokens": 58977766.0, + "step": 1544 + }, + { + "epoch": 0.19653988042233814, + "ewc_loss": 0.01129150390625, + "ewc_loss_parallel": 1.1265277862548828e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.449399948120117, + "learning_rate": 6.545146248410343e-07, + "loss": 0.502, + "mean_token_accuracy": 0.8388112187385559, + "num_tokens": 59019649.0, + "step": 1545 + }, + { + "epoch": 0.19666709070092864, + "ewc_loss": 0.01123046875, + "ewc_loss_parallel": 1.1205673217773438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.511979103088379, + "learning_rate": 6.549385332768122e-07, + "loss": 0.5137, + "mean_token_accuracy": 0.8377988338470459, + "num_tokens": 59060543.0, + "step": 1546 + }, + { + "epoch": 0.19679430097951914, + "ewc_loss": 0.0111083984375, + "ewc_loss_parallel": 1.1086463928222656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.460150718688965, + "learning_rate": 6.5536244171259e-07, + "loss": 0.5122, + "mean_token_accuracy": 0.8354827761650085, + "num_tokens": 59100049.0, + "step": 1547 + }, + { + "epoch": 0.19692151125810967, + "ewc_loss": 0.0111083984375, + "ewc_loss_parallel": 1.1086463928222656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.353584289550781, + "learning_rate": 6.55786350148368e-07, + "loss": 0.5197, + "mean_token_accuracy": 0.8337311148643494, + "num_tokens": 59137370.0, + "step": 1548 + }, + { + "epoch": 0.19704872153670017, + "ewc_loss": 0.01104736328125, + "ewc_loss_parallel": 1.1026859283447266e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.657391548156738, + "learning_rate": 6.562102585841458e-07, + "loss": 0.4718, + "mean_token_accuracy": 0.8445147275924683, + "num_tokens": 59176196.0, + "step": 1549 + }, + { + "epoch": 0.19717593181529067, + "ewc_loss": 0.01129150390625, + "ewc_loss_parallel": 1.1265277862548828e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.279279708862305, + "learning_rate": 6.566341670199236e-07, + "loss": 0.4703, + "mean_token_accuracy": 0.8493337631225586, + "num_tokens": 59216215.0, + "step": 1550 + }, + { + "epoch": 0.1973031420938812, + "ewc_loss": 0.0111083984375, + "ewc_loss_parallel": 1.1086463928222656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.334976196289062, + "learning_rate": 6.570580754557016e-07, + "loss": 0.4868, + "mean_token_accuracy": 0.8434025049209595, + "num_tokens": 59254173.0, + "step": 1551 + }, + { + "epoch": 0.1974303523724717, + "ewc_loss": 0.0113525390625, + "ewc_loss_parallel": 1.1324882507324219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.454713821411133, + "learning_rate": 6.574819838914794e-07, + "loss": 0.4583, + "mean_token_accuracy": 0.8520591855049133, + "num_tokens": 59288496.0, + "step": 1552 + }, + { + "epoch": 0.1975575626510622, + "ewc_loss": 0.01116943359375, + "ewc_loss_parallel": 1.1146068572998047e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.38670825958252, + "learning_rate": 6.579058923272573e-07, + "loss": 0.518, + "mean_token_accuracy": 0.833472728729248, + "num_tokens": 59329043.0, + "step": 1553 + }, + { + "epoch": 0.19768477292965272, + "ewc_loss": 0.01129150390625, + "ewc_loss_parallel": 1.1265277862548828e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.32424259185791, + "learning_rate": 6.583298007630351e-07, + "loss": 0.509, + "mean_token_accuracy": 0.8397361040115356, + "num_tokens": 59360743.0, + "step": 1554 + }, + { + "epoch": 0.19781198320824323, + "ewc_loss": 0.0113525390625, + "ewc_loss_parallel": 1.1324882507324219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.420591354370117, + "learning_rate": 6.58753709198813e-07, + "loss": 0.5049, + "mean_token_accuracy": 0.8380330801010132, + "num_tokens": 59404167.0, + "step": 1555 + }, + { + "epoch": 0.19793919348683373, + "ewc_loss": 0.0113525390625, + "ewc_loss_parallel": 1.1324882507324219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.424174308776855, + "learning_rate": 6.591776176345909e-07, + "loss": 0.4722, + "mean_token_accuracy": 0.8447147607803345, + "num_tokens": 59447770.0, + "step": 1556 + }, + { + "epoch": 0.19806640376542425, + "ewc_loss": 0.01129150390625, + "ewc_loss_parallel": 1.1265277862548828e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.436230659484863, + "learning_rate": 6.596015260703688e-07, + "loss": 0.4817, + "mean_token_accuracy": 0.849539041519165, + "num_tokens": 59494989.0, + "step": 1557 + }, + { + "epoch": 0.19819361404401475, + "ewc_loss": 0.0113525390625, + "ewc_loss_parallel": 1.1324882507324219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.452295303344727, + "learning_rate": 6.600254345061466e-07, + "loss": 0.469, + "mean_token_accuracy": 0.8500795364379883, + "num_tokens": 59534926.0, + "step": 1558 + }, + { + "epoch": 0.19832082432260525, + "ewc_loss": 0.0113525390625, + "ewc_loss_parallel": 1.1324882507324219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.5869779586792, + "learning_rate": 6.604493429419246e-07, + "loss": 0.524, + "mean_token_accuracy": 0.8350193500518799, + "num_tokens": 59577832.0, + "step": 1559 + }, + { + "epoch": 0.19844803460119578, + "ewc_loss": 0.01129150390625, + "ewc_loss_parallel": 1.1265277862548828e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.474446296691895, + "learning_rate": 6.608732513777023e-07, + "loss": 0.5318, + "mean_token_accuracy": 0.8388439416885376, + "num_tokens": 59624995.0, + "step": 1560 + }, + { + "epoch": 0.19857524487978628, + "ewc_loss": 0.0113525390625, + "ewc_loss_parallel": 1.1324882507324219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.894143104553223, + "learning_rate": 6.612971598134803e-07, + "loss": 0.484, + "mean_token_accuracy": 0.8437979221343994, + "num_tokens": 59663000.0, + "step": 1561 + }, + { + "epoch": 0.19870245515837678, + "ewc_loss": 0.01141357421875, + "ewc_loss_parallel": 1.138448715209961e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.384955406188965, + "learning_rate": 6.617210682492581e-07, + "loss": 0.5404, + "mean_token_accuracy": 0.8282244205474854, + "num_tokens": 59700241.0, + "step": 1562 + }, + { + "epoch": 0.1988296654369673, + "ewc_loss": 0.01129150390625, + "ewc_loss_parallel": 1.1265277862548828e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.547113418579102, + "learning_rate": 6.62144976685036e-07, + "loss": 0.4573, + "mean_token_accuracy": 0.8558753728866577, + "num_tokens": 59737244.0, + "step": 1563 + }, + { + "epoch": 0.1989568757155578, + "ewc_loss": 0.011474609375, + "ewc_loss_parallel": 1.1444091796875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.561962127685547, + "learning_rate": 6.625688851208139e-07, + "loss": 0.5076, + "mean_token_accuracy": 0.8384894132614136, + "num_tokens": 59775538.0, + "step": 1564 + }, + { + "epoch": 0.19908408599414834, + "ewc_loss": 0.0113525390625, + "ewc_loss_parallel": 1.1324882507324219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.441300392150879, + "learning_rate": 6.629927935565918e-07, + "loss": 0.5017, + "mean_token_accuracy": 0.8380743265151978, + "num_tokens": 59810373.0, + "step": 1565 + }, + { + "epoch": 0.19921129627273884, + "ewc_loss": 0.0113525390625, + "ewc_loss_parallel": 1.1324882507324219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.469098091125488, + "learning_rate": 6.634167019923696e-07, + "loss": 0.5333, + "mean_token_accuracy": 0.834197998046875, + "num_tokens": 59849992.0, + "step": 1566 + }, + { + "epoch": 0.19933850655132934, + "ewc_loss": 0.0113525390625, + "ewc_loss_parallel": 1.1324882507324219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.443674087524414, + "learning_rate": 6.638406104281476e-07, + "loss": 0.4944, + "mean_token_accuracy": 0.8408128619194031, + "num_tokens": 59882552.0, + "step": 1567 + }, + { + "epoch": 0.19946571682991987, + "ewc_loss": 0.01129150390625, + "ewc_loss_parallel": 1.1265277862548828e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.540221214294434, + "learning_rate": 6.642645188639253e-07, + "loss": 0.4695, + "mean_token_accuracy": 0.8471479415893555, + "num_tokens": 59925090.0, + "step": 1568 + }, + { + "epoch": 0.19959292710851037, + "ewc_loss": 0.011474609375, + "ewc_loss_parallel": 1.1444091796875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.40590763092041, + "learning_rate": 6.646884272997032e-07, + "loss": 0.5189, + "mean_token_accuracy": 0.8347338438034058, + "num_tokens": 59965406.0, + "step": 1569 + }, + { + "epoch": 0.19972013738710087, + "ewc_loss": 0.0113525390625, + "ewc_loss_parallel": 1.1324882507324219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.270981788635254, + "learning_rate": 6.651123357354811e-07, + "loss": 0.5057, + "mean_token_accuracy": 0.8386247158050537, + "num_tokens": 60001770.0, + "step": 1570 + }, + { + "epoch": 0.1998473476656914, + "ewc_loss": 0.011474609375, + "ewc_loss_parallel": 1.1444091796875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.498615264892578, + "learning_rate": 6.655362441712589e-07, + "loss": 0.5214, + "mean_token_accuracy": 0.8326208591461182, + "num_tokens": 60036911.0, + "step": 1571 + }, + { + "epoch": 0.1999745579442819, + "ewc_loss": 0.01153564453125, + "ewc_loss_parallel": 1.1563301086425781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.296235084533691, + "learning_rate": 6.659601526070369e-07, + "loss": 0.5494, + "mean_token_accuracy": 0.8291100263595581, + "num_tokens": 60080210.0, + "step": 1572 + }, + { + "epoch": 0.2001017682228724, + "ewc_loss": 0.011474609375, + "ewc_loss_parallel": 1.150369644165039e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.557855606079102, + "learning_rate": 6.663840610428147e-07, + "loss": 0.5848, + "mean_token_accuracy": 0.821843147277832, + "num_tokens": 60111533.0, + "step": 1573 + }, + { + "epoch": 0.20022897850146293, + "ewc_loss": 0.0115966796875, + "ewc_loss_parallel": 1.1622905731201172e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.532612800598145, + "learning_rate": 6.668079694785926e-07, + "loss": 0.4595, + "mean_token_accuracy": 0.8531987071037292, + "num_tokens": 60150733.0, + "step": 1574 + }, + { + "epoch": 0.20035618878005343, + "ewc_loss": 0.011474609375, + "ewc_loss_parallel": 1.150369644165039e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.43940258026123, + "learning_rate": 6.672318779143704e-07, + "loss": 0.4798, + "mean_token_accuracy": 0.8463480472564697, + "num_tokens": 60185034.0, + "step": 1575 + }, + { + "epoch": 0.20048339905864393, + "ewc_loss": 0.011474609375, + "ewc_loss_parallel": 1.150369644165039e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.569955825805664, + "learning_rate": 6.676557863501483e-07, + "loss": 0.5118, + "mean_token_accuracy": 0.8368837833404541, + "num_tokens": 60216865.0, + "step": 1576 + }, + { + "epoch": 0.20061060933723446, + "ewc_loss": 0.01165771484375, + "ewc_loss_parallel": 1.1682510375976562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.595352172851562, + "learning_rate": 6.680796947859262e-07, + "loss": 0.5007, + "mean_token_accuracy": 0.8396444916725159, + "num_tokens": 60257685.0, + "step": 1577 + }, + { + "epoch": 0.20073781961582496, + "ewc_loss": 0.011474609375, + "ewc_loss_parallel": 1.150369644165039e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.61436653137207, + "learning_rate": 6.685036032217041e-07, + "loss": 0.4938, + "mean_token_accuracy": 0.8431394100189209, + "num_tokens": 60294258.0, + "step": 1578 + }, + { + "epoch": 0.20086502989441546, + "ewc_loss": 0.0115966796875, + "ewc_loss_parallel": 1.1622905731201172e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.587539672851562, + "learning_rate": 6.689275116574819e-07, + "loss": 0.4843, + "mean_token_accuracy": 0.848354697227478, + "num_tokens": 60330319.0, + "step": 1579 + }, + { + "epoch": 0.200992240173006, + "ewc_loss": 0.01153564453125, + "ewc_loss_parallel": 1.1563301086425781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.593533515930176, + "learning_rate": 6.693514200932599e-07, + "loss": 0.5011, + "mean_token_accuracy": 0.8435118198394775, + "num_tokens": 60370482.0, + "step": 1580 + }, + { + "epoch": 0.2011194504515965, + "ewc_loss": 0.0115966796875, + "ewc_loss_parallel": 1.1622905731201172e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.552387237548828, + "learning_rate": 6.697753285290377e-07, + "loss": 0.507, + "mean_token_accuracy": 0.8383063077926636, + "num_tokens": 60412436.0, + "step": 1581 + }, + { + "epoch": 0.201246660730187, + "ewc_loss": 0.01165771484375, + "ewc_loss_parallel": 1.1682510375976562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.653478622436523, + "learning_rate": 6.701992369648156e-07, + "loss": 0.4298, + "mean_token_accuracy": 0.8621861934661865, + "num_tokens": 60448982.0, + "step": 1582 + }, + { + "epoch": 0.20137387100877752, + "ewc_loss": 0.0115966796875, + "ewc_loss_parallel": 1.1622905731201172e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.483749389648438, + "learning_rate": 6.706231454005934e-07, + "loss": 0.4471, + "mean_token_accuracy": 0.854668378829956, + "num_tokens": 60489878.0, + "step": 1583 + }, + { + "epoch": 0.20150108128736802, + "ewc_loss": 0.01153564453125, + "ewc_loss_parallel": 1.1563301086425781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.631285667419434, + "learning_rate": 6.710470538363713e-07, + "loss": 0.4833, + "mean_token_accuracy": 0.8483504056930542, + "num_tokens": 60528979.0, + "step": 1584 + }, + { + "epoch": 0.20162829156595852, + "ewc_loss": 0.01171875, + "ewc_loss_parallel": 1.1742115020751953e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.553683280944824, + "learning_rate": 6.714709622721492e-07, + "loss": 0.5104, + "mean_token_accuracy": 0.8368327021598816, + "num_tokens": 60561670.0, + "step": 1585 + }, + { + "epoch": 0.20175550184454905, + "ewc_loss": 0.011474609375, + "ewc_loss_parallel": 1.150369644165039e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.581741333007812, + "learning_rate": 6.718948707079271e-07, + "loss": 0.4856, + "mean_token_accuracy": 0.8434006571769714, + "num_tokens": 60602061.0, + "step": 1586 + }, + { + "epoch": 0.20188271212313955, + "ewc_loss": 0.0115966796875, + "ewc_loss_parallel": 1.1622905731201172e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.516776084899902, + "learning_rate": 6.723187791437049e-07, + "loss": 0.4886, + "mean_token_accuracy": 0.8435348272323608, + "num_tokens": 60639805.0, + "step": 1587 + }, + { + "epoch": 0.20200992240173005, + "ewc_loss": 0.0115966796875, + "ewc_loss_parallel": 1.1622905731201172e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.633248329162598, + "learning_rate": 6.727426875794829e-07, + "loss": 0.5153, + "mean_token_accuracy": 0.8367379903793335, + "num_tokens": 60681816.0, + "step": 1588 + }, + { + "epoch": 0.20213713268032057, + "ewc_loss": 0.0115966796875, + "ewc_loss_parallel": 1.1622905731201172e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.656110763549805, + "learning_rate": 6.731665960152607e-07, + "loss": 0.4996, + "mean_token_accuracy": 0.8376293182373047, + "num_tokens": 60726343.0, + "step": 1589 + }, + { + "epoch": 0.20226434295891108, + "ewc_loss": 0.01171875, + "ewc_loss_parallel": 1.1742115020751953e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.695756912231445, + "learning_rate": 6.735905044510385e-07, + "loss": 0.5439, + "mean_token_accuracy": 0.8216512203216553, + "num_tokens": 60764814.0, + "step": 1590 + }, + { + "epoch": 0.2023915532375016, + "ewc_loss": 0.0115966796875, + "ewc_loss_parallel": 1.1622905731201172e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.599090576171875, + "learning_rate": 6.740144128868164e-07, + "loss": 0.5108, + "mean_token_accuracy": 0.8377493619918823, + "num_tokens": 60802384.0, + "step": 1591 + }, + { + "epoch": 0.2025187635160921, + "ewc_loss": 0.01165771484375, + "ewc_loss_parallel": 1.1682510375976562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.507332801818848, + "learning_rate": 6.744383213225942e-07, + "loss": 0.5379, + "mean_token_accuracy": 0.8283106088638306, + "num_tokens": 60842085.0, + "step": 1592 + }, + { + "epoch": 0.2026459737946826, + "ewc_loss": 0.0115966796875, + "ewc_loss_parallel": 1.1622905731201172e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.536454200744629, + "learning_rate": 6.748622297583722e-07, + "loss": 0.4722, + "mean_token_accuracy": 0.8446212410926819, + "num_tokens": 60874835.0, + "step": 1593 + }, + { + "epoch": 0.20277318407327313, + "ewc_loss": 0.01165771484375, + "ewc_loss_parallel": 1.1682510375976562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.533506393432617, + "learning_rate": 6.7528613819415e-07, + "loss": 0.5004, + "mean_token_accuracy": 0.838416576385498, + "num_tokens": 60910682.0, + "step": 1594 + }, + { + "epoch": 0.20290039435186363, + "ewc_loss": 0.01171875, + "ewc_loss_parallel": 1.1742115020751953e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.578856468200684, + "learning_rate": 6.757100466299279e-07, + "loss": 0.4957, + "mean_token_accuracy": 0.8445016145706177, + "num_tokens": 60951352.0, + "step": 1595 + }, + { + "epoch": 0.20302760463045413, + "ewc_loss": 0.01171875, + "ewc_loss_parallel": 1.1742115020751953e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.712608337402344, + "learning_rate": 6.761339550657058e-07, + "loss": 0.525, + "mean_token_accuracy": 0.8304260969161987, + "num_tokens": 60979055.0, + "step": 1596 + }, + { + "epoch": 0.20315481490904466, + "ewc_loss": 0.01165771484375, + "ewc_loss_parallel": 1.1682510375976562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.519247055053711, + "learning_rate": 6.765578635014837e-07, + "loss": 0.5019, + "mean_token_accuracy": 0.8426333665847778, + "num_tokens": 61017437.0, + "step": 1597 + }, + { + "epoch": 0.20328202518763516, + "ewc_loss": 0.01177978515625, + "ewc_loss_parallel": 1.1801719665527344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.553820610046387, + "learning_rate": 6.769817719372614e-07, + "loss": 0.4785, + "mean_token_accuracy": 0.8466622829437256, + "num_tokens": 61052286.0, + "step": 1598 + }, + { + "epoch": 0.20340923546622566, + "ewc_loss": 0.01171875, + "ewc_loss_parallel": 1.1742115020751953e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.62208080291748, + "learning_rate": 6.774056803730394e-07, + "loss": 0.5232, + "mean_token_accuracy": 0.8328675031661987, + "num_tokens": 61086110.0, + "step": 1599 + }, + { + "epoch": 0.2035364457448162, + "ewc_loss": 0.01171875, + "ewc_loss_parallel": 1.1742115020751953e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.555632591247559, + "learning_rate": 6.778295888088172e-07, + "loss": 0.4948, + "mean_token_accuracy": 0.8414944410324097, + "num_tokens": 61124181.0, + "step": 1600 + }, + { + "epoch": 0.2036636560234067, + "ewc_loss": 0.01171875, + "ewc_loss_parallel": 1.1742115020751953e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.60610580444336, + "learning_rate": 6.782534972445952e-07, + "loss": 0.5158, + "mean_token_accuracy": 0.8342167735099792, + "num_tokens": 61160881.0, + "step": 1601 + }, + { + "epoch": 0.2037908663019972, + "ewc_loss": 0.01177978515625, + "ewc_loss_parallel": 1.1801719665527344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.528541564941406, + "learning_rate": 6.78677405680373e-07, + "loss": 0.513, + "mean_token_accuracy": 0.8387186527252197, + "num_tokens": 61200755.0, + "step": 1602 + }, + { + "epoch": 0.20391807658058772, + "ewc_loss": 0.01171875, + "ewc_loss_parallel": 1.1742115020751953e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.59168815612793, + "learning_rate": 6.791013141161509e-07, + "loss": 0.5036, + "mean_token_accuracy": 0.8377569913864136, + "num_tokens": 61240045.0, + "step": 1603 + }, + { + "epoch": 0.20404528685917822, + "ewc_loss": 0.01165771484375, + "ewc_loss_parallel": 1.1682510375976562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.704890251159668, + "learning_rate": 6.795252225519288e-07, + "loss": 0.5286, + "mean_token_accuracy": 0.8300440907478333, + "num_tokens": 61282832.0, + "step": 1604 + }, + { + "epoch": 0.20417249713776872, + "ewc_loss": 0.01177978515625, + "ewc_loss_parallel": 1.1801719665527344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.62867259979248, + "learning_rate": 6.799491309877067e-07, + "loss": 0.5021, + "mean_token_accuracy": 0.8377446532249451, + "num_tokens": 61319841.0, + "step": 1605 + }, + { + "epoch": 0.20429970741635925, + "ewc_loss": 0.0118408203125, + "ewc_loss_parallel": 1.1861324310302734e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.773322105407715, + "learning_rate": 6.803730394234844e-07, + "loss": 0.4719, + "mean_token_accuracy": 0.8501055240631104, + "num_tokens": 61352732.0, + "step": 1606 + }, + { + "epoch": 0.20442691769494975, + "ewc_loss": 0.01177978515625, + "ewc_loss_parallel": 1.1801719665527344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.72280216217041, + "learning_rate": 6.807969478592624e-07, + "loss": 0.5047, + "mean_token_accuracy": 0.8412450551986694, + "num_tokens": 61388324.0, + "step": 1607 + }, + { + "epoch": 0.20455412797354025, + "ewc_loss": 0.01171875, + "ewc_loss_parallel": 1.1742115020751953e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.642858505249023, + "learning_rate": 6.812208562950402e-07, + "loss": 0.5284, + "mean_token_accuracy": 0.8380742073059082, + "num_tokens": 61431222.0, + "step": 1608 + }, + { + "epoch": 0.20468133825213078, + "ewc_loss": 0.01177978515625, + "ewc_loss_parallel": 1.1801719665527344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.580619812011719, + "learning_rate": 6.816447647308182e-07, + "loss": 0.5591, + "mean_token_accuracy": 0.8233122229576111, + "num_tokens": 61475617.0, + "step": 1609 + }, + { + "epoch": 0.20480854853072128, + "ewc_loss": 0.01171875, + "ewc_loss_parallel": 1.1742115020751953e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.752456665039062, + "learning_rate": 6.82068673166596e-07, + "loss": 0.4842, + "mean_token_accuracy": 0.8463245630264282, + "num_tokens": 61511338.0, + "step": 1610 + }, + { + "epoch": 0.20493575880931178, + "ewc_loss": 0.01177978515625, + "ewc_loss_parallel": 1.1801719665527344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.682183265686035, + "learning_rate": 6.824925816023738e-07, + "loss": 0.5068, + "mean_token_accuracy": 0.8364455699920654, + "num_tokens": 61549093.0, + "step": 1611 + }, + { + "epoch": 0.2050629690879023, + "ewc_loss": 0.01177978515625, + "ewc_loss_parallel": 1.1801719665527344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.524528503417969, + "learning_rate": 6.829164900381518e-07, + "loss": 0.4735, + "mean_token_accuracy": 0.8478879928588867, + "num_tokens": 61593843.0, + "step": 1612 + }, + { + "epoch": 0.2051901793664928, + "ewc_loss": 0.0118408203125, + "ewc_loss_parallel": 1.1861324310302734e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.860389709472656, + "learning_rate": 6.833403984739295e-07, + "loss": 0.4858, + "mean_token_accuracy": 0.8430235981941223, + "num_tokens": 61629362.0, + "step": 1613 + }, + { + "epoch": 0.2053173896450833, + "ewc_loss": 0.01177978515625, + "ewc_loss_parallel": 1.1801719665527344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.666645050048828, + "learning_rate": 6.837643069097074e-07, + "loss": 0.5661, + "mean_token_accuracy": 0.8210057616233826, + "num_tokens": 61667037.0, + "step": 1614 + }, + { + "epoch": 0.20544459992367384, + "ewc_loss": 0.01177978515625, + "ewc_loss_parallel": 1.1801719665527344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.791309356689453, + "learning_rate": 6.841882153454853e-07, + "loss": 0.4841, + "mean_token_accuracy": 0.8432126045227051, + "num_tokens": 61701976.0, + "step": 1615 + }, + { + "epoch": 0.20557181020226434, + "ewc_loss": 0.0118408203125, + "ewc_loss_parallel": 1.1861324310302734e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.717935562133789, + "learning_rate": 6.846121237812632e-07, + "loss": 0.4533, + "mean_token_accuracy": 0.8540757298469543, + "num_tokens": 61739068.0, + "step": 1616 + }, + { + "epoch": 0.20569902048085487, + "ewc_loss": 0.01177978515625, + "ewc_loss_parallel": 1.1801719665527344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.696657180786133, + "learning_rate": 6.850360322170411e-07, + "loss": 0.4671, + "mean_token_accuracy": 0.8511074185371399, + "num_tokens": 61781515.0, + "step": 1617 + }, + { + "epoch": 0.20582623075944537, + "ewc_loss": 0.01177978515625, + "ewc_loss_parallel": 1.1801719665527344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.716712951660156, + "learning_rate": 6.85459940652819e-07, + "loss": 0.4815, + "mean_token_accuracy": 0.8421876430511475, + "num_tokens": 61815201.0, + "step": 1618 + }, + { + "epoch": 0.20595344103803587, + "ewc_loss": 0.011962890625, + "ewc_loss_parallel": 1.1980533599853516e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.764232635498047, + "learning_rate": 6.858838490885968e-07, + "loss": 0.5391, + "mean_token_accuracy": 0.828083872795105, + "num_tokens": 61856639.0, + "step": 1619 + }, + { + "epoch": 0.2060806513166264, + "ewc_loss": 0.01190185546875, + "ewc_loss_parallel": 1.1920928955078125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.677084922790527, + "learning_rate": 6.863077575243748e-07, + "loss": 0.5521, + "mean_token_accuracy": 0.8287068009376526, + "num_tokens": 61894313.0, + "step": 1620 + }, + { + "epoch": 0.2062078615952169, + "ewc_loss": 0.01190185546875, + "ewc_loss_parallel": 1.1920928955078125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.8160982131958, + "learning_rate": 6.867316659601525e-07, + "loss": 0.4869, + "mean_token_accuracy": 0.842766284942627, + "num_tokens": 61930748.0, + "step": 1621 + }, + { + "epoch": 0.2063350718738074, + "ewc_loss": 0.0118408203125, + "ewc_loss_parallel": 1.1861324310302734e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.871959686279297, + "learning_rate": 6.871555743959304e-07, + "loss": 0.5013, + "mean_token_accuracy": 0.841758131980896, + "num_tokens": 61972002.0, + "step": 1622 + }, + { + "epoch": 0.20646228215239792, + "ewc_loss": 0.011962890625, + "ewc_loss_parallel": 1.1980533599853516e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.635939598083496, + "learning_rate": 6.875794828317083e-07, + "loss": 0.4773, + "mean_token_accuracy": 0.8481923341751099, + "num_tokens": 62010141.0, + "step": 1623 + }, + { + "epoch": 0.20658949243098843, + "ewc_loss": 0.01177978515625, + "ewc_loss_parallel": 1.1801719665527344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.668034553527832, + "learning_rate": 6.880033912674862e-07, + "loss": 0.466, + "mean_token_accuracy": 0.8533011078834534, + "num_tokens": 62055907.0, + "step": 1624 + }, + { + "epoch": 0.20671670270957893, + "ewc_loss": 0.01190185546875, + "ewc_loss_parallel": 1.1920928955078125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.813193321228027, + "learning_rate": 6.884272997032641e-07, + "loss": 0.5726, + "mean_token_accuracy": 0.8198856115341187, + "num_tokens": 62095885.0, + "step": 1625 + }, + { + "epoch": 0.20684391298816945, + "ewc_loss": 0.011962890625, + "ewc_loss_parallel": 1.1980533599853516e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.732625007629395, + "learning_rate": 6.88851208139042e-07, + "loss": 0.4649, + "mean_token_accuracy": 0.8522018790245056, + "num_tokens": 62130497.0, + "step": 1626 + }, + { + "epoch": 0.20697112326675995, + "ewc_loss": 0.01190185546875, + "ewc_loss_parallel": 1.1920928955078125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.789525985717773, + "learning_rate": 6.892751165748198e-07, + "loss": 0.5264, + "mean_token_accuracy": 0.8309257626533508, + "num_tokens": 62170296.0, + "step": 1627 + }, + { + "epoch": 0.20709833354535045, + "ewc_loss": 0.0118408203125, + "ewc_loss_parallel": 1.1861324310302734e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.757556915283203, + "learning_rate": 6.896990250105978e-07, + "loss": 0.4608, + "mean_token_accuracy": 0.8556491136550903, + "num_tokens": 62210168.0, + "step": 1628 + }, + { + "epoch": 0.20722554382394098, + "ewc_loss": 0.01190185546875, + "ewc_loss_parallel": 1.1920928955078125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.80148696899414, + "learning_rate": 6.901229334463755e-07, + "loss": 0.488, + "mean_token_accuracy": 0.8426072597503662, + "num_tokens": 62245251.0, + "step": 1629 + }, + { + "epoch": 0.20735275410253148, + "ewc_loss": 0.0118408203125, + "ewc_loss_parallel": 1.1861324310302734e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.645769119262695, + "learning_rate": 6.905468418821534e-07, + "loss": 0.4862, + "mean_token_accuracy": 0.8483014702796936, + "num_tokens": 62286124.0, + "step": 1630 + }, + { + "epoch": 0.20747996438112198, + "ewc_loss": 0.01190185546875, + "ewc_loss_parallel": 1.1920928955078125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.942180633544922, + "learning_rate": 6.909707503179313e-07, + "loss": 0.4904, + "mean_token_accuracy": 0.840083122253418, + "num_tokens": 62325604.0, + "step": 1631 + }, + { + "epoch": 0.2076071746597125, + "ewc_loss": 0.011962890625, + "ewc_loss_parallel": 1.1980533599853516e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.65678882598877, + "learning_rate": 6.913946587537091e-07, + "loss": 0.4871, + "mean_token_accuracy": 0.8438014984130859, + "num_tokens": 62364467.0, + "step": 1632 + }, + { + "epoch": 0.207734384938303, + "ewc_loss": 0.01190185546875, + "ewc_loss_parallel": 1.1920928955078125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.84585952758789, + "learning_rate": 6.918185671894871e-07, + "loss": 0.5591, + "mean_token_accuracy": 0.8224788308143616, + "num_tokens": 62402075.0, + "step": 1633 + }, + { + "epoch": 0.2078615952168935, + "ewc_loss": 0.01190185546875, + "ewc_loss_parallel": 1.1920928955078125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.811942100524902, + "learning_rate": 6.922424756252649e-07, + "loss": 0.5262, + "mean_token_accuracy": 0.8390407562255859, + "num_tokens": 62437802.0, + "step": 1634 + }, + { + "epoch": 0.20798880549548404, + "ewc_loss": 0.01190185546875, + "ewc_loss_parallel": 1.1920928955078125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.736457824707031, + "learning_rate": 6.926663840610428e-07, + "loss": 0.4452, + "mean_token_accuracy": 0.8553357124328613, + "num_tokens": 62476176.0, + "step": 1635 + }, + { + "epoch": 0.20811601577407454, + "ewc_loss": 0.0118408203125, + "ewc_loss_parallel": 1.1861324310302734e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.904755592346191, + "learning_rate": 6.930902924968206e-07, + "loss": 0.4903, + "mean_token_accuracy": 0.8396879434585571, + "num_tokens": 62514378.0, + "step": 1636 + }, + { + "epoch": 0.20824322605266504, + "ewc_loss": 0.011962890625, + "ewc_loss_parallel": 1.1980533599853516e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.725395202636719, + "learning_rate": 6.935142009325985e-07, + "loss": 0.5126, + "mean_token_accuracy": 0.8366139531135559, + "num_tokens": 62555302.0, + "step": 1637 + }, + { + "epoch": 0.20837043633125557, + "ewc_loss": 0.01202392578125, + "ewc_loss_parallel": 1.2040138244628906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.921116828918457, + "learning_rate": 6.939381093683764e-07, + "loss": 0.5314, + "mean_token_accuracy": 0.8309392929077148, + "num_tokens": 62594356.0, + "step": 1638 + }, + { + "epoch": 0.20849764660984607, + "ewc_loss": 0.011962890625, + "ewc_loss_parallel": 1.1980533599853516e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.75913143157959, + "learning_rate": 6.943620178041543e-07, + "loss": 0.4483, + "mean_token_accuracy": 0.8560736179351807, + "num_tokens": 62632903.0, + "step": 1639 + }, + { + "epoch": 0.2086248568884366, + "ewc_loss": 0.011962890625, + "ewc_loss_parallel": 1.1980533599853516e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.772579193115234, + "learning_rate": 6.947859262399321e-07, + "loss": 0.4994, + "mean_token_accuracy": 0.8467634320259094, + "num_tokens": 62668979.0, + "step": 1640 + }, + { + "epoch": 0.2087520671670271, + "ewc_loss": 0.0118408203125, + "ewc_loss_parallel": 1.1861324310302734e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.816089630126953, + "learning_rate": 6.952098346757101e-07, + "loss": 0.4846, + "mean_token_accuracy": 0.8439229130744934, + "num_tokens": 62705867.0, + "step": 1641 + }, + { + "epoch": 0.2088792774456176, + "ewc_loss": 0.01190185546875, + "ewc_loss_parallel": 1.1920928955078125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.674177169799805, + "learning_rate": 6.956337431114879e-07, + "loss": 0.4822, + "mean_token_accuracy": 0.8473581075668335, + "num_tokens": 62744559.0, + "step": 1642 + }, + { + "epoch": 0.20900648772420813, + "ewc_loss": 0.011962890625, + "ewc_loss_parallel": 1.1980533599853516e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.821554183959961, + "learning_rate": 6.960576515472658e-07, + "loss": 0.5115, + "mean_token_accuracy": 0.8374745845794678, + "num_tokens": 62785221.0, + "step": 1643 + }, + { + "epoch": 0.20913369800279863, + "ewc_loss": 0.01202392578125, + "ewc_loss_parallel": 1.2040138244628906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.825197219848633, + "learning_rate": 6.964815599830436e-07, + "loss": 0.5011, + "mean_token_accuracy": 0.8396804332733154, + "num_tokens": 62820095.0, + "step": 1644 + }, + { + "epoch": 0.20926090828138913, + "ewc_loss": 0.01202392578125, + "ewc_loss_parallel": 1.2040138244628906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.721220970153809, + "learning_rate": 6.969054684188215e-07, + "loss": 0.5529, + "mean_token_accuracy": 0.821464478969574, + "num_tokens": 62863414.0, + "step": 1645 + }, + { + "epoch": 0.20938811855997966, + "ewc_loss": 0.01214599609375, + "ewc_loss_parallel": 1.2159347534179688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.990354537963867, + "learning_rate": 6.973293768545994e-07, + "loss": 0.4624, + "mean_token_accuracy": 0.8523005247116089, + "num_tokens": 62904647.0, + "step": 1646 + }, + { + "epoch": 0.20951532883857016, + "ewc_loss": 0.01202392578125, + "ewc_loss_parallel": 1.2040138244628906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.891809463500977, + "learning_rate": 6.977532852903773e-07, + "loss": 0.4959, + "mean_token_accuracy": 0.8399322032928467, + "num_tokens": 62949476.0, + "step": 1647 + }, + { + "epoch": 0.20964253911716066, + "ewc_loss": 0.01202392578125, + "ewc_loss_parallel": 1.2040138244628906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.13546371459961, + "learning_rate": 6.981771937261551e-07, + "loss": 0.5499, + "mean_token_accuracy": 0.8260796070098877, + "num_tokens": 62984784.0, + "step": 1648 + }, + { + "epoch": 0.2097697493957512, + "ewc_loss": 0.0120849609375, + "ewc_loss_parallel": 1.2099742889404297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.867329597473145, + "learning_rate": 6.986011021619331e-07, + "loss": 0.5023, + "mean_token_accuracy": 0.8400739431381226, + "num_tokens": 63024395.0, + "step": 1649 + }, + { + "epoch": 0.2098969596743417, + "ewc_loss": 0.011962890625, + "ewc_loss_parallel": 1.1980533599853516e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.847925186157227, + "learning_rate": 6.990250105977109e-07, + "loss": 0.5194, + "mean_token_accuracy": 0.8372750282287598, + "num_tokens": 63061886.0, + "step": 1650 + }, + { + "epoch": 0.2100241699529322, + "ewc_loss": 0.0120849609375, + "ewc_loss_parallel": 1.2099742889404297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.148345947265625, + "learning_rate": 6.994489190334886e-07, + "loss": 0.4878, + "mean_token_accuracy": 0.8423511981964111, + "num_tokens": 63104150.0, + "step": 1651 + }, + { + "epoch": 0.21015138023152272, + "ewc_loss": 0.0120849609375, + "ewc_loss_parallel": 1.2099742889404297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.914776802062988, + "learning_rate": 6.998728274692666e-07, + "loss": 0.4498, + "mean_token_accuracy": 0.8561079502105713, + "num_tokens": 63144346.0, + "step": 1652 + }, + { + "epoch": 0.21027859051011322, + "ewc_loss": 0.01202392578125, + "ewc_loss_parallel": 1.2040138244628906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.764535903930664, + "learning_rate": 7.002967359050444e-07, + "loss": 0.493, + "mean_token_accuracy": 0.8480390906333923, + "num_tokens": 63180042.0, + "step": 1653 + }, + { + "epoch": 0.21040580078870372, + "ewc_loss": 0.0120849609375, + "ewc_loss_parallel": 1.2099742889404297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.204858779907227, + "learning_rate": 7.007206443408224e-07, + "loss": 0.4425, + "mean_token_accuracy": 0.856090784072876, + "num_tokens": 63216570.0, + "step": 1654 + }, + { + "epoch": 0.21053301106729425, + "ewc_loss": 0.0120849609375, + "ewc_loss_parallel": 1.2099742889404297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.037420272827148, + "learning_rate": 7.011445527766002e-07, + "loss": 0.4937, + "mean_token_accuracy": 0.8424935340881348, + "num_tokens": 63253457.0, + "step": 1655 + }, + { + "epoch": 0.21066022134588475, + "ewc_loss": 0.0120849609375, + "ewc_loss_parallel": 1.2099742889404297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.729507446289062, + "learning_rate": 7.015684612123781e-07, + "loss": 0.479, + "mean_token_accuracy": 0.8455663919448853, + "num_tokens": 63295430.0, + "step": 1656 + }, + { + "epoch": 0.21078743162447525, + "ewc_loss": 0.0120849609375, + "ewc_loss_parallel": 1.2099742889404297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.981553077697754, + "learning_rate": 7.01992369648156e-07, + "loss": 0.5002, + "mean_token_accuracy": 0.8422472476959229, + "num_tokens": 63340118.0, + "step": 1657 + }, + { + "epoch": 0.21091464190306577, + "ewc_loss": 0.0120849609375, + "ewc_loss_parallel": 1.2099742889404297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.001666069030762, + "learning_rate": 7.024162780839339e-07, + "loss": 0.5315, + "mean_token_accuracy": 0.8242891430854797, + "num_tokens": 63372037.0, + "step": 1658 + }, + { + "epoch": 0.21104185218165628, + "ewc_loss": 0.0120849609375, + "ewc_loss_parallel": 1.2099742889404297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.881229400634766, + "learning_rate": 7.028401865197116e-07, + "loss": 0.5226, + "mean_token_accuracy": 0.8318896889686584, + "num_tokens": 63415508.0, + "step": 1659 + }, + { + "epoch": 0.21116906246024678, + "ewc_loss": 0.01202392578125, + "ewc_loss_parallel": 1.2040138244628906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.902131080627441, + "learning_rate": 7.032640949554896e-07, + "loss": 0.4305, + "mean_token_accuracy": 0.8577286601066589, + "num_tokens": 63459320.0, + "step": 1660 + }, + { + "epoch": 0.2112962727388373, + "ewc_loss": 0.01214599609375, + "ewc_loss_parallel": 1.2159347534179688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.929313659667969, + "learning_rate": 7.036880033912674e-07, + "loss": 0.5314, + "mean_token_accuracy": 0.8326007127761841, + "num_tokens": 63499624.0, + "step": 1661 + }, + { + "epoch": 0.2114234830174278, + "ewc_loss": 0.01214599609375, + "ewc_loss_parallel": 1.2159347534179688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.90310001373291, + "learning_rate": 7.041119118270454e-07, + "loss": 0.4892, + "mean_token_accuracy": 0.8420111536979675, + "num_tokens": 63536507.0, + "step": 1662 + }, + { + "epoch": 0.2115506932960183, + "ewc_loss": 0.01214599609375, + "ewc_loss_parallel": 1.2159347534179688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.031546592712402, + "learning_rate": 7.045358202628232e-07, + "loss": 0.5292, + "mean_token_accuracy": 0.8336703181266785, + "num_tokens": 63576483.0, + "step": 1663 + }, + { + "epoch": 0.21167790357460883, + "ewc_loss": 0.0120849609375, + "ewc_loss_parallel": 1.2099742889404297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.835326194763184, + "learning_rate": 7.049597286986011e-07, + "loss": 0.5186, + "mean_token_accuracy": 0.8336660861968994, + "num_tokens": 63615568.0, + "step": 1664 + }, + { + "epoch": 0.21180511385319933, + "ewc_loss": 0.01226806640625, + "ewc_loss_parallel": 1.2278556823730469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.152824401855469, + "learning_rate": 7.05383637134379e-07, + "loss": 0.4798, + "mean_token_accuracy": 0.8461973071098328, + "num_tokens": 63650023.0, + "step": 1665 + }, + { + "epoch": 0.21193232413178986, + "ewc_loss": 0.01226806640625, + "ewc_loss_parallel": 1.2278556823730469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.8939790725708, + "learning_rate": 7.058075455701568e-07, + "loss": 0.5091, + "mean_token_accuracy": 0.8331170678138733, + "num_tokens": 63690229.0, + "step": 1666 + }, + { + "epoch": 0.21205953441038036, + "ewc_loss": 0.01220703125, + "ewc_loss_parallel": 1.2218952178955078e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.161975860595703, + "learning_rate": 7.062314540059346e-07, + "loss": 0.4527, + "mean_token_accuracy": 0.857050895690918, + "num_tokens": 63728169.0, + "step": 1667 + }, + { + "epoch": 0.21218674468897086, + "ewc_loss": 0.01226806640625, + "ewc_loss_parallel": 1.2278556823730469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.975542068481445, + "learning_rate": 7.066553624417126e-07, + "loss": 0.479, + "mean_token_accuracy": 0.8451250791549683, + "num_tokens": 63762816.0, + "step": 1668 + }, + { + "epoch": 0.2123139549675614, + "ewc_loss": 0.01220703125, + "ewc_loss_parallel": 1.2218952178955078e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.948997497558594, + "learning_rate": 7.070792708774904e-07, + "loss": 0.5006, + "mean_token_accuracy": 0.8399857878684998, + "num_tokens": 63804288.0, + "step": 1669 + }, + { + "epoch": 0.2124411652461519, + "ewc_loss": 0.01214599609375, + "ewc_loss_parallel": 1.2159347534179688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.09414005279541, + "learning_rate": 7.075031793132684e-07, + "loss": 0.5208, + "mean_token_accuracy": 0.8374834060668945, + "num_tokens": 63851365.0, + "step": 1670 + }, + { + "epoch": 0.2125683755247424, + "ewc_loss": 0.01226806640625, + "ewc_loss_parallel": 1.2278556823730469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.898455619812012, + "learning_rate": 7.079270877490462e-07, + "loss": 0.483, + "mean_token_accuracy": 0.8476260900497437, + "num_tokens": 63886187.0, + "step": 1671 + }, + { + "epoch": 0.21269558580333292, + "ewc_loss": 0.01220703125, + "ewc_loss_parallel": 1.2218952178955078e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.95721435546875, + "learning_rate": 7.08350996184824e-07, + "loss": 0.5566, + "mean_token_accuracy": 0.8249709606170654, + "num_tokens": 63927187.0, + "step": 1672 + }, + { + "epoch": 0.21282279608192342, + "ewc_loss": 0.01220703125, + "ewc_loss_parallel": 1.2218952178955078e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.950007438659668, + "learning_rate": 7.08774904620602e-07, + "loss": 0.4766, + "mean_token_accuracy": 0.8503398299217224, + "num_tokens": 63964088.0, + "step": 1673 + }, + { + "epoch": 0.21295000636051392, + "ewc_loss": 0.01214599609375, + "ewc_loss_parallel": 1.2159347534179688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.0366792678833, + "learning_rate": 7.091988130563797e-07, + "loss": 0.5221, + "mean_token_accuracy": 0.833580732345581, + "num_tokens": 64004931.0, + "step": 1674 + }, + { + "epoch": 0.21307721663910445, + "ewc_loss": 0.01220703125, + "ewc_loss_parallel": 1.2218952178955078e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.948065757751465, + "learning_rate": 7.096227214921576e-07, + "loss": 0.5155, + "mean_token_accuracy": 0.8355704545974731, + "num_tokens": 64043966.0, + "step": 1675 + }, + { + "epoch": 0.21320442691769495, + "ewc_loss": 0.01220703125, + "ewc_loss_parallel": 1.2218952178955078e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.023536682128906, + "learning_rate": 7.100466299279355e-07, + "loss": 0.4973, + "mean_token_accuracy": 0.8421194553375244, + "num_tokens": 64078886.0, + "step": 1676 + }, + { + "epoch": 0.21333163719628545, + "ewc_loss": 0.0123291015625, + "ewc_loss_parallel": 1.233816146850586e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.997989654541016, + "learning_rate": 7.104705383637134e-07, + "loss": 0.5224, + "mean_token_accuracy": 0.8390644788742065, + "num_tokens": 64117291.0, + "step": 1677 + }, + { + "epoch": 0.21345884747487598, + "ewc_loss": 0.01226806640625, + "ewc_loss_parallel": 1.2278556823730469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.927923202514648, + "learning_rate": 7.108944467994913e-07, + "loss": 0.4775, + "mean_token_accuracy": 0.846312940120697, + "num_tokens": 64156369.0, + "step": 1678 + }, + { + "epoch": 0.21358605775346648, + "ewc_loss": 0.01239013671875, + "ewc_loss_parallel": 1.239776611328125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.056793212890625, + "learning_rate": 7.113183552352692e-07, + "loss": 0.5515, + "mean_token_accuracy": 0.8206208944320679, + "num_tokens": 64193646.0, + "step": 1679 + }, + { + "epoch": 0.21371326803205698, + "ewc_loss": 0.01239013671875, + "ewc_loss_parallel": 1.239776611328125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.124702453613281, + "learning_rate": 7.11742263671047e-07, + "loss": 0.4433, + "mean_token_accuracy": 0.8568878769874573, + "num_tokens": 64231933.0, + "step": 1680 + }, + { + "epoch": 0.2138404783106475, + "ewc_loss": 0.01220703125, + "ewc_loss_parallel": 1.2218952178955078e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.148919105529785, + "learning_rate": 7.12166172106825e-07, + "loss": 0.5766, + "mean_token_accuracy": 0.8142541646957397, + "num_tokens": 64267418.0, + "step": 1681 + }, + { + "epoch": 0.213967688589238, + "ewc_loss": 0.0123291015625, + "ewc_loss_parallel": 1.233816146850586e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.9140625, + "learning_rate": 7.125900805426027e-07, + "loss": 0.457, + "mean_token_accuracy": 0.8525717258453369, + "num_tokens": 64302440.0, + "step": 1682 + }, + { + "epoch": 0.2140948988678285, + "ewc_loss": 0.01239013671875, + "ewc_loss_parallel": 1.239776611328125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.123143196105957, + "learning_rate": 7.130139889783806e-07, + "loss": 0.497, + "mean_token_accuracy": 0.8412399291992188, + "num_tokens": 64338240.0, + "step": 1683 + }, + { + "epoch": 0.21422210914641904, + "ewc_loss": 0.01239013671875, + "ewc_loss_parallel": 1.239776611328125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.91596508026123, + "learning_rate": 7.134378974141585e-07, + "loss": 0.4874, + "mean_token_accuracy": 0.8427497744560242, + "num_tokens": 64375734.0, + "step": 1684 + }, + { + "epoch": 0.21434931942500954, + "ewc_loss": 0.0123291015625, + "ewc_loss_parallel": 1.233816146850586e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.023362159729004, + "learning_rate": 7.138618058499364e-07, + "loss": 0.4808, + "mean_token_accuracy": 0.8467328548431396, + "num_tokens": 64411401.0, + "step": 1685 + }, + { + "epoch": 0.21447652970360004, + "ewc_loss": 0.01239013671875, + "ewc_loss_parallel": 1.239776611328125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.031928062438965, + "learning_rate": 7.142857142857143e-07, + "loss": 0.4564, + "mean_token_accuracy": 0.8546421527862549, + "num_tokens": 64454719.0, + "step": 1686 + }, + { + "epoch": 0.21460373998219057, + "ewc_loss": 0.0123291015625, + "ewc_loss_parallel": 1.233816146850586e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.990389823913574, + "learning_rate": 7.147096227214922e-07, + "loss": 0.4808, + "mean_token_accuracy": 0.8463461399078369, + "num_tokens": 64495736.0, + "step": 1687 + }, + { + "epoch": 0.21473095026078107, + "ewc_loss": 0.01239013671875, + "ewc_loss_parallel": 1.239776611328125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.984098434448242, + "learning_rate": 7.1513353115727e-07, + "loss": 0.4618, + "mean_token_accuracy": 0.8509575128555298, + "num_tokens": 64532843.0, + "step": 1688 + }, + { + "epoch": 0.21485816053937157, + "ewc_loss": 0.0123291015625, + "ewc_loss_parallel": 1.233816146850586e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.951310157775879, + "learning_rate": 7.155574395930479e-07, + "loss": 0.5085, + "mean_token_accuracy": 0.8335709571838379, + "num_tokens": 64568950.0, + "step": 1689 + }, + { + "epoch": 0.2149853708179621, + "ewc_loss": 0.01239013671875, + "ewc_loss_parallel": 1.239776611328125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.071986198425293, + "learning_rate": 7.159813480288257e-07, + "loss": 0.4843, + "mean_token_accuracy": 0.8414452075958252, + "num_tokens": 64606878.0, + "step": 1690 + }, + { + "epoch": 0.2151125810965526, + "ewc_loss": 0.012451171875, + "ewc_loss_parallel": 1.245737075805664e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.01487922668457, + "learning_rate": 7.164052564646035e-07, + "loss": 0.4504, + "mean_token_accuracy": 0.8545402884483337, + "num_tokens": 64643012.0, + "step": 1691 + }, + { + "epoch": 0.21523979137514312, + "ewc_loss": 0.01239013671875, + "ewc_loss_parallel": 1.239776611328125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.162788391113281, + "learning_rate": 7.168291649003815e-07, + "loss": 0.4726, + "mean_token_accuracy": 0.8523707389831543, + "num_tokens": 64684444.0, + "step": 1692 + }, + { + "epoch": 0.21536700165373363, + "ewc_loss": 0.01251220703125, + "ewc_loss_parallel": 1.2516975402832031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.097426414489746, + "learning_rate": 7.172530733361593e-07, + "loss": 0.5232, + "mean_token_accuracy": 0.8345057368278503, + "num_tokens": 64722068.0, + "step": 1693 + }, + { + "epoch": 0.21549421193232413, + "ewc_loss": 0.01239013671875, + "ewc_loss_parallel": 1.239776611328125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.048905372619629, + "learning_rate": 7.176769817719373e-07, + "loss": 0.4346, + "mean_token_accuracy": 0.8655968904495239, + "num_tokens": 64761829.0, + "step": 1694 + }, + { + "epoch": 0.21562142221091465, + "ewc_loss": 0.012451171875, + "ewc_loss_parallel": 1.245737075805664e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.150838851928711, + "learning_rate": 7.181008902077151e-07, + "loss": 0.4803, + "mean_token_accuracy": 0.8470562696456909, + "num_tokens": 64799185.0, + "step": 1695 + }, + { + "epoch": 0.21574863248950515, + "ewc_loss": 0.01239013671875, + "ewc_loss_parallel": 1.239776611328125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.167257308959961, + "learning_rate": 7.18524798643493e-07, + "loss": 0.511, + "mean_token_accuracy": 0.8385040163993835, + "num_tokens": 64839306.0, + "step": 1696 + }, + { + "epoch": 0.21587584276809565, + "ewc_loss": 0.01239013671875, + "ewc_loss_parallel": 1.239776611328125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.000523567199707, + "learning_rate": 7.189487070792708e-07, + "loss": 0.4852, + "mean_token_accuracy": 0.8473718166351318, + "num_tokens": 64876011.0, + "step": 1697 + }, + { + "epoch": 0.21600305304668618, + "ewc_loss": 0.01251220703125, + "ewc_loss_parallel": 1.2516975402832031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.041399002075195, + "learning_rate": 7.193726155150487e-07, + "loss": 0.485, + "mean_token_accuracy": 0.847922146320343, + "num_tokens": 64918510.0, + "step": 1698 + }, + { + "epoch": 0.21613026332527668, + "ewc_loss": 0.012451171875, + "ewc_loss_parallel": 1.245737075805664e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.053359031677246, + "learning_rate": 7.197965239508265e-07, + "loss": 0.4912, + "mean_token_accuracy": 0.8436205387115479, + "num_tokens": 64957156.0, + "step": 1699 + }, + { + "epoch": 0.21625747360386718, + "ewc_loss": 0.0125732421875, + "ewc_loss_parallel": 1.2576580047607422e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 14.961491584777832, + "learning_rate": 7.202204323866045e-07, + "loss": 0.4903, + "mean_token_accuracy": 0.845818281173706, + "num_tokens": 64998761.0, + "step": 1700 + }, + { + "epoch": 0.2163846838824577, + "ewc_loss": 0.0125732421875, + "ewc_loss_parallel": 1.2576580047607422e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.104619026184082, + "learning_rate": 7.206443408223823e-07, + "loss": 0.4934, + "mean_token_accuracy": 0.840584933757782, + "num_tokens": 65038849.0, + "step": 1701 + }, + { + "epoch": 0.2165118941610482, + "ewc_loss": 0.0126953125, + "ewc_loss_parallel": 1.2695789337158203e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.119954109191895, + "learning_rate": 7.210682492581603e-07, + "loss": 0.4736, + "mean_token_accuracy": 0.8462426662445068, + "num_tokens": 65071130.0, + "step": 1702 + }, + { + "epoch": 0.2166391044396387, + "ewc_loss": 0.0125732421875, + "ewc_loss_parallel": 1.2576580047607422e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.109335899353027, + "learning_rate": 7.214921576939381e-07, + "loss": 0.5153, + "mean_token_accuracy": 0.8322718143463135, + "num_tokens": 65111619.0, + "step": 1703 + }, + { + "epoch": 0.21676631471822924, + "ewc_loss": 0.0125732421875, + "ewc_loss_parallel": 1.2576580047607422e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.062935829162598, + "learning_rate": 7.219160661297159e-07, + "loss": 0.4623, + "mean_token_accuracy": 0.8503813743591309, + "num_tokens": 65147537.0, + "step": 1704 + }, + { + "epoch": 0.21689352499681974, + "ewc_loss": 0.0126953125, + "ewc_loss_parallel": 1.2695789337158203e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.086376190185547, + "learning_rate": 7.223399745654938e-07, + "loss": 0.4985, + "mean_token_accuracy": 0.8405197858810425, + "num_tokens": 65187254.0, + "step": 1705 + }, + { + "epoch": 0.21702073527541024, + "ewc_loss": 0.01263427734375, + "ewc_loss_parallel": 1.2636184692382812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.18903636932373, + "learning_rate": 7.227638830012717e-07, + "loss": 0.5099, + "mean_token_accuracy": 0.833581805229187, + "num_tokens": 65226130.0, + "step": 1706 + }, + { + "epoch": 0.21714794555400077, + "ewc_loss": 0.0126953125, + "ewc_loss_parallel": 1.2695789337158203e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.264060974121094, + "learning_rate": 7.231877914370495e-07, + "loss": 0.4735, + "mean_token_accuracy": 0.8507121801376343, + "num_tokens": 65261282.0, + "step": 1707 + }, + { + "epoch": 0.21727515583259127, + "ewc_loss": 0.0125732421875, + "ewc_loss_parallel": 1.2576580047607422e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.335192680358887, + "learning_rate": 7.236116998728275e-07, + "loss": 0.5245, + "mean_token_accuracy": 0.8318140506744385, + "num_tokens": 65294850.0, + "step": 1708 + }, + { + "epoch": 0.21740236611118177, + "ewc_loss": 0.0125732421875, + "ewc_loss_parallel": 1.2576580047607422e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.039770126342773, + "learning_rate": 7.240356083086053e-07, + "loss": 0.4163, + "mean_token_accuracy": 0.8647704720497131, + "num_tokens": 65332323.0, + "step": 1709 + }, + { + "epoch": 0.2175295763897723, + "ewc_loss": 0.01263427734375, + "ewc_loss_parallel": 1.2636184692382812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.242782592773438, + "learning_rate": 7.244595167443833e-07, + "loss": 0.5109, + "mean_token_accuracy": 0.838172972202301, + "num_tokens": 65376033.0, + "step": 1710 + }, + { + "epoch": 0.2176567866683628, + "ewc_loss": 0.01263427734375, + "ewc_loss_parallel": 1.2636184692382812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.144503593444824, + "learning_rate": 7.248834251801611e-07, + "loss": 0.4749, + "mean_token_accuracy": 0.8494085073471069, + "num_tokens": 65417875.0, + "step": 1711 + }, + { + "epoch": 0.2177839969469533, + "ewc_loss": 0.01263427734375, + "ewc_loss_parallel": 1.2636184692382812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.009881973266602, + "learning_rate": 7.253073336159388e-07, + "loss": 0.4516, + "mean_token_accuracy": 0.8536325693130493, + "num_tokens": 65456675.0, + "step": 1712 + }, + { + "epoch": 0.21791120722554383, + "ewc_loss": 0.01263427734375, + "ewc_loss_parallel": 1.2636184692382812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.135117530822754, + "learning_rate": 7.257312420517168e-07, + "loss": 0.4444, + "mean_token_accuracy": 0.8553234934806824, + "num_tokens": 65487699.0, + "step": 1713 + }, + { + "epoch": 0.21803841750413433, + "ewc_loss": 0.0126953125, + "ewc_loss_parallel": 1.2695789337158203e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.180144309997559, + "learning_rate": 7.261551504874946e-07, + "loss": 0.4713, + "mean_token_accuracy": 0.8460460901260376, + "num_tokens": 65526143.0, + "step": 1714 + }, + { + "epoch": 0.21816562778272486, + "ewc_loss": 0.0126953125, + "ewc_loss_parallel": 1.2695789337158203e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.164257049560547, + "learning_rate": 7.265790589232725e-07, + "loss": 0.4688, + "mean_token_accuracy": 0.8504493236541748, + "num_tokens": 65562969.0, + "step": 1715 + }, + { + "epoch": 0.21829283806131536, + "ewc_loss": 0.01263427734375, + "ewc_loss_parallel": 1.2636184692382812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.078640937805176, + "learning_rate": 7.270029673590504e-07, + "loss": 0.5012, + "mean_token_accuracy": 0.842228353023529, + "num_tokens": 65605999.0, + "step": 1716 + }, + { + "epoch": 0.21842004833990586, + "ewc_loss": 0.0126953125, + "ewc_loss_parallel": 1.2695789337158203e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.142930030822754, + "learning_rate": 7.274268757948283e-07, + "loss": 0.5464, + "mean_token_accuracy": 0.8279677033424377, + "num_tokens": 65644877.0, + "step": 1717 + }, + { + "epoch": 0.2185472586184964, + "ewc_loss": 0.0126953125, + "ewc_loss_parallel": 1.2695789337158203e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.355448722839355, + "learning_rate": 7.278507842306062e-07, + "loss": 0.467, + "mean_token_accuracy": 0.8514090776443481, + "num_tokens": 65678823.0, + "step": 1718 + }, + { + "epoch": 0.2186744688970869, + "ewc_loss": 0.0126953125, + "ewc_loss_parallel": 1.2695789337158203e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.122355461120605, + "learning_rate": 7.282746926663841e-07, + "loss": 0.547, + "mean_token_accuracy": 0.8288793563842773, + "num_tokens": 65717185.0, + "step": 1719 + }, + { + "epoch": 0.2188016791756774, + "ewc_loss": 0.01263427734375, + "ewc_loss_parallel": 1.2636184692382812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.281404495239258, + "learning_rate": 7.286986011021618e-07, + "loss": 0.4676, + "mean_token_accuracy": 0.8505336046218872, + "num_tokens": 65754422.0, + "step": 1720 + }, + { + "epoch": 0.21892888945426792, + "ewc_loss": 0.01275634765625, + "ewc_loss_parallel": 1.2755393981933594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.134153366088867, + "learning_rate": 7.291225095379398e-07, + "loss": 0.5038, + "mean_token_accuracy": 0.8404026031494141, + "num_tokens": 65799390.0, + "step": 1721 + }, + { + "epoch": 0.21905609973285842, + "ewc_loss": 0.01263427734375, + "ewc_loss_parallel": 1.2636184692382812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.08900260925293, + "learning_rate": 7.295464179737176e-07, + "loss": 0.4962, + "mean_token_accuracy": 0.8446599841117859, + "num_tokens": 65837566.0, + "step": 1722 + }, + { + "epoch": 0.21918331001144892, + "ewc_loss": 0.0128173828125, + "ewc_loss_parallel": 1.2814998626708984e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.286566734313965, + "learning_rate": 7.299703264094955e-07, + "loss": 0.4812, + "mean_token_accuracy": 0.8470199108123779, + "num_tokens": 65874537.0, + "step": 1723 + }, + { + "epoch": 0.21931052029003945, + "ewc_loss": 0.01275634765625, + "ewc_loss_parallel": 1.2755393981933594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.1121826171875, + "learning_rate": 7.303942348452734e-07, + "loss": 0.5077, + "mean_token_accuracy": 0.8380482792854309, + "num_tokens": 65915616.0, + "step": 1724 + }, + { + "epoch": 0.21943773056862995, + "ewc_loss": 0.01263427734375, + "ewc_loss_parallel": 1.2636184692382812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.150321960449219, + "learning_rate": 7.308181432810513e-07, + "loss": 0.4558, + "mean_token_accuracy": 0.8515826463699341, + "num_tokens": 65954263.0, + "step": 1725 + }, + { + "epoch": 0.21956494084722045, + "ewc_loss": 0.01275634765625, + "ewc_loss_parallel": 1.2755393981933594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.33428955078125, + "learning_rate": 7.312420517168292e-07, + "loss": 0.4887, + "mean_token_accuracy": 0.8412097692489624, + "num_tokens": 65992880.0, + "step": 1726 + }, + { + "epoch": 0.21969215112581097, + "ewc_loss": 0.0126953125, + "ewc_loss_parallel": 1.2695789337158203e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.055437088012695, + "learning_rate": 7.31665960152607e-07, + "loss": 0.5069, + "mean_token_accuracy": 0.8351512551307678, + "num_tokens": 66033958.0, + "step": 1727 + }, + { + "epoch": 0.21981936140440148, + "ewc_loss": 0.0126953125, + "ewc_loss_parallel": 1.2695789337158203e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.321747779846191, + "learning_rate": 7.320898685883848e-07, + "loss": 0.4777, + "mean_token_accuracy": 0.8457844853401184, + "num_tokens": 66072448.0, + "step": 1728 + }, + { + "epoch": 0.21994657168299198, + "ewc_loss": 0.0128173828125, + "ewc_loss_parallel": 1.2814998626708984e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.062057495117188, + "learning_rate": 7.325137770241628e-07, + "loss": 0.5893, + "mean_token_accuracy": 0.8156198263168335, + "num_tokens": 66112648.0, + "step": 1729 + }, + { + "epoch": 0.2200737819615825, + "ewc_loss": 0.01275634765625, + "ewc_loss_parallel": 1.2755393981933594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.230755805969238, + "learning_rate": 7.329376854599406e-07, + "loss": 0.5004, + "mean_token_accuracy": 0.8406274914741516, + "num_tokens": 66154791.0, + "step": 1730 + }, + { + "epoch": 0.220200992240173, + "ewc_loss": 0.01287841796875, + "ewc_loss_parallel": 1.2874603271484375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.19685173034668, + "learning_rate": 7.333615938957184e-07, + "loss": 0.4896, + "mean_token_accuracy": 0.8431364893913269, + "num_tokens": 66192450.0, + "step": 1731 + }, + { + "epoch": 0.2203282025187635, + "ewc_loss": 0.0128173828125, + "ewc_loss_parallel": 1.2814998626708984e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.195070266723633, + "learning_rate": 7.337855023314964e-07, + "loss": 0.4834, + "mean_token_accuracy": 0.8437871932983398, + "num_tokens": 66231173.0, + "step": 1732 + }, + { + "epoch": 0.22045541279735403, + "ewc_loss": 0.01287841796875, + "ewc_loss_parallel": 1.2874603271484375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.496698379516602, + "learning_rate": 7.342094107672742e-07, + "loss": 0.5364, + "mean_token_accuracy": 0.8279390931129456, + "num_tokens": 66272956.0, + "step": 1733 + }, + { + "epoch": 0.22058262307594453, + "ewc_loss": 0.0128173828125, + "ewc_loss_parallel": 1.2814998626708984e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.224892616271973, + "learning_rate": 7.346333192030522e-07, + "loss": 0.4907, + "mean_token_accuracy": 0.8477909564971924, + "num_tokens": 66309450.0, + "step": 1734 + }, + { + "epoch": 0.22070983335453503, + "ewc_loss": 0.0126953125, + "ewc_loss_parallel": 1.2695789337158203e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.292984962463379, + "learning_rate": 7.350572276388299e-07, + "loss": 0.5405, + "mean_token_accuracy": 0.8261218667030334, + "num_tokens": 66350947.0, + "step": 1735 + }, + { + "epoch": 0.22083704363312556, + "ewc_loss": 0.012939453125, + "ewc_loss_parallel": 1.2934207916259766e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.345072746276855, + "learning_rate": 7.354811360746078e-07, + "loss": 0.4984, + "mean_token_accuracy": 0.8388299942016602, + "num_tokens": 66388364.0, + "step": 1736 + }, + { + "epoch": 0.22096425391171606, + "ewc_loss": 0.0128173828125, + "ewc_loss_parallel": 1.2814998626708984e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.231783866882324, + "learning_rate": 7.359050445103857e-07, + "loss": 0.5125, + "mean_token_accuracy": 0.8350645303726196, + "num_tokens": 66425029.0, + "step": 1737 + }, + { + "epoch": 0.22109146419030656, + "ewc_loss": 0.0128173828125, + "ewc_loss_parallel": 1.2814998626708984e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.26252269744873, + "learning_rate": 7.363289529461636e-07, + "loss": 0.4882, + "mean_token_accuracy": 0.8446649312973022, + "num_tokens": 66463924.0, + "step": 1738 + }, + { + "epoch": 0.2212186744688971, + "ewc_loss": 0.0128173828125, + "ewc_loss_parallel": 1.2814998626708984e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.31887149810791, + "learning_rate": 7.367528613819415e-07, + "loss": 0.4431, + "mean_token_accuracy": 0.857197105884552, + "num_tokens": 66504824.0, + "step": 1739 + }, + { + "epoch": 0.2213458847474876, + "ewc_loss": 0.01287841796875, + "ewc_loss_parallel": 1.2874603271484375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.278908729553223, + "learning_rate": 7.371767698177194e-07, + "loss": 0.4625, + "mean_token_accuracy": 0.8512871265411377, + "num_tokens": 66539308.0, + "step": 1740 + }, + { + "epoch": 0.22147309502607812, + "ewc_loss": 0.01287841796875, + "ewc_loss_parallel": 1.2874603271484375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.217623710632324, + "learning_rate": 7.376006782534972e-07, + "loss": 0.4587, + "mean_token_accuracy": 0.8532928228378296, + "num_tokens": 66572785.0, + "step": 1741 + }, + { + "epoch": 0.22160030530466862, + "ewc_loss": 0.01287841796875, + "ewc_loss_parallel": 1.2874603271484375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.140929222106934, + "learning_rate": 7.380245866892751e-07, + "loss": 0.4759, + "mean_token_accuracy": 0.8470246195793152, + "num_tokens": 66607890.0, + "step": 1742 + }, + { + "epoch": 0.22172751558325912, + "ewc_loss": 0.012939453125, + "ewc_loss_parallel": 1.2934207916259766e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.289922714233398, + "learning_rate": 7.384484951250529e-07, + "loss": 0.4784, + "mean_token_accuracy": 0.848902702331543, + "num_tokens": 66650553.0, + "step": 1743 + }, + { + "epoch": 0.22185472586184965, + "ewc_loss": 0.01287841796875, + "ewc_loss_parallel": 1.2874603271484375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.296194076538086, + "learning_rate": 7.388724035608308e-07, + "loss": 0.4718, + "mean_token_accuracy": 0.8495045304298401, + "num_tokens": 66685330.0, + "step": 1744 + }, + { + "epoch": 0.22198193614044015, + "ewc_loss": 0.012939453125, + "ewc_loss_parallel": 1.2934207916259766e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.258359909057617, + "learning_rate": 7.392963119966087e-07, + "loss": 0.4558, + "mean_token_accuracy": 0.8533797264099121, + "num_tokens": 66716816.0, + "step": 1745 + }, + { + "epoch": 0.22210914641903065, + "ewc_loss": 0.01300048828125, + "ewc_loss_parallel": 1.2993812561035156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.330414772033691, + "learning_rate": 7.397202204323866e-07, + "loss": 0.5008, + "mean_token_accuracy": 0.8410443067550659, + "num_tokens": 66754030.0, + "step": 1746 + }, + { + "epoch": 0.22223635669762118, + "ewc_loss": 0.012939453125, + "ewc_loss_parallel": 1.2934207916259766e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.315831184387207, + "learning_rate": 7.401441288681645e-07, + "loss": 0.4576, + "mean_token_accuracy": 0.8536355495452881, + "num_tokens": 66793650.0, + "step": 1747 + }, + { + "epoch": 0.22236356697621168, + "ewc_loss": 0.012939453125, + "ewc_loss_parallel": 1.2934207916259766e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.370409965515137, + "learning_rate": 7.405680373039424e-07, + "loss": 0.4415, + "mean_token_accuracy": 0.8609002828598022, + "num_tokens": 66833709.0, + "step": 1748 + }, + { + "epoch": 0.22249077725480218, + "ewc_loss": 0.012939453125, + "ewc_loss_parallel": 1.2934207916259766e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.225540161132812, + "learning_rate": 7.409919457397202e-07, + "loss": 0.5111, + "mean_token_accuracy": 0.8361127972602844, + "num_tokens": 66866456.0, + "step": 1749 + }, + { + "epoch": 0.2226179875333927, + "ewc_loss": 0.012939453125, + "ewc_loss_parallel": 1.2934207916259766e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.360699653625488, + "learning_rate": 7.414158541754981e-07, + "loss": 0.5363, + "mean_token_accuracy": 0.8284934163093567, + "num_tokens": 66905875.0, + "step": 1750 + }, + { + "epoch": 0.2227451978119832, + "ewc_loss": 0.012939453125, + "ewc_loss_parallel": 1.2934207916259766e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.36828899383545, + "learning_rate": 7.418397626112759e-07, + "loss": 0.4905, + "mean_token_accuracy": 0.8516567945480347, + "num_tokens": 66936009.0, + "step": 1751 + }, + { + "epoch": 0.2228724080905737, + "ewc_loss": 0.01300048828125, + "ewc_loss_parallel": 1.2993812561035156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.304216384887695, + "learning_rate": 7.422636710470537e-07, + "loss": 0.4954, + "mean_token_accuracy": 0.8419325351715088, + "num_tokens": 66978111.0, + "step": 1752 + }, + { + "epoch": 0.22299961836916424, + "ewc_loss": 0.01312255859375, + "ewc_loss_parallel": 1.3113021850585938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.289727210998535, + "learning_rate": 7.426875794828317e-07, + "loss": 0.5087, + "mean_token_accuracy": 0.8417267799377441, + "num_tokens": 67015938.0, + "step": 1753 + }, + { + "epoch": 0.22312682864775474, + "ewc_loss": 0.01312255859375, + "ewc_loss_parallel": 1.3113021850585938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.317116737365723, + "learning_rate": 7.431114879186095e-07, + "loss": 0.5287, + "mean_token_accuracy": 0.8320104479789734, + "num_tokens": 67055456.0, + "step": 1754 + }, + { + "epoch": 0.22325403892634524, + "ewc_loss": 0.01300048828125, + "ewc_loss_parallel": 1.2993812561035156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.22780704498291, + "learning_rate": 7.435353963543875e-07, + "loss": 0.4854, + "mean_token_accuracy": 0.8454005718231201, + "num_tokens": 67096298.0, + "step": 1755 + }, + { + "epoch": 0.22338124920493577, + "ewc_loss": 0.01300048828125, + "ewc_loss_parallel": 1.2993812561035156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.271021842956543, + "learning_rate": 7.439593047901653e-07, + "loss": 0.4408, + "mean_token_accuracy": 0.8573994636535645, + "num_tokens": 67135280.0, + "step": 1756 + }, + { + "epoch": 0.22350845948352627, + "ewc_loss": 0.01324462890625, + "ewc_loss_parallel": 1.3232231140136719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.211397171020508, + "learning_rate": 7.443832132259431e-07, + "loss": 0.5065, + "mean_token_accuracy": 0.8372431993484497, + "num_tokens": 67171936.0, + "step": 1757 + }, + { + "epoch": 0.22363566976211677, + "ewc_loss": 0.0130615234375, + "ewc_loss_parallel": 1.3053417205810547e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.338772773742676, + "learning_rate": 7.44807121661721e-07, + "loss": 0.4948, + "mean_token_accuracy": 0.8408916592597961, + "num_tokens": 67209854.0, + "step": 1758 + }, + { + "epoch": 0.2237628800407073, + "ewc_loss": 0.01312255859375, + "ewc_loss_parallel": 1.3113021850585938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.276137351989746, + "learning_rate": 7.452310300974989e-07, + "loss": 0.4491, + "mean_token_accuracy": 0.8522285223007202, + "num_tokens": 67244369.0, + "step": 1759 + }, + { + "epoch": 0.2238900903192978, + "ewc_loss": 0.0130615234375, + "ewc_loss_parallel": 1.3053417205810547e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.27319049835205, + "learning_rate": 7.456549385332767e-07, + "loss": 0.5279, + "mean_token_accuracy": 0.8324698805809021, + "num_tokens": 67283994.0, + "step": 1760 + }, + { + "epoch": 0.2240173005978883, + "ewc_loss": 0.01312255859375, + "ewc_loss_parallel": 1.3113021850585938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.416006088256836, + "learning_rate": 7.460788469690547e-07, + "loss": 0.4129, + "mean_token_accuracy": 0.8640729784965515, + "num_tokens": 67323230.0, + "step": 1761 + }, + { + "epoch": 0.22414451087647883, + "ewc_loss": 0.0130615234375, + "ewc_loss_parallel": 1.3053417205810547e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.27641487121582, + "learning_rate": 7.465027554048325e-07, + "loss": 0.5675, + "mean_token_accuracy": 0.8232094049453735, + "num_tokens": 67366198.0, + "step": 1762 + }, + { + "epoch": 0.22427172115506933, + "ewc_loss": 0.01318359375, + "ewc_loss_parallel": 1.3172626495361328e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.535512924194336, + "learning_rate": 7.469266638406105e-07, + "loss": 0.5511, + "mean_token_accuracy": 0.8277937769889832, + "num_tokens": 67410685.0, + "step": 1763 + }, + { + "epoch": 0.22439893143365983, + "ewc_loss": 0.01318359375, + "ewc_loss_parallel": 1.3172626495361328e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.397891998291016, + "learning_rate": 7.473505722763883e-07, + "loss": 0.4726, + "mean_token_accuracy": 0.8492611646652222, + "num_tokens": 67445818.0, + "step": 1764 + }, + { + "epoch": 0.22452614171225035, + "ewc_loss": 0.0130615234375, + "ewc_loss_parallel": 1.3053417205810547e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.50932502746582, + "learning_rate": 7.477744807121661e-07, + "loss": 0.5152, + "mean_token_accuracy": 0.8363776206970215, + "num_tokens": 67483873.0, + "step": 1765 + }, + { + "epoch": 0.22465335199084085, + "ewc_loss": 0.01318359375, + "ewc_loss_parallel": 1.3172626495361328e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.50573444366455, + "learning_rate": 7.48198389147944e-07, + "loss": 0.4936, + "mean_token_accuracy": 0.8462691307067871, + "num_tokens": 67519524.0, + "step": 1766 + }, + { + "epoch": 0.22478056226943138, + "ewc_loss": 0.01318359375, + "ewc_loss_parallel": 1.3172626495361328e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.362458229064941, + "learning_rate": 7.486222975837219e-07, + "loss": 0.478, + "mean_token_accuracy": 0.8450484871864319, + "num_tokens": 67560920.0, + "step": 1767 + }, + { + "epoch": 0.22490777254802188, + "ewc_loss": 0.01318359375, + "ewc_loss_parallel": 1.3172626495361328e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.717381477355957, + "learning_rate": 7.490462060194997e-07, + "loss": 0.4479, + "mean_token_accuracy": 0.8521252870559692, + "num_tokens": 67594948.0, + "step": 1768 + }, + { + "epoch": 0.22503498282661238, + "ewc_loss": 0.01318359375, + "ewc_loss_parallel": 1.3172626495361328e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.532065391540527, + "learning_rate": 7.494701144552777e-07, + "loss": 0.5377, + "mean_token_accuracy": 0.8283704519271851, + "num_tokens": 67628867.0, + "step": 1769 + }, + { + "epoch": 0.2251621931052029, + "ewc_loss": 0.0130615234375, + "ewc_loss_parallel": 1.3053417205810547e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.350523948669434, + "learning_rate": 7.498940228910555e-07, + "loss": 0.4968, + "mean_token_accuracy": 0.8400921821594238, + "num_tokens": 67669762.0, + "step": 1770 + }, + { + "epoch": 0.2252894033837934, + "ewc_loss": 0.01318359375, + "ewc_loss_parallel": 1.3172626495361328e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.522368431091309, + "learning_rate": 7.503179313268335e-07, + "loss": 0.4857, + "mean_token_accuracy": 0.8417670130729675, + "num_tokens": 67704750.0, + "step": 1771 + }, + { + "epoch": 0.2254166136623839, + "ewc_loss": 0.01318359375, + "ewc_loss_parallel": 1.3172626495361328e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.521617889404297, + "learning_rate": 7.507418397626113e-07, + "loss": 0.5229, + "mean_token_accuracy": 0.8338108658790588, + "num_tokens": 67745181.0, + "step": 1772 + }, + { + "epoch": 0.22554382394097444, + "ewc_loss": 0.01318359375, + "ewc_loss_parallel": 1.3172626495361328e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.283381462097168, + "learning_rate": 7.51165748198389e-07, + "loss": 0.5095, + "mean_token_accuracy": 0.8359849452972412, + "num_tokens": 67785929.0, + "step": 1773 + }, + { + "epoch": 0.22567103421956494, + "ewc_loss": 0.01312255859375, + "ewc_loss_parallel": 1.3113021850585938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.400461196899414, + "learning_rate": 7.51589656634167e-07, + "loss": 0.4641, + "mean_token_accuracy": 0.8470073938369751, + "num_tokens": 67821887.0, + "step": 1774 + }, + { + "epoch": 0.22579824449815544, + "ewc_loss": 0.01318359375, + "ewc_loss_parallel": 1.3172626495361328e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.43391227722168, + "learning_rate": 7.520135650699448e-07, + "loss": 0.4984, + "mean_token_accuracy": 0.8400801420211792, + "num_tokens": 67859608.0, + "step": 1775 + }, + { + "epoch": 0.22592545477674597, + "ewc_loss": 0.01336669921875, + "ewc_loss_parallel": 1.33514404296875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.563980102539062, + "learning_rate": 7.524374735057227e-07, + "loss": 0.5087, + "mean_token_accuracy": 0.8390763998031616, + "num_tokens": 67895287.0, + "step": 1776 + }, + { + "epoch": 0.22605266505533647, + "ewc_loss": 0.01318359375, + "ewc_loss_parallel": 1.3172626495361328e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.554306030273438, + "learning_rate": 7.528613819415006e-07, + "loss": 0.465, + "mean_token_accuracy": 0.8550572991371155, + "num_tokens": 67930358.0, + "step": 1777 + }, + { + "epoch": 0.22617987533392697, + "ewc_loss": 0.0133056640625, + "ewc_loss_parallel": 1.329183578491211e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.492141723632812, + "learning_rate": 7.532852903772785e-07, + "loss": 0.5069, + "mean_token_accuracy": 0.8376335501670837, + "num_tokens": 67966112.0, + "step": 1778 + }, + { + "epoch": 0.2263070856125175, + "ewc_loss": 0.0133056640625, + "ewc_loss_parallel": 1.329183578491211e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.012676239013672, + "learning_rate": 7.537091988130564e-07, + "loss": 0.5153, + "mean_token_accuracy": 0.8330185413360596, + "num_tokens": 68001297.0, + "step": 1779 + }, + { + "epoch": 0.226434295891108, + "ewc_loss": 0.013427734375, + "ewc_loss_parallel": 1.341104507446289e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.456533432006836, + "learning_rate": 7.541331072488342e-07, + "loss": 0.4967, + "mean_token_accuracy": 0.84180748462677, + "num_tokens": 68037346.0, + "step": 1780 + }, + { + "epoch": 0.2265615061696985, + "ewc_loss": 0.0130615234375, + "ewc_loss_parallel": 1.3053417205810547e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.662946701049805, + "learning_rate": 7.54557015684612e-07, + "loss": 0.494, + "mean_token_accuracy": 0.8407467603683472, + "num_tokens": 68076566.0, + "step": 1781 + }, + { + "epoch": 0.22668871644828903, + "ewc_loss": 0.0133056640625, + "ewc_loss_parallel": 1.329183578491211e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.6583251953125, + "learning_rate": 7.5498092412039e-07, + "loss": 0.4726, + "mean_token_accuracy": 0.8515999913215637, + "num_tokens": 68116834.0, + "step": 1782 + }, + { + "epoch": 0.22681592672687953, + "ewc_loss": 0.0130615234375, + "ewc_loss_parallel": 1.3053417205810547e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.310632705688477, + "learning_rate": 7.554048325561678e-07, + "loss": 0.4479, + "mean_token_accuracy": 0.8570326566696167, + "num_tokens": 68155573.0, + "step": 1783 + }, + { + "epoch": 0.22694313700547003, + "ewc_loss": 0.01318359375, + "ewc_loss_parallel": 1.3172626495361328e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.543944358825684, + "learning_rate": 7.558287409919457e-07, + "loss": 0.5418, + "mean_token_accuracy": 0.8297414183616638, + "num_tokens": 68193378.0, + "step": 1784 + }, + { + "epoch": 0.22707034728406056, + "ewc_loss": 0.013427734375, + "ewc_loss_parallel": 1.341104507446289e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.52973461151123, + "learning_rate": 7.562526494277236e-07, + "loss": 0.5132, + "mean_token_accuracy": 0.833582878112793, + "num_tokens": 68230183.0, + "step": 1785 + }, + { + "epoch": 0.22719755756265106, + "ewc_loss": 0.01336669921875, + "ewc_loss_parallel": 1.33514404296875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.51160717010498, + "learning_rate": 7.566765578635015e-07, + "loss": 0.4832, + "mean_token_accuracy": 0.8464821577072144, + "num_tokens": 68270526.0, + "step": 1786 + }, + { + "epoch": 0.22732476784124156, + "ewc_loss": 0.013427734375, + "ewc_loss_parallel": 1.341104507446289e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.431327819824219, + "learning_rate": 7.571004662992794e-07, + "loss": 0.5082, + "mean_token_accuracy": 0.8413958549499512, + "num_tokens": 68308690.0, + "step": 1787 + }, + { + "epoch": 0.2274519781198321, + "ewc_loss": 0.01324462890625, + "ewc_loss_parallel": 1.3232231140136719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.586213111877441, + "learning_rate": 7.575243747350572e-07, + "loss": 0.5506, + "mean_token_accuracy": 0.8277167081832886, + "num_tokens": 68348660.0, + "step": 1788 + }, + { + "epoch": 0.2275791883984226, + "ewc_loss": 0.013427734375, + "ewc_loss_parallel": 1.341104507446289e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.424886703491211, + "learning_rate": 7.57948283170835e-07, + "loss": 0.4994, + "mean_token_accuracy": 0.8439385294914246, + "num_tokens": 68389129.0, + "step": 1789 + }, + { + "epoch": 0.2277063986770131, + "ewc_loss": 0.0133056640625, + "ewc_loss_parallel": 1.329183578491211e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.506710052490234, + "learning_rate": 7.58372191606613e-07, + "loss": 0.4912, + "mean_token_accuracy": 0.8442832827568054, + "num_tokens": 68423234.0, + "step": 1790 + }, + { + "epoch": 0.22783360895560362, + "ewc_loss": 0.0133056640625, + "ewc_loss_parallel": 1.329183578491211e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.557100296020508, + "learning_rate": 7.587961000423908e-07, + "loss": 0.4981, + "mean_token_accuracy": 0.8425458669662476, + "num_tokens": 68460834.0, + "step": 1791 + }, + { + "epoch": 0.22796081923419412, + "ewc_loss": 0.0133056640625, + "ewc_loss_parallel": 1.329183578491211e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.611506462097168, + "learning_rate": 7.592200084781686e-07, + "loss": 0.5422, + "mean_token_accuracy": 0.8318078517913818, + "num_tokens": 68495605.0, + "step": 1792 + }, + { + "epoch": 0.22808802951278465, + "ewc_loss": 0.01324462890625, + "ewc_loss_parallel": 1.3232231140136719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.647872924804688, + "learning_rate": 7.596439169139466e-07, + "loss": 0.4885, + "mean_token_accuracy": 0.8439940214157104, + "num_tokens": 68530838.0, + "step": 1793 + }, + { + "epoch": 0.22821523979137515, + "ewc_loss": 0.013427734375, + "ewc_loss_parallel": 1.341104507446289e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.699058532714844, + "learning_rate": 7.600678253497244e-07, + "loss": 0.4831, + "mean_token_accuracy": 0.8473274111747742, + "num_tokens": 68571225.0, + "step": 1794 + }, + { + "epoch": 0.22834245006996565, + "ewc_loss": 0.0135498046875, + "ewc_loss_parallel": 1.3530254364013672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.969966888427734, + "learning_rate": 7.604917337855023e-07, + "loss": 0.4653, + "mean_token_accuracy": 0.8537826538085938, + "num_tokens": 68608284.0, + "step": 1795 + }, + { + "epoch": 0.22846966034855618, + "ewc_loss": 0.01336669921875, + "ewc_loss_parallel": 1.33514404296875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.659420013427734, + "learning_rate": 7.609156422212801e-07, + "loss": 0.4458, + "mean_token_accuracy": 0.8566625118255615, + "num_tokens": 68645943.0, + "step": 1796 + }, + { + "epoch": 0.22859687062714668, + "ewc_loss": 0.01324462890625, + "ewc_loss_parallel": 1.3232231140136719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.543681144714355, + "learning_rate": 7.61339550657058e-07, + "loss": 0.469, + "mean_token_accuracy": 0.8485193252563477, + "num_tokens": 68684245.0, + "step": 1797 + }, + { + "epoch": 0.22872408090573718, + "ewc_loss": 0.01336669921875, + "ewc_loss_parallel": 1.33514404296875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.765536308288574, + "learning_rate": 7.617634590928359e-07, + "loss": 0.4829, + "mean_token_accuracy": 0.8483557105064392, + "num_tokens": 68717532.0, + "step": 1798 + }, + { + "epoch": 0.2288512911843277, + "ewc_loss": 0.01324462890625, + "ewc_loss_parallel": 1.3232231140136719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.543257713317871, + "learning_rate": 7.621873675286138e-07, + "loss": 0.5017, + "mean_token_accuracy": 0.8418962955474854, + "num_tokens": 68759515.0, + "step": 1799 + }, + { + "epoch": 0.2289785014629182, + "ewc_loss": 0.01348876953125, + "ewc_loss_parallel": 1.3470649719238281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.573440551757812, + "learning_rate": 7.626112759643916e-07, + "loss": 0.5089, + "mean_token_accuracy": 0.8364842534065247, + "num_tokens": 68796969.0, + "step": 1800 + }, + { + "epoch": 0.2291057117415087, + "ewc_loss": 0.01348876953125, + "ewc_loss_parallel": 1.3470649719238281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.795574188232422, + "learning_rate": 7.630351844001696e-07, + "loss": 0.4901, + "mean_token_accuracy": 0.8424214124679565, + "num_tokens": 68831138.0, + "step": 1801 + }, + { + "epoch": 0.22923292202009923, + "ewc_loss": 0.0135498046875, + "ewc_loss_parallel": 1.3530254364013672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.612656593322754, + "learning_rate": 7.634590928359474e-07, + "loss": 0.4812, + "mean_token_accuracy": 0.8483468890190125, + "num_tokens": 68868741.0, + "step": 1802 + }, + { + "epoch": 0.22936013229868973, + "ewc_loss": 0.0133056640625, + "ewc_loss_parallel": 1.329183578491211e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.411928176879883, + "learning_rate": 7.638830012717253e-07, + "loss": 0.4994, + "mean_token_accuracy": 0.8388811945915222, + "num_tokens": 68909191.0, + "step": 1803 + }, + { + "epoch": 0.22948734257728023, + "ewc_loss": 0.01336669921875, + "ewc_loss_parallel": 1.33514404296875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.678753852844238, + "learning_rate": 7.643069097075031e-07, + "loss": 0.4751, + "mean_token_accuracy": 0.8468382358551025, + "num_tokens": 68949261.0, + "step": 1804 + }, + { + "epoch": 0.22961455285587076, + "ewc_loss": 0.01348876953125, + "ewc_loss_parallel": 1.3470649719238281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.629150390625, + "learning_rate": 7.64730818143281e-07, + "loss": 0.4496, + "mean_token_accuracy": 0.8518558740615845, + "num_tokens": 68990434.0, + "step": 1805 + }, + { + "epoch": 0.22974176313446126, + "ewc_loss": 0.013427734375, + "ewc_loss_parallel": 1.341104507446289e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.655515670776367, + "learning_rate": 7.651547265790589e-07, + "loss": 0.5328, + "mean_token_accuracy": 0.8315738439559937, + "num_tokens": 69029482.0, + "step": 1806 + }, + { + "epoch": 0.22986897341305176, + "ewc_loss": 0.01361083984375, + "ewc_loss_parallel": 1.3589859008789062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.651174545288086, + "learning_rate": 7.655786350148368e-07, + "loss": 0.5128, + "mean_token_accuracy": 0.8366343975067139, + "num_tokens": 69069766.0, + "step": 1807 + }, + { + "epoch": 0.2299961836916423, + "ewc_loss": 0.0133056640625, + "ewc_loss_parallel": 1.329183578491211e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.552387237548828, + "learning_rate": 7.660025434506146e-07, + "loss": 0.4985, + "mean_token_accuracy": 0.8415099382400513, + "num_tokens": 69114592.0, + "step": 1808 + }, + { + "epoch": 0.2301233939702328, + "ewc_loss": 0.01348876953125, + "ewc_loss_parallel": 1.3470649719238281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.522300720214844, + "learning_rate": 7.664264518863926e-07, + "loss": 0.4875, + "mean_token_accuracy": 0.8450610637664795, + "num_tokens": 69152599.0, + "step": 1809 + }, + { + "epoch": 0.2302506042488233, + "ewc_loss": 0.0135498046875, + "ewc_loss_parallel": 1.3530254364013672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.667068481445312, + "learning_rate": 7.668503603221704e-07, + "loss": 0.5399, + "mean_token_accuracy": 0.8255102634429932, + "num_tokens": 69193157.0, + "step": 1810 + }, + { + "epoch": 0.23037781452741382, + "ewc_loss": 0.013671875, + "ewc_loss_parallel": 1.3649463653564453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.649654388427734, + "learning_rate": 7.672742687579483e-07, + "loss": 0.4525, + "mean_token_accuracy": 0.8521043658256531, + "num_tokens": 69230681.0, + "step": 1811 + }, + { + "epoch": 0.23050502480600432, + "ewc_loss": 0.013427734375, + "ewc_loss_parallel": 1.341104507446289e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.558501243591309, + "learning_rate": 7.676981771937261e-07, + "loss": 0.4302, + "mean_token_accuracy": 0.8571748733520508, + "num_tokens": 69261292.0, + "step": 1812 + }, + { + "epoch": 0.23063223508459482, + "ewc_loss": 0.01361083984375, + "ewc_loss_parallel": 1.3589859008789062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.59444808959961, + "learning_rate": 7.681220856295039e-07, + "loss": 0.462, + "mean_token_accuracy": 0.8518713712692261, + "num_tokens": 69301795.0, + "step": 1813 + }, + { + "epoch": 0.23075944536318535, + "ewc_loss": 0.01348876953125, + "ewc_loss_parallel": 1.3470649719238281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.697613716125488, + "learning_rate": 7.685459940652819e-07, + "loss": 0.4843, + "mean_token_accuracy": 0.8423460721969604, + "num_tokens": 69337038.0, + "step": 1814 + }, + { + "epoch": 0.23088665564177585, + "ewc_loss": 0.013671875, + "ewc_loss_parallel": 1.3649463653564453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.541418075561523, + "learning_rate": 7.689699025010597e-07, + "loss": 0.4824, + "mean_token_accuracy": 0.8445755839347839, + "num_tokens": 69379871.0, + "step": 1815 + }, + { + "epoch": 0.23101386592036638, + "ewc_loss": 0.01361083984375, + "ewc_loss_parallel": 1.3589859008789062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.865463256835938, + "learning_rate": 7.693938109368376e-07, + "loss": 0.5033, + "mean_token_accuracy": 0.838146448135376, + "num_tokens": 69418021.0, + "step": 1816 + }, + { + "epoch": 0.23114107619895688, + "ewc_loss": 0.013671875, + "ewc_loss_parallel": 1.3649463653564453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.13829803466797, + "learning_rate": 7.698177193726155e-07, + "loss": 0.4805, + "mean_token_accuracy": 0.8447362780570984, + "num_tokens": 69455574.0, + "step": 1817 + }, + { + "epoch": 0.23126828647754738, + "ewc_loss": 0.01348876953125, + "ewc_loss_parallel": 1.3470649719238281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.536333084106445, + "learning_rate": 7.702416278083933e-07, + "loss": 0.5051, + "mean_token_accuracy": 0.8400833010673523, + "num_tokens": 69489822.0, + "step": 1818 + }, + { + "epoch": 0.2313954967561379, + "ewc_loss": 0.01361083984375, + "ewc_loss_parallel": 1.3589859008789062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.351009368896484, + "learning_rate": 7.706655362441712e-07, + "loss": 0.4766, + "mean_token_accuracy": 0.8463813066482544, + "num_tokens": 69532821.0, + "step": 1819 + }, + { + "epoch": 0.2315227070347284, + "ewc_loss": 0.01373291015625, + "ewc_loss_parallel": 1.3709068298339844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.717137336730957, + "learning_rate": 7.710894446799491e-07, + "loss": 0.4597, + "mean_token_accuracy": 0.8503444194793701, + "num_tokens": 69565674.0, + "step": 1820 + }, + { + "epoch": 0.2316499173133189, + "ewc_loss": 0.013427734375, + "ewc_loss_parallel": 1.341104507446289e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.665715217590332, + "learning_rate": 7.715133531157269e-07, + "loss": 0.4588, + "mean_token_accuracy": 0.8547110557556152, + "num_tokens": 69606118.0, + "step": 1821 + }, + { + "epoch": 0.23177712759190944, + "ewc_loss": 0.013671875, + "ewc_loss_parallel": 1.3649463653564453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.120914459228516, + "learning_rate": 7.719372615515049e-07, + "loss": 0.5316, + "mean_token_accuracy": 0.831981360912323, + "num_tokens": 69650102.0, + "step": 1822 + }, + { + "epoch": 0.23190433787049994, + "ewc_loss": 0.01361083984375, + "ewc_loss_parallel": 1.3589859008789062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.744611740112305, + "learning_rate": 7.723611699872827e-07, + "loss": 0.4767, + "mean_token_accuracy": 0.8462427854537964, + "num_tokens": 69693440.0, + "step": 1823 + }, + { + "epoch": 0.23203154814909044, + "ewc_loss": 0.01361083984375, + "ewc_loss_parallel": 1.3589859008789062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.528194427490234, + "learning_rate": 7.727850784230606e-07, + "loss": 0.495, + "mean_token_accuracy": 0.8374661803245544, + "num_tokens": 69732651.0, + "step": 1824 + }, + { + "epoch": 0.23215875842768097, + "ewc_loss": 0.0135498046875, + "ewc_loss_parallel": 1.3530254364013672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.12316131591797, + "learning_rate": 7.732089868588385e-07, + "loss": 0.5091, + "mean_token_accuracy": 0.8407329320907593, + "num_tokens": 69769915.0, + "step": 1825 + }, + { + "epoch": 0.23228596870627147, + "ewc_loss": 0.01373291015625, + "ewc_loss_parallel": 1.3709068298339844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.66110610961914, + "learning_rate": 7.736328952946163e-07, + "loss": 0.432, + "mean_token_accuracy": 0.861801266670227, + "num_tokens": 69813943.0, + "step": 1826 + }, + { + "epoch": 0.23241317898486197, + "ewc_loss": 0.01348876953125, + "ewc_loss_parallel": 1.3470649719238281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.38420581817627, + "learning_rate": 7.740568037303942e-07, + "loss": 0.4623, + "mean_token_accuracy": 0.8487093448638916, + "num_tokens": 69847039.0, + "step": 1827 + }, + { + "epoch": 0.2325403892634525, + "ewc_loss": 0.013671875, + "ewc_loss_parallel": 1.3649463653564453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.795839309692383, + "learning_rate": 7.744807121661721e-07, + "loss": 0.4275, + "mean_token_accuracy": 0.863652229309082, + "num_tokens": 69884588.0, + "step": 1828 + }, + { + "epoch": 0.232667599542043, + "ewc_loss": 0.01373291015625, + "ewc_loss_parallel": 1.3709068298339844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.73145580291748, + "learning_rate": 7.749046206019499e-07, + "loss": 0.492, + "mean_token_accuracy": 0.8433988094329834, + "num_tokens": 69925552.0, + "step": 1829 + }, + { + "epoch": 0.2327948098206335, + "ewc_loss": 0.01373291015625, + "ewc_loss_parallel": 1.3709068298339844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.807867050170898, + "learning_rate": 7.753285290377279e-07, + "loss": 0.4246, + "mean_token_accuracy": 0.8616424202919006, + "num_tokens": 69961776.0, + "step": 1830 + }, + { + "epoch": 0.23292202009922403, + "ewc_loss": 0.013671875, + "ewc_loss_parallel": 1.3649463653564453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.78206729888916, + "learning_rate": 7.757524374735057e-07, + "loss": 0.5309, + "mean_token_accuracy": 0.8302631378173828, + "num_tokens": 69999775.0, + "step": 1831 + }, + { + "epoch": 0.23304923037781453, + "ewc_loss": 0.013671875, + "ewc_loss_parallel": 1.3649463653564453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.697492599487305, + "learning_rate": 7.761763459092836e-07, + "loss": 0.4418, + "mean_token_accuracy": 0.861207127571106, + "num_tokens": 70037294.0, + "step": 1832 + }, + { + "epoch": 0.23317644065640503, + "ewc_loss": 0.01373291015625, + "ewc_loss_parallel": 1.3709068298339844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.988725662231445, + "learning_rate": 7.766002543450614e-07, + "loss": 0.4784, + "mean_token_accuracy": 0.8466432094573975, + "num_tokens": 70075014.0, + "step": 1833 + }, + { + "epoch": 0.23330365093499555, + "ewc_loss": 0.01373291015625, + "ewc_loss_parallel": 1.3709068298339844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.743636131286621, + "learning_rate": 7.770241627808392e-07, + "loss": 0.4334, + "mean_token_accuracy": 0.8606836795806885, + "num_tokens": 70112186.0, + "step": 1834 + }, + { + "epoch": 0.23343086121358606, + "ewc_loss": 0.01361083984375, + "ewc_loss_parallel": 1.3589859008789062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.892930030822754, + "learning_rate": 7.774480712166172e-07, + "loss": 0.513, + "mean_token_accuracy": 0.8361047506332397, + "num_tokens": 70153285.0, + "step": 1835 + }, + { + "epoch": 0.23355807149217656, + "ewc_loss": 0.0137939453125, + "ewc_loss_parallel": 1.3768672943115234e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.808900833129883, + "learning_rate": 7.77871979652395e-07, + "loss": 0.4945, + "mean_token_accuracy": 0.8420778512954712, + "num_tokens": 70193382.0, + "step": 1836 + }, + { + "epoch": 0.23368528177076708, + "ewc_loss": 0.01361083984375, + "ewc_loss_parallel": 1.3589859008789062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.947002410888672, + "learning_rate": 7.782958880881729e-07, + "loss": 0.4848, + "mean_token_accuracy": 0.8444897532463074, + "num_tokens": 70227279.0, + "step": 1837 + }, + { + "epoch": 0.23381249204935758, + "ewc_loss": 0.013671875, + "ewc_loss_parallel": 1.3649463653564453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.850427627563477, + "learning_rate": 7.787197965239508e-07, + "loss": 0.4459, + "mean_token_accuracy": 0.8528091907501221, + "num_tokens": 70264395.0, + "step": 1838 + }, + { + "epoch": 0.23393970232794808, + "ewc_loss": 0.01373291015625, + "ewc_loss_parallel": 1.3709068298339844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.782670974731445, + "learning_rate": 7.791437049597287e-07, + "loss": 0.5631, + "mean_token_accuracy": 0.8220210075378418, + "num_tokens": 70295016.0, + "step": 1839 + }, + { + "epoch": 0.2340669126065386, + "ewc_loss": 0.01373291015625, + "ewc_loss_parallel": 1.3709068298339844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.695878028869629, + "learning_rate": 7.795676133955065e-07, + "loss": 0.5065, + "mean_token_accuracy": 0.8383074998855591, + "num_tokens": 70332774.0, + "step": 1840 + }, + { + "epoch": 0.2341941228851291, + "ewc_loss": 0.01385498046875, + "ewc_loss_parallel": 1.3828277587890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.041954040527344, + "learning_rate": 7.799915218312844e-07, + "loss": 0.4906, + "mean_token_accuracy": 0.8422507643699646, + "num_tokens": 70366880.0, + "step": 1841 + }, + { + "epoch": 0.23432133316371964, + "ewc_loss": 0.01373291015625, + "ewc_loss_parallel": 1.3709068298339844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.720046043395996, + "learning_rate": 7.804154302670622e-07, + "loss": 0.4659, + "mean_token_accuracy": 0.8508517742156982, + "num_tokens": 70405235.0, + "step": 1842 + }, + { + "epoch": 0.23444854344231014, + "ewc_loss": 0.01385498046875, + "ewc_loss_parallel": 1.3828277587890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.752041816711426, + "learning_rate": 7.808393387028402e-07, + "loss": 0.4764, + "mean_token_accuracy": 0.8454947471618652, + "num_tokens": 70446503.0, + "step": 1843 + }, + { + "epoch": 0.23457575372090064, + "ewc_loss": 0.0137939453125, + "ewc_loss_parallel": 1.3768672943115234e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.100372314453125, + "learning_rate": 7.81263247138618e-07, + "loss": 0.5267, + "mean_token_accuracy": 0.8313867449760437, + "num_tokens": 70483514.0, + "step": 1844 + }, + { + "epoch": 0.23470296399949117, + "ewc_loss": 0.01385498046875, + "ewc_loss_parallel": 1.3828277587890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.759761810302734, + "learning_rate": 7.816871555743959e-07, + "loss": 0.4882, + "mean_token_accuracy": 0.842577338218689, + "num_tokens": 70520944.0, + "step": 1845 + }, + { + "epoch": 0.23483017427808167, + "ewc_loss": 0.0137939453125, + "ewc_loss_parallel": 1.3768672943115234e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.878510475158691, + "learning_rate": 7.821110640101738e-07, + "loss": 0.511, + "mean_token_accuracy": 0.834726095199585, + "num_tokens": 70564399.0, + "step": 1846 + }, + { + "epoch": 0.23495738455667217, + "ewc_loss": 0.0137939453125, + "ewc_loss_parallel": 1.3768672943115234e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.862735748291016, + "learning_rate": 7.825349724459517e-07, + "loss": 0.4841, + "mean_token_accuracy": 0.845803439617157, + "num_tokens": 70604000.0, + "step": 1847 + }, + { + "epoch": 0.2350845948352627, + "ewc_loss": 0.013916015625, + "ewc_loss_parallel": 1.3887882232666016e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.711832046508789, + "learning_rate": 7.829588808817294e-07, + "loss": 0.4764, + "mean_token_accuracy": 0.8477498292922974, + "num_tokens": 70645133.0, + "step": 1848 + }, + { + "epoch": 0.2352118051138532, + "ewc_loss": 0.013916015625, + "ewc_loss_parallel": 1.3887882232666016e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.730348587036133, + "learning_rate": 7.833827893175074e-07, + "loss": 0.5108, + "mean_token_accuracy": 0.8314738273620605, + "num_tokens": 70675824.0, + "step": 1849 + }, + { + "epoch": 0.2353390153924437, + "ewc_loss": 0.01385498046875, + "ewc_loss_parallel": 1.3828277587890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.713469505310059, + "learning_rate": 7.838066977532852e-07, + "loss": 0.474, + "mean_token_accuracy": 0.8476850986480713, + "num_tokens": 70714103.0, + "step": 1850 + }, + { + "epoch": 0.23546622567103423, + "ewc_loss": 0.01397705078125, + "ewc_loss_parallel": 1.3947486877441406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.888211250305176, + "learning_rate": 7.842306061890632e-07, + "loss": 0.5245, + "mean_token_accuracy": 0.8348374962806702, + "num_tokens": 70754265.0, + "step": 1851 + }, + { + "epoch": 0.23559343594962473, + "ewc_loss": 0.013916015625, + "ewc_loss_parallel": 1.3887882232666016e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.742119789123535, + "learning_rate": 7.84654514624841e-07, + "loss": 0.4904, + "mean_token_accuracy": 0.8403937220573425, + "num_tokens": 70791368.0, + "step": 1852 + }, + { + "epoch": 0.23572064622821523, + "ewc_loss": 0.01397705078125, + "ewc_loss_parallel": 1.3947486877441406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.74248218536377, + "learning_rate": 7.850784230606188e-07, + "loss": 0.5366, + "mean_token_accuracy": 0.8321897387504578, + "num_tokens": 70827397.0, + "step": 1853 + }, + { + "epoch": 0.23584785650680576, + "ewc_loss": 0.01397705078125, + "ewc_loss_parallel": 1.3947486877441406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.956568717956543, + "learning_rate": 7.855023314963968e-07, + "loss": 0.5072, + "mean_token_accuracy": 0.8412234783172607, + "num_tokens": 70872503.0, + "step": 1854 + }, + { + "epoch": 0.23597506678539626, + "ewc_loss": 0.013916015625, + "ewc_loss_parallel": 1.3887882232666016e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.791096687316895, + "learning_rate": 7.859262399321746e-07, + "loss": 0.5059, + "mean_token_accuracy": 0.8372113108634949, + "num_tokens": 70914376.0, + "step": 1855 + }, + { + "epoch": 0.23610227706398676, + "ewc_loss": 0.01397705078125, + "ewc_loss_parallel": 1.4007091522216797e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.966279983520508, + "learning_rate": 7.863501483679524e-07, + "loss": 0.4628, + "mean_token_accuracy": 0.8513384461402893, + "num_tokens": 70954004.0, + "step": 1856 + }, + { + "epoch": 0.2362294873425773, + "ewc_loss": 0.01397705078125, + "ewc_loss_parallel": 1.4007091522216797e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.948528289794922, + "learning_rate": 7.867740568037303e-07, + "loss": 0.4637, + "mean_token_accuracy": 0.847959578037262, + "num_tokens": 70991765.0, + "step": 1857 + }, + { + "epoch": 0.2363566976211678, + "ewc_loss": 0.013916015625, + "ewc_loss_parallel": 1.3887882232666016e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.780752182006836, + "learning_rate": 7.871979652395082e-07, + "loss": 0.4932, + "mean_token_accuracy": 0.8391636610031128, + "num_tokens": 71025478.0, + "step": 1858 + }, + { + "epoch": 0.2364839078997583, + "ewc_loss": 0.0140380859375, + "ewc_loss_parallel": 1.4066696166992188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.89908504486084, + "learning_rate": 7.876218736752861e-07, + "loss": 0.4474, + "mean_token_accuracy": 0.8586655259132385, + "num_tokens": 71067287.0, + "step": 1859 + }, + { + "epoch": 0.23661111817834882, + "ewc_loss": 0.0140380859375, + "ewc_loss_parallel": 1.4066696166992188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.019746780395508, + "learning_rate": 7.88045782111064e-07, + "loss": 0.5174, + "mean_token_accuracy": 0.8364534974098206, + "num_tokens": 71108095.0, + "step": 1860 + }, + { + "epoch": 0.23673832845693932, + "ewc_loss": 0.01397705078125, + "ewc_loss_parallel": 1.3947486877441406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.894621849060059, + "learning_rate": 7.884696905468418e-07, + "loss": 0.4504, + "mean_token_accuracy": 0.8565007448196411, + "num_tokens": 71150297.0, + "step": 1861 + }, + { + "epoch": 0.23686553873552982, + "ewc_loss": 0.01397705078125, + "ewc_loss_parallel": 1.3947486877441406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.899049758911133, + "learning_rate": 7.888935989826198e-07, + "loss": 0.4979, + "mean_token_accuracy": 0.8381061553955078, + "num_tokens": 71189205.0, + "step": 1862 + }, + { + "epoch": 0.23699274901412035, + "ewc_loss": 0.01409912109375, + "ewc_loss_parallel": 1.4126300811767578e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.910693168640137, + "learning_rate": 7.893175074183976e-07, + "loss": 0.4878, + "mean_token_accuracy": 0.8436882495880127, + "num_tokens": 71228141.0, + "step": 1863 + }, + { + "epoch": 0.23711995929271085, + "ewc_loss": 0.0140380859375, + "ewc_loss_parallel": 1.4066696166992188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.96704387664795, + "learning_rate": 7.897414158541754e-07, + "loss": 0.4882, + "mean_token_accuracy": 0.8442503213882446, + "num_tokens": 71267854.0, + "step": 1864 + }, + { + "epoch": 0.23724716957130135, + "ewc_loss": 0.01416015625, + "ewc_loss_parallel": 1.4185905456542969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.43419075012207, + "learning_rate": 7.901653242899533e-07, + "loss": 0.5238, + "mean_token_accuracy": 0.8379025459289551, + "num_tokens": 71305030.0, + "step": 1865 + }, + { + "epoch": 0.23737437984989188, + "ewc_loss": 0.01397705078125, + "ewc_loss_parallel": 1.4007091522216797e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.94784927368164, + "learning_rate": 7.905892327257312e-07, + "loss": 0.4789, + "mean_token_accuracy": 0.8471644520759583, + "num_tokens": 71342202.0, + "step": 1866 + }, + { + "epoch": 0.23750159012848238, + "ewc_loss": 0.0140380859375, + "ewc_loss_parallel": 1.4066696166992188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.65156364440918, + "learning_rate": 7.910131411615091e-07, + "loss": 0.4787, + "mean_token_accuracy": 0.8497942686080933, + "num_tokens": 71379702.0, + "step": 1867 + }, + { + "epoch": 0.2376288004070729, + "ewc_loss": 0.01397705078125, + "ewc_loss_parallel": 1.4007091522216797e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.97873592376709, + "learning_rate": 7.91437049597287e-07, + "loss": 0.4813, + "mean_token_accuracy": 0.8457337021827698, + "num_tokens": 71417785.0, + "step": 1868 + }, + { + "epoch": 0.2377560106856634, + "ewc_loss": 0.01397705078125, + "ewc_loss_parallel": 1.3947486877441406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.763128280639648, + "learning_rate": 7.918609580330648e-07, + "loss": 0.5246, + "mean_token_accuracy": 0.8323811888694763, + "num_tokens": 71461358.0, + "step": 1869 + }, + { + "epoch": 0.2378832209642539, + "ewc_loss": 0.01409912109375, + "ewc_loss_parallel": 1.4126300811767578e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.388065338134766, + "learning_rate": 7.922848664688428e-07, + "loss": 0.5221, + "mean_token_accuracy": 0.8359972238540649, + "num_tokens": 71500059.0, + "step": 1870 + }, + { + "epoch": 0.23801043124284443, + "ewc_loss": 0.01397705078125, + "ewc_loss_parallel": 1.3947486877441406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.88632869720459, + "learning_rate": 7.927087749046205e-07, + "loss": 0.4319, + "mean_token_accuracy": 0.8617045879364014, + "num_tokens": 71537043.0, + "step": 1871 + }, + { + "epoch": 0.23813764152143493, + "ewc_loss": 0.013916015625, + "ewc_loss_parallel": 1.3887882232666016e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.753966331481934, + "learning_rate": 7.931326833403983e-07, + "loss": 0.4673, + "mean_token_accuracy": 0.8477494716644287, + "num_tokens": 71570671.0, + "step": 1872 + }, + { + "epoch": 0.23826485180002543, + "ewc_loss": 0.01397705078125, + "ewc_loss_parallel": 1.4007091522216797e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.948641777038574, + "learning_rate": 7.935565917761763e-07, + "loss": 0.4782, + "mean_token_accuracy": 0.847187876701355, + "num_tokens": 71614813.0, + "step": 1873 + }, + { + "epoch": 0.23839206207861596, + "ewc_loss": 0.0140380859375, + "ewc_loss_parallel": 1.4066696166992188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.055776596069336, + "learning_rate": 7.939805002119541e-07, + "loss": 0.4933, + "mean_token_accuracy": 0.8402226567268372, + "num_tokens": 71650441.0, + "step": 1874 + }, + { + "epoch": 0.23851927235720646, + "ewc_loss": 0.01397705078125, + "ewc_loss_parallel": 1.4007091522216797e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.958059310913086, + "learning_rate": 7.944044086477321e-07, + "loss": 0.4991, + "mean_token_accuracy": 0.8414943218231201, + "num_tokens": 71690425.0, + "step": 1875 + }, + { + "epoch": 0.23864648263579696, + "ewc_loss": 0.0140380859375, + "ewc_loss_parallel": 1.4066696166992188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.782086372375488, + "learning_rate": 7.948283170835099e-07, + "loss": 0.4469, + "mean_token_accuracy": 0.8555009365081787, + "num_tokens": 71727349.0, + "step": 1876 + }, + { + "epoch": 0.2387736929143875, + "ewc_loss": 0.01409912109375, + "ewc_loss_parallel": 1.4126300811767578e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.2465877532959, + "learning_rate": 7.952522255192878e-07, + "loss": 0.4119, + "mean_token_accuracy": 0.86760014295578, + "num_tokens": 71770386.0, + "step": 1877 + }, + { + "epoch": 0.238900903192978, + "ewc_loss": 0.0140380859375, + "ewc_loss_parallel": 1.4066696166992188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.930253028869629, + "learning_rate": 7.956761339550657e-07, + "loss": 0.4455, + "mean_token_accuracy": 0.8571825623512268, + "num_tokens": 71808946.0, + "step": 1878 + }, + { + "epoch": 0.2390281134715685, + "ewc_loss": 0.0140380859375, + "ewc_loss_parallel": 1.4066696166992188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.903308868408203, + "learning_rate": 7.961000423908435e-07, + "loss": 0.4859, + "mean_token_accuracy": 0.8505847454071045, + "num_tokens": 71848605.0, + "step": 1879 + }, + { + "epoch": 0.23915532375015902, + "ewc_loss": 0.01409912109375, + "ewc_loss_parallel": 1.4126300811767578e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.148204803466797, + "learning_rate": 7.965239508266214e-07, + "loss": 0.4646, + "mean_token_accuracy": 0.8491826057434082, + "num_tokens": 71886771.0, + "step": 1880 + }, + { + "epoch": 0.23928253402874952, + "ewc_loss": 0.0140380859375, + "ewc_loss_parallel": 1.4066696166992188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.867496490478516, + "learning_rate": 7.969478592623993e-07, + "loss": 0.442, + "mean_token_accuracy": 0.8559621572494507, + "num_tokens": 71924905.0, + "step": 1881 + }, + { + "epoch": 0.23940974430734002, + "ewc_loss": 0.01416015625, + "ewc_loss_parallel": 1.4185905456542969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.816497802734375, + "learning_rate": 7.973717676981771e-07, + "loss": 0.4678, + "mean_token_accuracy": 0.8503828048706055, + "num_tokens": 71964513.0, + "step": 1882 + }, + { + "epoch": 0.23953695458593055, + "ewc_loss": 0.01416015625, + "ewc_loss_parallel": 1.4185905456542969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.23880958557129, + "learning_rate": 7.977956761339551e-07, + "loss": 0.4377, + "mean_token_accuracy": 0.8589995503425598, + "num_tokens": 72000918.0, + "step": 1883 + }, + { + "epoch": 0.23966416486452105, + "ewc_loss": 0.01422119140625, + "ewc_loss_parallel": 1.424551010131836e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.101530075073242, + "learning_rate": 7.982195845697329e-07, + "loss": 0.5685, + "mean_token_accuracy": 0.8198273181915283, + "num_tokens": 72040840.0, + "step": 1884 + }, + { + "epoch": 0.23979137514311155, + "ewc_loss": 0.01409912109375, + "ewc_loss_parallel": 1.4126300811767578e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.064260482788086, + "learning_rate": 7.986434930055108e-07, + "loss": 0.4407, + "mean_token_accuracy": 0.8562836647033691, + "num_tokens": 72076757.0, + "step": 1885 + }, + { + "epoch": 0.23991858542170208, + "ewc_loss": 0.01409912109375, + "ewc_loss_parallel": 1.4126300811767578e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.960433006286621, + "learning_rate": 7.990674014412886e-07, + "loss": 0.4987, + "mean_token_accuracy": 0.8429090976715088, + "num_tokens": 72120760.0, + "step": 1886 + }, + { + "epoch": 0.24004579570029258, + "ewc_loss": 0.01409912109375, + "ewc_loss_parallel": 1.4126300811767578e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.069683074951172, + "learning_rate": 7.994913098770665e-07, + "loss": 0.4846, + "mean_token_accuracy": 0.8428739905357361, + "num_tokens": 72159985.0, + "step": 1887 + }, + { + "epoch": 0.24017300597888308, + "ewc_loss": 0.01416015625, + "ewc_loss_parallel": 1.4185905456542969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.14586067199707, + "learning_rate": 7.999152183128444e-07, + "loss": 0.5185, + "mean_token_accuracy": 0.8333330154418945, + "num_tokens": 72196377.0, + "step": 1888 + }, + { + "epoch": 0.2403002162574736, + "ewc_loss": 0.01416015625, + "ewc_loss_parallel": 1.4185905456542969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.956576347351074, + "learning_rate": 8.003391267486223e-07, + "loss": 0.467, + "mean_token_accuracy": 0.8501458168029785, + "num_tokens": 72236585.0, + "step": 1889 + }, + { + "epoch": 0.2404274265360641, + "ewc_loss": 0.01416015625, + "ewc_loss_parallel": 1.4185905456542969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.1302547454834, + "learning_rate": 8.007630351844001e-07, + "loss": 0.4805, + "mean_token_accuracy": 0.8453172445297241, + "num_tokens": 72268450.0, + "step": 1890 + }, + { + "epoch": 0.24055463681465464, + "ewc_loss": 0.01416015625, + "ewc_loss_parallel": 1.4185905456542969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.99983024597168, + "learning_rate": 8.011869436201781e-07, + "loss": 0.4638, + "mean_token_accuracy": 0.8504749536514282, + "num_tokens": 72304092.0, + "step": 1891 + }, + { + "epoch": 0.24068184709324514, + "ewc_loss": 0.01409912109375, + "ewc_loss_parallel": 1.4126300811767578e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.044355392456055, + "learning_rate": 8.016108520559559e-07, + "loss": 0.4904, + "mean_token_accuracy": 0.8412038087844849, + "num_tokens": 72347312.0, + "step": 1892 + }, + { + "epoch": 0.24080905737183564, + "ewc_loss": 0.01416015625, + "ewc_loss_parallel": 1.4185905456542969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.898366928100586, + "learning_rate": 8.020347604917338e-07, + "loss": 0.4523, + "mean_token_accuracy": 0.854594886302948, + "num_tokens": 72382135.0, + "step": 1893 + }, + { + "epoch": 0.24093626765042617, + "ewc_loss": 0.01422119140625, + "ewc_loss_parallel": 1.424551010131836e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.779858589172363, + "learning_rate": 8.024586689275116e-07, + "loss": 0.4806, + "mean_token_accuracy": 0.8462188839912415, + "num_tokens": 72421526.0, + "step": 1894 + }, + { + "epoch": 0.24106347792901667, + "ewc_loss": 0.0142822265625, + "ewc_loss_parallel": 1.430511474609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.13174057006836, + "learning_rate": 8.028825773632894e-07, + "loss": 0.4804, + "mean_token_accuracy": 0.8473474383354187, + "num_tokens": 72458592.0, + "step": 1895 + }, + { + "epoch": 0.24119068820760717, + "ewc_loss": 0.01416015625, + "ewc_loss_parallel": 1.4185905456542969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.987349510192871, + "learning_rate": 8.033064857990674e-07, + "loss": 0.4117, + "mean_token_accuracy": 0.8660913705825806, + "num_tokens": 72494734.0, + "step": 1896 + }, + { + "epoch": 0.2413178984861977, + "ewc_loss": 0.01416015625, + "ewc_loss_parallel": 1.4185905456542969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.943689346313477, + "learning_rate": 8.037303942348452e-07, + "loss": 0.4619, + "mean_token_accuracy": 0.8517770171165466, + "num_tokens": 72531560.0, + "step": 1897 + }, + { + "epoch": 0.2414451087647882, + "ewc_loss": 0.01422119140625, + "ewc_loss_parallel": 1.424551010131836e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.959389686584473, + "learning_rate": 8.041543026706231e-07, + "loss": 0.4604, + "mean_token_accuracy": 0.852912187576294, + "num_tokens": 72570144.0, + "step": 1898 + }, + { + "epoch": 0.2415723190433787, + "ewc_loss": 0.01422119140625, + "ewc_loss_parallel": 1.424551010131836e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.120746612548828, + "learning_rate": 8.04578211106401e-07, + "loss": 0.4699, + "mean_token_accuracy": 0.8463860750198364, + "num_tokens": 72606738.0, + "step": 1899 + }, + { + "epoch": 0.24169952932196923, + "ewc_loss": 0.0142822265625, + "ewc_loss_parallel": 1.430511474609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.09333038330078, + "learning_rate": 8.050021195421789e-07, + "loss": 0.5138, + "mean_token_accuracy": 0.8359842300415039, + "num_tokens": 72646230.0, + "step": 1900 + }, + { + "epoch": 0.24182673960055973, + "ewc_loss": 0.0142822265625, + "ewc_loss_parallel": 1.430511474609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.00885581970215, + "learning_rate": 8.054260279779567e-07, + "loss": 0.4823, + "mean_token_accuracy": 0.8430066108703613, + "num_tokens": 72681811.0, + "step": 1901 + }, + { + "epoch": 0.24195394987915023, + "ewc_loss": 0.01434326171875, + "ewc_loss_parallel": 1.436471939086914e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.20515251159668, + "learning_rate": 8.058499364137346e-07, + "loss": 0.5188, + "mean_token_accuracy": 0.8338181376457214, + "num_tokens": 72721805.0, + "step": 1902 + }, + { + "epoch": 0.24208116015774075, + "ewc_loss": 0.01446533203125, + "ewc_loss_parallel": 1.4483928680419922e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.148435592651367, + "learning_rate": 8.062738448495124e-07, + "loss": 0.4313, + "mean_token_accuracy": 0.8599361181259155, + "num_tokens": 72763142.0, + "step": 1903 + }, + { + "epoch": 0.24220837043633126, + "ewc_loss": 0.01434326171875, + "ewc_loss_parallel": 1.436471939086914e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.1020565032959, + "learning_rate": 8.066977532852904e-07, + "loss": 0.4234, + "mean_token_accuracy": 0.8669000267982483, + "num_tokens": 72802047.0, + "step": 1904 + }, + { + "epoch": 0.24233558071492176, + "ewc_loss": 0.0145263671875, + "ewc_loss_parallel": 1.4543533325195312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.270212173461914, + "learning_rate": 8.071216617210682e-07, + "loss": 0.5026, + "mean_token_accuracy": 0.8391115069389343, + "num_tokens": 72834470.0, + "step": 1905 + }, + { + "epoch": 0.24246279099351228, + "ewc_loss": 0.0142822265625, + "ewc_loss_parallel": 1.430511474609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.898370742797852, + "learning_rate": 8.075455701568461e-07, + "loss": 0.4708, + "mean_token_accuracy": 0.8500076532363892, + "num_tokens": 72873101.0, + "step": 1906 + }, + { + "epoch": 0.24259000127210278, + "ewc_loss": 0.0145263671875, + "ewc_loss_parallel": 1.4543533325195312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.36661148071289, + "learning_rate": 8.07969478592624e-07, + "loss": 0.4504, + "mean_token_accuracy": 0.8588871359825134, + "num_tokens": 72915504.0, + "step": 1907 + }, + { + "epoch": 0.24271721155069328, + "ewc_loss": 0.0145263671875, + "ewc_loss_parallel": 1.4543533325195312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.265409469604492, + "learning_rate": 8.083933870284019e-07, + "loss": 0.4241, + "mean_token_accuracy": 0.8643263578414917, + "num_tokens": 72947179.0, + "step": 1908 + }, + { + "epoch": 0.2428444218292838, + "ewc_loss": 0.01422119140625, + "ewc_loss_parallel": 1.424551010131836e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.069292068481445, + "learning_rate": 8.088172954641796e-07, + "loss": 0.4655, + "mean_token_accuracy": 0.8508919477462769, + "num_tokens": 72983313.0, + "step": 1909 + }, + { + "epoch": 0.2429716321078743, + "ewc_loss": 0.0145263671875, + "ewc_loss_parallel": 1.4543533325195312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.207353591918945, + "learning_rate": 8.092412038999576e-07, + "loss": 0.5163, + "mean_token_accuracy": 0.835444450378418, + "num_tokens": 73014218.0, + "step": 1910 + }, + { + "epoch": 0.24309884238646481, + "ewc_loss": 0.01446533203125, + "ewc_loss_parallel": 1.4483928680419922e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.987160682678223, + "learning_rate": 8.096651123357354e-07, + "loss": 0.525, + "mean_token_accuracy": 0.8304306864738464, + "num_tokens": 73048282.0, + "step": 1911 + }, + { + "epoch": 0.24322605266505534, + "ewc_loss": 0.01446533203125, + "ewc_loss_parallel": 1.4483928680419922e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.3576717376709, + "learning_rate": 8.100890207715134e-07, + "loss": 0.5107, + "mean_token_accuracy": 0.8376772403717041, + "num_tokens": 73084390.0, + "step": 1912 + }, + { + "epoch": 0.24335326294364584, + "ewc_loss": 0.0145263671875, + "ewc_loss_parallel": 1.4543533325195312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.99792194366455, + "learning_rate": 8.105129292072912e-07, + "loss": 0.4781, + "mean_token_accuracy": 0.8466206789016724, + "num_tokens": 73123296.0, + "step": 1913 + }, + { + "epoch": 0.24348047322223634, + "ewc_loss": 0.014404296875, + "ewc_loss_parallel": 1.4424324035644531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.08733367919922, + "learning_rate": 8.10936837643069e-07, + "loss": 0.524, + "mean_token_accuracy": 0.8343006372451782, + "num_tokens": 73157533.0, + "step": 1914 + }, + { + "epoch": 0.24360768350082687, + "ewc_loss": 0.014404296875, + "ewc_loss_parallel": 1.4424324035644531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.137712478637695, + "learning_rate": 8.11360746078847e-07, + "loss": 0.4566, + "mean_token_accuracy": 0.8525373935699463, + "num_tokens": 73195314.0, + "step": 1915 + }, + { + "epoch": 0.24373489377941737, + "ewc_loss": 0.0145263671875, + "ewc_loss_parallel": 1.4543533325195312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.00232696533203, + "learning_rate": 8.117846545146248e-07, + "loss": 0.4254, + "mean_token_accuracy": 0.8615268468856812, + "num_tokens": 73236763.0, + "step": 1916 + }, + { + "epoch": 0.2438621040580079, + "ewc_loss": 0.01458740234375, + "ewc_loss_parallel": 1.4603137969970703e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.111183166503906, + "learning_rate": 8.122085629504026e-07, + "loss": 0.5151, + "mean_token_accuracy": 0.8335838913917542, + "num_tokens": 73276719.0, + "step": 1917 + }, + { + "epoch": 0.2439893143365984, + "ewc_loss": 0.01446533203125, + "ewc_loss_parallel": 1.4483928680419922e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.03960609436035, + "learning_rate": 8.126324713861805e-07, + "loss": 0.4801, + "mean_token_accuracy": 0.8489706516265869, + "num_tokens": 73318594.0, + "step": 1918 + }, + { + "epoch": 0.2441165246151889, + "ewc_loss": 0.0146484375, + "ewc_loss_parallel": 1.4662742614746094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.151363372802734, + "learning_rate": 8.130563798219584e-07, + "loss": 0.4543, + "mean_token_accuracy": 0.8561756014823914, + "num_tokens": 73358760.0, + "step": 1919 + }, + { + "epoch": 0.24424373489377943, + "ewc_loss": 0.0146484375, + "ewc_loss_parallel": 1.4662742614746094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.014583587646484, + "learning_rate": 8.134802882577363e-07, + "loss": 0.4984, + "mean_token_accuracy": 0.8450211882591248, + "num_tokens": 73398623.0, + "step": 1920 + }, + { + "epoch": 0.24437094517236993, + "ewc_loss": 0.0145263671875, + "ewc_loss_parallel": 1.4543533325195312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.32200813293457, + "learning_rate": 8.139041966935142e-07, + "loss": 0.4794, + "mean_token_accuracy": 0.8478074669837952, + "num_tokens": 73446369.0, + "step": 1921 + }, + { + "epoch": 0.24449815545096043, + "ewc_loss": 0.01470947265625, + "ewc_loss_parallel": 1.4722347259521484e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 15.879984855651855, + "learning_rate": 8.14328105129292e-07, + "loss": 0.5404, + "mean_token_accuracy": 0.8325735330581665, + "num_tokens": 73483152.0, + "step": 1922 + }, + { + "epoch": 0.24462536572955096, + "ewc_loss": 0.014404296875, + "ewc_loss_parallel": 1.4424324035644531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.246355056762695, + "learning_rate": 8.1475201356507e-07, + "loss": 0.4946, + "mean_token_accuracy": 0.8430588245391846, + "num_tokens": 73519179.0, + "step": 1923 + }, + { + "epoch": 0.24475257600814146, + "ewc_loss": 0.0147705078125, + "ewc_loss_parallel": 1.4781951904296875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.15650749206543, + "learning_rate": 8.151759220008477e-07, + "loss": 0.4666, + "mean_token_accuracy": 0.8486585021018982, + "num_tokens": 73561947.0, + "step": 1924 + }, + { + "epoch": 0.24487978628673196, + "ewc_loss": 0.01446533203125, + "ewc_loss_parallel": 1.4483928680419922e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.317974090576172, + "learning_rate": 8.155998304366256e-07, + "loss": 0.4812, + "mean_token_accuracy": 0.8506535887718201, + "num_tokens": 73595841.0, + "step": 1925 + }, + { + "epoch": 0.2450069965653225, + "ewc_loss": 0.01470947265625, + "ewc_loss_parallel": 1.4722347259521484e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.202207565307617, + "learning_rate": 8.160237388724035e-07, + "loss": 0.5076, + "mean_token_accuracy": 0.8349061012268066, + "num_tokens": 73629716.0, + "step": 1926 + }, + { + "epoch": 0.245134206843913, + "ewc_loss": 0.0145263671875, + "ewc_loss_parallel": 1.4543533325195312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.070085525512695, + "learning_rate": 8.164476473081814e-07, + "loss": 0.4947, + "mean_token_accuracy": 0.8425544500350952, + "num_tokens": 73667285.0, + "step": 1927 + }, + { + "epoch": 0.2452614171225035, + "ewc_loss": 0.01458740234375, + "ewc_loss_parallel": 1.4603137969970703e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.144054412841797, + "learning_rate": 8.168715557439593e-07, + "loss": 0.4386, + "mean_token_accuracy": 0.859912097454071, + "num_tokens": 73708059.0, + "step": 1928 + }, + { + "epoch": 0.24538862740109402, + "ewc_loss": 0.0146484375, + "ewc_loss_parallel": 1.4662742614746094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.131624221801758, + "learning_rate": 8.172954641797372e-07, + "loss": 0.4505, + "mean_token_accuracy": 0.853791356086731, + "num_tokens": 73748553.0, + "step": 1929 + }, + { + "epoch": 0.24551583767968452, + "ewc_loss": 0.01470947265625, + "ewc_loss_parallel": 1.4722347259521484e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.277387619018555, + "learning_rate": 8.17719372615515e-07, + "loss": 0.5516, + "mean_token_accuracy": 0.8258775472640991, + "num_tokens": 73796295.0, + "step": 1930 + }, + { + "epoch": 0.24564304795827502, + "ewc_loss": 0.01470947265625, + "ewc_loss_parallel": 1.4722347259521484e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.138154983520508, + "learning_rate": 8.18143281051293e-07, + "loss": 0.5323, + "mean_token_accuracy": 0.8329344987869263, + "num_tokens": 73835960.0, + "step": 1931 + }, + { + "epoch": 0.24577025823686555, + "ewc_loss": 0.01458740234375, + "ewc_loss_parallel": 1.4603137969970703e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.17721176147461, + "learning_rate": 8.185671894870707e-07, + "loss": 0.5474, + "mean_token_accuracy": 0.8276559710502625, + "num_tokens": 73875302.0, + "step": 1932 + }, + { + "epoch": 0.24589746851545605, + "ewc_loss": 0.01470947265625, + "ewc_loss_parallel": 1.4722347259521484e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.083209991455078, + "learning_rate": 8.189910979228485e-07, + "loss": 0.4378, + "mean_token_accuracy": 0.8579937219619751, + "num_tokens": 73907703.0, + "step": 1933 + }, + { + "epoch": 0.24602467879404655, + "ewc_loss": 0.0147705078125, + "ewc_loss_parallel": 1.4781951904296875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.228839874267578, + "learning_rate": 8.194150063586265e-07, + "loss": 0.5324, + "mean_token_accuracy": 0.8329579830169678, + "num_tokens": 73945449.0, + "step": 1934 + }, + { + "epoch": 0.24615188907263708, + "ewc_loss": 0.01470947265625, + "ewc_loss_parallel": 1.4722347259521484e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.528194427490234, + "learning_rate": 8.198389147944043e-07, + "loss": 0.5209, + "mean_token_accuracy": 0.8315050601959229, + "num_tokens": 73982461.0, + "step": 1935 + }, + { + "epoch": 0.24627909935122758, + "ewc_loss": 0.0145263671875, + "ewc_loss_parallel": 1.4543533325195312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.181560516357422, + "learning_rate": 8.202628232301823e-07, + "loss": 0.4939, + "mean_token_accuracy": 0.8460862636566162, + "num_tokens": 74018252.0, + "step": 1936 + }, + { + "epoch": 0.24640630962981808, + "ewc_loss": 0.01458740234375, + "ewc_loss_parallel": 1.4603137969970703e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.376907348632812, + "learning_rate": 8.206867316659601e-07, + "loss": 0.5138, + "mean_token_accuracy": 0.8345792293548584, + "num_tokens": 74065843.0, + "step": 1937 + }, + { + "epoch": 0.2465335199084086, + "ewc_loss": 0.01470947265625, + "ewc_loss_parallel": 1.4722347259521484e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.246356964111328, + "learning_rate": 8.21110640101738e-07, + "loss": 0.4045, + "mean_token_accuracy": 0.8665695190429688, + "num_tokens": 74106389.0, + "step": 1938 + }, + { + "epoch": 0.2466607301869991, + "ewc_loss": 0.01470947265625, + "ewc_loss_parallel": 1.4722347259521484e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.183086395263672, + "learning_rate": 8.215345485375159e-07, + "loss": 0.5249, + "mean_token_accuracy": 0.8341671824455261, + "num_tokens": 74150786.0, + "step": 1939 + }, + { + "epoch": 0.2467879404655896, + "ewc_loss": 0.0147705078125, + "ewc_loss_parallel": 1.4781951904296875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.613452911376953, + "learning_rate": 8.219584569732937e-07, + "loss": 0.4944, + "mean_token_accuracy": 0.8392809629440308, + "num_tokens": 74190474.0, + "step": 1940 + }, + { + "epoch": 0.24691515074418013, + "ewc_loss": 0.01470947265625, + "ewc_loss_parallel": 1.4722347259521484e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.068622589111328, + "learning_rate": 8.223823654090715e-07, + "loss": 0.4444, + "mean_token_accuracy": 0.8571344614028931, + "num_tokens": 74226437.0, + "step": 1941 + }, + { + "epoch": 0.24704236102277063, + "ewc_loss": 0.01458740234375, + "ewc_loss_parallel": 1.4603137969970703e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.161985397338867, + "learning_rate": 8.228062738448495e-07, + "loss": 0.4741, + "mean_token_accuracy": 0.8451393842697144, + "num_tokens": 74267292.0, + "step": 1942 + }, + { + "epoch": 0.24716957130136116, + "ewc_loss": 0.0147705078125, + "ewc_loss_parallel": 1.4781951904296875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.466264724731445, + "learning_rate": 8.232301822806273e-07, + "loss": 0.499, + "mean_token_accuracy": 0.8409318327903748, + "num_tokens": 74306738.0, + "step": 1943 + }, + { + "epoch": 0.24729678157995166, + "ewc_loss": 0.0145263671875, + "ewc_loss_parallel": 1.4543533325195312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.074859619140625, + "learning_rate": 8.236540907164053e-07, + "loss": 0.4887, + "mean_token_accuracy": 0.8466722965240479, + "num_tokens": 74343151.0, + "step": 1944 + }, + { + "epoch": 0.24742399185854216, + "ewc_loss": 0.0147705078125, + "ewc_loss_parallel": 1.4781951904296875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.277788162231445, + "learning_rate": 8.240779991521831e-07, + "loss": 0.4652, + "mean_token_accuracy": 0.8494277596473694, + "num_tokens": 74382755.0, + "step": 1945 + }, + { + "epoch": 0.2475512021371327, + "ewc_loss": 0.0147705078125, + "ewc_loss_parallel": 1.4781951904296875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.454872131347656, + "learning_rate": 8.24501907587961e-07, + "loss": 0.5019, + "mean_token_accuracy": 0.8402118682861328, + "num_tokens": 74426129.0, + "step": 1946 + }, + { + "epoch": 0.2476784124157232, + "ewc_loss": 0.01470947265625, + "ewc_loss_parallel": 1.4722347259521484e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.17142677307129, + "learning_rate": 8.249258160237388e-07, + "loss": 0.5137, + "mean_token_accuracy": 0.8412748575210571, + "num_tokens": 74460904.0, + "step": 1947 + }, + { + "epoch": 0.2478056226943137, + "ewc_loss": 0.0147705078125, + "ewc_loss_parallel": 1.4781951904296875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.22056770324707, + "learning_rate": 8.253497244595167e-07, + "loss": 0.5023, + "mean_token_accuracy": 0.8399139642715454, + "num_tokens": 74497993.0, + "step": 1948 + }, + { + "epoch": 0.24793283297290422, + "ewc_loss": 0.0147705078125, + "ewc_loss_parallel": 1.4781951904296875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.200288772583008, + "learning_rate": 8.257736328952945e-07, + "loss": 0.5192, + "mean_token_accuracy": 0.8350913524627686, + "num_tokens": 74541108.0, + "step": 1949 + }, + { + "epoch": 0.24806004325149472, + "ewc_loss": 0.0147705078125, + "ewc_loss_parallel": 1.4781951904296875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.33965301513672, + "learning_rate": 8.261975413310725e-07, + "loss": 0.49, + "mean_token_accuracy": 0.8447152376174927, + "num_tokens": 74579389.0, + "step": 1950 + }, + { + "epoch": 0.24818725353008522, + "ewc_loss": 0.0147705078125, + "ewc_loss_parallel": 1.4781951904296875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.323230743408203, + "learning_rate": 8.266214497668503e-07, + "loss": 0.4426, + "mean_token_accuracy": 0.8609152436256409, + "num_tokens": 74621546.0, + "step": 1951 + }, + { + "epoch": 0.24831446380867575, + "ewc_loss": 0.01483154296875, + "ewc_loss_parallel": 1.4841556549072266e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.204185485839844, + "learning_rate": 8.270453582026283e-07, + "loss": 0.5234, + "mean_token_accuracy": 0.8325436115264893, + "num_tokens": 74660178.0, + "step": 1952 + }, + { + "epoch": 0.24844167408726625, + "ewc_loss": 0.014892578125, + "ewc_loss_parallel": 1.4901161193847656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.315141677856445, + "learning_rate": 8.274692666384061e-07, + "loss": 0.4697, + "mean_token_accuracy": 0.8495647311210632, + "num_tokens": 74693867.0, + "step": 1953 + }, + { + "epoch": 0.24856888436585675, + "ewc_loss": 0.014892578125, + "ewc_loss_parallel": 1.4901161193847656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.340490341186523, + "learning_rate": 8.27893175074184e-07, + "loss": 0.4835, + "mean_token_accuracy": 0.8434159755706787, + "num_tokens": 74727561.0, + "step": 1954 + }, + { + "epoch": 0.24869609464444728, + "ewc_loss": 0.0147705078125, + "ewc_loss_parallel": 1.4781951904296875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.40137481689453, + "learning_rate": 8.283170835099618e-07, + "loss": 0.4852, + "mean_token_accuracy": 0.8468562364578247, + "num_tokens": 74763719.0, + "step": 1955 + }, + { + "epoch": 0.24882330492303778, + "ewc_loss": 0.01495361328125, + "ewc_loss_parallel": 1.4960765838623047e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.400390625, + "learning_rate": 8.287409919457396e-07, + "loss": 0.4501, + "mean_token_accuracy": 0.8545525074005127, + "num_tokens": 74802666.0, + "step": 1956 + }, + { + "epoch": 0.24895051520162828, + "ewc_loss": 0.014892578125, + "ewc_loss_parallel": 1.4901161193847656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.234928131103516, + "learning_rate": 8.291649003815175e-07, + "loss": 0.4995, + "mean_token_accuracy": 0.8411229848861694, + "num_tokens": 74840307.0, + "step": 1957 + }, + { + "epoch": 0.2490777254802188, + "ewc_loss": 0.01483154296875, + "ewc_loss_parallel": 1.4841556549072266e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.515405654907227, + "learning_rate": 8.295888088172954e-07, + "loss": 0.5109, + "mean_token_accuracy": 0.8400311470031738, + "num_tokens": 74880181.0, + "step": 1958 + }, + { + "epoch": 0.2492049357588093, + "ewc_loss": 0.01495361328125, + "ewc_loss_parallel": 1.4960765838623047e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.14311981201172, + "learning_rate": 8.300127172530733e-07, + "loss": 0.4198, + "mean_token_accuracy": 0.8622341156005859, + "num_tokens": 74917148.0, + "step": 1959 + }, + { + "epoch": 0.2493321460373998, + "ewc_loss": 0.01483154296875, + "ewc_loss_parallel": 1.4841556549072266e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.65288543701172, + "learning_rate": 8.304366256888512e-07, + "loss": 0.4564, + "mean_token_accuracy": 0.8516495227813721, + "num_tokens": 74953124.0, + "step": 1960 + }, + { + "epoch": 0.24945935631599034, + "ewc_loss": 0.01495361328125, + "ewc_loss_parallel": 1.4960765838623047e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.41834831237793, + "learning_rate": 8.308605341246291e-07, + "loss": 0.4705, + "mean_token_accuracy": 0.8482500314712524, + "num_tokens": 74989726.0, + "step": 1961 + }, + { + "epoch": 0.24958656659458084, + "ewc_loss": 0.0147705078125, + "ewc_loss_parallel": 1.4781951904296875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.3189697265625, + "learning_rate": 8.312844425604068e-07, + "loss": 0.4715, + "mean_token_accuracy": 0.8552803993225098, + "num_tokens": 75024849.0, + "step": 1962 + }, + { + "epoch": 0.24971377687317134, + "ewc_loss": 0.014892578125, + "ewc_loss_parallel": 1.4901161193847656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.42240333557129, + "learning_rate": 8.317083509961848e-07, + "loss": 0.4996, + "mean_token_accuracy": 0.8431419730186462, + "num_tokens": 75066464.0, + "step": 1963 + }, + { + "epoch": 0.24984098715176187, + "ewc_loss": 0.01483154296875, + "ewc_loss_parallel": 1.4841556549072266e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.222118377685547, + "learning_rate": 8.321322594319626e-07, + "loss": 0.4512, + "mean_token_accuracy": 0.8540411591529846, + "num_tokens": 75106829.0, + "step": 1964 + }, + { + "epoch": 0.24996819743035237, + "ewc_loss": 0.01495361328125, + "ewc_loss_parallel": 1.4960765838623047e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.310331344604492, + "learning_rate": 8.325561678677405e-07, + "loss": 0.5042, + "mean_token_accuracy": 0.8398633599281311, + "num_tokens": 75148227.0, + "step": 1965 + }, + { + "epoch": 0.2500954077089429, + "ewc_loss": 0.014892578125, + "ewc_loss_parallel": 1.4901161193847656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.17259979248047, + "learning_rate": 8.329800763035184e-07, + "loss": 0.518, + "mean_token_accuracy": 0.8338892459869385, + "num_tokens": 75188719.0, + "step": 1966 + }, + { + "epoch": 0.25022261798753337, + "ewc_loss": 0.014892578125, + "ewc_loss_parallel": 1.4901161193847656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.49191665649414, + "learning_rate": 8.334039847392963e-07, + "loss": 0.4689, + "mean_token_accuracy": 0.8473942279815674, + "num_tokens": 75222260.0, + "step": 1967 + }, + { + "epoch": 0.2503498282661239, + "ewc_loss": 0.01495361328125, + "ewc_loss_parallel": 1.4960765838623047e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.236019134521484, + "learning_rate": 8.338278931750742e-07, + "loss": 0.4427, + "mean_token_accuracy": 0.8574731349945068, + "num_tokens": 75255489.0, + "step": 1968 + }, + { + "epoch": 0.2504770385447144, + "ewc_loss": 0.01495361328125, + "ewc_loss_parallel": 1.4960765838623047e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.335311889648438, + "learning_rate": 8.342518016108521e-07, + "loss": 0.4706, + "mean_token_accuracy": 0.8453167676925659, + "num_tokens": 75290548.0, + "step": 1969 + }, + { + "epoch": 0.2506042488233049, + "ewc_loss": 0.0150146484375, + "ewc_loss_parallel": 1.5020370483398438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.3532657623291, + "learning_rate": 8.346757100466298e-07, + "loss": 0.4705, + "mean_token_accuracy": 0.850372314453125, + "num_tokens": 75323912.0, + "step": 1970 + }, + { + "epoch": 0.2507314591018954, + "ewc_loss": 0.014892578125, + "ewc_loss_parallel": 1.4901161193847656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.248550415039062, + "learning_rate": 8.350996184824078e-07, + "loss": 0.5112, + "mean_token_accuracy": 0.8354228138923645, + "num_tokens": 75361206.0, + "step": 1971 + }, + { + "epoch": 0.25085866938048595, + "ewc_loss": 0.0150146484375, + "ewc_loss_parallel": 1.5020370483398438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.3332462310791, + "learning_rate": 8.355235269181856e-07, + "loss": 0.4927, + "mean_token_accuracy": 0.8418889045715332, + "num_tokens": 75397292.0, + "step": 1972 + }, + { + "epoch": 0.2509858796590764, + "ewc_loss": 0.01507568359375, + "ewc_loss_parallel": 1.5079975128173828e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.53282928466797, + "learning_rate": 8.359474353539635e-07, + "loss": 0.4495, + "mean_token_accuracy": 0.8567947745323181, + "num_tokens": 75437616.0, + "step": 1973 + }, + { + "epoch": 0.25111308993766696, + "ewc_loss": 0.01495361328125, + "ewc_loss_parallel": 1.4960765838623047e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.196983337402344, + "learning_rate": 8.363713437897414e-07, + "loss": 0.542, + "mean_token_accuracy": 0.8294128179550171, + "num_tokens": 75486172.0, + "step": 1974 + }, + { + "epoch": 0.2512403002162575, + "ewc_loss": 0.01495361328125, + "ewc_loss_parallel": 1.4960765838623047e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.38929557800293, + "learning_rate": 8.367952522255193e-07, + "loss": 0.476, + "mean_token_accuracy": 0.8441166877746582, + "num_tokens": 75526637.0, + "step": 1975 + }, + { + "epoch": 0.25136751049484796, + "ewc_loss": 0.01495361328125, + "ewc_loss_parallel": 1.4960765838623047e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.358884811401367, + "learning_rate": 8.372191606612972e-07, + "loss": 0.477, + "mean_token_accuracy": 0.8485085964202881, + "num_tokens": 75564632.0, + "step": 1976 + }, + { + "epoch": 0.2514947207734385, + "ewc_loss": 0.01507568359375, + "ewc_loss_parallel": 1.5079975128173828e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.33184814453125, + "learning_rate": 8.376430690970749e-07, + "loss": 0.4451, + "mean_token_accuracy": 0.8572269678115845, + "num_tokens": 75595553.0, + "step": 1977 + }, + { + "epoch": 0.251621931052029, + "ewc_loss": 0.0150146484375, + "ewc_loss_parallel": 1.5020370483398438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.508953094482422, + "learning_rate": 8.380669775328528e-07, + "loss": 0.4395, + "mean_token_accuracy": 0.8601265549659729, + "num_tokens": 75635847.0, + "step": 1978 + }, + { + "epoch": 0.25174914133061954, + "ewc_loss": 0.01507568359375, + "ewc_loss_parallel": 1.5079975128173828e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.48158836364746, + "learning_rate": 8.384908859686307e-07, + "loss": 0.4374, + "mean_token_accuracy": 0.8611552715301514, + "num_tokens": 75677403.0, + "step": 1979 + }, + { + "epoch": 0.25187635160921, + "ewc_loss": 0.01513671875, + "ewc_loss_parallel": 1.5139579772949219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.302610397338867, + "learning_rate": 8.389147944044086e-07, + "loss": 0.4796, + "mean_token_accuracy": 0.8454524278640747, + "num_tokens": 75720703.0, + "step": 1980 + }, + { + "epoch": 0.25200356188780054, + "ewc_loss": 0.0150146484375, + "ewc_loss_parallel": 1.5020370483398438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.588459014892578, + "learning_rate": 8.393387028401864e-07, + "loss": 0.5029, + "mean_token_accuracy": 0.8402667045593262, + "num_tokens": 75758000.0, + "step": 1981 + }, + { + "epoch": 0.25213077216639107, + "ewc_loss": 0.01513671875, + "ewc_loss_parallel": 1.5139579772949219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.889535903930664, + "learning_rate": 8.397626112759644e-07, + "loss": 0.5316, + "mean_token_accuracy": 0.832599401473999, + "num_tokens": 75791113.0, + "step": 1982 + }, + { + "epoch": 0.25225798244498154, + "ewc_loss": 0.01495361328125, + "ewc_loss_parallel": 1.4960765838623047e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.473827362060547, + "learning_rate": 8.401865197117422e-07, + "loss": 0.4768, + "mean_token_accuracy": 0.8481397032737732, + "num_tokens": 75821733.0, + "step": 1983 + }, + { + "epoch": 0.25238519272357207, + "ewc_loss": 0.01507568359375, + "ewc_loss_parallel": 1.5079975128173828e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.200485229492188, + "learning_rate": 8.406104281475202e-07, + "loss": 0.4538, + "mean_token_accuracy": 0.8527203798294067, + "num_tokens": 75861404.0, + "step": 1984 + }, + { + "epoch": 0.2525124030021626, + "ewc_loss": 0.01507568359375, + "ewc_loss_parallel": 1.5079975128173828e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.45029067993164, + "learning_rate": 8.410343365832979e-07, + "loss": 0.454, + "mean_token_accuracy": 0.8527860641479492, + "num_tokens": 75903059.0, + "step": 1985 + }, + { + "epoch": 0.2526396132807531, + "ewc_loss": 0.014892578125, + "ewc_loss_parallel": 1.4901161193847656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.603904724121094, + "learning_rate": 8.414582450190758e-07, + "loss": 0.4667, + "mean_token_accuracy": 0.8508836030960083, + "num_tokens": 75947125.0, + "step": 1986 + }, + { + "epoch": 0.2527668235593436, + "ewc_loss": 0.01513671875, + "ewc_loss_parallel": 1.5139579772949219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.73856544494629, + "learning_rate": 8.418821534548537e-07, + "loss": 0.5519, + "mean_token_accuracy": 0.8254508376121521, + "num_tokens": 75987519.0, + "step": 1987 + }, + { + "epoch": 0.25289403383793413, + "ewc_loss": 0.0150146484375, + "ewc_loss_parallel": 1.5020370483398438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.438642501831055, + "learning_rate": 8.423060618906316e-07, + "loss": 0.4787, + "mean_token_accuracy": 0.8472344875335693, + "num_tokens": 76027728.0, + "step": 1988 + }, + { + "epoch": 0.2530212441165246, + "ewc_loss": 0.0150146484375, + "ewc_loss_parallel": 1.5020370483398438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.361562728881836, + "learning_rate": 8.427299703264095e-07, + "loss": 0.4769, + "mean_token_accuracy": 0.8502647876739502, + "num_tokens": 76070942.0, + "step": 1989 + }, + { + "epoch": 0.25314845439511513, + "ewc_loss": 0.01495361328125, + "ewc_loss_parallel": 1.4960765838623047e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.819643020629883, + "learning_rate": 8.431538787621874e-07, + "loss": 0.5031, + "mean_token_accuracy": 0.8405884504318237, + "num_tokens": 76113330.0, + "step": 1990 + }, + { + "epoch": 0.25327566467370566, + "ewc_loss": 0.01513671875, + "ewc_loss_parallel": 1.5139579772949219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.505098342895508, + "learning_rate": 8.435777871979652e-07, + "loss": 0.4549, + "mean_token_accuracy": 0.8547405004501343, + "num_tokens": 76153511.0, + "step": 1991 + }, + { + "epoch": 0.25340287495229613, + "ewc_loss": 0.01495361328125, + "ewc_loss_parallel": 1.4960765838623047e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.270748138427734, + "learning_rate": 8.440016956337432e-07, + "loss": 0.4805, + "mean_token_accuracy": 0.8477612733840942, + "num_tokens": 76191505.0, + "step": 1992 + }, + { + "epoch": 0.25353008523088666, + "ewc_loss": 0.01513671875, + "ewc_loss_parallel": 1.5139579772949219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.87639808654785, + "learning_rate": 8.444256040695209e-07, + "loss": 0.4963, + "mean_token_accuracy": 0.8431577682495117, + "num_tokens": 76232479.0, + "step": 1993 + }, + { + "epoch": 0.2536572955094772, + "ewc_loss": 0.01519775390625, + "ewc_loss_parallel": 1.519918441772461e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.69689178466797, + "learning_rate": 8.448495125052988e-07, + "loss": 0.4905, + "mean_token_accuracy": 0.8446313142776489, + "num_tokens": 76270192.0, + "step": 1994 + }, + { + "epoch": 0.25378450578806766, + "ewc_loss": 0.01507568359375, + "ewc_loss_parallel": 1.5079975128173828e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.29501724243164, + "learning_rate": 8.452734209410767e-07, + "loss": 0.4305, + "mean_token_accuracy": 0.8620313405990601, + "num_tokens": 76303981.0, + "step": 1995 + }, + { + "epoch": 0.2539117160666582, + "ewc_loss": 0.01507568359375, + "ewc_loss_parallel": 1.5079975128173828e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.65469741821289, + "learning_rate": 8.456973293768545e-07, + "loss": 0.4554, + "mean_token_accuracy": 0.8524099588394165, + "num_tokens": 76338496.0, + "step": 1996 + }, + { + "epoch": 0.2540389263452487, + "ewc_loss": 0.01513671875, + "ewc_loss_parallel": 1.5139579772949219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.56753921508789, + "learning_rate": 8.461212378126325e-07, + "loss": 0.5088, + "mean_token_accuracy": 0.8382336497306824, + "num_tokens": 76377904.0, + "step": 1997 + }, + { + "epoch": 0.2541661366238392, + "ewc_loss": 0.01519775390625, + "ewc_loss_parallel": 1.519918441772461e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.797544479370117, + "learning_rate": 8.465451462484103e-07, + "loss": 0.5323, + "mean_token_accuracy": 0.8331543207168579, + "num_tokens": 76413905.0, + "step": 1998 + }, + { + "epoch": 0.2542933469024297, + "ewc_loss": 0.01519775390625, + "ewc_loss_parallel": 1.519918441772461e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.272518157958984, + "learning_rate": 8.469690546841882e-07, + "loss": 0.4818, + "mean_token_accuracy": 0.8454369306564331, + "num_tokens": 76451591.0, + "step": 1999 + }, + { + "epoch": 0.25442055718102025, + "ewc_loss": 0.01519775390625, + "ewc_loss_parallel": 1.519918441772461e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.46867561340332, + "learning_rate": 8.47392963119966e-07, + "loss": 0.4621, + "mean_token_accuracy": 0.8518861532211304, + "num_tokens": 76486191.0, + "step": 2000 + }, + { + "epoch": 0.2545477674596107, + "ewc_loss": 0.0150146484375, + "ewc_loss_parallel": 1.5020370483398438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.182443618774414, + "learning_rate": 8.478168715557439e-07, + "loss": 0.4804, + "mean_token_accuracy": 0.8497424721717834, + "num_tokens": 76523317.0, + "step": 2001 + }, + { + "epoch": 0.25467497773820125, + "ewc_loss": 0.0152587890625, + "ewc_loss_parallel": 1.52587890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.662443161010742, + "learning_rate": 8.482407799915217e-07, + "loss": 0.4873, + "mean_token_accuracy": 0.8445266485214233, + "num_tokens": 76562661.0, + "step": 2002 + }, + { + "epoch": 0.2548021880167918, + "ewc_loss": 0.01507568359375, + "ewc_loss_parallel": 1.5079975128173828e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.47756576538086, + "learning_rate": 8.486646884272997e-07, + "loss": 0.4817, + "mean_token_accuracy": 0.8436242341995239, + "num_tokens": 76600914.0, + "step": 2003 + }, + { + "epoch": 0.25492939829538225, + "ewc_loss": 0.01513671875, + "ewc_loss_parallel": 1.5139579772949219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.94194793701172, + "learning_rate": 8.490885968630775e-07, + "loss": 0.5015, + "mean_token_accuracy": 0.8391764163970947, + "num_tokens": 76636310.0, + "step": 2004 + }, + { + "epoch": 0.2550566085739728, + "ewc_loss": 0.01513671875, + "ewc_loss_parallel": 1.5139579772949219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.375072479248047, + "learning_rate": 8.495125052988555e-07, + "loss": 0.4858, + "mean_token_accuracy": 0.8483004570007324, + "num_tokens": 76680889.0, + "step": 2005 + }, + { + "epoch": 0.2551838188525633, + "ewc_loss": 0.01507568359375, + "ewc_loss_parallel": 1.5079975128173828e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.42582893371582, + "learning_rate": 8.499364137346333e-07, + "loss": 0.4787, + "mean_token_accuracy": 0.8516112565994263, + "num_tokens": 76722469.0, + "step": 2006 + }, + { + "epoch": 0.2553110291311538, + "ewc_loss": 0.01519775390625, + "ewc_loss_parallel": 1.519918441772461e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.717144012451172, + "learning_rate": 8.503603221704112e-07, + "loss": 0.5148, + "mean_token_accuracy": 0.8365160226821899, + "num_tokens": 76757668.0, + "step": 2007 + }, + { + "epoch": 0.2554382394097443, + "ewc_loss": 0.01519775390625, + "ewc_loss_parallel": 1.519918441772461e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.61484718322754, + "learning_rate": 8.50784230606189e-07, + "loss": 0.4751, + "mean_token_accuracy": 0.8486993312835693, + "num_tokens": 76795607.0, + "step": 2008 + }, + { + "epoch": 0.25556544968833483, + "ewc_loss": 0.01519775390625, + "ewc_loss_parallel": 1.519918441772461e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.372135162353516, + "learning_rate": 8.512081390419669e-07, + "loss": 0.5044, + "mean_token_accuracy": 0.8465191721916199, + "num_tokens": 76837161.0, + "step": 2009 + }, + { + "epoch": 0.2556926599669253, + "ewc_loss": 0.01519775390625, + "ewc_loss_parallel": 1.519918441772461e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.490121841430664, + "learning_rate": 8.516320474777447e-07, + "loss": 0.4822, + "mean_token_accuracy": 0.8462674021720886, + "num_tokens": 76872198.0, + "step": 2010 + }, + { + "epoch": 0.25581987024551583, + "ewc_loss": 0.01519775390625, + "ewc_loss_parallel": 1.519918441772461e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.656906127929688, + "learning_rate": 8.520559559135227e-07, + "loss": 0.4484, + "mean_token_accuracy": 0.8586271405220032, + "num_tokens": 76908976.0, + "step": 2011 + }, + { + "epoch": 0.25594708052410636, + "ewc_loss": 0.0152587890625, + "ewc_loss_parallel": 1.52587890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.49842071533203, + "learning_rate": 8.524798643493005e-07, + "loss": 0.5003, + "mean_token_accuracy": 0.8453860282897949, + "num_tokens": 76947298.0, + "step": 2012 + }, + { + "epoch": 0.25607429080269684, + "ewc_loss": 0.0152587890625, + "ewc_loss_parallel": 1.52587890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.56821632385254, + "learning_rate": 8.529037727850785e-07, + "loss": 0.4553, + "mean_token_accuracy": 0.8522633910179138, + "num_tokens": 76985112.0, + "step": 2013 + }, + { + "epoch": 0.25620150108128736, + "ewc_loss": 0.01519775390625, + "ewc_loss_parallel": 1.519918441772461e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.379514694213867, + "learning_rate": 8.533276812208563e-07, + "loss": 0.4279, + "mean_token_accuracy": 0.8619780540466309, + "num_tokens": 77020309.0, + "step": 2014 + }, + { + "epoch": 0.2563287113598779, + "ewc_loss": 0.01519775390625, + "ewc_loss_parallel": 1.519918441772461e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.619096755981445, + "learning_rate": 8.53751589656634e-07, + "loss": 0.4692, + "mean_token_accuracy": 0.8473557233810425, + "num_tokens": 77063245.0, + "step": 2015 + }, + { + "epoch": 0.25645592163846836, + "ewc_loss": 0.015380859375, + "ewc_loss_parallel": 1.537799835205078e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.53879165649414, + "learning_rate": 8.54175498092412e-07, + "loss": 0.4662, + "mean_token_accuracy": 0.8503516912460327, + "num_tokens": 77096791.0, + "step": 2016 + }, + { + "epoch": 0.2565831319170589, + "ewc_loss": 0.01519775390625, + "ewc_loss_parallel": 1.519918441772461e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.658493041992188, + "learning_rate": 8.545994065281898e-07, + "loss": 0.4434, + "mean_token_accuracy": 0.8583998680114746, + "num_tokens": 77129281.0, + "step": 2017 + }, + { + "epoch": 0.2567103421956494, + "ewc_loss": 0.0152587890625, + "ewc_loss_parallel": 1.52587890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.64406967163086, + "learning_rate": 8.550233149639677e-07, + "loss": 0.5325, + "mean_token_accuracy": 0.829209566116333, + "num_tokens": 77168785.0, + "step": 2018 + }, + { + "epoch": 0.2568375524742399, + "ewc_loss": 0.01519775390625, + "ewc_loss_parallel": 1.519918441772461e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.484500885009766, + "learning_rate": 8.554472233997456e-07, + "loss": 0.5, + "mean_token_accuracy": 0.8371227979660034, + "num_tokens": 77208360.0, + "step": 2019 + }, + { + "epoch": 0.2569647627528304, + "ewc_loss": 0.015380859375, + "ewc_loss_parallel": 1.537799835205078e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.564083099365234, + "learning_rate": 8.558711318355235e-07, + "loss": 0.5328, + "mean_token_accuracy": 0.8293783068656921, + "num_tokens": 77245539.0, + "step": 2020 + }, + { + "epoch": 0.25709197303142095, + "ewc_loss": 0.0155029296875, + "ewc_loss_parallel": 1.5497207641601562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.674636840820312, + "learning_rate": 8.562950402713014e-07, + "loss": 0.4352, + "mean_token_accuracy": 0.8654814958572388, + "num_tokens": 77279118.0, + "step": 2021 + }, + { + "epoch": 0.2572191833100114, + "ewc_loss": 0.015380859375, + "ewc_loss_parallel": 1.537799835205078e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.456560134887695, + "learning_rate": 8.567189487070793e-07, + "loss": 0.5232, + "mean_token_accuracy": 0.8362671136856079, + "num_tokens": 77316259.0, + "step": 2022 + }, + { + "epoch": 0.25734639358860195, + "ewc_loss": 0.0155029296875, + "ewc_loss_parallel": 1.5497207641601562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.73530387878418, + "learning_rate": 8.57142857142857e-07, + "loss": 0.5418, + "mean_token_accuracy": 0.8298063278198242, + "num_tokens": 77359712.0, + "step": 2023 + }, + { + "epoch": 0.2574736038671925, + "ewc_loss": 0.015380859375, + "ewc_loss_parallel": 1.537799835205078e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.56362533569336, + "learning_rate": 8.57566765578635e-07, + "loss": 0.4433, + "mean_token_accuracy": 0.8605771064758301, + "num_tokens": 77402269.0, + "step": 2024 + }, + { + "epoch": 0.25760081414578295, + "ewc_loss": 0.0152587890625, + "ewc_loss_parallel": 1.52587890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.511825561523438, + "learning_rate": 8.579906740144128e-07, + "loss": 0.4712, + "mean_token_accuracy": 0.8500720858573914, + "num_tokens": 77445791.0, + "step": 2025 + }, + { + "epoch": 0.2577280244243735, + "ewc_loss": 0.0155029296875, + "ewc_loss_parallel": 1.5497207641601562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.638334274291992, + "learning_rate": 8.584145824501907e-07, + "loss": 0.5047, + "mean_token_accuracy": 0.8369925022125244, + "num_tokens": 77485376.0, + "step": 2026 + }, + { + "epoch": 0.257855234702964, + "ewc_loss": 0.0155029296875, + "ewc_loss_parallel": 1.5497207641601562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.659719467163086, + "learning_rate": 8.588384908859686e-07, + "loss": 0.5021, + "mean_token_accuracy": 0.8424631357192993, + "num_tokens": 77524344.0, + "step": 2027 + }, + { + "epoch": 0.25798244498155454, + "ewc_loss": 0.0155029296875, + "ewc_loss_parallel": 1.5497207641601562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.4816951751709, + "learning_rate": 8.592623993217465e-07, + "loss": 0.4962, + "mean_token_accuracy": 0.8444185256958008, + "num_tokens": 77562746.0, + "step": 2028 + }, + { + "epoch": 0.258109655260145, + "ewc_loss": 0.0155029296875, + "ewc_loss_parallel": 1.5497207641601562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.881425857543945, + "learning_rate": 8.596863077575244e-07, + "loss": 0.5183, + "mean_token_accuracy": 0.8377434015274048, + "num_tokens": 77601261.0, + "step": 2029 + }, + { + "epoch": 0.25823686553873554, + "ewc_loss": 0.0155029296875, + "ewc_loss_parallel": 1.5497207641601562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.52353286743164, + "learning_rate": 8.601102161933023e-07, + "loss": 0.4994, + "mean_token_accuracy": 0.8412812352180481, + "num_tokens": 77640054.0, + "step": 2030 + }, + { + "epoch": 0.25836407581732607, + "ewc_loss": 0.015380859375, + "ewc_loss_parallel": 1.537799835205078e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.65882682800293, + "learning_rate": 8.6053412462908e-07, + "loss": 0.4736, + "mean_token_accuracy": 0.8491774797439575, + "num_tokens": 77680098.0, + "step": 2031 + }, + { + "epoch": 0.25849128609591654, + "ewc_loss": 0.0155029296875, + "ewc_loss_parallel": 1.5497207641601562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.679534912109375, + "learning_rate": 8.60958033064858e-07, + "loss": 0.4665, + "mean_token_accuracy": 0.8523605465888977, + "num_tokens": 77712517.0, + "step": 2032 + }, + { + "epoch": 0.25861849637450707, + "ewc_loss": 0.0155029296875, + "ewc_loss_parallel": 1.5497207641601562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.59523582458496, + "learning_rate": 8.613819415006358e-07, + "loss": 0.5298, + "mean_token_accuracy": 0.8372585773468018, + "num_tokens": 77751335.0, + "step": 2033 + }, + { + "epoch": 0.2587457066530976, + "ewc_loss": 0.0155029296875, + "ewc_loss_parallel": 1.5497207641601562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.868715286254883, + "learning_rate": 8.618058499364137e-07, + "loss": 0.5153, + "mean_token_accuracy": 0.8362785577774048, + "num_tokens": 77792300.0, + "step": 2034 + }, + { + "epoch": 0.25887291693168807, + "ewc_loss": 0.015625, + "ewc_loss_parallel": 1.5616416931152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.761066436767578, + "learning_rate": 8.622297583721916e-07, + "loss": 0.474, + "mean_token_accuracy": 0.8520626425743103, + "num_tokens": 77836391.0, + "step": 2035 + }, + { + "epoch": 0.2590001272102786, + "ewc_loss": 0.0155029296875, + "ewc_loss_parallel": 1.5497207641601562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.580127716064453, + "learning_rate": 8.626536668079695e-07, + "loss": 0.4856, + "mean_token_accuracy": 0.8431588411331177, + "num_tokens": 77871600.0, + "step": 2036 + }, + { + "epoch": 0.2591273374888691, + "ewc_loss": 0.0155029296875, + "ewc_loss_parallel": 1.5497207641601562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.82476806640625, + "learning_rate": 8.630775752437474e-07, + "loss": 0.4746, + "mean_token_accuracy": 0.8493351936340332, + "num_tokens": 77904623.0, + "step": 2037 + }, + { + "epoch": 0.2592545477674596, + "ewc_loss": 0.0155029296875, + "ewc_loss_parallel": 1.5497207641601562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.496963500976562, + "learning_rate": 8.635014836795251e-07, + "loss": 0.5128, + "mean_token_accuracy": 0.838222861289978, + "num_tokens": 77941145.0, + "step": 2038 + }, + { + "epoch": 0.2593817580460501, + "ewc_loss": 0.0155029296875, + "ewc_loss_parallel": 1.5497207641601562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.065793991088867, + "learning_rate": 8.63925392115303e-07, + "loss": 0.4506, + "mean_token_accuracy": 0.8533099293708801, + "num_tokens": 77975087.0, + "step": 2039 + }, + { + "epoch": 0.25950896832464065, + "ewc_loss": 0.0157470703125, + "ewc_loss_parallel": 1.5735626220703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.61357307434082, + "learning_rate": 8.643493005510809e-07, + "loss": 0.4585, + "mean_token_accuracy": 0.8524636030197144, + "num_tokens": 78009526.0, + "step": 2040 + }, + { + "epoch": 0.2596361786032311, + "ewc_loss": 0.0155029296875, + "ewc_loss_parallel": 1.5497207641601562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.761999130249023, + "learning_rate": 8.647732089868588e-07, + "loss": 0.5352, + "mean_token_accuracy": 0.8304874300956726, + "num_tokens": 78050918.0, + "step": 2041 + }, + { + "epoch": 0.25976338888182166, + "ewc_loss": 0.0157470703125, + "ewc_loss_parallel": 1.5735626220703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.7143611907959, + "learning_rate": 8.651971174226366e-07, + "loss": 0.4522, + "mean_token_accuracy": 0.8553327918052673, + "num_tokens": 78085547.0, + "step": 2042 + }, + { + "epoch": 0.2598905991604122, + "ewc_loss": 0.015625, + "ewc_loss_parallel": 1.5616416931152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.468217849731445, + "learning_rate": 8.656210258584146e-07, + "loss": 0.5214, + "mean_token_accuracy": 0.8330469727516174, + "num_tokens": 78122916.0, + "step": 2043 + }, + { + "epoch": 0.26001780943900266, + "ewc_loss": 0.015625, + "ewc_loss_parallel": 1.5616416931152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.840473175048828, + "learning_rate": 8.660449342941924e-07, + "loss": 0.5145, + "mean_token_accuracy": 0.833366870880127, + "num_tokens": 78161800.0, + "step": 2044 + }, + { + "epoch": 0.2601450197175932, + "ewc_loss": 0.015625, + "ewc_loss_parallel": 1.5616416931152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.698745727539062, + "learning_rate": 8.664688427299704e-07, + "loss": 0.4924, + "mean_token_accuracy": 0.8428030014038086, + "num_tokens": 78199961.0, + "step": 2045 + }, + { + "epoch": 0.2602722299961837, + "ewc_loss": 0.015625, + "ewc_loss_parallel": 1.5616416931152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.716686248779297, + "learning_rate": 8.668927511657481e-07, + "loss": 0.4502, + "mean_token_accuracy": 0.8539580702781677, + "num_tokens": 78231332.0, + "step": 2046 + }, + { + "epoch": 0.2603994402747742, + "ewc_loss": 0.0157470703125, + "ewc_loss_parallel": 1.5735626220703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.71603012084961, + "learning_rate": 8.67316659601526e-07, + "loss": 0.5065, + "mean_token_accuracy": 0.8380352258682251, + "num_tokens": 78265054.0, + "step": 2047 + }, + { + "epoch": 0.2605266505533647, + "ewc_loss": 0.015625, + "ewc_loss_parallel": 1.5616416931152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.71367073059082, + "learning_rate": 8.677405680373039e-07, + "loss": 0.463, + "mean_token_accuracy": 0.8539851903915405, + "num_tokens": 78305445.0, + "step": 2048 + }, + { + "epoch": 0.26065386083195524, + "ewc_loss": 0.015625, + "ewc_loss_parallel": 1.5616416931152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.62868881225586, + "learning_rate": 8.681644764730818e-07, + "loss": 0.4932, + "mean_token_accuracy": 0.8413673043251038, + "num_tokens": 78339174.0, + "step": 2049 + }, + { + "epoch": 0.2607810711105457, + "ewc_loss": 0.015625, + "ewc_loss_parallel": 1.5616416931152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.74702262878418, + "learning_rate": 8.685883849088596e-07, + "loss": 0.4348, + "mean_token_accuracy": 0.860611081123352, + "num_tokens": 78377630.0, + "step": 2050 + }, + { + "epoch": 0.26090828138913624, + "ewc_loss": 0.0157470703125, + "ewc_loss_parallel": 1.5735626220703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.8507022857666, + "learning_rate": 8.690122933446376e-07, + "loss": 0.4912, + "mean_token_accuracy": 0.8472294807434082, + "num_tokens": 78419908.0, + "step": 2051 + }, + { + "epoch": 0.26103549166772677, + "ewc_loss": 0.0155029296875, + "ewc_loss_parallel": 1.5497207641601562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.092376708984375, + "learning_rate": 8.694362017804154e-07, + "loss": 0.4573, + "mean_token_accuracy": 0.8550996780395508, + "num_tokens": 78459561.0, + "step": 2052 + }, + { + "epoch": 0.26116270194631724, + "ewc_loss": 0.015625, + "ewc_loss_parallel": 1.5616416931152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.799304962158203, + "learning_rate": 8.698601102161933e-07, + "loss": 0.4896, + "mean_token_accuracy": 0.8436682820320129, + "num_tokens": 78502500.0, + "step": 2053 + }, + { + "epoch": 0.26128991222490777, + "ewc_loss": 0.0155029296875, + "ewc_loss_parallel": 1.5497207641601562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.278034210205078, + "learning_rate": 8.702840186519711e-07, + "loss": 0.4459, + "mean_token_accuracy": 0.8575461506843567, + "num_tokens": 78545411.0, + "step": 2054 + }, + { + "epoch": 0.2614171225034983, + "ewc_loss": 0.015625, + "ewc_loss_parallel": 1.5616416931152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.755184173583984, + "learning_rate": 8.70707927087749e-07, + "loss": 0.4456, + "mean_token_accuracy": 0.8576827049255371, + "num_tokens": 78585878.0, + "step": 2055 + }, + { + "epoch": 0.2615443327820888, + "ewc_loss": 0.0155029296875, + "ewc_loss_parallel": 1.5497207641601562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.07305908203125, + "learning_rate": 8.711318355235269e-07, + "loss": 0.5346, + "mean_token_accuracy": 0.828410804271698, + "num_tokens": 78621233.0, + "step": 2056 + }, + { + "epoch": 0.2616715430606793, + "ewc_loss": 0.015625, + "ewc_loss_parallel": 1.5616416931152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.05051612854004, + "learning_rate": 8.715557439593047e-07, + "loss": 0.4554, + "mean_token_accuracy": 0.8515546917915344, + "num_tokens": 78654644.0, + "step": 2057 + }, + { + "epoch": 0.26179875333926983, + "ewc_loss": 0.0155029296875, + "ewc_loss_parallel": 1.5497207641601562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.151063919067383, + "learning_rate": 8.719796523950826e-07, + "loss": 0.5324, + "mean_token_accuracy": 0.8298545479774475, + "num_tokens": 78692851.0, + "step": 2058 + }, + { + "epoch": 0.2619259636178603, + "ewc_loss": 0.0155029296875, + "ewc_loss_parallel": 1.5497207641601562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.982196807861328, + "learning_rate": 8.724035608308605e-07, + "loss": 0.4266, + "mean_token_accuracy": 0.8634630441665649, + "num_tokens": 78725329.0, + "step": 2059 + }, + { + "epoch": 0.26205317389645083, + "ewc_loss": 0.0155029296875, + "ewc_loss_parallel": 1.5497207641601562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.97284698486328, + "learning_rate": 8.728274692666384e-07, + "loss": 0.4829, + "mean_token_accuracy": 0.8465631604194641, + "num_tokens": 78766645.0, + "step": 2060 + }, + { + "epoch": 0.26218038417504136, + "ewc_loss": 0.015625, + "ewc_loss_parallel": 1.5616416931152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.20551872253418, + "learning_rate": 8.732513777024162e-07, + "loss": 0.4763, + "mean_token_accuracy": 0.8496098518371582, + "num_tokens": 78801576.0, + "step": 2061 + }, + { + "epoch": 0.26230759445363183, + "ewc_loss": 0.0155029296875, + "ewc_loss_parallel": 1.5497207641601562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.71599578857422, + "learning_rate": 8.736752861381941e-07, + "loss": 0.5207, + "mean_token_accuracy": 0.8333470225334167, + "num_tokens": 78845577.0, + "step": 2062 + }, + { + "epoch": 0.26243480473222236, + "ewc_loss": 0.0155029296875, + "ewc_loss_parallel": 1.5497207641601562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.938425064086914, + "learning_rate": 8.740991945739719e-07, + "loss": 0.4861, + "mean_token_accuracy": 0.8455096483230591, + "num_tokens": 78884229.0, + "step": 2063 + }, + { + "epoch": 0.2625620150108129, + "ewc_loss": 0.015625, + "ewc_loss_parallel": 1.5616416931152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.172649383544922, + "learning_rate": 8.745231030097499e-07, + "loss": 0.4722, + "mean_token_accuracy": 0.8499330282211304, + "num_tokens": 78926948.0, + "step": 2064 + }, + { + "epoch": 0.26268922528940336, + "ewc_loss": 0.0157470703125, + "ewc_loss_parallel": 1.5735626220703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.75023651123047, + "learning_rate": 8.749470114455277e-07, + "loss": 0.4486, + "mean_token_accuracy": 0.8558911681175232, + "num_tokens": 78963582.0, + "step": 2065 + }, + { + "epoch": 0.2628164355679939, + "ewc_loss": 0.0155029296875, + "ewc_loss_parallel": 1.5497207641601562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.821624755859375, + "learning_rate": 8.753709198813056e-07, + "loss": 0.5567, + "mean_token_accuracy": 0.8235707879066467, + "num_tokens": 79007345.0, + "step": 2066 + }, + { + "epoch": 0.2629436458465844, + "ewc_loss": 0.0157470703125, + "ewc_loss_parallel": 1.5735626220703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.843257904052734, + "learning_rate": 8.757948283170835e-07, + "loss": 0.4539, + "mean_token_accuracy": 0.8575340509414673, + "num_tokens": 79049505.0, + "step": 2067 + }, + { + "epoch": 0.2630708561251749, + "ewc_loss": 0.015625, + "ewc_loss_parallel": 1.5616416931152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.43691062927246, + "learning_rate": 8.762187367528613e-07, + "loss": 0.4479, + "mean_token_accuracy": 0.8538172245025635, + "num_tokens": 79087419.0, + "step": 2068 + }, + { + "epoch": 0.2631980664037654, + "ewc_loss": 0.015625, + "ewc_loss_parallel": 1.5616416931152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.703662872314453, + "learning_rate": 8.766426451886392e-07, + "loss": 0.4961, + "mean_token_accuracy": 0.8401191234588623, + "num_tokens": 79131444.0, + "step": 2069 + }, + { + "epoch": 0.26332527668235595, + "ewc_loss": 0.0155029296875, + "ewc_loss_parallel": 1.5497207641601562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.433242797851562, + "learning_rate": 8.770665536244171e-07, + "loss": 0.5231, + "mean_token_accuracy": 0.8371050357818604, + "num_tokens": 79171976.0, + "step": 2070 + }, + { + "epoch": 0.2634524869609464, + "ewc_loss": 0.0157470703125, + "ewc_loss_parallel": 1.5735626220703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.88192367553711, + "learning_rate": 8.774904620601949e-07, + "loss": 0.451, + "mean_token_accuracy": 0.8541407585144043, + "num_tokens": 79206273.0, + "step": 2071 + }, + { + "epoch": 0.26357969723953695, + "ewc_loss": 0.0155029296875, + "ewc_loss_parallel": 1.5497207641601562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.960214614868164, + "learning_rate": 8.779143704959729e-07, + "loss": 0.4541, + "mean_token_accuracy": 0.8546192049980164, + "num_tokens": 79245564.0, + "step": 2072 + }, + { + "epoch": 0.2637069075181275, + "ewc_loss": 0.0157470703125, + "ewc_loss_parallel": 1.5735626220703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.94658660888672, + "learning_rate": 8.783382789317507e-07, + "loss": 0.4814, + "mean_token_accuracy": 0.8466944098472595, + "num_tokens": 79281857.0, + "step": 2073 + }, + { + "epoch": 0.26383411779671795, + "ewc_loss": 0.0155029296875, + "ewc_loss_parallel": 1.5497207641601562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.89457130432129, + "learning_rate": 8.787621873675286e-07, + "loss": 0.517, + "mean_token_accuracy": 0.8321760892868042, + "num_tokens": 79312482.0, + "step": 2074 + }, + { + "epoch": 0.2639613280753085, + "ewc_loss": 0.0157470703125, + "ewc_loss_parallel": 1.5735626220703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.78931999206543, + "learning_rate": 8.791860958033065e-07, + "loss": 0.4567, + "mean_token_accuracy": 0.8550777435302734, + "num_tokens": 79348157.0, + "step": 2075 + }, + { + "epoch": 0.264088538353899, + "ewc_loss": 0.0155029296875, + "ewc_loss_parallel": 1.5497207641601562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.8769588470459, + "learning_rate": 8.796100042390842e-07, + "loss": 0.4423, + "mean_token_accuracy": 0.8590100407600403, + "num_tokens": 79387540.0, + "step": 2076 + }, + { + "epoch": 0.2642157486324895, + "ewc_loss": 0.015625, + "ewc_loss_parallel": 1.5616416931152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.007192611694336, + "learning_rate": 8.800339126748622e-07, + "loss": 0.4606, + "mean_token_accuracy": 0.8512341976165771, + "num_tokens": 79421424.0, + "step": 2077 + }, + { + "epoch": 0.26434295891108, + "ewc_loss": 0.015625, + "ewc_loss_parallel": 1.5616416931152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.005529403686523, + "learning_rate": 8.8045782111064e-07, + "loss": 0.5316, + "mean_token_accuracy": 0.8309507369995117, + "num_tokens": 79458733.0, + "step": 2078 + }, + { + "epoch": 0.26447016918967053, + "ewc_loss": 0.0155029296875, + "ewc_loss_parallel": 1.5497207641601562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.69200897216797, + "learning_rate": 8.808817295464179e-07, + "loss": 0.4835, + "mean_token_accuracy": 0.8475135564804077, + "num_tokens": 79495394.0, + "step": 2079 + }, + { + "epoch": 0.26459737946826106, + "ewc_loss": 0.0157470703125, + "ewc_loss_parallel": 1.5735626220703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.010751724243164, + "learning_rate": 8.813056379821958e-07, + "loss": 0.4537, + "mean_token_accuracy": 0.8564443588256836, + "num_tokens": 79529799.0, + "step": 2080 + }, + { + "epoch": 0.26472458974685154, + "ewc_loss": 0.015869140625, + "ewc_loss_parallel": 1.5854835510253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.122350692749023, + "learning_rate": 8.817295464179737e-07, + "loss": 0.4794, + "mean_token_accuracy": 0.8460768461227417, + "num_tokens": 79564664.0, + "step": 2081 + }, + { + "epoch": 0.26485180002544206, + "ewc_loss": 0.0157470703125, + "ewc_loss_parallel": 1.5735626220703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.79910659790039, + "learning_rate": 8.821534548537515e-07, + "loss": 0.5579, + "mean_token_accuracy": 0.8210839033126831, + "num_tokens": 79606104.0, + "step": 2082 + }, + { + "epoch": 0.2649790103040326, + "ewc_loss": 0.0157470703125, + "ewc_loss_parallel": 1.5735626220703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.90894317626953, + "learning_rate": 8.825773632895295e-07, + "loss": 0.4526, + "mean_token_accuracy": 0.856300950050354, + "num_tokens": 79647688.0, + "step": 2083 + }, + { + "epoch": 0.26510622058262306, + "ewc_loss": 0.0157470703125, + "ewc_loss_parallel": 1.5735626220703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.924118041992188, + "learning_rate": 8.830012717253072e-07, + "loss": 0.47, + "mean_token_accuracy": 0.8520140647888184, + "num_tokens": 79685500.0, + "step": 2084 + }, + { + "epoch": 0.2652334308612136, + "ewc_loss": 0.0157470703125, + "ewc_loss_parallel": 1.5735626220703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.971233367919922, + "learning_rate": 8.834251801610852e-07, + "loss": 0.499, + "mean_token_accuracy": 0.8403083086013794, + "num_tokens": 79729912.0, + "step": 2085 + }, + { + "epoch": 0.2653606411398041, + "ewc_loss": 0.0157470703125, + "ewc_loss_parallel": 1.5735626220703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.04555892944336, + "learning_rate": 8.83849088596863e-07, + "loss": 0.4559, + "mean_token_accuracy": 0.8544774651527405, + "num_tokens": 79767487.0, + "step": 2086 + }, + { + "epoch": 0.2654878514183946, + "ewc_loss": 0.0157470703125, + "ewc_loss_parallel": 1.5735626220703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.04593849182129, + "learning_rate": 8.842729970326409e-07, + "loss": 0.4337, + "mean_token_accuracy": 0.8638788461685181, + "num_tokens": 79804802.0, + "step": 2087 + }, + { + "epoch": 0.2656150616969851, + "ewc_loss": 0.0157470703125, + "ewc_loss_parallel": 1.5735626220703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.054359436035156, + "learning_rate": 8.846969054684188e-07, + "loss": 0.5527, + "mean_token_accuracy": 0.8278090953826904, + "num_tokens": 79845685.0, + "step": 2088 + }, + { + "epoch": 0.26574227197557565, + "ewc_loss": 0.0157470703125, + "ewc_loss_parallel": 1.5735626220703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.934812545776367, + "learning_rate": 8.851208139041967e-07, + "loss": 0.4424, + "mean_token_accuracy": 0.8572190999984741, + "num_tokens": 79884692.0, + "step": 2089 + }, + { + "epoch": 0.2658694822541661, + "ewc_loss": 0.0157470703125, + "ewc_loss_parallel": 1.5735626220703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.045215606689453, + "learning_rate": 8.855447223399745e-07, + "loss": 0.5019, + "mean_token_accuracy": 0.8372780680656433, + "num_tokens": 79919780.0, + "step": 2090 + }, + { + "epoch": 0.26599669253275665, + "ewc_loss": 0.015869140625, + "ewc_loss_parallel": 1.5854835510253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.950366973876953, + "learning_rate": 8.859686307757524e-07, + "loss": 0.5272, + "mean_token_accuracy": 0.8320314288139343, + "num_tokens": 79958317.0, + "step": 2091 + }, + { + "epoch": 0.2661239028113472, + "ewc_loss": 0.0157470703125, + "ewc_loss_parallel": 1.5735626220703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.861713409423828, + "learning_rate": 8.863925392115302e-07, + "loss": 0.4596, + "mean_token_accuracy": 0.8520068526268005, + "num_tokens": 79994126.0, + "step": 2092 + }, + { + "epoch": 0.26625111308993765, + "ewc_loss": 0.015869140625, + "ewc_loss_parallel": 1.5854835510253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.16277313232422, + "learning_rate": 8.868164476473082e-07, + "loss": 0.4302, + "mean_token_accuracy": 0.8633633852005005, + "num_tokens": 80033157.0, + "step": 2093 + }, + { + "epoch": 0.2663783233685282, + "ewc_loss": 0.015869140625, + "ewc_loss_parallel": 1.5854835510253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.021854400634766, + "learning_rate": 8.87240356083086e-07, + "loss": 0.4585, + "mean_token_accuracy": 0.8552228808403015, + "num_tokens": 80073354.0, + "step": 2094 + }, + { + "epoch": 0.2665055336471187, + "ewc_loss": 0.015869140625, + "ewc_loss_parallel": 1.5854835510253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.90361976623535, + "learning_rate": 8.876642645188639e-07, + "loss": 0.5201, + "mean_token_accuracy": 0.8348504900932312, + "num_tokens": 80111819.0, + "step": 2095 + }, + { + "epoch": 0.2666327439257092, + "ewc_loss": 0.015869140625, + "ewc_loss_parallel": 1.5854835510253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.335697174072266, + "learning_rate": 8.880881729546418e-07, + "loss": 0.4348, + "mean_token_accuracy": 0.8597913384437561, + "num_tokens": 80147951.0, + "step": 2096 + }, + { + "epoch": 0.2667599542042997, + "ewc_loss": 0.015869140625, + "ewc_loss_parallel": 1.5854835510253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.032142639160156, + "learning_rate": 8.885120813904197e-07, + "loss": 0.448, + "mean_token_accuracy": 0.8553754091262817, + "num_tokens": 80191319.0, + "step": 2097 + }, + { + "epoch": 0.26688716448289024, + "ewc_loss": 0.0157470703125, + "ewc_loss_parallel": 1.5735626220703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.044879913330078, + "learning_rate": 8.889359898261976e-07, + "loss": 0.5355, + "mean_token_accuracy": 0.8331473469734192, + "num_tokens": 80226300.0, + "step": 2098 + }, + { + "epoch": 0.2670143747614807, + "ewc_loss": 0.015869140625, + "ewc_loss_parallel": 1.5854835510253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.19695281982422, + "learning_rate": 8.893598982619753e-07, + "loss": 0.4607, + "mean_token_accuracy": 0.8525813817977905, + "num_tokens": 80265465.0, + "step": 2099 + }, + { + "epoch": 0.26714158504007124, + "ewc_loss": 0.0157470703125, + "ewc_loss_parallel": 1.5735626220703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.903066635131836, + "learning_rate": 8.897838066977532e-07, + "loss": 0.4618, + "mean_token_accuracy": 0.852627158164978, + "num_tokens": 80312696.0, + "step": 2100 + }, + { + "epoch": 0.26726879531866177, + "ewc_loss": 0.015869140625, + "ewc_loss_parallel": 1.5854835510253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.155527114868164, + "learning_rate": 8.902077151335311e-07, + "loss": 0.4843, + "mean_token_accuracy": 0.8440378904342651, + "num_tokens": 80353507.0, + "step": 2101 + }, + { + "epoch": 0.26739600559725224, + "ewc_loss": 0.0157470703125, + "ewc_loss_parallel": 1.5735626220703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.012420654296875, + "learning_rate": 8.90631623569309e-07, + "loss": 0.5665, + "mean_token_accuracy": 0.8173599243164062, + "num_tokens": 80390795.0, + "step": 2102 + }, + { + "epoch": 0.26752321587584277, + "ewc_loss": 0.015869140625, + "ewc_loss_parallel": 1.5854835510253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.16326141357422, + "learning_rate": 8.910555320050868e-07, + "loss": 0.4754, + "mean_token_accuracy": 0.8483799695968628, + "num_tokens": 80427599.0, + "step": 2103 + }, + { + "epoch": 0.2676504261544333, + "ewc_loss": 0.015869140625, + "ewc_loss_parallel": 1.5854835510253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.780179977416992, + "learning_rate": 8.914794404408648e-07, + "loss": 0.4786, + "mean_token_accuracy": 0.8484863042831421, + "num_tokens": 80468084.0, + "step": 2104 + }, + { + "epoch": 0.26777763643302377, + "ewc_loss": 0.015869140625, + "ewc_loss_parallel": 1.5854835510253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.15060043334961, + "learning_rate": 8.919033488766426e-07, + "loss": 0.4229, + "mean_token_accuracy": 0.8647266626358032, + "num_tokens": 80507218.0, + "step": 2105 + }, + { + "epoch": 0.2679048467116143, + "ewc_loss": 0.015869140625, + "ewc_loss_parallel": 1.5854835510253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.220821380615234, + "learning_rate": 8.923272573124204e-07, + "loss": 0.5065, + "mean_token_accuracy": 0.8405295610427856, + "num_tokens": 80541730.0, + "step": 2106 + }, + { + "epoch": 0.2680320569902048, + "ewc_loss": 0.0157470703125, + "ewc_loss_parallel": 1.5735626220703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.973196029663086, + "learning_rate": 8.927511657481983e-07, + "loss": 0.5172, + "mean_token_accuracy": 0.835089385509491, + "num_tokens": 80584677.0, + "step": 2107 + }, + { + "epoch": 0.2681592672687953, + "ewc_loss": 0.015869140625, + "ewc_loss_parallel": 1.5854835510253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.0325984954834, + "learning_rate": 8.931750741839762e-07, + "loss": 0.4526, + "mean_token_accuracy": 0.8567606210708618, + "num_tokens": 80616332.0, + "step": 2108 + }, + { + "epoch": 0.2682864775473858, + "ewc_loss": 0.015869140625, + "ewc_loss_parallel": 1.5854835510253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.138450622558594, + "learning_rate": 8.935989826197541e-07, + "loss": 0.4605, + "mean_token_accuracy": 0.8540708422660828, + "num_tokens": 80654435.0, + "step": 2109 + }, + { + "epoch": 0.26841368782597635, + "ewc_loss": 0.0157470703125, + "ewc_loss_parallel": 1.5735626220703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.100984573364258, + "learning_rate": 8.94022891055532e-07, + "loss": 0.4372, + "mean_token_accuracy": 0.8640156984329224, + "num_tokens": 80691246.0, + "step": 2110 + }, + { + "epoch": 0.2685408981045668, + "ewc_loss": 0.0159912109375, + "ewc_loss_parallel": 1.5974044799804688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.190185546875, + "learning_rate": 8.944467994913098e-07, + "loss": 0.4882, + "mean_token_accuracy": 0.8404057025909424, + "num_tokens": 80730510.0, + "step": 2111 + }, + { + "epoch": 0.26866810838315736, + "ewc_loss": 0.015869140625, + "ewc_loss_parallel": 1.5854835510253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.07202911376953, + "learning_rate": 8.948707079270878e-07, + "loss": 0.4066, + "mean_token_accuracy": 0.868396520614624, + "num_tokens": 80769369.0, + "step": 2112 + }, + { + "epoch": 0.2687953186617479, + "ewc_loss": 0.015869140625, + "ewc_loss_parallel": 1.5854835510253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.010356903076172, + "learning_rate": 8.952946163628656e-07, + "loss": 0.4616, + "mean_token_accuracy": 0.8502946496009827, + "num_tokens": 80802168.0, + "step": 2113 + }, + { + "epoch": 0.26892252894033836, + "ewc_loss": 0.015869140625, + "ewc_loss_parallel": 1.5854835510253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.159690856933594, + "learning_rate": 8.957185247986434e-07, + "loss": 0.4113, + "mean_token_accuracy": 0.8682106137275696, + "num_tokens": 80839389.0, + "step": 2114 + }, + { + "epoch": 0.2690497392189289, + "ewc_loss": 0.015869140625, + "ewc_loss_parallel": 1.5854835510253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.33889389038086, + "learning_rate": 8.961424332344213e-07, + "loss": 0.4663, + "mean_token_accuracy": 0.85276859998703, + "num_tokens": 80881139.0, + "step": 2115 + }, + { + "epoch": 0.2691769494975194, + "ewc_loss": 0.015869140625, + "ewc_loss_parallel": 1.5854835510253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.077028274536133, + "learning_rate": 8.965663416701992e-07, + "loss": 0.4693, + "mean_token_accuracy": 0.8485875725746155, + "num_tokens": 80919809.0, + "step": 2116 + }, + { + "epoch": 0.2693041597761099, + "ewc_loss": 0.01611328125, + "ewc_loss_parallel": 1.609325408935547e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.179712295532227, + "learning_rate": 8.969902501059771e-07, + "loss": 0.4451, + "mean_token_accuracy": 0.8598123788833618, + "num_tokens": 80956860.0, + "step": 2117 + }, + { + "epoch": 0.2694313700547004, + "ewc_loss": 0.015869140625, + "ewc_loss_parallel": 1.5854835510253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.26450538635254, + "learning_rate": 8.97414158541755e-07, + "loss": 0.4119, + "mean_token_accuracy": 0.867343008518219, + "num_tokens": 80994322.0, + "step": 2118 + }, + { + "epoch": 0.26955858033329094, + "ewc_loss": 0.0159912109375, + "ewc_loss_parallel": 1.5974044799804688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.98185157775879, + "learning_rate": 8.978380669775328e-07, + "loss": 0.4947, + "mean_token_accuracy": 0.8416095972061157, + "num_tokens": 81032792.0, + "step": 2119 + }, + { + "epoch": 0.2696857906118814, + "ewc_loss": 0.0159912109375, + "ewc_loss_parallel": 1.5974044799804688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.338115692138672, + "learning_rate": 8.982619754133107e-07, + "loss": 0.4966, + "mean_token_accuracy": 0.8429543375968933, + "num_tokens": 81070011.0, + "step": 2120 + }, + { + "epoch": 0.26981300089047194, + "ewc_loss": 0.015869140625, + "ewc_loss_parallel": 1.5854835510253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.125600814819336, + "learning_rate": 8.986858838490886e-07, + "loss": 0.498, + "mean_token_accuracy": 0.842580258846283, + "num_tokens": 81113145.0, + "step": 2121 + }, + { + "epoch": 0.26994021116906247, + "ewc_loss": 0.015869140625, + "ewc_loss_parallel": 1.5854835510253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.079044342041016, + "learning_rate": 8.991097922848663e-07, + "loss": 0.4668, + "mean_token_accuracy": 0.8519216775894165, + "num_tokens": 81154175.0, + "step": 2122 + }, + { + "epoch": 0.27006742144765294, + "ewc_loss": 0.015869140625, + "ewc_loss_parallel": 1.5854835510253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.18721580505371, + "learning_rate": 8.995337007206443e-07, + "loss": 0.4902, + "mean_token_accuracy": 0.8455658555030823, + "num_tokens": 81193319.0, + "step": 2123 + }, + { + "epoch": 0.2701946317262435, + "ewc_loss": 0.0159912109375, + "ewc_loss_parallel": 1.5974044799804688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.26805877685547, + "learning_rate": 8.999576091564221e-07, + "loss": 0.4574, + "mean_token_accuracy": 0.8533343076705933, + "num_tokens": 81229057.0, + "step": 2124 + }, + { + "epoch": 0.270321842004834, + "ewc_loss": 0.015869140625, + "ewc_loss_parallel": 1.5854835510253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.099191665649414, + "learning_rate": 9.003815175922001e-07, + "loss": 0.4657, + "mean_token_accuracy": 0.8507155179977417, + "num_tokens": 81262380.0, + "step": 2125 + }, + { + "epoch": 0.2704490522834245, + "ewc_loss": 0.0159912109375, + "ewc_loss_parallel": 1.5974044799804688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.119670867919922, + "learning_rate": 9.008054260279779e-07, + "loss": 0.4698, + "mean_token_accuracy": 0.8507822155952454, + "num_tokens": 81304080.0, + "step": 2126 + }, + { + "epoch": 0.270576262562015, + "ewc_loss": 0.015869140625, + "ewc_loss_parallel": 1.5854835510253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.15827178955078, + "learning_rate": 9.012293344637558e-07, + "loss": 0.5006, + "mean_token_accuracy": 0.8431931138038635, + "num_tokens": 81340226.0, + "step": 2127 + }, + { + "epoch": 0.27070347284060553, + "ewc_loss": 0.0159912109375, + "ewc_loss_parallel": 1.5974044799804688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.19314193725586, + "learning_rate": 9.016532428995337e-07, + "loss": 0.4472, + "mean_token_accuracy": 0.8560678958892822, + "num_tokens": 81378046.0, + "step": 2128 + }, + { + "epoch": 0.27083068311919606, + "ewc_loss": 0.0159912109375, + "ewc_loss_parallel": 1.5974044799804688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.165098190307617, + "learning_rate": 9.020771513353115e-07, + "loss": 0.4907, + "mean_token_accuracy": 0.8439125418663025, + "num_tokens": 81419486.0, + "step": 2129 + }, + { + "epoch": 0.27095789339778653, + "ewc_loss": 0.015869140625, + "ewc_loss_parallel": 1.5854835510253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.22647476196289, + "learning_rate": 9.025010597710894e-07, + "loss": 0.4905, + "mean_token_accuracy": 0.8429534435272217, + "num_tokens": 81457488.0, + "step": 2130 + }, + { + "epoch": 0.27108510367637706, + "ewc_loss": 0.015869140625, + "ewc_loss_parallel": 1.5854835510253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.413331985473633, + "learning_rate": 9.029249682068673e-07, + "loss": 0.4594, + "mean_token_accuracy": 0.8533809781074524, + "num_tokens": 81494812.0, + "step": 2131 + }, + { + "epoch": 0.2712123139549676, + "ewc_loss": 0.01611328125, + "ewc_loss_parallel": 1.609325408935547e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.458251953125, + "learning_rate": 9.033488766426451e-07, + "loss": 0.4771, + "mean_token_accuracy": 0.8465293049812317, + "num_tokens": 81531421.0, + "step": 2132 + }, + { + "epoch": 0.27133952423355806, + "ewc_loss": 0.015869140625, + "ewc_loss_parallel": 1.5854835510253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.923023223876953, + "learning_rate": 9.037727850784231e-07, + "loss": 0.4415, + "mean_token_accuracy": 0.856415867805481, + "num_tokens": 81567221.0, + "step": 2133 + }, + { + "epoch": 0.2714667345121486, + "ewc_loss": 0.015869140625, + "ewc_loss_parallel": 1.5854835510253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.215919494628906, + "learning_rate": 9.041966935142009e-07, + "loss": 0.4754, + "mean_token_accuracy": 0.8485864400863647, + "num_tokens": 81605832.0, + "step": 2134 + }, + { + "epoch": 0.2715939447907391, + "ewc_loss": 0.01611328125, + "ewc_loss_parallel": 1.609325408935547e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.535669326782227, + "learning_rate": 9.046206019499788e-07, + "loss": 0.4305, + "mean_token_accuracy": 0.8642510771751404, + "num_tokens": 81646919.0, + "step": 2135 + }, + { + "epoch": 0.2717211550693296, + "ewc_loss": 0.015869140625, + "ewc_loss_parallel": 1.5854835510253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 16.924850463867188, + "learning_rate": 9.050445103857567e-07, + "loss": 0.5093, + "mean_token_accuracy": 0.8431447148323059, + "num_tokens": 81687214.0, + "step": 2136 + }, + { + "epoch": 0.2718483653479201, + "ewc_loss": 0.0162353515625, + "ewc_loss_parallel": 1.621246337890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.278663635253906, + "learning_rate": 9.054684188215344e-07, + "loss": 0.4147, + "mean_token_accuracy": 0.8678444623947144, + "num_tokens": 81721437.0, + "step": 2137 + }, + { + "epoch": 0.27197557562651065, + "ewc_loss": 0.01611328125, + "ewc_loss_parallel": 1.609325408935547e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.114749908447266, + "learning_rate": 9.058923272573124e-07, + "loss": 0.488, + "mean_token_accuracy": 0.8448159694671631, + "num_tokens": 81756595.0, + "step": 2138 + }, + { + "epoch": 0.2721027859051011, + "ewc_loss": 0.01611328125, + "ewc_loss_parallel": 1.609325408935547e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.18610191345215, + "learning_rate": 9.063162356930902e-07, + "loss": 0.4529, + "mean_token_accuracy": 0.8564993143081665, + "num_tokens": 81796198.0, + "step": 2139 + }, + { + "epoch": 0.27222999618369165, + "ewc_loss": 0.0159912109375, + "ewc_loss_parallel": 1.5974044799804688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.116241455078125, + "learning_rate": 9.067401441288681e-07, + "loss": 0.484, + "mean_token_accuracy": 0.8481016159057617, + "num_tokens": 81833002.0, + "step": 2140 + }, + { + "epoch": 0.2723572064622822, + "ewc_loss": 0.0162353515625, + "ewc_loss_parallel": 1.621246337890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.181196212768555, + "learning_rate": 9.07164052564646e-07, + "loss": 0.4555, + "mean_token_accuracy": 0.850698709487915, + "num_tokens": 81872089.0, + "step": 2141 + }, + { + "epoch": 0.27248441674087265, + "ewc_loss": 0.01611328125, + "ewc_loss_parallel": 1.609325408935547e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.283557891845703, + "learning_rate": 9.075879610004239e-07, + "loss": 0.4904, + "mean_token_accuracy": 0.8432379961013794, + "num_tokens": 81908356.0, + "step": 2142 + }, + { + "epoch": 0.2726116270194632, + "ewc_loss": 0.01611328125, + "ewc_loss_parallel": 1.609325408935547e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.18136978149414, + "learning_rate": 9.080118694362017e-07, + "loss": 0.4957, + "mean_token_accuracy": 0.8402630686759949, + "num_tokens": 81950251.0, + "step": 2143 + }, + { + "epoch": 0.2727388372980537, + "ewc_loss": 0.01611328125, + "ewc_loss_parallel": 1.609325408935547e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.238826751708984, + "learning_rate": 9.084357778719796e-07, + "loss": 0.4887, + "mean_token_accuracy": 0.8411521911621094, + "num_tokens": 81987157.0, + "step": 2144 + }, + { + "epoch": 0.2728660475766442, + "ewc_loss": 0.016357421875, + "ewc_loss_parallel": 1.633167266845703e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.176929473876953, + "learning_rate": 9.088596863077574e-07, + "loss": 0.4725, + "mean_token_accuracy": 0.8530030846595764, + "num_tokens": 82023633.0, + "step": 2145 + }, + { + "epoch": 0.2729932578552347, + "ewc_loss": 0.01611328125, + "ewc_loss_parallel": 1.609325408935547e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.21600341796875, + "learning_rate": 9.092835947435354e-07, + "loss": 0.4757, + "mean_token_accuracy": 0.8446229696273804, + "num_tokens": 82061311.0, + "step": 2146 + }, + { + "epoch": 0.27312046813382523, + "ewc_loss": 0.0162353515625, + "ewc_loss_parallel": 1.621246337890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.436113357543945, + "learning_rate": 9.097075031793132e-07, + "loss": 0.5094, + "mean_token_accuracy": 0.8389359712600708, + "num_tokens": 82098370.0, + "step": 2147 + }, + { + "epoch": 0.2732476784124157, + "ewc_loss": 0.01611328125, + "ewc_loss_parallel": 1.609325408935547e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.294723510742188, + "learning_rate": 9.101314116150911e-07, + "loss": 0.4375, + "mean_token_accuracy": 0.8639211058616638, + "num_tokens": 82134931.0, + "step": 2148 + }, + { + "epoch": 0.27337488869100623, + "ewc_loss": 0.0162353515625, + "ewc_loss_parallel": 1.621246337890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.284381866455078, + "learning_rate": 9.10555320050869e-07, + "loss": 0.4464, + "mean_token_accuracy": 0.8579601049423218, + "num_tokens": 82171391.0, + "step": 2149 + }, + { + "epoch": 0.27350209896959676, + "ewc_loss": 0.0162353515625, + "ewc_loss_parallel": 1.621246337890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.330175399780273, + "learning_rate": 9.109792284866469e-07, + "loss": 0.4651, + "mean_token_accuracy": 0.854894757270813, + "num_tokens": 82214064.0, + "step": 2150 + }, + { + "epoch": 0.27362930924818724, + "ewc_loss": 0.01611328125, + "ewc_loss_parallel": 1.609325408935547e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.243322372436523, + "learning_rate": 9.114031369224247e-07, + "loss": 0.4816, + "mean_token_accuracy": 0.846619188785553, + "num_tokens": 82252774.0, + "step": 2151 + }, + { + "epoch": 0.27375651952677776, + "ewc_loss": 0.0159912109375, + "ewc_loss_parallel": 1.5974044799804688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.168548583984375, + "learning_rate": 9.118270453582026e-07, + "loss": 0.5361, + "mean_token_accuracy": 0.8340762853622437, + "num_tokens": 82291645.0, + "step": 2152 + }, + { + "epoch": 0.2738837298053683, + "ewc_loss": 0.0162353515625, + "ewc_loss_parallel": 1.621246337890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.209550857543945, + "learning_rate": 9.122509537939804e-07, + "loss": 0.4675, + "mean_token_accuracy": 0.8456586599349976, + "num_tokens": 82333643.0, + "step": 2153 + }, + { + "epoch": 0.27401094008395877, + "ewc_loss": 0.01611328125, + "ewc_loss_parallel": 1.609325408935547e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.10158348083496, + "learning_rate": 9.126748622297584e-07, + "loss": 0.5067, + "mean_token_accuracy": 0.835564136505127, + "num_tokens": 82370792.0, + "step": 2154 + }, + { + "epoch": 0.2741381503625493, + "ewc_loss": 0.0164794921875, + "ewc_loss_parallel": 1.6450881958007812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.410810470581055, + "learning_rate": 9.130987706655362e-07, + "loss": 0.4974, + "mean_token_accuracy": 0.8411959409713745, + "num_tokens": 82408626.0, + "step": 2155 + }, + { + "epoch": 0.2742653606411398, + "ewc_loss": 0.0164794921875, + "ewc_loss_parallel": 1.6450881958007812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.101184844970703, + "learning_rate": 9.135226791013141e-07, + "loss": 0.5164, + "mean_token_accuracy": 0.8415631055831909, + "num_tokens": 82448401.0, + "step": 2156 + }, + { + "epoch": 0.2743925709197303, + "ewc_loss": 0.016357421875, + "ewc_loss_parallel": 1.633167266845703e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.563337326049805, + "learning_rate": 9.13946587537092e-07, + "loss": 0.4729, + "mean_token_accuracy": 0.8457000255584717, + "num_tokens": 82476644.0, + "step": 2157 + }, + { + "epoch": 0.2745197811983208, + "ewc_loss": 0.016357421875, + "ewc_loss_parallel": 1.633167266845703e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.161897659301758, + "learning_rate": 9.143704959728699e-07, + "loss": 0.4895, + "mean_token_accuracy": 0.8401024341583252, + "num_tokens": 82512722.0, + "step": 2158 + }, + { + "epoch": 0.27464699147691135, + "ewc_loss": 0.016357421875, + "ewc_loss_parallel": 1.633167266845703e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.40839958190918, + "learning_rate": 9.147944044086476e-07, + "loss": 0.5099, + "mean_token_accuracy": 0.8381123542785645, + "num_tokens": 82545393.0, + "step": 2159 + }, + { + "epoch": 0.2747742017555018, + "ewc_loss": 0.016357421875, + "ewc_loss_parallel": 1.633167266845703e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.207500457763672, + "learning_rate": 9.152183128444255e-07, + "loss": 0.5066, + "mean_token_accuracy": 0.844028651714325, + "num_tokens": 82586329.0, + "step": 2160 + }, + { + "epoch": 0.27490141203409235, + "ewc_loss": 0.0164794921875, + "ewc_loss_parallel": 1.6450881958007812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.297170639038086, + "learning_rate": 9.156422212802034e-07, + "loss": 0.5132, + "mean_token_accuracy": 0.8340127468109131, + "num_tokens": 82625563.0, + "step": 2161 + }, + { + "epoch": 0.2750286223126829, + "ewc_loss": 0.0164794921875, + "ewc_loss_parallel": 1.6450881958007812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.187917709350586, + "learning_rate": 9.160661297159813e-07, + "loss": 0.4513, + "mean_token_accuracy": 0.8556728363037109, + "num_tokens": 82660030.0, + "step": 2162 + }, + { + "epoch": 0.27515583259127335, + "ewc_loss": 0.0164794921875, + "ewc_loss_parallel": 1.6450881958007812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.238269805908203, + "learning_rate": 9.164900381517592e-07, + "loss": 0.4742, + "mean_token_accuracy": 0.8483792543411255, + "num_tokens": 82698794.0, + "step": 2163 + }, + { + "epoch": 0.2752830428698639, + "ewc_loss": 0.0164794921875, + "ewc_loss_parallel": 1.6450881958007812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.10076141357422, + "learning_rate": 9.16913946587537e-07, + "loss": 0.4709, + "mean_token_accuracy": 0.8480222821235657, + "num_tokens": 82740015.0, + "step": 2164 + }, + { + "epoch": 0.2754102531484544, + "ewc_loss": 0.0164794921875, + "ewc_loss_parallel": 1.6450881958007812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.394092559814453, + "learning_rate": 9.17337855023315e-07, + "loss": 0.4853, + "mean_token_accuracy": 0.8444505929946899, + "num_tokens": 82775765.0, + "step": 2165 + }, + { + "epoch": 0.2755374634270449, + "ewc_loss": 0.0164794921875, + "ewc_loss_parallel": 1.6450881958007812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.14708137512207, + "learning_rate": 9.177617634590928e-07, + "loss": 0.435, + "mean_token_accuracy": 0.8604170083999634, + "num_tokens": 82814324.0, + "step": 2166 + }, + { + "epoch": 0.2756646737056354, + "ewc_loss": 0.0164794921875, + "ewc_loss_parallel": 1.6450881958007812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.357460021972656, + "learning_rate": 9.181856718948706e-07, + "loss": 0.4223, + "mean_token_accuracy": 0.8631100654602051, + "num_tokens": 82851007.0, + "step": 2167 + }, + { + "epoch": 0.27579188398422594, + "ewc_loss": 0.0164794921875, + "ewc_loss_parallel": 1.6450881958007812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.372108459472656, + "learning_rate": 9.186095803306485e-07, + "loss": 0.4664, + "mean_token_accuracy": 0.8499196171760559, + "num_tokens": 82886846.0, + "step": 2168 + }, + { + "epoch": 0.2759190942628164, + "ewc_loss": 0.0164794921875, + "ewc_loss_parallel": 1.6450881958007812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.316991806030273, + "learning_rate": 9.190334887664264e-07, + "loss": 0.5045, + "mean_token_accuracy": 0.8401814103126526, + "num_tokens": 82929607.0, + "step": 2169 + }, + { + "epoch": 0.27604630454140694, + "ewc_loss": 0.0164794921875, + "ewc_loss_parallel": 1.6450881958007812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.47112274169922, + "learning_rate": 9.194573972022043e-07, + "loss": 0.5633, + "mean_token_accuracy": 0.8283705115318298, + "num_tokens": 82965089.0, + "step": 2170 + }, + { + "epoch": 0.27617351481999747, + "ewc_loss": 0.0164794921875, + "ewc_loss_parallel": 1.6450881958007812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.32832908630371, + "learning_rate": 9.198813056379822e-07, + "loss": 0.5365, + "mean_token_accuracy": 0.829142153263092, + "num_tokens": 83008345.0, + "step": 2171 + }, + { + "epoch": 0.27630072509858794, + "ewc_loss": 0.016357421875, + "ewc_loss_parallel": 1.633167266845703e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.371307373046875, + "learning_rate": 9.2030521407376e-07, + "loss": 0.4902, + "mean_token_accuracy": 0.8419389128684998, + "num_tokens": 83043763.0, + "step": 2172 + }, + { + "epoch": 0.27642793537717847, + "ewc_loss": 0.0164794921875, + "ewc_loss_parallel": 1.6450881958007812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.918481826782227, + "learning_rate": 9.20729122509538e-07, + "loss": 0.4348, + "mean_token_accuracy": 0.8613678812980652, + "num_tokens": 83081389.0, + "step": 2173 + }, + { + "epoch": 0.276555145655769, + "ewc_loss": 0.0164794921875, + "ewc_loss_parallel": 1.6450881958007812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.4007511138916, + "learning_rate": 9.211530309453158e-07, + "loss": 0.4732, + "mean_token_accuracy": 0.8473789095878601, + "num_tokens": 83120547.0, + "step": 2174 + }, + { + "epoch": 0.27668235593435947, + "ewc_loss": 0.0164794921875, + "ewc_loss_parallel": 1.6450881958007812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.49276351928711, + "learning_rate": 9.215769393810936e-07, + "loss": 0.4799, + "mean_token_accuracy": 0.8491522073745728, + "num_tokens": 83157229.0, + "step": 2175 + }, + { + "epoch": 0.27680956621295, + "ewc_loss": 0.0164794921875, + "ewc_loss_parallel": 1.6450881958007812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.38220977783203, + "learning_rate": 9.220008478168715e-07, + "loss": 0.5596, + "mean_token_accuracy": 0.8223308324813843, + "num_tokens": 83198217.0, + "step": 2176 + }, + { + "epoch": 0.2769367764915405, + "ewc_loss": 0.01611328125, + "ewc_loss_parallel": 1.609325408935547e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.617984771728516, + "learning_rate": 9.224247562526494e-07, + "loss": 0.4795, + "mean_token_accuracy": 0.8458821773529053, + "num_tokens": 83238750.0, + "step": 2177 + }, + { + "epoch": 0.277063986770131, + "ewc_loss": 0.016357421875, + "ewc_loss_parallel": 1.633167266845703e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.844154357910156, + "learning_rate": 9.228486646884273e-07, + "loss": 0.4667, + "mean_token_accuracy": 0.8498421907424927, + "num_tokens": 83280138.0, + "step": 2178 + }, + { + "epoch": 0.2771911970487215, + "ewc_loss": 0.0162353515625, + "ewc_loss_parallel": 1.621246337890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.196626663208008, + "learning_rate": 9.232725731242052e-07, + "loss": 0.5214, + "mean_token_accuracy": 0.8337583541870117, + "num_tokens": 83320633.0, + "step": 2179 + }, + { + "epoch": 0.27731840732731206, + "ewc_loss": 0.0162353515625, + "ewc_loss_parallel": 1.621246337890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.458763122558594, + "learning_rate": 9.23696481559983e-07, + "loss": 0.5032, + "mean_token_accuracy": 0.8376169800758362, + "num_tokens": 83358787.0, + "step": 2180 + }, + { + "epoch": 0.2774456176059026, + "ewc_loss": 0.0164794921875, + "ewc_loss_parallel": 1.6450881958007812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.83786392211914, + "learning_rate": 9.24120389995761e-07, + "loss": 0.4465, + "mean_token_accuracy": 0.8566627502441406, + "num_tokens": 83400609.0, + "step": 2181 + }, + { + "epoch": 0.27757282788449306, + "ewc_loss": 0.0164794921875, + "ewc_loss_parallel": 1.6450881958007812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.615785598754883, + "learning_rate": 9.245442984315387e-07, + "loss": 0.4624, + "mean_token_accuracy": 0.8511881828308105, + "num_tokens": 83436058.0, + "step": 2182 + }, + { + "epoch": 0.2777000381630836, + "ewc_loss": 0.016357421875, + "ewc_loss_parallel": 1.633167266845703e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.314422607421875, + "learning_rate": 9.249682068673165e-07, + "loss": 0.4977, + "mean_token_accuracy": 0.838887095451355, + "num_tokens": 83474542.0, + "step": 2183 + }, + { + "epoch": 0.2778272484416741, + "ewc_loss": 0.016357421875, + "ewc_loss_parallel": 1.633167266845703e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.347652435302734, + "learning_rate": 9.253921153030945e-07, + "loss": 0.5349, + "mean_token_accuracy": 0.8298279047012329, + "num_tokens": 83515551.0, + "step": 2184 + }, + { + "epoch": 0.2779544587202646, + "ewc_loss": 0.0166015625, + "ewc_loss_parallel": 1.6570091247558594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.022565841674805, + "learning_rate": 9.258160237388723e-07, + "loss": 0.4484, + "mean_token_accuracy": 0.8584697246551514, + "num_tokens": 83551826.0, + "step": 2185 + }, + { + "epoch": 0.2780816689988551, + "ewc_loss": 0.016357421875, + "ewc_loss_parallel": 1.633167266845703e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.22850799560547, + "learning_rate": 9.262399321746503e-07, + "loss": 0.4672, + "mean_token_accuracy": 0.8498799800872803, + "num_tokens": 83586598.0, + "step": 2186 + }, + { + "epoch": 0.27820887927744564, + "ewc_loss": 0.016357421875, + "ewc_loss_parallel": 1.633167266845703e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.437273025512695, + "learning_rate": 9.266638406104281e-07, + "loss": 0.5164, + "mean_token_accuracy": 0.8351545333862305, + "num_tokens": 83625996.0, + "step": 2187 + }, + { + "epoch": 0.2783360895560361, + "ewc_loss": 0.0166015625, + "ewc_loss_parallel": 1.6570091247558594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.55986785888672, + "learning_rate": 9.27087749046206e-07, + "loss": 0.486, + "mean_token_accuracy": 0.8460791707038879, + "num_tokens": 83657322.0, + "step": 2188 + }, + { + "epoch": 0.27846329983462664, + "ewc_loss": 0.016357421875, + "ewc_loss_parallel": 1.633167266845703e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.56097412109375, + "learning_rate": 9.275116574819839e-07, + "loss": 0.4805, + "mean_token_accuracy": 0.8461290001869202, + "num_tokens": 83693578.0, + "step": 2189 + }, + { + "epoch": 0.27859051011321717, + "ewc_loss": 0.0164794921875, + "ewc_loss_parallel": 1.6450881958007812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.428682327270508, + "learning_rate": 9.279355659177617e-07, + "loss": 0.485, + "mean_token_accuracy": 0.8472268581390381, + "num_tokens": 83727672.0, + "step": 2190 + }, + { + "epoch": 0.27871772039180764, + "ewc_loss": 0.0164794921875, + "ewc_loss_parallel": 1.6450881958007812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.53011703491211, + "learning_rate": 9.283594743535395e-07, + "loss": 0.5289, + "mean_token_accuracy": 0.8345844745635986, + "num_tokens": 83761405.0, + "step": 2191 + }, + { + "epoch": 0.2788449306703982, + "ewc_loss": 0.016357421875, + "ewc_loss_parallel": 1.633167266845703e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.70113182067871, + "learning_rate": 9.287833827893175e-07, + "loss": 0.4676, + "mean_token_accuracy": 0.8469088673591614, + "num_tokens": 83801564.0, + "step": 2192 + }, + { + "epoch": 0.2789721409489887, + "ewc_loss": 0.016357421875, + "ewc_loss_parallel": 1.633167266845703e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.279462814331055, + "learning_rate": 9.292072912250953e-07, + "loss": 0.4978, + "mean_token_accuracy": 0.8402290344238281, + "num_tokens": 83834260.0, + "step": 2193 + }, + { + "epoch": 0.2790993512275792, + "ewc_loss": 0.016357421875, + "ewc_loss_parallel": 1.633167266845703e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.319124221801758, + "learning_rate": 9.296311996608733e-07, + "loss": 0.4251, + "mean_token_accuracy": 0.8622506856918335, + "num_tokens": 83873518.0, + "step": 2194 + }, + { + "epoch": 0.2792265615061697, + "ewc_loss": 0.0164794921875, + "ewc_loss_parallel": 1.6450881958007812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.673908233642578, + "learning_rate": 9.300551080966511e-07, + "loss": 0.5399, + "mean_token_accuracy": 0.8284832835197449, + "num_tokens": 83913902.0, + "step": 2195 + }, + { + "epoch": 0.27935377178476023, + "ewc_loss": 0.0166015625, + "ewc_loss_parallel": 1.6570091247558594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.350515365600586, + "learning_rate": 9.30479016532429e-07, + "loss": 0.4662, + "mean_token_accuracy": 0.8469796776771545, + "num_tokens": 83947952.0, + "step": 2196 + }, + { + "epoch": 0.2794809820633507, + "ewc_loss": 0.0164794921875, + "ewc_loss_parallel": 1.6450881958007812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.483510971069336, + "learning_rate": 9.309029249682068e-07, + "loss": 0.4757, + "mean_token_accuracy": 0.848548412322998, + "num_tokens": 83986057.0, + "step": 2197 + }, + { + "epoch": 0.27960819234194123, + "ewc_loss": 0.0164794921875, + "ewc_loss_parallel": 1.6450881958007812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.427288055419922, + "learning_rate": 9.313268334039847e-07, + "loss": 0.4857, + "mean_token_accuracy": 0.8445684313774109, + "num_tokens": 84023881.0, + "step": 2198 + }, + { + "epoch": 0.27973540262053176, + "ewc_loss": 0.0166015625, + "ewc_loss_parallel": 1.6570091247558594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.46202278137207, + "learning_rate": 9.317507418397625e-07, + "loss": 0.4188, + "mean_token_accuracy": 0.8667633533477783, + "num_tokens": 84061480.0, + "step": 2199 + }, + { + "epoch": 0.27986261289912223, + "ewc_loss": 0.0167236328125, + "ewc_loss_parallel": 1.6689300537109375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.40802764892578, + "learning_rate": 9.321746502755404e-07, + "loss": 0.522, + "mean_token_accuracy": 0.836362361907959, + "num_tokens": 84101782.0, + "step": 2200 + }, + { + "epoch": 0.27998982317771276, + "ewc_loss": 0.0164794921875, + "ewc_loss_parallel": 1.6450881958007812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.606475830078125, + "learning_rate": 9.325985587113183e-07, + "loss": 0.48, + "mean_token_accuracy": 0.8460999727249146, + "num_tokens": 84136125.0, + "step": 2201 + }, + { + "epoch": 0.2801170334563033, + "ewc_loss": 0.0167236328125, + "ewc_loss_parallel": 1.6689300537109375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.521535873413086, + "learning_rate": 9.330224671470962e-07, + "loss": 0.4942, + "mean_token_accuracy": 0.8468997478485107, + "num_tokens": 84169748.0, + "step": 2202 + }, + { + "epoch": 0.28024424373489376, + "ewc_loss": 0.0166015625, + "ewc_loss_parallel": 1.6570091247558594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.4962215423584, + "learning_rate": 9.334463755828741e-07, + "loss": 0.5089, + "mean_token_accuracy": 0.8346999883651733, + "num_tokens": 84201292.0, + "step": 2203 + }, + { + "epoch": 0.2803714540134843, + "ewc_loss": 0.0166015625, + "ewc_loss_parallel": 1.6570091247558594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.546642303466797, + "learning_rate": 9.338702840186519e-07, + "loss": 0.5088, + "mean_token_accuracy": 0.8356702923774719, + "num_tokens": 84240652.0, + "step": 2204 + }, + { + "epoch": 0.2804986642920748, + "ewc_loss": 0.0166015625, + "ewc_loss_parallel": 1.6570091247558594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.39162826538086, + "learning_rate": 9.342941924544298e-07, + "loss": 0.5, + "mean_token_accuracy": 0.8416280150413513, + "num_tokens": 84280252.0, + "step": 2205 + }, + { + "epoch": 0.2806258745706653, + "ewc_loss": 0.0166015625, + "ewc_loss_parallel": 1.6570091247558594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.434253692626953, + "learning_rate": 9.347181008902076e-07, + "loss": 0.4373, + "mean_token_accuracy": 0.8624545931816101, + "num_tokens": 84316505.0, + "step": 2206 + }, + { + "epoch": 0.2807530848492558, + "ewc_loss": 0.0167236328125, + "ewc_loss_parallel": 1.6689300537109375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.593997955322266, + "learning_rate": 9.351420093259855e-07, + "loss": 0.4416, + "mean_token_accuracy": 0.8576128482818604, + "num_tokens": 84349990.0, + "step": 2207 + }, + { + "epoch": 0.28088029512784635, + "ewc_loss": 0.0167236328125, + "ewc_loss_parallel": 1.6689300537109375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.4936580657959, + "learning_rate": 9.355659177617634e-07, + "loss": 0.4307, + "mean_token_accuracy": 0.8595693707466125, + "num_tokens": 84381066.0, + "step": 2208 + }, + { + "epoch": 0.2810075054064368, + "ewc_loss": 0.0164794921875, + "ewc_loss_parallel": 1.6450881958007812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.552841186523438, + "learning_rate": 9.359898261975413e-07, + "loss": 0.4216, + "mean_token_accuracy": 0.8641200065612793, + "num_tokens": 84422540.0, + "step": 2209 + }, + { + "epoch": 0.28113471568502735, + "ewc_loss": 0.0166015625, + "ewc_loss_parallel": 1.6570091247558594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.444244384765625, + "learning_rate": 9.364137346333192e-07, + "loss": 0.55, + "mean_token_accuracy": 0.8270378112792969, + "num_tokens": 84462158.0, + "step": 2210 + }, + { + "epoch": 0.2812619259636179, + "ewc_loss": 0.0167236328125, + "ewc_loss_parallel": 1.6689300537109375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.592458724975586, + "learning_rate": 9.368376430690971e-07, + "loss": 0.4393, + "mean_token_accuracy": 0.857974648475647, + "num_tokens": 84495887.0, + "step": 2211 + }, + { + "epoch": 0.28138913624220835, + "ewc_loss": 0.0167236328125, + "ewc_loss_parallel": 1.6689300537109375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.49656867980957, + "learning_rate": 9.372615515048749e-07, + "loss": 0.5122, + "mean_token_accuracy": 0.8381519317626953, + "num_tokens": 84530176.0, + "step": 2212 + }, + { + "epoch": 0.2815163465207989, + "ewc_loss": 0.0167236328125, + "ewc_loss_parallel": 1.6689300537109375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.589284896850586, + "learning_rate": 9.376854599406528e-07, + "loss": 0.5137, + "mean_token_accuracy": 0.8393266797065735, + "num_tokens": 84574450.0, + "step": 2213 + }, + { + "epoch": 0.2816435567993894, + "ewc_loss": 0.016845703125, + "ewc_loss_parallel": 1.6808509826660156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.526554107666016, + "learning_rate": 9.381093683764306e-07, + "loss": 0.4815, + "mean_token_accuracy": 0.8468750715255737, + "num_tokens": 84610661.0, + "step": 2214 + }, + { + "epoch": 0.2817707670779799, + "ewc_loss": 0.016845703125, + "ewc_loss_parallel": 1.6808509826660156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.489463806152344, + "learning_rate": 9.385332768122085e-07, + "loss": 0.4632, + "mean_token_accuracy": 0.849702000617981, + "num_tokens": 84651259.0, + "step": 2215 + }, + { + "epoch": 0.2818979773565704, + "ewc_loss": 0.0166015625, + "ewc_loss_parallel": 1.6570091247558594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.47727394104004, + "learning_rate": 9.389571852479864e-07, + "loss": 0.4674, + "mean_token_accuracy": 0.851354718208313, + "num_tokens": 84692389.0, + "step": 2216 + }, + { + "epoch": 0.28202518763516093, + "ewc_loss": 0.0167236328125, + "ewc_loss_parallel": 1.6689300537109375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.500381469726562, + "learning_rate": 9.393810936837643e-07, + "loss": 0.4706, + "mean_token_accuracy": 0.850077748298645, + "num_tokens": 84731905.0, + "step": 2217 + }, + { + "epoch": 0.2821523979137514, + "ewc_loss": 0.0167236328125, + "ewc_loss_parallel": 1.6689300537109375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.66480255126953, + "learning_rate": 9.398050021195422e-07, + "loss": 0.4994, + "mean_token_accuracy": 0.8446285724639893, + "num_tokens": 84773727.0, + "step": 2218 + }, + { + "epoch": 0.28227960819234194, + "ewc_loss": 0.0167236328125, + "ewc_loss_parallel": 1.6689300537109375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.50716781616211, + "learning_rate": 9.402289105553201e-07, + "loss": 0.4443, + "mean_token_accuracy": 0.8571961522102356, + "num_tokens": 84816771.0, + "step": 2219 + }, + { + "epoch": 0.28240681847093246, + "ewc_loss": 0.0167236328125, + "ewc_loss_parallel": 1.6689300537109375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.453548431396484, + "learning_rate": 9.406528189910978e-07, + "loss": 0.4694, + "mean_token_accuracy": 0.851533830165863, + "num_tokens": 84853172.0, + "step": 2220 + }, + { + "epoch": 0.28253402874952294, + "ewc_loss": 0.0167236328125, + "ewc_loss_parallel": 1.6689300537109375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.633344650268555, + "learning_rate": 9.410767274268757e-07, + "loss": 0.4502, + "mean_token_accuracy": 0.852831244468689, + "num_tokens": 84891508.0, + "step": 2221 + }, + { + "epoch": 0.28266123902811346, + "ewc_loss": 0.016845703125, + "ewc_loss_parallel": 1.6808509826660156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.69211196899414, + "learning_rate": 9.415006358626536e-07, + "loss": 0.5473, + "mean_token_accuracy": 0.8253505229949951, + "num_tokens": 84931204.0, + "step": 2222 + }, + { + "epoch": 0.282788449306704, + "ewc_loss": 0.0167236328125, + "ewc_loss_parallel": 1.6689300537109375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.94822120666504, + "learning_rate": 9.419245442984314e-07, + "loss": 0.5294, + "mean_token_accuracy": 0.832375168800354, + "num_tokens": 84966782.0, + "step": 2223 + }, + { + "epoch": 0.28291565958529447, + "ewc_loss": 0.0167236328125, + "ewc_loss_parallel": 1.6689300537109375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.69540786743164, + "learning_rate": 9.423484527342094e-07, + "loss": 0.4746, + "mean_token_accuracy": 0.8453001976013184, + "num_tokens": 85001512.0, + "step": 2224 + }, + { + "epoch": 0.283042869863885, + "ewc_loss": 0.0167236328125, + "ewc_loss_parallel": 1.6689300537109375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.660051345825195, + "learning_rate": 9.427723611699872e-07, + "loss": 0.483, + "mean_token_accuracy": 0.8473771810531616, + "num_tokens": 85036082.0, + "step": 2225 + }, + { + "epoch": 0.2831700801424755, + "ewc_loss": 0.0167236328125, + "ewc_loss_parallel": 1.6689300537109375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.868330001831055, + "learning_rate": 9.431962696057652e-07, + "loss": 0.4889, + "mean_token_accuracy": 0.8447633385658264, + "num_tokens": 85079433.0, + "step": 2226 + }, + { + "epoch": 0.283297290421066, + "ewc_loss": 0.0167236328125, + "ewc_loss_parallel": 1.6689300537109375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.665769577026367, + "learning_rate": 9.43620178041543e-07, + "loss": 0.4398, + "mean_token_accuracy": 0.8610824346542358, + "num_tokens": 85116940.0, + "step": 2227 + }, + { + "epoch": 0.2834245006996565, + "ewc_loss": 0.0166015625, + "ewc_loss_parallel": 1.6570091247558594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.68961524963379, + "learning_rate": 9.440440864773208e-07, + "loss": 0.4933, + "mean_token_accuracy": 0.8443598747253418, + "num_tokens": 85153328.0, + "step": 2228 + }, + { + "epoch": 0.28355171097824705, + "ewc_loss": 0.0167236328125, + "ewc_loss_parallel": 1.6689300537109375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.68086814880371, + "learning_rate": 9.444679949130987e-07, + "loss": 0.4948, + "mean_token_accuracy": 0.8412810564041138, + "num_tokens": 85194059.0, + "step": 2229 + }, + { + "epoch": 0.2836789212568376, + "ewc_loss": 0.016845703125, + "ewc_loss_parallel": 1.6808509826660156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.64029884338379, + "learning_rate": 9.448919033488766e-07, + "loss": 0.5163, + "mean_token_accuracy": 0.8389192223548889, + "num_tokens": 85230820.0, + "step": 2230 + }, + { + "epoch": 0.28380613153542805, + "ewc_loss": 0.016845703125, + "ewc_loss_parallel": 1.6808509826660156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.54386329650879, + "learning_rate": 9.453158117846544e-07, + "loss": 0.4971, + "mean_token_accuracy": 0.8425614833831787, + "num_tokens": 85268864.0, + "step": 2231 + }, + { + "epoch": 0.2839333418140186, + "ewc_loss": 0.016845703125, + "ewc_loss_parallel": 1.6808509826660156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.77560043334961, + "learning_rate": 9.457397202204324e-07, + "loss": 0.5246, + "mean_token_accuracy": 0.8344732522964478, + "num_tokens": 85311377.0, + "step": 2232 + }, + { + "epoch": 0.2840605520926091, + "ewc_loss": 0.016845703125, + "ewc_loss_parallel": 1.6808509826660156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.219892501831055, + "learning_rate": 9.461636286562102e-07, + "loss": 0.4376, + "mean_token_accuracy": 0.8625282645225525, + "num_tokens": 85347082.0, + "step": 2233 + }, + { + "epoch": 0.2841877623711996, + "ewc_loss": 0.016845703125, + "ewc_loss_parallel": 1.6808509826660156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.523513793945312, + "learning_rate": 9.465875370919882e-07, + "loss": 0.5271, + "mean_token_accuracy": 0.8357503414154053, + "num_tokens": 85384696.0, + "step": 2234 + }, + { + "epoch": 0.2843149726497901, + "ewc_loss": 0.016845703125, + "ewc_loss_parallel": 1.6808509826660156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.717805862426758, + "learning_rate": 9.470114455277659e-07, + "loss": 0.5127, + "mean_token_accuracy": 0.8357899188995361, + "num_tokens": 85420192.0, + "step": 2235 + }, + { + "epoch": 0.28444218292838064, + "ewc_loss": 0.0169677734375, + "ewc_loss_parallel": 1.6927719116210938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.660276412963867, + "learning_rate": 9.474353539635438e-07, + "loss": 0.462, + "mean_token_accuracy": 0.8504072427749634, + "num_tokens": 85462281.0, + "step": 2236 + }, + { + "epoch": 0.2845693932069711, + "ewc_loss": 0.0167236328125, + "ewc_loss_parallel": 1.6689300537109375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.625835418701172, + "learning_rate": 9.478592623993217e-07, + "loss": 0.4684, + "mean_token_accuracy": 0.8494139313697815, + "num_tokens": 85495191.0, + "step": 2237 + }, + { + "epoch": 0.28469660348556164, + "ewc_loss": 0.0167236328125, + "ewc_loss_parallel": 1.6689300537109375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.107206344604492, + "learning_rate": 9.482831708350996e-07, + "loss": 0.4658, + "mean_token_accuracy": 0.8520893454551697, + "num_tokens": 85533803.0, + "step": 2238 + }, + { + "epoch": 0.28482381376415217, + "ewc_loss": 0.0169677734375, + "ewc_loss_parallel": 1.6927719116210938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.498865127563477, + "learning_rate": 9.487070792708775e-07, + "loss": 0.5132, + "mean_token_accuracy": 0.8393528461456299, + "num_tokens": 85573424.0, + "step": 2239 + }, + { + "epoch": 0.28495102404274264, + "ewc_loss": 0.0167236328125, + "ewc_loss_parallel": 1.6689300537109375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.647554397583008, + "learning_rate": 9.491309877066554e-07, + "loss": 0.496, + "mean_token_accuracy": 0.8428480625152588, + "num_tokens": 85616423.0, + "step": 2240 + }, + { + "epoch": 0.28507823432133317, + "ewc_loss": 0.016845703125, + "ewc_loss_parallel": 1.6808509826660156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.701597213745117, + "learning_rate": 9.495548961424332e-07, + "loss": 0.5226, + "mean_token_accuracy": 0.8339008092880249, + "num_tokens": 85658410.0, + "step": 2241 + }, + { + "epoch": 0.2852054445999237, + "ewc_loss": 0.016845703125, + "ewc_loss_parallel": 1.6808509826660156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.77522087097168, + "learning_rate": 9.499788045782111e-07, + "loss": 0.4801, + "mean_token_accuracy": 0.8468512296676636, + "num_tokens": 85695767.0, + "step": 2242 + }, + { + "epoch": 0.28533265487851417, + "ewc_loss": 0.0169677734375, + "ewc_loss_parallel": 1.6927719116210938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.41444969177246, + "learning_rate": 9.504027130139889e-07, + "loss": 0.4954, + "mean_token_accuracy": 0.8427377939224243, + "num_tokens": 85737771.0, + "step": 2243 + }, + { + "epoch": 0.2854598651571047, + "ewc_loss": 0.0169677734375, + "ewc_loss_parallel": 1.6927719116210938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.742551803588867, + "learning_rate": 9.508266214497667e-07, + "loss": 0.4422, + "mean_token_accuracy": 0.8614599704742432, + "num_tokens": 85774789.0, + "step": 2244 + }, + { + "epoch": 0.2855870754356952, + "ewc_loss": 0.0169677734375, + "ewc_loss_parallel": 1.6927719116210938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.677858352661133, + "learning_rate": 9.512505298855447e-07, + "loss": 0.4809, + "mean_token_accuracy": 0.8488513231277466, + "num_tokens": 85817833.0, + "step": 2245 + }, + { + "epoch": 0.2857142857142857, + "ewc_loss": 0.016845703125, + "ewc_loss_parallel": 1.6808509826660156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.473188400268555, + "learning_rate": 9.516744383213225e-07, + "loss": 0.4248, + "mean_token_accuracy": 0.8624462485313416, + "num_tokens": 85854488.0, + "step": 2246 + }, + { + "epoch": 0.2858414959928762, + "ewc_loss": 0.016845703125, + "ewc_loss_parallel": 1.6808509826660156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.56850814819336, + "learning_rate": 9.520983467571005e-07, + "loss": 0.5031, + "mean_token_accuracy": 0.840664803981781, + "num_tokens": 85893746.0, + "step": 2247 + }, + { + "epoch": 0.28596870627146675, + "ewc_loss": 0.016845703125, + "ewc_loss_parallel": 1.6808509826660156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.604427337646484, + "learning_rate": 9.525222551928783e-07, + "loss": 0.5448, + "mean_token_accuracy": 0.8339802622795105, + "num_tokens": 85932798.0, + "step": 2248 + }, + { + "epoch": 0.2860959165500572, + "ewc_loss": 0.0169677734375, + "ewc_loss_parallel": 1.6927719116210938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.62835693359375, + "learning_rate": 9.529461636286562e-07, + "loss": 0.4573, + "mean_token_accuracy": 0.8537768125534058, + "num_tokens": 85964446.0, + "step": 2249 + }, + { + "epoch": 0.28622312682864776, + "ewc_loss": 0.0169677734375, + "ewc_loss_parallel": 1.6927719116210938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.765249252319336, + "learning_rate": 9.533700720644341e-07, + "loss": 0.4345, + "mean_token_accuracy": 0.8613641858100891, + "num_tokens": 86000587.0, + "step": 2250 + }, + { + "epoch": 0.2863503371072383, + "ewc_loss": 0.0172119140625, + "ewc_loss_parallel": 1.71661376953125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.60612678527832, + "learning_rate": 9.537939805002118e-07, + "loss": 0.4866, + "mean_token_accuracy": 0.8448870182037354, + "num_tokens": 86042891.0, + "step": 2251 + }, + { + "epoch": 0.28647754738582876, + "ewc_loss": 0.01708984375, + "ewc_loss_parallel": 1.704692840576172e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.744489669799805, + "learning_rate": 9.542178889359898e-07, + "loss": 0.4636, + "mean_token_accuracy": 0.8521593809127808, + "num_tokens": 86079973.0, + "step": 2252 + }, + { + "epoch": 0.2866047576644193, + "ewc_loss": 0.01708984375, + "ewc_loss_parallel": 1.704692840576172e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.704662322998047, + "learning_rate": 9.546417973717677e-07, + "loss": 0.4869, + "mean_token_accuracy": 0.8461906909942627, + "num_tokens": 86119767.0, + "step": 2253 + }, + { + "epoch": 0.2867319679430098, + "ewc_loss": 0.01708984375, + "ewc_loss_parallel": 1.704692840576172e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.017187118530273, + "learning_rate": 9.550657058075455e-07, + "loss": 0.508, + "mean_token_accuracy": 0.8408430814743042, + "num_tokens": 86163176.0, + "step": 2254 + }, + { + "epoch": 0.2868591782216003, + "ewc_loss": 0.01708984375, + "ewc_loss_parallel": 1.704692840576172e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.753889083862305, + "learning_rate": 9.554896142433234e-07, + "loss": 0.5037, + "mean_token_accuracy": 0.8389160633087158, + "num_tokens": 86201523.0, + "step": 2255 + }, + { + "epoch": 0.2869863885001908, + "ewc_loss": 0.0169677734375, + "ewc_loss_parallel": 1.6927719116210938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.933748245239258, + "learning_rate": 9.559135226791012e-07, + "loss": 0.5214, + "mean_token_accuracy": 0.8305214643478394, + "num_tokens": 86240277.0, + "step": 2256 + }, + { + "epoch": 0.28711359877878134, + "ewc_loss": 0.01708984375, + "ewc_loss_parallel": 1.704692840576172e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.609378814697266, + "learning_rate": 9.563374311148793e-07, + "loss": 0.5115, + "mean_token_accuracy": 0.8388653993606567, + "num_tokens": 86289714.0, + "step": 2257 + }, + { + "epoch": 0.2872408090573718, + "ewc_loss": 0.01708984375, + "ewc_loss_parallel": 1.704692840576172e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.257038116455078, + "learning_rate": 9.56761339550657e-07, + "loss": 0.462, + "mean_token_accuracy": 0.8544743657112122, + "num_tokens": 86320786.0, + "step": 2258 + }, + { + "epoch": 0.28736801933596234, + "ewc_loss": 0.01708984375, + "ewc_loss_parallel": 1.704692840576172e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.68645477294922, + "learning_rate": 9.57185247986435e-07, + "loss": 0.4523, + "mean_token_accuracy": 0.855008602142334, + "num_tokens": 86358002.0, + "step": 2259 + }, + { + "epoch": 0.28749522961455287, + "ewc_loss": 0.0172119140625, + "ewc_loss_parallel": 1.71661376953125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.818626403808594, + "learning_rate": 9.576091564222128e-07, + "loss": 0.4207, + "mean_token_accuracy": 0.8626700639724731, + "num_tokens": 86395334.0, + "step": 2260 + }, + { + "epoch": 0.28762243989314334, + "ewc_loss": 0.0172119140625, + "ewc_loss_parallel": 1.71661376953125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.09760093688965, + "learning_rate": 9.580330648579906e-07, + "loss": 0.498, + "mean_token_accuracy": 0.8389296531677246, + "num_tokens": 86432616.0, + "step": 2261 + }, + { + "epoch": 0.2877496501717339, + "ewc_loss": 0.01708984375, + "ewc_loss_parallel": 1.704692840576172e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.695180892944336, + "learning_rate": 9.584569732937685e-07, + "loss": 0.4918, + "mean_token_accuracy": 0.8425763845443726, + "num_tokens": 86472293.0, + "step": 2262 + }, + { + "epoch": 0.2878768604503244, + "ewc_loss": 0.0169677734375, + "ewc_loss_parallel": 1.6927719116210938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.863964080810547, + "learning_rate": 9.588808817295463e-07, + "loss": 0.4839, + "mean_token_accuracy": 0.8422583937644958, + "num_tokens": 86506304.0, + "step": 2263 + }, + { + "epoch": 0.2880040707289149, + "ewc_loss": 0.01708984375, + "ewc_loss_parallel": 1.704692840576172e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.687061309814453, + "learning_rate": 9.593047901653242e-07, + "loss": 0.4813, + "mean_token_accuracy": 0.8444689512252808, + "num_tokens": 86540520.0, + "step": 2264 + }, + { + "epoch": 0.2881312810075054, + "ewc_loss": 0.0169677734375, + "ewc_loss_parallel": 1.6927719116210938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.851882934570312, + "learning_rate": 9.597286986011022e-07, + "loss": 0.4413, + "mean_token_accuracy": 0.857108473777771, + "num_tokens": 86576956.0, + "step": 2265 + }, + { + "epoch": 0.28825849128609593, + "ewc_loss": 0.0172119140625, + "ewc_loss_parallel": 1.71661376953125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.547170639038086, + "learning_rate": 9.601526070368799e-07, + "loss": 0.5267, + "mean_token_accuracy": 0.8340678215026855, + "num_tokens": 86613799.0, + "step": 2266 + }, + { + "epoch": 0.2883857015646864, + "ewc_loss": 0.01708984375, + "ewc_loss_parallel": 1.704692840576172e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.76983642578125, + "learning_rate": 9.60576515472658e-07, + "loss": 0.464, + "mean_token_accuracy": 0.8520174026489258, + "num_tokens": 86650338.0, + "step": 2267 + }, + { + "epoch": 0.28851291184327693, + "ewc_loss": 0.0174560546875, + "ewc_loss_parallel": 1.7404556274414062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.914087295532227, + "learning_rate": 9.610004239084358e-07, + "loss": 0.4782, + "mean_token_accuracy": 0.84613037109375, + "num_tokens": 86691605.0, + "step": 2268 + }, + { + "epoch": 0.28864012212186746, + "ewc_loss": 0.0172119140625, + "ewc_loss_parallel": 1.71661376953125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.518590927124023, + "learning_rate": 9.614243323442136e-07, + "loss": 0.5455, + "mean_token_accuracy": 0.8255656957626343, + "num_tokens": 86728767.0, + "step": 2269 + }, + { + "epoch": 0.28876733240045793, + "ewc_loss": 0.01708984375, + "ewc_loss_parallel": 1.704692840576172e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.77357292175293, + "learning_rate": 9.618482407799915e-07, + "loss": 0.4585, + "mean_token_accuracy": 0.8516212701797485, + "num_tokens": 86765240.0, + "step": 2270 + }, + { + "epoch": 0.28889454267904846, + "ewc_loss": 0.0174560546875, + "ewc_loss_parallel": 1.7404556274414062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.7757511138916, + "learning_rate": 9.622721492157693e-07, + "loss": 0.426, + "mean_token_accuracy": 0.8645527362823486, + "num_tokens": 86804042.0, + "step": 2271 + }, + { + "epoch": 0.289021752957639, + "ewc_loss": 0.0174560546875, + "ewc_loss_parallel": 1.7404556274414062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.8135986328125, + "learning_rate": 9.626960576515472e-07, + "loss": 0.4777, + "mean_token_accuracy": 0.8467485904693604, + "num_tokens": 86835220.0, + "step": 2272 + }, + { + "epoch": 0.28914896323622946, + "ewc_loss": 0.0174560546875, + "ewc_loss_parallel": 1.7404556274414062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.04121208190918, + "learning_rate": 9.63119966087325e-07, + "loss": 0.463, + "mean_token_accuracy": 0.8500820994377136, + "num_tokens": 86873414.0, + "step": 2273 + }, + { + "epoch": 0.28927617351482, + "ewc_loss": 0.0174560546875, + "ewc_loss_parallel": 1.7404556274414062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.92688751220703, + "learning_rate": 9.635438745231029e-07, + "loss": 0.4527, + "mean_token_accuracy": 0.855850875377655, + "num_tokens": 86914272.0, + "step": 2274 + }, + { + "epoch": 0.2894033837934105, + "ewc_loss": 0.0172119140625, + "ewc_loss_parallel": 1.71661376953125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.758665084838867, + "learning_rate": 9.63967782958881e-07, + "loss": 0.4943, + "mean_token_accuracy": 0.8402325510978699, + "num_tokens": 86952425.0, + "step": 2275 + }, + { + "epoch": 0.289530594072001, + "ewc_loss": 0.017333984375, + "ewc_loss_parallel": 1.728534698486328e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.824670791625977, + "learning_rate": 9.643916913946588e-07, + "loss": 0.453, + "mean_token_accuracy": 0.8558037877082825, + "num_tokens": 86987842.0, + "step": 2276 + }, + { + "epoch": 0.2896578043505915, + "ewc_loss": 0.017333984375, + "ewc_loss_parallel": 1.728534698486328e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.893285751342773, + "learning_rate": 9.648155998304366e-07, + "loss": 0.4255, + "mean_token_accuracy": 0.8641119003295898, + "num_tokens": 87026750.0, + "step": 2277 + }, + { + "epoch": 0.28978501462918205, + "ewc_loss": 0.017333984375, + "ewc_loss_parallel": 1.728534698486328e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.733125686645508, + "learning_rate": 9.652395082662145e-07, + "loss": 0.5297, + "mean_token_accuracy": 0.8308703899383545, + "num_tokens": 87069685.0, + "step": 2278 + }, + { + "epoch": 0.2899122249077726, + "ewc_loss": 0.017333984375, + "ewc_loss_parallel": 1.728534698486328e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.91075325012207, + "learning_rate": 9.656634167019923e-07, + "loss": 0.4853, + "mean_token_accuracy": 0.8416273593902588, + "num_tokens": 87107054.0, + "step": 2279 + }, + { + "epoch": 0.29003943518636305, + "ewc_loss": 0.017333984375, + "ewc_loss_parallel": 1.728534698486328e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.025291442871094, + "learning_rate": 9.660873251377701e-07, + "loss": 0.4918, + "mean_token_accuracy": 0.8425164222717285, + "num_tokens": 87146254.0, + "step": 2280 + }, + { + "epoch": 0.2901666454649536, + "ewc_loss": 0.017333984375, + "ewc_loss_parallel": 1.728534698486328e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.773775100708008, + "learning_rate": 9.66511233573548e-07, + "loss": 0.517, + "mean_token_accuracy": 0.8396442532539368, + "num_tokens": 87182404.0, + "step": 2281 + }, + { + "epoch": 0.2902938557435441, + "ewc_loss": 0.017333984375, + "ewc_loss_parallel": 1.728534698486328e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.816444396972656, + "learning_rate": 9.669351420093258e-07, + "loss": 0.4483, + "mean_token_accuracy": 0.8564375042915344, + "num_tokens": 87216551.0, + "step": 2282 + }, + { + "epoch": 0.2904210660221346, + "ewc_loss": 0.017333984375, + "ewc_loss_parallel": 1.728534698486328e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.174331665039062, + "learning_rate": 9.67359050445104e-07, + "loss": 0.4738, + "mean_token_accuracy": 0.8485612869262695, + "num_tokens": 87249402.0, + "step": 2283 + }, + { + "epoch": 0.2905482763007251, + "ewc_loss": 0.017333984375, + "ewc_loss_parallel": 1.728534698486328e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.690649032592773, + "learning_rate": 9.677829588808817e-07, + "loss": 0.4593, + "mean_token_accuracy": 0.8507636785507202, + "num_tokens": 87281846.0, + "step": 2284 + }, + { + "epoch": 0.29067548657931563, + "ewc_loss": 0.017333984375, + "ewc_loss_parallel": 1.728534698486328e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.926721572875977, + "learning_rate": 9.682068673166596e-07, + "loss": 0.5311, + "mean_token_accuracy": 0.835615336894989, + "num_tokens": 87315765.0, + "step": 2285 + }, + { + "epoch": 0.2908026968579061, + "ewc_loss": 0.017333984375, + "ewc_loss_parallel": 1.728534698486328e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.937973022460938, + "learning_rate": 9.686307757524374e-07, + "loss": 0.4722, + "mean_token_accuracy": 0.8478906154632568, + "num_tokens": 87353668.0, + "step": 2286 + }, + { + "epoch": 0.29092990713649663, + "ewc_loss": 0.0174560546875, + "ewc_loss_parallel": 1.7404556274414062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.721569061279297, + "learning_rate": 9.690546841882153e-07, + "loss": 0.4031, + "mean_token_accuracy": 0.8694809675216675, + "num_tokens": 87390747.0, + "step": 2287 + }, + { + "epoch": 0.29105711741508716, + "ewc_loss": 0.0172119140625, + "ewc_loss_parallel": 1.71661376953125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.85981559753418, + "learning_rate": 9.694785926239931e-07, + "loss": 0.4851, + "mean_token_accuracy": 0.8480643630027771, + "num_tokens": 87430686.0, + "step": 2288 + }, + { + "epoch": 0.29118432769367764, + "ewc_loss": 0.017578125, + "ewc_loss_parallel": 1.7523765563964844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.80729866027832, + "learning_rate": 9.69902501059771e-07, + "loss": 0.5258, + "mean_token_accuracy": 0.8329977989196777, + "num_tokens": 87465290.0, + "step": 2289 + }, + { + "epoch": 0.29131153797226816, + "ewc_loss": 0.017333984375, + "ewc_loss_parallel": 1.728534698486328e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.879173278808594, + "learning_rate": 9.703264094955488e-07, + "loss": 0.4948, + "mean_token_accuracy": 0.8410463929176331, + "num_tokens": 87504553.0, + "step": 2290 + }, + { + "epoch": 0.2914387482508587, + "ewc_loss": 0.017333984375, + "ewc_loss_parallel": 1.728534698486328e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.819087982177734, + "learning_rate": 9.707503179313269e-07, + "loss": 0.4631, + "mean_token_accuracy": 0.8534063100814819, + "num_tokens": 87544896.0, + "step": 2291 + }, + { + "epoch": 0.29156595852944917, + "ewc_loss": 0.017333984375, + "ewc_loss_parallel": 1.728534698486328e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.893999099731445, + "learning_rate": 9.711742263671047e-07, + "loss": 0.4634, + "mean_token_accuracy": 0.8517369031906128, + "num_tokens": 87582075.0, + "step": 2292 + }, + { + "epoch": 0.2916931688080397, + "ewc_loss": 0.017333984375, + "ewc_loss_parallel": 1.728534698486328e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.79585838317871, + "learning_rate": 9.715981348028826e-07, + "loss": 0.4568, + "mean_token_accuracy": 0.8534086346626282, + "num_tokens": 87623117.0, + "step": 2293 + }, + { + "epoch": 0.2918203790866302, + "ewc_loss": 0.0174560546875, + "ewc_loss_parallel": 1.7404556274414062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.985994338989258, + "learning_rate": 9.720220432386604e-07, + "loss": 0.4724, + "mean_token_accuracy": 0.8511906862258911, + "num_tokens": 87656263.0, + "step": 2294 + }, + { + "epoch": 0.2919475893652207, + "ewc_loss": 0.017333984375, + "ewc_loss_parallel": 1.728534698486328e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.918212890625, + "learning_rate": 9.724459516744383e-07, + "loss": 0.4441, + "mean_token_accuracy": 0.8582050800323486, + "num_tokens": 87695852.0, + "step": 2295 + }, + { + "epoch": 0.2920747996438112, + "ewc_loss": 0.0174560546875, + "ewc_loss_parallel": 1.7404556274414062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.970806121826172, + "learning_rate": 9.728698601102161e-07, + "loss": 0.4438, + "mean_token_accuracy": 0.8584553003311157, + "num_tokens": 87734331.0, + "step": 2296 + }, + { + "epoch": 0.29220200992240175, + "ewc_loss": 0.0174560546875, + "ewc_loss_parallel": 1.7404556274414062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.858932495117188, + "learning_rate": 9.73293768545994e-07, + "loss": 0.4777, + "mean_token_accuracy": 0.8437983989715576, + "num_tokens": 87768447.0, + "step": 2297 + }, + { + "epoch": 0.2923292202009922, + "ewc_loss": 0.017333984375, + "ewc_loss_parallel": 1.728534698486328e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.973424911499023, + "learning_rate": 9.737176769817718e-07, + "loss": 0.5047, + "mean_token_accuracy": 0.8443831205368042, + "num_tokens": 87813213.0, + "step": 2298 + }, + { + "epoch": 0.29245643047958275, + "ewc_loss": 0.017578125, + "ewc_loss_parallel": 1.7523765563964844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.87557029724121, + "learning_rate": 9.741415854175499e-07, + "loss": 0.5415, + "mean_token_accuracy": 0.8328657150268555, + "num_tokens": 87851336.0, + "step": 2299 + }, + { + "epoch": 0.2925836407581733, + "ewc_loss": 0.017333984375, + "ewc_loss_parallel": 1.728534698486328e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.876117706298828, + "learning_rate": 9.745654938533277e-07, + "loss": 0.4955, + "mean_token_accuracy": 0.8421332240104675, + "num_tokens": 87888474.0, + "step": 2300 + }, + { + "epoch": 0.29271085103676375, + "ewc_loss": 0.017578125, + "ewc_loss_parallel": 1.7523765563964844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.82928466796875, + "learning_rate": 9.749894022891056e-07, + "loss": 0.5196, + "mean_token_accuracy": 0.8373050093650818, + "num_tokens": 87928232.0, + "step": 2301 + }, + { + "epoch": 0.2928380613153543, + "ewc_loss": 0.017578125, + "ewc_loss_parallel": 1.7523765563964844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.936182022094727, + "learning_rate": 9.754133107248834e-07, + "loss": 0.4368, + "mean_token_accuracy": 0.8586188554763794, + "num_tokens": 87966982.0, + "step": 2302 + }, + { + "epoch": 0.2929652715939448, + "ewc_loss": 0.0177001953125, + "ewc_loss_parallel": 1.7642974853515625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.95132064819336, + "learning_rate": 9.758372191606612e-07, + "loss": 0.5238, + "mean_token_accuracy": 0.8327370285987854, + "num_tokens": 88003550.0, + "step": 2303 + }, + { + "epoch": 0.2930924818725353, + "ewc_loss": 0.0177001953125, + "ewc_loss_parallel": 1.7642974853515625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.96668815612793, + "learning_rate": 9.76261127596439e-07, + "loss": 0.4338, + "mean_token_accuracy": 0.8615291118621826, + "num_tokens": 88038038.0, + "step": 2304 + }, + { + "epoch": 0.2932196921511258, + "ewc_loss": 0.017578125, + "ewc_loss_parallel": 1.7523765563964844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.01139259338379, + "learning_rate": 9.76685036032217e-07, + "loss": 0.5066, + "mean_token_accuracy": 0.8364408016204834, + "num_tokens": 88074627.0, + "step": 2305 + }, + { + "epoch": 0.29334690242971634, + "ewc_loss": 0.017578125, + "ewc_loss_parallel": 1.7523765563964844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.974939346313477, + "learning_rate": 9.771089444679948e-07, + "loss": 0.4484, + "mean_token_accuracy": 0.8559218645095825, + "num_tokens": 88116772.0, + "step": 2306 + }, + { + "epoch": 0.2934741127083068, + "ewc_loss": 0.017578125, + "ewc_loss_parallel": 1.7523765563964844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.834997177124023, + "learning_rate": 9.775328529037728e-07, + "loss": 0.4935, + "mean_token_accuracy": 0.8416805267333984, + "num_tokens": 88152888.0, + "step": 2307 + }, + { + "epoch": 0.29360132298689734, + "ewc_loss": 0.017578125, + "ewc_loss_parallel": 1.7523765563964844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.596940994262695, + "learning_rate": 9.779567613395507e-07, + "loss": 0.5235, + "mean_token_accuracy": 0.8367881774902344, + "num_tokens": 88184544.0, + "step": 2308 + }, + { + "epoch": 0.29372853326548787, + "ewc_loss": 0.017578125, + "ewc_loss_parallel": 1.7523765563964844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.033180236816406, + "learning_rate": 9.783806697753285e-07, + "loss": 0.4668, + "mean_token_accuracy": 0.852021336555481, + "num_tokens": 88221603.0, + "step": 2309 + }, + { + "epoch": 0.29385574354407834, + "ewc_loss": 0.0174560546875, + "ewc_loss_parallel": 1.7404556274414062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.88749122619629, + "learning_rate": 9.788045782111064e-07, + "loss": 0.4412, + "mean_token_accuracy": 0.8583465218544006, + "num_tokens": 88265109.0, + "step": 2310 + }, + { + "epoch": 0.29398295382266887, + "ewc_loss": 0.0177001953125, + "ewc_loss_parallel": 1.7642974853515625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.22272491455078, + "learning_rate": 9.792284866468842e-07, + "loss": 0.4318, + "mean_token_accuracy": 0.8602224588394165, + "num_tokens": 88302756.0, + "step": 2311 + }, + { + "epoch": 0.2941101641012594, + "ewc_loss": 0.017578125, + "ewc_loss_parallel": 1.7523765563964844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.02560043334961, + "learning_rate": 9.79652395082662e-07, + "loss": 0.4318, + "mean_token_accuracy": 0.8568153381347656, + "num_tokens": 88335605.0, + "step": 2312 + }, + { + "epoch": 0.29423737437984987, + "ewc_loss": 0.017578125, + "ewc_loss_parallel": 1.7523765563964844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.363445281982422, + "learning_rate": 9.8007630351844e-07, + "loss": 0.5016, + "mean_token_accuracy": 0.8421366214752197, + "num_tokens": 88369447.0, + "step": 2313 + }, + { + "epoch": 0.2943645846584404, + "ewc_loss": 0.017578125, + "ewc_loss_parallel": 1.7523765563964844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.00010871887207, + "learning_rate": 9.805002119542178e-07, + "loss": 0.5315, + "mean_token_accuracy": 0.8311786651611328, + "num_tokens": 88404442.0, + "step": 2314 + }, + { + "epoch": 0.2944917949370309, + "ewc_loss": 0.017333984375, + "ewc_loss_parallel": 1.728534698486328e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.10611343383789, + "learning_rate": 9.809241203899958e-07, + "loss": 0.4763, + "mean_token_accuracy": 0.8496510982513428, + "num_tokens": 88445870.0, + "step": 2315 + }, + { + "epoch": 0.2946190052156214, + "ewc_loss": 0.0177001953125, + "ewc_loss_parallel": 1.7642974853515625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.940444946289062, + "learning_rate": 9.813480288257737e-07, + "loss": 0.4915, + "mean_token_accuracy": 0.8394631743431091, + "num_tokens": 88480994.0, + "step": 2316 + }, + { + "epoch": 0.2947462154942119, + "ewc_loss": 0.017578125, + "ewc_loss_parallel": 1.7523765563964844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.04210090637207, + "learning_rate": 9.817719372615515e-07, + "loss": 0.446, + "mean_token_accuracy": 0.8573328256607056, + "num_tokens": 88519666.0, + "step": 2317 + }, + { + "epoch": 0.29487342577280246, + "ewc_loss": 0.017578125, + "ewc_loss_parallel": 1.7523765563964844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.97551918029785, + "learning_rate": 9.821958456973294e-07, + "loss": 0.4208, + "mean_token_accuracy": 0.8640345335006714, + "num_tokens": 88553855.0, + "step": 2318 + }, + { + "epoch": 0.29500063605139293, + "ewc_loss": 0.017578125, + "ewc_loss_parallel": 1.7523765563964844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.97370147705078, + "learning_rate": 9.826197541331072e-07, + "loss": 0.4365, + "mean_token_accuracy": 0.8607592582702637, + "num_tokens": 88589053.0, + "step": 2319 + }, + { + "epoch": 0.29512784632998346, + "ewc_loss": 0.017822265625, + "ewc_loss_parallel": 1.7762184143066406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.095083236694336, + "learning_rate": 9.83043662568885e-07, + "loss": 0.4949, + "mean_token_accuracy": 0.8438201546669006, + "num_tokens": 88627067.0, + "step": 2320 + }, + { + "epoch": 0.295255056608574, + "ewc_loss": 0.0177001953125, + "ewc_loss_parallel": 1.7642974853515625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.028217315673828, + "learning_rate": 9.83467571004663e-07, + "loss": 0.4386, + "mean_token_accuracy": 0.858614444732666, + "num_tokens": 88668030.0, + "step": 2321 + }, + { + "epoch": 0.29538226688716446, + "ewc_loss": 0.0177001953125, + "ewc_loss_parallel": 1.7642974853515625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.988243103027344, + "learning_rate": 9.838914794404407e-07, + "loss": 0.4379, + "mean_token_accuracy": 0.8608827590942383, + "num_tokens": 88708889.0, + "step": 2322 + }, + { + "epoch": 0.295509477165755, + "ewc_loss": 0.017578125, + "ewc_loss_parallel": 1.7523765563964844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.935901641845703, + "learning_rate": 9.843153878762188e-07, + "loss": 0.5041, + "mean_token_accuracy": 0.8427949547767639, + "num_tokens": 88745689.0, + "step": 2323 + }, + { + "epoch": 0.2956366874443455, + "ewc_loss": 0.0177001953125, + "ewc_loss_parallel": 1.7642974853515625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.06087303161621, + "learning_rate": 9.847392963119966e-07, + "loss": 0.4557, + "mean_token_accuracy": 0.8544406294822693, + "num_tokens": 88784667.0, + "step": 2324 + }, + { + "epoch": 0.295763897722936, + "ewc_loss": 0.0177001953125, + "ewc_loss_parallel": 1.7642974853515625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.140625, + "learning_rate": 9.851632047477745e-07, + "loss": 0.4696, + "mean_token_accuracy": 0.8491194248199463, + "num_tokens": 88827334.0, + "step": 2325 + }, + { + "epoch": 0.2958911080015265, + "ewc_loss": 0.0177001953125, + "ewc_loss_parallel": 1.7642974853515625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.204429626464844, + "learning_rate": 9.855871131835523e-07, + "loss": 0.4028, + "mean_token_accuracy": 0.8688396215438843, + "num_tokens": 88863744.0, + "step": 2326 + }, + { + "epoch": 0.29601831828011704, + "ewc_loss": 0.017578125, + "ewc_loss_parallel": 1.7523765563964844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.26107406616211, + "learning_rate": 9.860110216193302e-07, + "loss": 0.4349, + "mean_token_accuracy": 0.8617672324180603, + "num_tokens": 88903693.0, + "step": 2327 + }, + { + "epoch": 0.2961455285587075, + "ewc_loss": 0.0177001953125, + "ewc_loss_parallel": 1.7642974853515625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.881258010864258, + "learning_rate": 9.86434930055108e-07, + "loss": 0.4306, + "mean_token_accuracy": 0.8614164590835571, + "num_tokens": 88936668.0, + "step": 2328 + }, + { + "epoch": 0.29627273883729804, + "ewc_loss": 0.017578125, + "ewc_loss_parallel": 1.7523765563964844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.93040657043457, + "learning_rate": 9.868588384908859e-07, + "loss": 0.4402, + "mean_token_accuracy": 0.8569693565368652, + "num_tokens": 88970920.0, + "step": 2329 + }, + { + "epoch": 0.2963999491158886, + "ewc_loss": 0.0177001953125, + "ewc_loss_parallel": 1.7642974853515625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.808265686035156, + "learning_rate": 9.872827469266637e-07, + "loss": 0.4425, + "mean_token_accuracy": 0.8553948402404785, + "num_tokens": 89010856.0, + "step": 2330 + }, + { + "epoch": 0.2965271593944791, + "ewc_loss": 0.0177001953125, + "ewc_loss_parallel": 1.7642974853515625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.89480209350586, + "learning_rate": 9.877066553624418e-07, + "loss": 0.4518, + "mean_token_accuracy": 0.8552908897399902, + "num_tokens": 89047109.0, + "step": 2331 + }, + { + "epoch": 0.2966543696730696, + "ewc_loss": 0.0179443359375, + "ewc_loss_parallel": 1.800060272216797e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.001693725585938, + "learning_rate": 9.881305637982196e-07, + "loss": 0.4608, + "mean_token_accuracy": 0.8536298274993896, + "num_tokens": 89087859.0, + "step": 2332 + }, + { + "epoch": 0.2967815799516601, + "ewc_loss": 0.017822265625, + "ewc_loss_parallel": 1.7881393432617188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.16016387939453, + "learning_rate": 9.885544722339975e-07, + "loss": 0.4429, + "mean_token_accuracy": 0.8562581539154053, + "num_tokens": 89123601.0, + "step": 2333 + }, + { + "epoch": 0.29690879023025063, + "ewc_loss": 0.017822265625, + "ewc_loss_parallel": 1.7881393432617188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.464635848999023, + "learning_rate": 9.889783806697753e-07, + "loss": 0.4254, + "mean_token_accuracy": 0.8644710183143616, + "num_tokens": 89160133.0, + "step": 2334 + }, + { + "epoch": 0.2970360005088411, + "ewc_loss": 0.017822265625, + "ewc_loss_parallel": 1.7881393432617188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.774890899658203, + "learning_rate": 9.894022891055532e-07, + "loss": 0.4186, + "mean_token_accuracy": 0.8666499257087708, + "num_tokens": 89196731.0, + "step": 2335 + }, + { + "epoch": 0.29716321078743163, + "ewc_loss": 0.0177001953125, + "ewc_loss_parallel": 1.7642974853515625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.301063537597656, + "learning_rate": 9.89826197541331e-07, + "loss": 0.4077, + "mean_token_accuracy": 0.8683470487594604, + "num_tokens": 89230159.0, + "step": 2336 + }, + { + "epoch": 0.29729042106602216, + "ewc_loss": 0.017822265625, + "ewc_loss_parallel": 1.7881393432617188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.026226043701172, + "learning_rate": 9.902501059771089e-07, + "loss": 0.4722, + "mean_token_accuracy": 0.8478020429611206, + "num_tokens": 89264648.0, + "step": 2337 + }, + { + "epoch": 0.29741763134461263, + "ewc_loss": 0.0177001953125, + "ewc_loss_parallel": 1.7642974853515625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.091310501098633, + "learning_rate": 9.906740144128867e-07, + "loss": 0.4729, + "mean_token_accuracy": 0.8456758260726929, + "num_tokens": 89302505.0, + "step": 2338 + }, + { + "epoch": 0.29754484162320316, + "ewc_loss": 0.017822265625, + "ewc_loss_parallel": 1.7881393432617188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.207128524780273, + "learning_rate": 9.910979228486648e-07, + "loss": 0.4313, + "mean_token_accuracy": 0.8594080209732056, + "num_tokens": 89338079.0, + "step": 2339 + }, + { + "epoch": 0.2976720519017937, + "ewc_loss": 0.0179443359375, + "ewc_loss_parallel": 1.800060272216797e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.010007858276367, + "learning_rate": 9.915218312844426e-07, + "loss": 0.497, + "mean_token_accuracy": 0.8463428020477295, + "num_tokens": 89376717.0, + "step": 2340 + }, + { + "epoch": 0.29779926218038416, + "ewc_loss": 0.017822265625, + "ewc_loss_parallel": 1.7762184143066406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.148639678955078, + "learning_rate": 9.919457397202205e-07, + "loss": 0.486, + "mean_token_accuracy": 0.8436863422393799, + "num_tokens": 89416164.0, + "step": 2341 + }, + { + "epoch": 0.2979264724589747, + "ewc_loss": 0.017822265625, + "ewc_loss_parallel": 1.7762184143066406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.15279769897461, + "learning_rate": 9.923696481559983e-07, + "loss": 0.4384, + "mean_token_accuracy": 0.8594964742660522, + "num_tokens": 89446491.0, + "step": 2342 + }, + { + "epoch": 0.2980536827375652, + "ewc_loss": 0.0177001953125, + "ewc_loss_parallel": 1.7642974853515625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.056640625, + "learning_rate": 9.927935565917761e-07, + "loss": 0.4869, + "mean_token_accuracy": 0.8419390916824341, + "num_tokens": 89483637.0, + "step": 2343 + }, + { + "epoch": 0.2981808930161557, + "ewc_loss": 0.017822265625, + "ewc_loss_parallel": 1.7881393432617188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.042572021484375, + "learning_rate": 9.93217465027554e-07, + "loss": 0.4794, + "mean_token_accuracy": 0.8509517908096313, + "num_tokens": 89526532.0, + "step": 2344 + }, + { + "epoch": 0.2983081032947462, + "ewc_loss": 0.017578125, + "ewc_loss_parallel": 1.7523765563964844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.993000030517578, + "learning_rate": 9.936413734633318e-07, + "loss": 0.5419, + "mean_token_accuracy": 0.8324784636497498, + "num_tokens": 89558353.0, + "step": 2345 + }, + { + "epoch": 0.29843531357333675, + "ewc_loss": 0.017822265625, + "ewc_loss_parallel": 1.7881393432617188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.89861488342285, + "learning_rate": 9.940652818991097e-07, + "loss": 0.4342, + "mean_token_accuracy": 0.8615548610687256, + "num_tokens": 89598727.0, + "step": 2346 + }, + { + "epoch": 0.2985625238519272, + "ewc_loss": 0.017822265625, + "ewc_loss_parallel": 1.7762184143066406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.066408157348633, + "learning_rate": 9.944891903348877e-07, + "loss": 0.5017, + "mean_token_accuracy": 0.8393524885177612, + "num_tokens": 89640093.0, + "step": 2347 + }, + { + "epoch": 0.29868973413051775, + "ewc_loss": 0.017822265625, + "ewc_loss_parallel": 1.7881393432617188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.029521942138672, + "learning_rate": 9.949130987706656e-07, + "loss": 0.4464, + "mean_token_accuracy": 0.8568928241729736, + "num_tokens": 89681401.0, + "step": 2348 + }, + { + "epoch": 0.2988169444091083, + "ewc_loss": 0.0179443359375, + "ewc_loss_parallel": 1.800060272216797e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.954605102539062, + "learning_rate": 9.953370072064432e-07, + "loss": 0.4398, + "mean_token_accuracy": 0.8582218885421753, + "num_tokens": 89720366.0, + "step": 2349 + }, + { + "epoch": 0.29894415468769875, + "ewc_loss": 0.0179443359375, + "ewc_loss_parallel": 1.800060272216797e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.094512939453125, + "learning_rate": 9.957609156422213e-07, + "loss": 0.5169, + "mean_token_accuracy": 0.835008978843689, + "num_tokens": 89762234.0, + "step": 2350 + }, + { + "epoch": 0.2990713649662893, + "ewc_loss": 0.0179443359375, + "ewc_loss_parallel": 1.800060272216797e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.07025909423828, + "learning_rate": 9.961848240779991e-07, + "loss": 0.4639, + "mean_token_accuracy": 0.8531187176704407, + "num_tokens": 89792208.0, + "step": 2351 + }, + { + "epoch": 0.2991985752448798, + "ewc_loss": 0.0179443359375, + "ewc_loss_parallel": 1.800060272216797e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.984466552734375, + "learning_rate": 9.96608732513777e-07, + "loss": 0.4398, + "mean_token_accuracy": 0.8630532026290894, + "num_tokens": 89833246.0, + "step": 2352 + }, + { + "epoch": 0.2993257855234703, + "ewc_loss": 0.017822265625, + "ewc_loss_parallel": 1.7881393432617188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.13919448852539, + "learning_rate": 9.970326409495548e-07, + "loss": 0.4995, + "mean_token_accuracy": 0.8392009735107422, + "num_tokens": 89873271.0, + "step": 2353 + }, + { + "epoch": 0.2994529958020608, + "ewc_loss": 0.0179443359375, + "ewc_loss_parallel": 1.800060272216797e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.298873901367188, + "learning_rate": 9.974565493853327e-07, + "loss": 0.5151, + "mean_token_accuracy": 0.8352450132369995, + "num_tokens": 89913069.0, + "step": 2354 + }, + { + "epoch": 0.29958020608065133, + "ewc_loss": 0.01806640625, + "ewc_loss_parallel": 1.811981201171875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.401819229125977, + "learning_rate": 9.978804578211107e-07, + "loss": 0.4943, + "mean_token_accuracy": 0.8419837355613708, + "num_tokens": 89948915.0, + "step": 2355 + }, + { + "epoch": 0.2997074163592418, + "ewc_loss": 0.0179443359375, + "ewc_loss_parallel": 1.800060272216797e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.473176956176758, + "learning_rate": 9.983043662568886e-07, + "loss": 0.4928, + "mean_token_accuracy": 0.8445240259170532, + "num_tokens": 89982123.0, + "step": 2356 + }, + { + "epoch": 0.29983462663783234, + "ewc_loss": 0.017822265625, + "ewc_loss_parallel": 1.7881393432617188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.068702697753906, + "learning_rate": 9.987282746926662e-07, + "loss": 0.4421, + "mean_token_accuracy": 0.858441174030304, + "num_tokens": 90018259.0, + "step": 2357 + }, + { + "epoch": 0.29996183691642286, + "ewc_loss": 0.017822265625, + "ewc_loss_parallel": 1.7881393432617188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.13272476196289, + "learning_rate": 9.991521831284443e-07, + "loss": 0.4652, + "mean_token_accuracy": 0.8531167507171631, + "num_tokens": 90059041.0, + "step": 2358 + }, + { + "epoch": 0.30008904719501334, + "ewc_loss": 0.017822265625, + "ewc_loss_parallel": 1.7881393432617188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.281721115112305, + "learning_rate": 9.995760915642221e-07, + "loss": 0.4134, + "mean_token_accuracy": 0.8664848208427429, + "num_tokens": 90089463.0, + "step": 2359 + }, + { + "epoch": 0.30021625747360386, + "ewc_loss": 0.0179443359375, + "ewc_loss_parallel": 1.800060272216797e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.059173583984375, + "learning_rate": 1e-06, + "loss": 0.432, + "mean_token_accuracy": 0.8592876195907593, + "num_tokens": 90125691.0, + "step": 2360 + }, + { + "epoch": 0.3003434677521944, + "ewc_loss": 0.0179443359375, + "ewc_loss_parallel": 1.800060272216797e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.223709106445312, + "learning_rate": 1e-06, + "loss": 0.4864, + "mean_token_accuracy": 0.8476400375366211, + "num_tokens": 90161778.0, + "step": 2361 + }, + { + "epoch": 0.30047067803078487, + "ewc_loss": 0.017822265625, + "ewc_loss_parallel": 1.7881393432617188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.051006317138672, + "learning_rate": 1e-06, + "loss": 0.5166, + "mean_token_accuracy": 0.8360084295272827, + "num_tokens": 90202243.0, + "step": 2362 + }, + { + "epoch": 0.3005978883093754, + "ewc_loss": 0.0179443359375, + "ewc_loss_parallel": 1.800060272216797e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.115028381347656, + "learning_rate": 1e-06, + "loss": 0.4989, + "mean_token_accuracy": 0.8432847857475281, + "num_tokens": 90248924.0, + "step": 2363 + }, + { + "epoch": 0.3007250985879659, + "ewc_loss": 0.01806640625, + "ewc_loss_parallel": 1.811981201171875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.262943267822266, + "learning_rate": 1e-06, + "loss": 0.5821, + "mean_token_accuracy": 0.8220176696777344, + "num_tokens": 90281988.0, + "step": 2364 + }, + { + "epoch": 0.3008523088665564, + "ewc_loss": 0.01806640625, + "ewc_loss_parallel": 1.811981201171875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.183088302612305, + "learning_rate": 1e-06, + "loss": 0.4934, + "mean_token_accuracy": 0.8442250490188599, + "num_tokens": 90317050.0, + "step": 2365 + }, + { + "epoch": 0.3009795191451469, + "ewc_loss": 0.017822265625, + "ewc_loss_parallel": 1.7881393432617188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.895492553710938, + "learning_rate": 1e-06, + "loss": 0.4412, + "mean_token_accuracy": 0.8581901788711548, + "num_tokens": 90358064.0, + "step": 2366 + }, + { + "epoch": 0.30110672942373745, + "ewc_loss": 0.01806640625, + "ewc_loss_parallel": 1.811981201171875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.322288513183594, + "learning_rate": 1e-06, + "loss": 0.5034, + "mean_token_accuracy": 0.8393821716308594, + "num_tokens": 90400692.0, + "step": 2367 + }, + { + "epoch": 0.3012339397023279, + "ewc_loss": 0.01806640625, + "ewc_loss_parallel": 1.811981201171875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.023193359375, + "learning_rate": 1e-06, + "loss": 0.4885, + "mean_token_accuracy": 0.8413727283477783, + "num_tokens": 90432207.0, + "step": 2368 + }, + { + "epoch": 0.30136114998091845, + "ewc_loss": 0.0179443359375, + "ewc_loss_parallel": 1.800060272216797e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.184070587158203, + "learning_rate": 1e-06, + "loss": 0.4771, + "mean_token_accuracy": 0.8466888666152954, + "num_tokens": 90466081.0, + "step": 2369 + }, + { + "epoch": 0.301488360259509, + "ewc_loss": 0.0181884765625, + "ewc_loss_parallel": 1.823902130126953e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.30136489868164, + "learning_rate": 1e-06, + "loss": 0.454, + "mean_token_accuracy": 0.8563944697380066, + "num_tokens": 90501963.0, + "step": 2370 + }, + { + "epoch": 0.30161557053809945, + "ewc_loss": 0.01806640625, + "ewc_loss_parallel": 1.811981201171875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.146333694458008, + "learning_rate": 1e-06, + "loss": 0.4357, + "mean_token_accuracy": 0.8613779544830322, + "num_tokens": 90540651.0, + "step": 2371 + }, + { + "epoch": 0.30174278081669, + "ewc_loss": 0.01806640625, + "ewc_loss_parallel": 1.811981201171875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.036352157592773, + "learning_rate": 1e-06, + "loss": 0.4959, + "mean_token_accuracy": 0.8412851095199585, + "num_tokens": 90585105.0, + "step": 2372 + }, + { + "epoch": 0.3018699910952805, + "ewc_loss": 0.0181884765625, + "ewc_loss_parallel": 1.823902130126953e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.12156105041504, + "learning_rate": 1e-06, + "loss": 0.4876, + "mean_token_accuracy": 0.8472088575363159, + "num_tokens": 90626773.0, + "step": 2373 + }, + { + "epoch": 0.301997201373871, + "ewc_loss": 0.0181884765625, + "ewc_loss_parallel": 1.823902130126953e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 17.976667404174805, + "learning_rate": 1e-06, + "loss": 0.4293, + "mean_token_accuracy": 0.8642392158508301, + "num_tokens": 90665974.0, + "step": 2374 + }, + { + "epoch": 0.3021244116524615, + "ewc_loss": 0.0181884765625, + "ewc_loss_parallel": 1.823902130126953e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.010122299194336, + "learning_rate": 1e-06, + "loss": 0.4853, + "mean_token_accuracy": 0.8448747396469116, + "num_tokens": 90706801.0, + "step": 2375 + }, + { + "epoch": 0.30225162193105204, + "ewc_loss": 0.018310546875, + "ewc_loss_parallel": 1.8358230590820312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.162857055664062, + "learning_rate": 1e-06, + "loss": 0.4893, + "mean_token_accuracy": 0.8459033370018005, + "num_tokens": 90743635.0, + "step": 2376 + }, + { + "epoch": 0.3023788322096425, + "ewc_loss": 0.0181884765625, + "ewc_loss_parallel": 1.823902130126953e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.098430633544922, + "learning_rate": 1e-06, + "loss": 0.4605, + "mean_token_accuracy": 0.8500732183456421, + "num_tokens": 90782797.0, + "step": 2377 + }, + { + "epoch": 0.30250604248823304, + "ewc_loss": 0.0181884765625, + "ewc_loss_parallel": 1.823902130126953e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.2091121673584, + "learning_rate": 1e-06, + "loss": 0.4971, + "mean_token_accuracy": 0.8426514863967896, + "num_tokens": 90822661.0, + "step": 2378 + }, + { + "epoch": 0.30263325276682357, + "ewc_loss": 0.018310546875, + "ewc_loss_parallel": 1.8358230590820312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.34140968322754, + "learning_rate": 1e-06, + "loss": 0.5149, + "mean_token_accuracy": 0.8327630758285522, + "num_tokens": 90857804.0, + "step": 2379 + }, + { + "epoch": 0.3027604630454141, + "ewc_loss": 0.0181884765625, + "ewc_loss_parallel": 1.823902130126953e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.154403686523438, + "learning_rate": 1e-06, + "loss": 0.4905, + "mean_token_accuracy": 0.8477676510810852, + "num_tokens": 90895778.0, + "step": 2380 + }, + { + "epoch": 0.30288767332400457, + "ewc_loss": 0.0181884765625, + "ewc_loss_parallel": 1.823902130126953e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.005029678344727, + "learning_rate": 1e-06, + "loss": 0.4568, + "mean_token_accuracy": 0.8568584322929382, + "num_tokens": 90934463.0, + "step": 2381 + }, + { + "epoch": 0.3030148836025951, + "ewc_loss": 0.0181884765625, + "ewc_loss_parallel": 1.823902130126953e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.646236419677734, + "learning_rate": 1e-06, + "loss": 0.4521, + "mean_token_accuracy": 0.8545093536376953, + "num_tokens": 90967888.0, + "step": 2382 + }, + { + "epoch": 0.3031420938811856, + "ewc_loss": 0.0181884765625, + "ewc_loss_parallel": 1.823902130126953e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.52532958984375, + "learning_rate": 1e-06, + "loss": 0.447, + "mean_token_accuracy": 0.8574329614639282, + "num_tokens": 91005200.0, + "step": 2383 + }, + { + "epoch": 0.3032693041597761, + "ewc_loss": 0.01806640625, + "ewc_loss_parallel": 1.811981201171875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.190898895263672, + "learning_rate": 1e-06, + "loss": 0.5321, + "mean_token_accuracy": 0.833408772945404, + "num_tokens": 91037840.0, + "step": 2384 + }, + { + "epoch": 0.3033965144383666, + "ewc_loss": 0.0179443359375, + "ewc_loss_parallel": 1.800060272216797e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.036640167236328, + "learning_rate": 1e-06, + "loss": 0.5142, + "mean_token_accuracy": 0.83632493019104, + "num_tokens": 91073336.0, + "step": 2385 + }, + { + "epoch": 0.30352372471695716, + "ewc_loss": 0.01806640625, + "ewc_loss_parallel": 1.811981201171875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.47882080078125, + "learning_rate": 1e-06, + "loss": 0.4622, + "mean_token_accuracy": 0.8503527045249939, + "num_tokens": 91107001.0, + "step": 2386 + }, + { + "epoch": 0.30365093499554763, + "ewc_loss": 0.0181884765625, + "ewc_loss_parallel": 1.823902130126953e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.141252517700195, + "learning_rate": 1e-06, + "loss": 0.4583, + "mean_token_accuracy": 0.8560794591903687, + "num_tokens": 91143937.0, + "step": 2387 + }, + { + "epoch": 0.30377814527413816, + "ewc_loss": 0.0181884765625, + "ewc_loss_parallel": 1.823902130126953e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.00504493713379, + "learning_rate": 1e-06, + "loss": 0.5028, + "mean_token_accuracy": 0.8394514322280884, + "num_tokens": 91185391.0, + "step": 2388 + }, + { + "epoch": 0.3039053555527287, + "ewc_loss": 0.01806640625, + "ewc_loss_parallel": 1.811981201171875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.49518394470215, + "learning_rate": 1e-06, + "loss": 0.4715, + "mean_token_accuracy": 0.8485589027404785, + "num_tokens": 91227653.0, + "step": 2389 + }, + { + "epoch": 0.30403256583131916, + "ewc_loss": 0.01806640625, + "ewc_loss_parallel": 1.811981201171875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.052968978881836, + "learning_rate": 1e-06, + "loss": 0.4133, + "mean_token_accuracy": 0.8694751262664795, + "num_tokens": 91263610.0, + "step": 2390 + }, + { + "epoch": 0.3041597761099097, + "ewc_loss": 0.0181884765625, + "ewc_loss_parallel": 1.823902130126953e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.144086837768555, + "learning_rate": 1e-06, + "loss": 0.4712, + "mean_token_accuracy": 0.8443995714187622, + "num_tokens": 91296880.0, + "step": 2391 + }, + { + "epoch": 0.3042869863885002, + "ewc_loss": 0.01806640625, + "ewc_loss_parallel": 1.811981201171875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.31254768371582, + "learning_rate": 1e-06, + "loss": 0.5049, + "mean_token_accuracy": 0.8370500206947327, + "num_tokens": 91336242.0, + "step": 2392 + }, + { + "epoch": 0.3044141966670907, + "ewc_loss": 0.01806640625, + "ewc_loss_parallel": 1.811981201171875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.277942657470703, + "learning_rate": 1e-06, + "loss": 0.4393, + "mean_token_accuracy": 0.8566747307777405, + "num_tokens": 91376527.0, + "step": 2393 + }, + { + "epoch": 0.3045414069456812, + "ewc_loss": 0.01806640625, + "ewc_loss_parallel": 1.811981201171875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.14729881286621, + "learning_rate": 1e-06, + "loss": 0.4945, + "mean_token_accuracy": 0.8397624492645264, + "num_tokens": 91408415.0, + "step": 2394 + }, + { + "epoch": 0.30466861722427174, + "ewc_loss": 0.01806640625, + "ewc_loss_parallel": 1.811981201171875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.37980079650879, + "learning_rate": 1e-06, + "loss": 0.4678, + "mean_token_accuracy": 0.8500796556472778, + "num_tokens": 91449189.0, + "step": 2395 + }, + { + "epoch": 0.3047958275028622, + "ewc_loss": 0.018310546875, + "ewc_loss_parallel": 1.8358230590820312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.335182189941406, + "learning_rate": 1e-06, + "loss": 0.4862, + "mean_token_accuracy": 0.8462612628936768, + "num_tokens": 91488599.0, + "step": 2396 + }, + { + "epoch": 0.30492303778145274, + "ewc_loss": 0.0181884765625, + "ewc_loss_parallel": 1.823902130126953e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.30771827697754, + "learning_rate": 1e-06, + "loss": 0.4568, + "mean_token_accuracy": 0.8551372289657593, + "num_tokens": 91525138.0, + "step": 2397 + }, + { + "epoch": 0.30505024806004327, + "ewc_loss": 0.01806640625, + "ewc_loss_parallel": 1.811981201171875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.221784591674805, + "learning_rate": 1e-06, + "loss": 0.4686, + "mean_token_accuracy": 0.8518155813217163, + "num_tokens": 91567444.0, + "step": 2398 + }, + { + "epoch": 0.30517745833863374, + "ewc_loss": 0.01806640625, + "ewc_loss_parallel": 1.811981201171875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.18751335144043, + "learning_rate": 1e-06, + "loss": 0.4712, + "mean_token_accuracy": 0.8471107482910156, + "num_tokens": 91602926.0, + "step": 2399 + }, + { + "epoch": 0.3053046686172243, + "ewc_loss": 0.01806640625, + "ewc_loss_parallel": 1.811981201171875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.55335235595703, + "learning_rate": 1e-06, + "loss": 0.443, + "mean_token_accuracy": 0.861280620098114, + "num_tokens": 91635901.0, + "step": 2400 + }, + { + "epoch": 0.3054318788958148, + "ewc_loss": 0.0181884765625, + "ewc_loss_parallel": 1.823902130126953e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.255205154418945, + "learning_rate": 1e-06, + "loss": 0.4648, + "mean_token_accuracy": 0.8516491651535034, + "num_tokens": 91679542.0, + "step": 2401 + }, + { + "epoch": 0.3055590891744053, + "ewc_loss": 0.0181884765625, + "ewc_loss_parallel": 1.823902130126953e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.080541610717773, + "learning_rate": 1e-06, + "loss": 0.4806, + "mean_token_accuracy": 0.844903290271759, + "num_tokens": 91723022.0, + "step": 2402 + }, + { + "epoch": 0.3056862994529958, + "ewc_loss": 0.01806640625, + "ewc_loss_parallel": 1.811981201171875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.44685173034668, + "learning_rate": 1e-06, + "loss": 0.4825, + "mean_token_accuracy": 0.84502774477005, + "num_tokens": 91760664.0, + "step": 2403 + }, + { + "epoch": 0.30581350973158633, + "ewc_loss": 0.0181884765625, + "ewc_loss_parallel": 1.823902130126953e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.233692169189453, + "learning_rate": 1e-06, + "loss": 0.4902, + "mean_token_accuracy": 0.8461461067199707, + "num_tokens": 91805570.0, + "step": 2404 + }, + { + "epoch": 0.3059407200101768, + "ewc_loss": 0.017822265625, + "ewc_loss_parallel": 1.7881393432617188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.182025909423828, + "learning_rate": 1e-06, + "loss": 0.517, + "mean_token_accuracy": 0.8414931297302246, + "num_tokens": 91843342.0, + "step": 2405 + }, + { + "epoch": 0.30606793028876733, + "ewc_loss": 0.01806640625, + "ewc_loss_parallel": 1.811981201171875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.31741714477539, + "learning_rate": 1e-06, + "loss": 0.5014, + "mean_token_accuracy": 0.8407564163208008, + "num_tokens": 91883872.0, + "step": 2406 + }, + { + "epoch": 0.30619514056735786, + "ewc_loss": 0.0179443359375, + "ewc_loss_parallel": 1.800060272216797e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.087648391723633, + "learning_rate": 1e-06, + "loss": 0.4744, + "mean_token_accuracy": 0.8486841917037964, + "num_tokens": 91923174.0, + "step": 2407 + }, + { + "epoch": 0.30632235084594833, + "ewc_loss": 0.0181884765625, + "ewc_loss_parallel": 1.823902130126953e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.391843795776367, + "learning_rate": 1e-06, + "loss": 0.4777, + "mean_token_accuracy": 0.848781943321228, + "num_tokens": 91963462.0, + "step": 2408 + }, + { + "epoch": 0.30644956112453886, + "ewc_loss": 0.0181884765625, + "ewc_loss_parallel": 1.823902130126953e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.19814682006836, + "learning_rate": 1e-06, + "loss": 0.457, + "mean_token_accuracy": 0.8533178567886353, + "num_tokens": 92000907.0, + "step": 2409 + }, + { + "epoch": 0.3065767714031294, + "ewc_loss": 0.0181884765625, + "ewc_loss_parallel": 1.823902130126953e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.086793899536133, + "learning_rate": 1e-06, + "loss": 0.4601, + "mean_token_accuracy": 0.8512652516365051, + "num_tokens": 92036110.0, + "step": 2410 + }, + { + "epoch": 0.30670398168171986, + "ewc_loss": 0.01806640625, + "ewc_loss_parallel": 1.811981201171875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.240779876708984, + "learning_rate": 1e-06, + "loss": 0.4767, + "mean_token_accuracy": 0.845737636089325, + "num_tokens": 92080647.0, + "step": 2411 + }, + { + "epoch": 0.3068311919603104, + "ewc_loss": 0.018310546875, + "ewc_loss_parallel": 1.8358230590820312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.155752182006836, + "learning_rate": 1e-06, + "loss": 0.4597, + "mean_token_accuracy": 0.8535138368606567, + "num_tokens": 92115407.0, + "step": 2412 + }, + { + "epoch": 0.3069584022389009, + "ewc_loss": 0.01806640625, + "ewc_loss_parallel": 1.811981201171875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.275001525878906, + "learning_rate": 1e-06, + "loss": 0.4743, + "mean_token_accuracy": 0.8500757217407227, + "num_tokens": 92149660.0, + "step": 2413 + }, + { + "epoch": 0.3070856125174914, + "ewc_loss": 0.0181884765625, + "ewc_loss_parallel": 1.823902130126953e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.140043258666992, + "learning_rate": 1e-06, + "loss": 0.4583, + "mean_token_accuracy": 0.8525623083114624, + "num_tokens": 92191581.0, + "step": 2414 + }, + { + "epoch": 0.3072128227960819, + "ewc_loss": 0.018310546875, + "ewc_loss_parallel": 1.8358230590820312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.361120223999023, + "learning_rate": 1e-06, + "loss": 0.4897, + "mean_token_accuracy": 0.8446232676506042, + "num_tokens": 92230435.0, + "step": 2415 + }, + { + "epoch": 0.30734003307467245, + "ewc_loss": 0.0181884765625, + "ewc_loss_parallel": 1.823902130126953e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.132970809936523, + "learning_rate": 1e-06, + "loss": 0.5148, + "mean_token_accuracy": 0.837756872177124, + "num_tokens": 92269545.0, + "step": 2416 + }, + { + "epoch": 0.3074672433532629, + "ewc_loss": 0.018310546875, + "ewc_loss_parallel": 1.8358230590820312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.36236572265625, + "learning_rate": 1e-06, + "loss": 0.4513, + "mean_token_accuracy": 0.8572521209716797, + "num_tokens": 92307447.0, + "step": 2417 + }, + { + "epoch": 0.30759445363185345, + "ewc_loss": 0.018310546875, + "ewc_loss_parallel": 1.8358230590820312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.130847930908203, + "learning_rate": 1e-06, + "loss": 0.4722, + "mean_token_accuracy": 0.8481963276863098, + "num_tokens": 92349441.0, + "step": 2418 + }, + { + "epoch": 0.307721663910444, + "ewc_loss": 0.0181884765625, + "ewc_loss_parallel": 1.823902130126953e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.48912811279297, + "learning_rate": 1e-06, + "loss": 0.4498, + "mean_token_accuracy": 0.8544466495513916, + "num_tokens": 92380405.0, + "step": 2419 + }, + { + "epoch": 0.30784887418903445, + "ewc_loss": 0.0181884765625, + "ewc_loss_parallel": 1.823902130126953e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.231294631958008, + "learning_rate": 1e-06, + "loss": 0.495, + "mean_token_accuracy": 0.8481523394584656, + "num_tokens": 92419596.0, + "step": 2420 + }, + { + "epoch": 0.307976084467625, + "ewc_loss": 0.0181884765625, + "ewc_loss_parallel": 1.823902130126953e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.241167068481445, + "learning_rate": 1e-06, + "loss": 0.4977, + "mean_token_accuracy": 0.8423651456832886, + "num_tokens": 92454590.0, + "step": 2421 + }, + { + "epoch": 0.3081032947462155, + "ewc_loss": 0.01806640625, + "ewc_loss_parallel": 1.811981201171875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.27459144592285, + "learning_rate": 1e-06, + "loss": 0.4818, + "mean_token_accuracy": 0.8445221781730652, + "num_tokens": 92491640.0, + "step": 2422 + }, + { + "epoch": 0.308230505024806, + "ewc_loss": 0.018310546875, + "ewc_loss_parallel": 1.8358230590820312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.412647247314453, + "learning_rate": 1e-06, + "loss": 0.5102, + "mean_token_accuracy": 0.8405555486679077, + "num_tokens": 92528682.0, + "step": 2423 + }, + { + "epoch": 0.3083577153033965, + "ewc_loss": 0.01806640625, + "ewc_loss_parallel": 1.811981201171875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.2515811920166, + "learning_rate": 1e-06, + "loss": 0.4429, + "mean_token_accuracy": 0.8564356565475464, + "num_tokens": 92566368.0, + "step": 2424 + }, + { + "epoch": 0.30848492558198704, + "ewc_loss": 0.0184326171875, + "ewc_loss_parallel": 1.8477439880371094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.230730056762695, + "learning_rate": 1e-06, + "loss": 0.5379, + "mean_token_accuracy": 0.8274973630905151, + "num_tokens": 92601380.0, + "step": 2425 + }, + { + "epoch": 0.3086121358605775, + "ewc_loss": 0.01806640625, + "ewc_loss_parallel": 1.811981201171875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.181743621826172, + "learning_rate": 1e-06, + "loss": 0.4218, + "mean_token_accuracy": 0.8659114837646484, + "num_tokens": 92639724.0, + "step": 2426 + }, + { + "epoch": 0.30873934613916804, + "ewc_loss": 0.0181884765625, + "ewc_loss_parallel": 1.823902130126953e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.32028579711914, + "learning_rate": 1e-06, + "loss": 0.4427, + "mean_token_accuracy": 0.8556919097900391, + "num_tokens": 92677450.0, + "step": 2427 + }, + { + "epoch": 0.30886655641775856, + "ewc_loss": 0.018310546875, + "ewc_loss_parallel": 1.8358230590820312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.356565475463867, + "learning_rate": 1e-06, + "loss": 0.4429, + "mean_token_accuracy": 0.8573735952377319, + "num_tokens": 92714681.0, + "step": 2428 + }, + { + "epoch": 0.3089937666963491, + "ewc_loss": 0.018310546875, + "ewc_loss_parallel": 1.8358230590820312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.19818687438965, + "learning_rate": 1e-06, + "loss": 0.4779, + "mean_token_accuracy": 0.8467022180557251, + "num_tokens": 92749925.0, + "step": 2429 + }, + { + "epoch": 0.30912097697493957, + "ewc_loss": 0.018310546875, + "ewc_loss_parallel": 1.8358230590820312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.169771194458008, + "learning_rate": 1e-06, + "loss": 0.4625, + "mean_token_accuracy": 0.8534141778945923, + "num_tokens": 92791720.0, + "step": 2430 + }, + { + "epoch": 0.3092481872535301, + "ewc_loss": 0.0181884765625, + "ewc_loss_parallel": 1.823902130126953e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.32694435119629, + "learning_rate": 1e-06, + "loss": 0.4958, + "mean_token_accuracy": 0.8419428467750549, + "num_tokens": 92826179.0, + "step": 2431 + }, + { + "epoch": 0.3093753975321206, + "ewc_loss": 0.0181884765625, + "ewc_loss_parallel": 1.823902130126953e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.123292922973633, + "learning_rate": 1e-06, + "loss": 0.476, + "mean_token_accuracy": 0.8497340083122253, + "num_tokens": 92872751.0, + "step": 2432 + }, + { + "epoch": 0.3095026078107111, + "ewc_loss": 0.0181884765625, + "ewc_loss_parallel": 1.823902130126953e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.312843322753906, + "learning_rate": 1e-06, + "loss": 0.4859, + "mean_token_accuracy": 0.8438581824302673, + "num_tokens": 92907470.0, + "step": 2433 + }, + { + "epoch": 0.3096298180893016, + "ewc_loss": 0.0184326171875, + "ewc_loss_parallel": 1.8477439880371094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.12096405029297, + "learning_rate": 1e-06, + "loss": 0.4547, + "mean_token_accuracy": 0.8547062277793884, + "num_tokens": 92943321.0, + "step": 2434 + }, + { + "epoch": 0.30975702836789215, + "ewc_loss": 0.01806640625, + "ewc_loss_parallel": 1.811981201171875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.18924331665039, + "learning_rate": 1e-06, + "loss": 0.464, + "mean_token_accuracy": 0.8512157797813416, + "num_tokens": 92982136.0, + "step": 2435 + }, + { + "epoch": 0.3098842386464826, + "ewc_loss": 0.0184326171875, + "ewc_loss_parallel": 1.8477439880371094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.33397102355957, + "learning_rate": 1e-06, + "loss": 0.5143, + "mean_token_accuracy": 0.8387995958328247, + "num_tokens": 93015974.0, + "step": 2436 + }, + { + "epoch": 0.31001144892507315, + "ewc_loss": 0.018310546875, + "ewc_loss_parallel": 1.8358230590820312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.208215713500977, + "learning_rate": 1e-06, + "loss": 0.4572, + "mean_token_accuracy": 0.8525780439376831, + "num_tokens": 93054010.0, + "step": 2437 + }, + { + "epoch": 0.3101386592036637, + "ewc_loss": 0.0181884765625, + "ewc_loss_parallel": 1.823902130126953e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.120521545410156, + "learning_rate": 1e-06, + "loss": 0.4427, + "mean_token_accuracy": 0.8570492267608643, + "num_tokens": 93090226.0, + "step": 2438 + }, + { + "epoch": 0.31026586948225415, + "ewc_loss": 0.018310546875, + "ewc_loss_parallel": 1.8358230590820312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.285091400146484, + "learning_rate": 1e-06, + "loss": 0.4793, + "mean_token_accuracy": 0.8461980223655701, + "num_tokens": 93130242.0, + "step": 2439 + }, + { + "epoch": 0.3103930797608447, + "ewc_loss": 0.0185546875, + "ewc_loss_parallel": 1.8596649169921875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.129587173461914, + "learning_rate": 1e-06, + "loss": 0.4973, + "mean_token_accuracy": 0.8456978797912598, + "num_tokens": 93170990.0, + "step": 2440 + }, + { + "epoch": 0.3105202900394352, + "ewc_loss": 0.018310546875, + "ewc_loss_parallel": 1.8358230590820312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.324474334716797, + "learning_rate": 1e-06, + "loss": 0.4859, + "mean_token_accuracy": 0.8421214818954468, + "num_tokens": 93205478.0, + "step": 2441 + }, + { + "epoch": 0.3106475003180257, + "ewc_loss": 0.0181884765625, + "ewc_loss_parallel": 1.823902130126953e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.167375564575195, + "learning_rate": 1e-06, + "loss": 0.4933, + "mean_token_accuracy": 0.8422609567642212, + "num_tokens": 93240952.0, + "step": 2442 + }, + { + "epoch": 0.3107747105966162, + "ewc_loss": 0.0184326171875, + "ewc_loss_parallel": 1.8477439880371094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.292831420898438, + "learning_rate": 1e-06, + "loss": 0.5083, + "mean_token_accuracy": 0.8381716012954712, + "num_tokens": 93278882.0, + "step": 2443 + }, + { + "epoch": 0.31090192087520674, + "ewc_loss": 0.018310546875, + "ewc_loss_parallel": 1.8358230590820312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.23102569580078, + "learning_rate": 1e-06, + "loss": 0.4985, + "mean_token_accuracy": 0.8416039347648621, + "num_tokens": 93324573.0, + "step": 2444 + }, + { + "epoch": 0.3110291311537972, + "ewc_loss": 0.018310546875, + "ewc_loss_parallel": 1.8358230590820312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.19495964050293, + "learning_rate": 1e-06, + "loss": 0.437, + "mean_token_accuracy": 0.863502025604248, + "num_tokens": 93361049.0, + "step": 2445 + }, + { + "epoch": 0.31115634143238774, + "ewc_loss": 0.0184326171875, + "ewc_loss_parallel": 1.8477439880371094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.25240135192871, + "learning_rate": 1e-06, + "loss": 0.4765, + "mean_token_accuracy": 0.8465648889541626, + "num_tokens": 93401748.0, + "step": 2446 + }, + { + "epoch": 0.31128355171097827, + "ewc_loss": 0.0181884765625, + "ewc_loss_parallel": 1.823902130126953e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.212093353271484, + "learning_rate": 1e-06, + "loss": 0.4582, + "mean_token_accuracy": 0.8609666228294373, + "num_tokens": 93434857.0, + "step": 2447 + }, + { + "epoch": 0.31141076198956874, + "ewc_loss": 0.0184326171875, + "ewc_loss_parallel": 1.8477439880371094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.320880889892578, + "learning_rate": 1e-06, + "loss": 0.4851, + "mean_token_accuracy": 0.8472992181777954, + "num_tokens": 93474681.0, + "step": 2448 + }, + { + "epoch": 0.31153797226815927, + "ewc_loss": 0.0181884765625, + "ewc_loss_parallel": 1.823902130126953e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.24153709411621, + "learning_rate": 1e-06, + "loss": 0.4832, + "mean_token_accuracy": 0.8473387360572815, + "num_tokens": 93510627.0, + "step": 2449 + }, + { + "epoch": 0.3116651825467498, + "ewc_loss": 0.0184326171875, + "ewc_loss_parallel": 1.8477439880371094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.261558532714844, + "learning_rate": 1e-06, + "loss": 0.4948, + "mean_token_accuracy": 0.8396289348602295, + "num_tokens": 93551598.0, + "step": 2450 + }, + { + "epoch": 0.31179239282534027, + "ewc_loss": 0.0184326171875, + "ewc_loss_parallel": 1.8477439880371094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.31833267211914, + "learning_rate": 1e-06, + "loss": 0.5009, + "mean_token_accuracy": 0.8426158428192139, + "num_tokens": 93586622.0, + "step": 2451 + }, + { + "epoch": 0.3119196031039308, + "ewc_loss": 0.0184326171875, + "ewc_loss_parallel": 1.8477439880371094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.226261138916016, + "learning_rate": 1e-06, + "loss": 0.5235, + "mean_token_accuracy": 0.8383059501647949, + "num_tokens": 93626496.0, + "step": 2452 + }, + { + "epoch": 0.3120468133825213, + "ewc_loss": 0.0184326171875, + "ewc_loss_parallel": 1.8477439880371094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.386592864990234, + "learning_rate": 1e-06, + "loss": 0.4793, + "mean_token_accuracy": 0.8507805466651917, + "num_tokens": 93665641.0, + "step": 2453 + }, + { + "epoch": 0.3121740236611118, + "ewc_loss": 0.0184326171875, + "ewc_loss_parallel": 1.8477439880371094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.381322860717773, + "learning_rate": 1e-06, + "loss": 0.4614, + "mean_token_accuracy": 0.8545463681221008, + "num_tokens": 93702135.0, + "step": 2454 + }, + { + "epoch": 0.3123012339397023, + "ewc_loss": 0.0184326171875, + "ewc_loss_parallel": 1.8477439880371094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.330156326293945, + "learning_rate": 1e-06, + "loss": 0.4651, + "mean_token_accuracy": 0.8554707169532776, + "num_tokens": 93738226.0, + "step": 2455 + }, + { + "epoch": 0.31242844421829286, + "ewc_loss": 0.0181884765625, + "ewc_loss_parallel": 1.823902130126953e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.45296287536621, + "learning_rate": 1e-06, + "loss": 0.4693, + "mean_token_accuracy": 0.8520175218582153, + "num_tokens": 93780273.0, + "step": 2456 + }, + { + "epoch": 0.31255565449688333, + "ewc_loss": 0.018310546875, + "ewc_loss_parallel": 1.8358230590820312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.446924209594727, + "learning_rate": 1e-06, + "loss": 0.5116, + "mean_token_accuracy": 0.8366209268569946, + "num_tokens": 93821977.0, + "step": 2457 + }, + { + "epoch": 0.31268286477547386, + "ewc_loss": 0.0184326171875, + "ewc_loss_parallel": 1.8477439880371094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.470590591430664, + "learning_rate": 1e-06, + "loss": 0.4449, + "mean_token_accuracy": 0.8576492071151733, + "num_tokens": 93856084.0, + "step": 2458 + }, + { + "epoch": 0.3128100750540644, + "ewc_loss": 0.018310546875, + "ewc_loss_parallel": 1.8358230590820312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.595977783203125, + "learning_rate": 1e-06, + "loss": 0.471, + "mean_token_accuracy": 0.852412223815918, + "num_tokens": 93893491.0, + "step": 2459 + }, + { + "epoch": 0.31293728533265486, + "ewc_loss": 0.0181884765625, + "ewc_loss_parallel": 1.823902130126953e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.88178253173828, + "learning_rate": 1e-06, + "loss": 0.5078, + "mean_token_accuracy": 0.83777916431427, + "num_tokens": 93940240.0, + "step": 2460 + }, + { + "epoch": 0.3130644956112454, + "ewc_loss": 0.0184326171875, + "ewc_loss_parallel": 1.8477439880371094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.425708770751953, + "learning_rate": 1e-06, + "loss": 0.4972, + "mean_token_accuracy": 0.8409446477890015, + "num_tokens": 93974504.0, + "step": 2461 + }, + { + "epoch": 0.3131917058898359, + "ewc_loss": 0.01806640625, + "ewc_loss_parallel": 1.811981201171875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.037540435791016, + "learning_rate": 1e-06, + "loss": 0.4594, + "mean_token_accuracy": 0.8511963486671448, + "num_tokens": 94009208.0, + "step": 2462 + }, + { + "epoch": 0.3133189161684264, + "ewc_loss": 0.018310546875, + "ewc_loss_parallel": 1.8358230590820312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.133028030395508, + "learning_rate": 1e-06, + "loss": 0.4729, + "mean_token_accuracy": 0.8495776057243347, + "num_tokens": 94049249.0, + "step": 2463 + }, + { + "epoch": 0.3134461264470169, + "ewc_loss": 0.01806640625, + "ewc_loss_parallel": 1.811981201171875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.935224533081055, + "learning_rate": 1e-06, + "loss": 0.4648, + "mean_token_accuracy": 0.8490167856216431, + "num_tokens": 94085553.0, + "step": 2464 + }, + { + "epoch": 0.31357333672560744, + "ewc_loss": 0.0184326171875, + "ewc_loss_parallel": 1.8477439880371094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.427677154541016, + "learning_rate": 1e-06, + "loss": 0.53, + "mean_token_accuracy": 0.8296159505844116, + "num_tokens": 94127531.0, + "step": 2465 + }, + { + "epoch": 0.3137005470041979, + "ewc_loss": 0.0179443359375, + "ewc_loss_parallel": 1.800060272216797e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.216032028198242, + "learning_rate": 1e-06, + "loss": 0.4483, + "mean_token_accuracy": 0.8608163595199585, + "num_tokens": 94171191.0, + "step": 2466 + }, + { + "epoch": 0.31382775728278844, + "ewc_loss": 0.0181884765625, + "ewc_loss_parallel": 1.823902130126953e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.31682777404785, + "learning_rate": 1e-06, + "loss": 0.4661, + "mean_token_accuracy": 0.8545234203338623, + "num_tokens": 94212090.0, + "step": 2467 + }, + { + "epoch": 0.313954967561379, + "ewc_loss": 0.0181884765625, + "ewc_loss_parallel": 1.823902130126953e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.49441146850586, + "learning_rate": 1e-06, + "loss": 0.4101, + "mean_token_accuracy": 0.8676466941833496, + "num_tokens": 94246835.0, + "step": 2468 + }, + { + "epoch": 0.31408217783996945, + "ewc_loss": 0.0184326171875, + "ewc_loss_parallel": 1.8477439880371094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.372337341308594, + "learning_rate": 1e-06, + "loss": 0.4751, + "mean_token_accuracy": 0.8469221591949463, + "num_tokens": 94284581.0, + "step": 2469 + }, + { + "epoch": 0.31420938811856, + "ewc_loss": 0.0181884765625, + "ewc_loss_parallel": 1.823902130126953e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.58326530456543, + "learning_rate": 1e-06, + "loss": 0.4478, + "mean_token_accuracy": 0.8596336841583252, + "num_tokens": 94323050.0, + "step": 2470 + }, + { + "epoch": 0.3143365983971505, + "ewc_loss": 0.018310546875, + "ewc_loss_parallel": 1.8358230590820312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.339916229248047, + "learning_rate": 1e-06, + "loss": 0.4913, + "mean_token_accuracy": 0.8436723947525024, + "num_tokens": 94364702.0, + "step": 2471 + }, + { + "epoch": 0.314463808675741, + "ewc_loss": 0.0181884765625, + "ewc_loss_parallel": 1.823902130126953e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.410329818725586, + "learning_rate": 1e-06, + "loss": 0.4542, + "mean_token_accuracy": 0.8549140691757202, + "num_tokens": 94404735.0, + "step": 2472 + }, + { + "epoch": 0.3145910189543315, + "ewc_loss": 0.0184326171875, + "ewc_loss_parallel": 1.8477439880371094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.692659378051758, + "learning_rate": 1e-06, + "loss": 0.5173, + "mean_token_accuracy": 0.834281325340271, + "num_tokens": 94444683.0, + "step": 2473 + }, + { + "epoch": 0.31471822923292203, + "ewc_loss": 0.0181884765625, + "ewc_loss_parallel": 1.823902130126953e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.294527053833008, + "learning_rate": 1e-06, + "loss": 0.4566, + "mean_token_accuracy": 0.8525226712226868, + "num_tokens": 94484392.0, + "step": 2474 + }, + { + "epoch": 0.3148454395115125, + "ewc_loss": 0.01806640625, + "ewc_loss_parallel": 1.811981201171875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.82837677001953, + "learning_rate": 1e-06, + "loss": 0.4442, + "mean_token_accuracy": 0.8567503094673157, + "num_tokens": 94524860.0, + "step": 2475 + }, + { + "epoch": 0.31497264979010303, + "ewc_loss": 0.0185546875, + "ewc_loss_parallel": 1.8596649169921875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.172441482543945, + "learning_rate": 1e-06, + "loss": 0.4944, + "mean_token_accuracy": 0.8459969162940979, + "num_tokens": 94565076.0, + "step": 2476 + }, + { + "epoch": 0.31509986006869356, + "ewc_loss": 0.01806640625, + "ewc_loss_parallel": 1.811981201171875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.34014129638672, + "learning_rate": 1e-06, + "loss": 0.5487, + "mean_token_accuracy": 0.8261749148368835, + "num_tokens": 94606191.0, + "step": 2477 + }, + { + "epoch": 0.31522707034728403, + "ewc_loss": 0.0184326171875, + "ewc_loss_parallel": 1.8477439880371094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.23594093322754, + "learning_rate": 1e-06, + "loss": 0.4981, + "mean_token_accuracy": 0.8440148234367371, + "num_tokens": 94648021.0, + "step": 2478 + }, + { + "epoch": 0.31535428062587456, + "ewc_loss": 0.0184326171875, + "ewc_loss_parallel": 1.8477439880371094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.41504669189453, + "learning_rate": 1e-06, + "loss": 0.4424, + "mean_token_accuracy": 0.8605002760887146, + "num_tokens": 94686305.0, + "step": 2479 + }, + { + "epoch": 0.3154814909044651, + "ewc_loss": 0.0184326171875, + "ewc_loss_parallel": 1.8477439880371094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.023969650268555, + "learning_rate": 1e-06, + "loss": 0.4936, + "mean_token_accuracy": 0.8473485708236694, + "num_tokens": 94728606.0, + "step": 2480 + }, + { + "epoch": 0.3156087011830556, + "ewc_loss": 0.018310546875, + "ewc_loss_parallel": 1.8358230590820312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.12834358215332, + "learning_rate": 1e-06, + "loss": 0.4714, + "mean_token_accuracy": 0.8524676561355591, + "num_tokens": 94770162.0, + "step": 2481 + }, + { + "epoch": 0.3157359114616461, + "ewc_loss": 0.0186767578125, + "ewc_loss_parallel": 1.8715858459472656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.502317428588867, + "learning_rate": 1e-06, + "loss": 0.4511, + "mean_token_accuracy": 0.8563441038131714, + "num_tokens": 94807735.0, + "step": 2482 + }, + { + "epoch": 0.3158631217402366, + "ewc_loss": 0.0184326171875, + "ewc_loss_parallel": 1.8477439880371094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.030746459960938, + "learning_rate": 1e-06, + "loss": 0.4696, + "mean_token_accuracy": 0.844109058380127, + "num_tokens": 94840366.0, + "step": 2483 + }, + { + "epoch": 0.31599033201882715, + "ewc_loss": 0.0184326171875, + "ewc_loss_parallel": 1.8477439880371094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.303377151489258, + "learning_rate": 1e-06, + "loss": 0.4756, + "mean_token_accuracy": 0.8457484245300293, + "num_tokens": 94874293.0, + "step": 2484 + }, + { + "epoch": 0.3161175422974176, + "ewc_loss": 0.0185546875, + "ewc_loss_parallel": 1.8596649169921875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.16417694091797, + "learning_rate": 1e-06, + "loss": 0.4811, + "mean_token_accuracy": 0.8489395976066589, + "num_tokens": 94909347.0, + "step": 2485 + }, + { + "epoch": 0.31624475257600815, + "ewc_loss": 0.0186767578125, + "ewc_loss_parallel": 1.8715858459472656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.364011764526367, + "learning_rate": 1e-06, + "loss": 0.4415, + "mean_token_accuracy": 0.8594827651977539, + "num_tokens": 94955120.0, + "step": 2486 + }, + { + "epoch": 0.3163719628545987, + "ewc_loss": 0.0185546875, + "ewc_loss_parallel": 1.8596649169921875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.130395889282227, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.8635391592979431, + "num_tokens": 94992319.0, + "step": 2487 + }, + { + "epoch": 0.31649917313318915, + "ewc_loss": 0.0184326171875, + "ewc_loss_parallel": 1.8477439880371094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.331567764282227, + "learning_rate": 1e-06, + "loss": 0.426, + "mean_token_accuracy": 0.8597838878631592, + "num_tokens": 95023464.0, + "step": 2488 + }, + { + "epoch": 0.3166263834117797, + "ewc_loss": 0.0186767578125, + "ewc_loss_parallel": 1.8715858459472656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.219757080078125, + "learning_rate": 1e-06, + "loss": 0.4414, + "mean_token_accuracy": 0.86271071434021, + "num_tokens": 95055963.0, + "step": 2489 + }, + { + "epoch": 0.3167535936903702, + "ewc_loss": 0.0186767578125, + "ewc_loss_parallel": 1.8715858459472656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.217823028564453, + "learning_rate": 1e-06, + "loss": 0.5297, + "mean_token_accuracy": 0.8314908742904663, + "num_tokens": 95089628.0, + "step": 2490 + }, + { + "epoch": 0.3168808039689607, + "ewc_loss": 0.0186767578125, + "ewc_loss_parallel": 1.8715858459472656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.366323471069336, + "learning_rate": 1e-06, + "loss": 0.4416, + "mean_token_accuracy": 0.8643625974655151, + "num_tokens": 95128557.0, + "step": 2491 + }, + { + "epoch": 0.3170080142475512, + "ewc_loss": 0.0186767578125, + "ewc_loss_parallel": 1.8715858459472656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.16297149658203, + "learning_rate": 1e-06, + "loss": 0.511, + "mean_token_accuracy": 0.8378991484642029, + "num_tokens": 95168324.0, + "step": 2492 + }, + { + "epoch": 0.31713522452614173, + "ewc_loss": 0.0185546875, + "ewc_loss_parallel": 1.8596649169921875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.19845962524414, + "learning_rate": 1e-06, + "loss": 0.5427, + "mean_token_accuracy": 0.8301461338996887, + "num_tokens": 95208139.0, + "step": 2493 + }, + { + "epoch": 0.3172624348047322, + "ewc_loss": 0.0186767578125, + "ewc_loss_parallel": 1.8715858459472656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.2425479888916, + "learning_rate": 1e-06, + "loss": 0.496, + "mean_token_accuracy": 0.8446944952011108, + "num_tokens": 95247337.0, + "step": 2494 + }, + { + "epoch": 0.31738964508332274, + "ewc_loss": 0.0185546875, + "ewc_loss_parallel": 1.8596649169921875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.40341567993164, + "learning_rate": 1e-06, + "loss": 0.486, + "mean_token_accuracy": 0.8429699540138245, + "num_tokens": 95289208.0, + "step": 2495 + }, + { + "epoch": 0.31751685536191326, + "ewc_loss": 0.0186767578125, + "ewc_loss_parallel": 1.8715858459472656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.23289680480957, + "learning_rate": 1e-06, + "loss": 0.4755, + "mean_token_accuracy": 0.8478590250015259, + "num_tokens": 95329717.0, + "step": 2496 + }, + { + "epoch": 0.31764406564050374, + "ewc_loss": 0.0186767578125, + "ewc_loss_parallel": 1.8715858459472656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.292858123779297, + "learning_rate": 1e-06, + "loss": 0.523, + "mean_token_accuracy": 0.8293315768241882, + "num_tokens": 95371052.0, + "step": 2497 + }, + { + "epoch": 0.31777127591909426, + "ewc_loss": 0.018798828125, + "ewc_loss_parallel": 1.8835067749023438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.290769577026367, + "learning_rate": 1e-06, + "loss": 0.4092, + "mean_token_accuracy": 0.8703939914703369, + "num_tokens": 95412486.0, + "step": 2498 + }, + { + "epoch": 0.3178984861976848, + "ewc_loss": 0.0186767578125, + "ewc_loss_parallel": 1.8715858459472656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.29134178161621, + "learning_rate": 1e-06, + "loss": 0.4201, + "mean_token_accuracy": 0.8661432266235352, + "num_tokens": 95446167.0, + "step": 2499 + }, + { + "epoch": 0.31802569647627527, + "ewc_loss": 0.0186767578125, + "ewc_loss_parallel": 1.8715858459472656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.306806564331055, + "learning_rate": 1e-06, + "loss": 0.4935, + "mean_token_accuracy": 0.8445620536804199, + "num_tokens": 95485763.0, + "step": 2500 + }, + { + "epoch": 0.3181529067548658, + "ewc_loss": 0.0185546875, + "ewc_loss_parallel": 1.8596649169921875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.31886100769043, + "learning_rate": 1e-06, + "loss": 0.5361, + "mean_token_accuracy": 0.8387401103973389, + "num_tokens": 95526407.0, + "step": 2501 + }, + { + "epoch": 0.3182801170334563, + "ewc_loss": 0.0185546875, + "ewc_loss_parallel": 1.8596649169921875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.288236618041992, + "learning_rate": 1e-06, + "loss": 0.5796, + "mean_token_accuracy": 0.8176649212837219, + "num_tokens": 95567479.0, + "step": 2502 + }, + { + "epoch": 0.3184073273120468, + "ewc_loss": 0.0185546875, + "ewc_loss_parallel": 1.8596649169921875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.169227600097656, + "learning_rate": 1e-06, + "loss": 0.4673, + "mean_token_accuracy": 0.856611967086792, + "num_tokens": 95606588.0, + "step": 2503 + }, + { + "epoch": 0.3185345375906373, + "ewc_loss": 0.0186767578125, + "ewc_loss_parallel": 1.8715858459472656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.327245712280273, + "learning_rate": 1e-06, + "loss": 0.4601, + "mean_token_accuracy": 0.8541443347930908, + "num_tokens": 95653293.0, + "step": 2504 + }, + { + "epoch": 0.31866174786922785, + "ewc_loss": 0.0185546875, + "ewc_loss_parallel": 1.8596649169921875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.40208625793457, + "learning_rate": 1e-06, + "loss": 0.495, + "mean_token_accuracy": 0.8403949737548828, + "num_tokens": 95692424.0, + "step": 2505 + }, + { + "epoch": 0.3187889581478183, + "ewc_loss": 0.0186767578125, + "ewc_loss_parallel": 1.8715858459472656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.34638214111328, + "learning_rate": 1e-06, + "loss": 0.4444, + "mean_token_accuracy": 0.8585959672927856, + "num_tokens": 95733679.0, + "step": 2506 + }, + { + "epoch": 0.31891616842640885, + "ewc_loss": 0.0186767578125, + "ewc_loss_parallel": 1.8715858459472656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.34052848815918, + "learning_rate": 1e-06, + "loss": 0.4823, + "mean_token_accuracy": 0.8451075553894043, + "num_tokens": 95768211.0, + "step": 2507 + }, + { + "epoch": 0.3190433787049994, + "ewc_loss": 0.0186767578125, + "ewc_loss_parallel": 1.8715858459472656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.324573516845703, + "learning_rate": 1e-06, + "loss": 0.4853, + "mean_token_accuracy": 0.8477520942687988, + "num_tokens": 95805974.0, + "step": 2508 + }, + { + "epoch": 0.31917058898358985, + "ewc_loss": 0.018310546875, + "ewc_loss_parallel": 1.8358230590820312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.50644302368164, + "learning_rate": 1e-06, + "loss": 0.4775, + "mean_token_accuracy": 0.8483642935752869, + "num_tokens": 95845463.0, + "step": 2509 + }, + { + "epoch": 0.3192977992621804, + "ewc_loss": 0.0186767578125, + "ewc_loss_parallel": 1.8715858459472656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.243947982788086, + "learning_rate": 1e-06, + "loss": 0.5246, + "mean_token_accuracy": 0.8452005386352539, + "num_tokens": 95888148.0, + "step": 2510 + }, + { + "epoch": 0.3194250095407709, + "ewc_loss": 0.0184326171875, + "ewc_loss_parallel": 1.8477439880371094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.290145874023438, + "learning_rate": 1e-06, + "loss": 0.3956, + "mean_token_accuracy": 0.870600700378418, + "num_tokens": 95925432.0, + "step": 2511 + }, + { + "epoch": 0.3195522198193614, + "ewc_loss": 0.0184326171875, + "ewc_loss_parallel": 1.8477439880371094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.165443420410156, + "learning_rate": 1e-06, + "loss": 0.447, + "mean_token_accuracy": 0.8559191823005676, + "num_tokens": 95970695.0, + "step": 2512 + }, + { + "epoch": 0.3196794300979519, + "ewc_loss": 0.0186767578125, + "ewc_loss_parallel": 1.8715858459472656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.364425659179688, + "learning_rate": 1e-06, + "loss": 0.4597, + "mean_token_accuracy": 0.8506932258605957, + "num_tokens": 96006093.0, + "step": 2513 + }, + { + "epoch": 0.31980664037654244, + "ewc_loss": 0.0186767578125, + "ewc_loss_parallel": 1.8715858459472656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.119176864624023, + "learning_rate": 1e-06, + "loss": 0.4673, + "mean_token_accuracy": 0.8494522571563721, + "num_tokens": 96045945.0, + "step": 2514 + }, + { + "epoch": 0.3199338506551329, + "ewc_loss": 0.0189208984375, + "ewc_loss_parallel": 1.895427703857422e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.58434295654297, + "learning_rate": 1e-06, + "loss": 0.4231, + "mean_token_accuracy": 0.8647292852401733, + "num_tokens": 96076748.0, + "step": 2515 + }, + { + "epoch": 0.32006106093372344, + "ewc_loss": 0.018798828125, + "ewc_loss_parallel": 1.8835067749023438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.368209838867188, + "learning_rate": 1e-06, + "loss": 0.4731, + "mean_token_accuracy": 0.8509847521781921, + "num_tokens": 96116964.0, + "step": 2516 + }, + { + "epoch": 0.32018827121231397, + "ewc_loss": 0.0186767578125, + "ewc_loss_parallel": 1.8715858459472656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.34535026550293, + "learning_rate": 1e-06, + "loss": 0.4755, + "mean_token_accuracy": 0.8527777194976807, + "num_tokens": 96152073.0, + "step": 2517 + }, + { + "epoch": 0.32031548149090444, + "ewc_loss": 0.0186767578125, + "ewc_loss_parallel": 1.8715858459472656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.27676773071289, + "learning_rate": 1e-06, + "loss": 0.4701, + "mean_token_accuracy": 0.8484737873077393, + "num_tokens": 96191608.0, + "step": 2518 + }, + { + "epoch": 0.32044269176949497, + "ewc_loss": 0.018798828125, + "ewc_loss_parallel": 1.8835067749023438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.41425895690918, + "learning_rate": 1e-06, + "loss": 0.4172, + "mean_token_accuracy": 0.8674360513687134, + "num_tokens": 96228124.0, + "step": 2519 + }, + { + "epoch": 0.3205699020480855, + "ewc_loss": 0.0186767578125, + "ewc_loss_parallel": 1.8715858459472656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.260902404785156, + "learning_rate": 1e-06, + "loss": 0.4316, + "mean_token_accuracy": 0.8608043193817139, + "num_tokens": 96265870.0, + "step": 2520 + }, + { + "epoch": 0.32069711232667597, + "ewc_loss": 0.0186767578125, + "ewc_loss_parallel": 1.8715858459472656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.234647750854492, + "learning_rate": 1e-06, + "loss": 0.438, + "mean_token_accuracy": 0.8600730299949646, + "num_tokens": 96309623.0, + "step": 2521 + }, + { + "epoch": 0.3208243226052665, + "ewc_loss": 0.018798828125, + "ewc_loss_parallel": 1.8835067749023438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.412019729614258, + "learning_rate": 1e-06, + "loss": 0.4807, + "mean_token_accuracy": 0.8420732021331787, + "num_tokens": 96344036.0, + "step": 2522 + }, + { + "epoch": 0.320951532883857, + "ewc_loss": 0.0186767578125, + "ewc_loss_parallel": 1.8715858459472656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.277158737182617, + "learning_rate": 1e-06, + "loss": 0.423, + "mean_token_accuracy": 0.8615325093269348, + "num_tokens": 96377627.0, + "step": 2523 + }, + { + "epoch": 0.3210787431624475, + "ewc_loss": 0.0186767578125, + "ewc_loss_parallel": 1.8715858459472656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.350414276123047, + "learning_rate": 1e-06, + "loss": 0.4605, + "mean_token_accuracy": 0.8509925007820129, + "num_tokens": 96413568.0, + "step": 2524 + }, + { + "epoch": 0.32120595344103803, + "ewc_loss": 0.018798828125, + "ewc_loss_parallel": 1.8835067749023438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.22789764404297, + "learning_rate": 1e-06, + "loss": 0.4736, + "mean_token_accuracy": 0.8467742800712585, + "num_tokens": 96454682.0, + "step": 2525 + }, + { + "epoch": 0.32133316371962856, + "ewc_loss": 0.018798828125, + "ewc_loss_parallel": 1.8835067749023438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.398380279541016, + "learning_rate": 1e-06, + "loss": 0.4975, + "mean_token_accuracy": 0.8391166925430298, + "num_tokens": 96490456.0, + "step": 2526 + }, + { + "epoch": 0.32146037399821903, + "ewc_loss": 0.0185546875, + "ewc_loss_parallel": 1.8596649169921875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.31603240966797, + "learning_rate": 1e-06, + "loss": 0.5172, + "mean_token_accuracy": 0.8441852927207947, + "num_tokens": 96526891.0, + "step": 2527 + }, + { + "epoch": 0.32158758427680956, + "ewc_loss": 0.0186767578125, + "ewc_loss_parallel": 1.8715858459472656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.272968292236328, + "learning_rate": 1e-06, + "loss": 0.492, + "mean_token_accuracy": 0.8442441821098328, + "num_tokens": 96575226.0, + "step": 2528 + }, + { + "epoch": 0.3217147945554001, + "ewc_loss": 0.018798828125, + "ewc_loss_parallel": 1.8835067749023438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.260942459106445, + "learning_rate": 1e-06, + "loss": 0.4995, + "mean_token_accuracy": 0.839619517326355, + "num_tokens": 96606383.0, + "step": 2529 + }, + { + "epoch": 0.3218420048339906, + "ewc_loss": 0.0184326171875, + "ewc_loss_parallel": 1.8477439880371094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.290767669677734, + "learning_rate": 1e-06, + "loss": 0.4704, + "mean_token_accuracy": 0.8520424962043762, + "num_tokens": 96639826.0, + "step": 2530 + }, + { + "epoch": 0.3219692151125811, + "ewc_loss": 0.018798828125, + "ewc_loss_parallel": 1.8835067749023438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.335189819335938, + "learning_rate": 1e-06, + "loss": 0.4596, + "mean_token_accuracy": 0.8544858694076538, + "num_tokens": 96679151.0, + "step": 2531 + }, + { + "epoch": 0.3220964253911716, + "ewc_loss": 0.018798828125, + "ewc_loss_parallel": 1.8835067749023438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.357723236083984, + "learning_rate": 1e-06, + "loss": 0.4628, + "mean_token_accuracy": 0.8572138547897339, + "num_tokens": 96719365.0, + "step": 2532 + }, + { + "epoch": 0.32222363566976214, + "ewc_loss": 0.018798828125, + "ewc_loss_parallel": 1.8835067749023438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.356502532958984, + "learning_rate": 1e-06, + "loss": 0.4727, + "mean_token_accuracy": 0.8476420044898987, + "num_tokens": 96758318.0, + "step": 2533 + }, + { + "epoch": 0.3223508459483526, + "ewc_loss": 0.0185546875, + "ewc_loss_parallel": 1.8596649169921875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.37061309814453, + "learning_rate": 1e-06, + "loss": 0.5062, + "mean_token_accuracy": 0.8448919653892517, + "num_tokens": 96791159.0, + "step": 2534 + }, + { + "epoch": 0.32247805622694314, + "ewc_loss": 0.018798828125, + "ewc_loss_parallel": 1.8835067749023438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.30552101135254, + "learning_rate": 1e-06, + "loss": 0.4654, + "mean_token_accuracy": 0.8523802757263184, + "num_tokens": 96825352.0, + "step": 2535 + }, + { + "epoch": 0.32260526650553367, + "ewc_loss": 0.0186767578125, + "ewc_loss_parallel": 1.8715858459472656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.414161682128906, + "learning_rate": 1e-06, + "loss": 0.4911, + "mean_token_accuracy": 0.8488186001777649, + "num_tokens": 96859700.0, + "step": 2536 + }, + { + "epoch": 0.32273247678412414, + "ewc_loss": 0.0186767578125, + "ewc_loss_parallel": 1.8715858459472656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.21934700012207, + "learning_rate": 1e-06, + "loss": 0.4747, + "mean_token_accuracy": 0.8484101891517639, + "num_tokens": 96897077.0, + "step": 2537 + }, + { + "epoch": 0.3228596870627147, + "ewc_loss": 0.018798828125, + "ewc_loss_parallel": 1.8835067749023438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.41180992126465, + "learning_rate": 1e-06, + "loss": 0.4806, + "mean_token_accuracy": 0.8450804948806763, + "num_tokens": 96936936.0, + "step": 2538 + }, + { + "epoch": 0.3229868973413052, + "ewc_loss": 0.0186767578125, + "ewc_loss_parallel": 1.8715858459472656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.174699783325195, + "learning_rate": 1e-06, + "loss": 0.4488, + "mean_token_accuracy": 0.856186032295227, + "num_tokens": 96975702.0, + "step": 2539 + }, + { + "epoch": 0.3231141076198957, + "ewc_loss": 0.018798828125, + "ewc_loss_parallel": 1.8835067749023438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.370532989501953, + "learning_rate": 1e-06, + "loss": 0.4711, + "mean_token_accuracy": 0.8487446308135986, + "num_tokens": 97010585.0, + "step": 2540 + }, + { + "epoch": 0.3232413178984862, + "ewc_loss": 0.0189208984375, + "ewc_loss_parallel": 1.895427703857422e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.302379608154297, + "learning_rate": 1e-06, + "loss": 0.3884, + "mean_token_accuracy": 0.8740602135658264, + "num_tokens": 97047210.0, + "step": 2541 + }, + { + "epoch": 0.32336852817707673, + "ewc_loss": 0.018798828125, + "ewc_loss_parallel": 1.8835067749023438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.328210830688477, + "learning_rate": 1e-06, + "loss": 0.4835, + "mean_token_accuracy": 0.8450117707252502, + "num_tokens": 97084835.0, + "step": 2542 + }, + { + "epoch": 0.3234957384556672, + "ewc_loss": 0.01904296875, + "ewc_loss_parallel": 1.9073486328125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.314979553222656, + "learning_rate": 1e-06, + "loss": 0.4886, + "mean_token_accuracy": 0.8474457263946533, + "num_tokens": 97125829.0, + "step": 2543 + }, + { + "epoch": 0.32362294873425773, + "ewc_loss": 0.0189208984375, + "ewc_loss_parallel": 1.895427703857422e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.195398330688477, + "learning_rate": 1e-06, + "loss": 0.4853, + "mean_token_accuracy": 0.8437923192977905, + "num_tokens": 97162874.0, + "step": 2544 + }, + { + "epoch": 0.32375015901284826, + "ewc_loss": 0.018798828125, + "ewc_loss_parallel": 1.8835067749023438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.499406814575195, + "learning_rate": 1e-06, + "loss": 0.4641, + "mean_token_accuracy": 0.853674054145813, + "num_tokens": 97197559.0, + "step": 2545 + }, + { + "epoch": 0.32387736929143873, + "ewc_loss": 0.01904296875, + "ewc_loss_parallel": 1.9073486328125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.472591400146484, + "learning_rate": 1e-06, + "loss": 0.5111, + "mean_token_accuracy": 0.8370122313499451, + "num_tokens": 97227028.0, + "step": 2546 + }, + { + "epoch": 0.32400457957002926, + "ewc_loss": 0.018798828125, + "ewc_loss_parallel": 1.8835067749023438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.346235275268555, + "learning_rate": 1e-06, + "loss": 0.4837, + "mean_token_accuracy": 0.84568190574646, + "num_tokens": 97270209.0, + "step": 2547 + }, + { + "epoch": 0.3241317898486198, + "ewc_loss": 0.018798828125, + "ewc_loss_parallel": 1.8835067749023438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.70754051208496, + "learning_rate": 1e-06, + "loss": 0.449, + "mean_token_accuracy": 0.8532497882843018, + "num_tokens": 97309176.0, + "step": 2548 + }, + { + "epoch": 0.32425900012721026, + "ewc_loss": 0.01904296875, + "ewc_loss_parallel": 1.9073486328125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.401504516601562, + "learning_rate": 1e-06, + "loss": 0.5285, + "mean_token_accuracy": 0.8298766613006592, + "num_tokens": 97340464.0, + "step": 2549 + }, + { + "epoch": 0.3243862104058008, + "ewc_loss": 0.018798828125, + "ewc_loss_parallel": 1.8835067749023438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.993671417236328, + "learning_rate": 1e-06, + "loss": 0.4594, + "mean_token_accuracy": 0.8530110120773315, + "num_tokens": 97378435.0, + "step": 2550 + }, + { + "epoch": 0.3245134206843913, + "ewc_loss": 0.0189208984375, + "ewc_loss_parallel": 1.895427703857422e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.288528442382812, + "learning_rate": 1e-06, + "loss": 0.4827, + "mean_token_accuracy": 0.8463670015335083, + "num_tokens": 97417886.0, + "step": 2551 + }, + { + "epoch": 0.3246406309629818, + "ewc_loss": 0.0189208984375, + "ewc_loss_parallel": 1.895427703857422e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.629304885864258, + "learning_rate": 1e-06, + "loss": 0.4603, + "mean_token_accuracy": 0.853192925453186, + "num_tokens": 97450552.0, + "step": 2552 + }, + { + "epoch": 0.3247678412415723, + "ewc_loss": 0.019287109375, + "ewc_loss_parallel": 1.9311904907226562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.44365882873535, + "learning_rate": 1e-06, + "loss": 0.4482, + "mean_token_accuracy": 0.8571586012840271, + "num_tokens": 97488586.0, + "step": 2553 + }, + { + "epoch": 0.32489505152016285, + "ewc_loss": 0.018310546875, + "ewc_loss_parallel": 1.8358230590820312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.356712341308594, + "learning_rate": 1e-06, + "loss": 0.4832, + "mean_token_accuracy": 0.8456220626831055, + "num_tokens": 97528893.0, + "step": 2554 + }, + { + "epoch": 0.3250222617987533, + "ewc_loss": 0.01904296875, + "ewc_loss_parallel": 1.9073486328125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.48281478881836, + "learning_rate": 1e-06, + "loss": 0.5448, + "mean_token_accuracy": 0.8265083432197571, + "num_tokens": 97568654.0, + "step": 2555 + }, + { + "epoch": 0.32514947207734385, + "ewc_loss": 0.01904296875, + "ewc_loss_parallel": 1.9073486328125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.491622924804688, + "learning_rate": 1e-06, + "loss": 0.4559, + "mean_token_accuracy": 0.8534296154975891, + "num_tokens": 97607068.0, + "step": 2556 + }, + { + "epoch": 0.3252766823559344, + "ewc_loss": 0.01806640625, + "ewc_loss_parallel": 1.811981201171875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.141998291015625, + "learning_rate": 1e-06, + "loss": 0.4074, + "mean_token_accuracy": 0.8717786073684692, + "num_tokens": 97647505.0, + "step": 2557 + }, + { + "epoch": 0.32540389263452485, + "ewc_loss": 0.018798828125, + "ewc_loss_parallel": 1.8835067749023438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.1624698638916, + "learning_rate": 1e-06, + "loss": 0.4924, + "mean_token_accuracy": 0.8438711166381836, + "num_tokens": 97690505.0, + "step": 2558 + }, + { + "epoch": 0.3255311029131154, + "ewc_loss": 0.018798828125, + "ewc_loss_parallel": 1.8835067749023438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.98193359375, + "learning_rate": 1e-06, + "loss": 0.5274, + "mean_token_accuracy": 0.8381083011627197, + "num_tokens": 97723955.0, + "step": 2559 + }, + { + "epoch": 0.3256583131917059, + "ewc_loss": 0.018310546875, + "ewc_loss_parallel": 1.8358230590820312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.3891544342041, + "learning_rate": 1e-06, + "loss": 0.4872, + "mean_token_accuracy": 0.8443644642829895, + "num_tokens": 97763822.0, + "step": 2560 + }, + { + "epoch": 0.3257855234702964, + "ewc_loss": 0.018310546875, + "ewc_loss_parallel": 1.8358230590820312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.27897071838379, + "learning_rate": 1e-06, + "loss": 0.4778, + "mean_token_accuracy": 0.8448662757873535, + "num_tokens": 97803827.0, + "step": 2561 + }, + { + "epoch": 0.3259127337488869, + "ewc_loss": 0.0185546875, + "ewc_loss_parallel": 1.8596649169921875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.43033790588379, + "learning_rate": 1e-06, + "loss": 0.4586, + "mean_token_accuracy": 0.8499809503555298, + "num_tokens": 97841056.0, + "step": 2562 + }, + { + "epoch": 0.32603994402747744, + "ewc_loss": 0.0186767578125, + "ewc_loss_parallel": 1.8715858459472656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.668474197387695, + "learning_rate": 1e-06, + "loss": 0.4148, + "mean_token_accuracy": 0.8672713041305542, + "num_tokens": 97881714.0, + "step": 2563 + }, + { + "epoch": 0.3261671543060679, + "ewc_loss": 0.018798828125, + "ewc_loss_parallel": 1.8835067749023438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.33933448791504, + "learning_rate": 1e-06, + "loss": 0.4762, + "mean_token_accuracy": 0.8468438386917114, + "num_tokens": 97925797.0, + "step": 2564 + }, + { + "epoch": 0.32629436458465844, + "ewc_loss": 0.0186767578125, + "ewc_loss_parallel": 1.8715858459472656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.31231117248535, + "learning_rate": 1e-06, + "loss": 0.4527, + "mean_token_accuracy": 0.854313850402832, + "num_tokens": 97963638.0, + "step": 2565 + }, + { + "epoch": 0.32642157486324896, + "ewc_loss": 0.0185546875, + "ewc_loss_parallel": 1.8596649169921875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.466251373291016, + "learning_rate": 1e-06, + "loss": 0.4882, + "mean_token_accuracy": 0.8432685136795044, + "num_tokens": 98002043.0, + "step": 2566 + }, + { + "epoch": 0.32654878514183944, + "ewc_loss": 0.0185546875, + "ewc_loss_parallel": 1.8596649169921875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.239919662475586, + "learning_rate": 1e-06, + "loss": 0.4601, + "mean_token_accuracy": 0.8522003889083862, + "num_tokens": 98041031.0, + "step": 2567 + }, + { + "epoch": 0.32667599542042997, + "ewc_loss": 0.0186767578125, + "ewc_loss_parallel": 1.8715858459472656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.5793514251709, + "learning_rate": 1e-06, + "loss": 0.4304, + "mean_token_accuracy": 0.8577349185943604, + "num_tokens": 98075510.0, + "step": 2568 + }, + { + "epoch": 0.3268032056990205, + "ewc_loss": 0.0189208984375, + "ewc_loss_parallel": 1.895427703857422e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.413009643554688, + "learning_rate": 1e-06, + "loss": 0.4481, + "mean_token_accuracy": 0.8556965589523315, + "num_tokens": 98114162.0, + "step": 2569 + }, + { + "epoch": 0.32693041597761097, + "ewc_loss": 0.0186767578125, + "ewc_loss_parallel": 1.8715858459472656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.435089111328125, + "learning_rate": 1e-06, + "loss": 0.4839, + "mean_token_accuracy": 0.844971776008606, + "num_tokens": 98148722.0, + "step": 2570 + }, + { + "epoch": 0.3270576262562015, + "ewc_loss": 0.0186767578125, + "ewc_loss_parallel": 1.8715858459472656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.22075080871582, + "learning_rate": 1e-06, + "loss": 0.5449, + "mean_token_accuracy": 0.8292291164398193, + "num_tokens": 98188255.0, + "step": 2571 + }, + { + "epoch": 0.327184836534792, + "ewc_loss": 0.0186767578125, + "ewc_loss_parallel": 1.8715858459472656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.484277725219727, + "learning_rate": 1e-06, + "loss": 0.438, + "mean_token_accuracy": 0.859457790851593, + "num_tokens": 98228597.0, + "step": 2572 + }, + { + "epoch": 0.3273120468133825, + "ewc_loss": 0.018798828125, + "ewc_loss_parallel": 1.8835067749023438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.209604263305664, + "learning_rate": 1e-06, + "loss": 0.498, + "mean_token_accuracy": 0.8414795398712158, + "num_tokens": 98273405.0, + "step": 2573 + }, + { + "epoch": 0.327439257091973, + "ewc_loss": 0.0185546875, + "ewc_loss_parallel": 1.8596649169921875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.57332992553711, + "learning_rate": 1e-06, + "loss": 0.5335, + "mean_token_accuracy": 0.8356603980064392, + "num_tokens": 98315091.0, + "step": 2574 + }, + { + "epoch": 0.32756646737056355, + "ewc_loss": 0.0189208984375, + "ewc_loss_parallel": 1.895427703857422e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.384389877319336, + "learning_rate": 1e-06, + "loss": 0.4849, + "mean_token_accuracy": 0.8505857586860657, + "num_tokens": 98356967.0, + "step": 2575 + }, + { + "epoch": 0.327693677649154, + "ewc_loss": 0.018798828125, + "ewc_loss_parallel": 1.8835067749023438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.283205032348633, + "learning_rate": 1e-06, + "loss": 0.5041, + "mean_token_accuracy": 0.8401296138763428, + "num_tokens": 98394455.0, + "step": 2576 + }, + { + "epoch": 0.32782088792774455, + "ewc_loss": 0.0189208984375, + "ewc_loss_parallel": 1.895427703857422e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.40604591369629, + "learning_rate": 1e-06, + "loss": 0.4405, + "mean_token_accuracy": 0.8588406443595886, + "num_tokens": 98430455.0, + "step": 2577 + }, + { + "epoch": 0.3279480982063351, + "ewc_loss": 0.0189208984375, + "ewc_loss_parallel": 1.895427703857422e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.416683197021484, + "learning_rate": 1e-06, + "loss": 0.513, + "mean_token_accuracy": 0.8359020352363586, + "num_tokens": 98472218.0, + "step": 2578 + }, + { + "epoch": 0.3280753084849256, + "ewc_loss": 0.0189208984375, + "ewc_loss_parallel": 1.895427703857422e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.367677688598633, + "learning_rate": 1e-06, + "loss": 0.4504, + "mean_token_accuracy": 0.8568689227104187, + "num_tokens": 98509540.0, + "step": 2579 + }, + { + "epoch": 0.3282025187635161, + "ewc_loss": 0.0189208984375, + "ewc_loss_parallel": 1.895427703857422e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.305191040039062, + "learning_rate": 1e-06, + "loss": 0.4869, + "mean_token_accuracy": 0.8492423892021179, + "num_tokens": 98544761.0, + "step": 2580 + }, + { + "epoch": 0.3283297290421066, + "ewc_loss": 0.0189208984375, + "ewc_loss_parallel": 1.895427703857422e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.339570999145508, + "learning_rate": 1e-06, + "loss": 0.4267, + "mean_token_accuracy": 0.8618676662445068, + "num_tokens": 98580522.0, + "step": 2581 + }, + { + "epoch": 0.32845693932069714, + "ewc_loss": 0.018798828125, + "ewc_loss_parallel": 1.8835067749023438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.63553810119629, + "learning_rate": 1e-06, + "loss": 0.4337, + "mean_token_accuracy": 0.8577818870544434, + "num_tokens": 98612988.0, + "step": 2582 + }, + { + "epoch": 0.3285841495992876, + "ewc_loss": 0.01904296875, + "ewc_loss_parallel": 1.9073486328125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.317581176757812, + "learning_rate": 1e-06, + "loss": 0.503, + "mean_token_accuracy": 0.838445782661438, + "num_tokens": 98654213.0, + "step": 2583 + }, + { + "epoch": 0.32871135987787814, + "ewc_loss": 0.0189208984375, + "ewc_loss_parallel": 1.895427703857422e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.462661743164062, + "learning_rate": 1e-06, + "loss": 0.4626, + "mean_token_accuracy": 0.8526546359062195, + "num_tokens": 98697168.0, + "step": 2584 + }, + { + "epoch": 0.32883857015646867, + "ewc_loss": 0.01904296875, + "ewc_loss_parallel": 1.9073486328125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.36846923828125, + "learning_rate": 1e-06, + "loss": 0.4567, + "mean_token_accuracy": 0.855007529258728, + "num_tokens": 98735664.0, + "step": 2585 + }, + { + "epoch": 0.32896578043505914, + "ewc_loss": 0.0189208984375, + "ewc_loss_parallel": 1.895427703857422e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.380531311035156, + "learning_rate": 1e-06, + "loss": 0.4836, + "mean_token_accuracy": 0.8443198204040527, + "num_tokens": 98777257.0, + "step": 2586 + }, + { + "epoch": 0.32909299071364967, + "ewc_loss": 0.01904296875, + "ewc_loss_parallel": 1.9073486328125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.67214012145996, + "learning_rate": 1e-06, + "loss": 0.4612, + "mean_token_accuracy": 0.8537855744361877, + "num_tokens": 98812443.0, + "step": 2587 + }, + { + "epoch": 0.3292202009922402, + "ewc_loss": 0.0189208984375, + "ewc_loss_parallel": 1.895427703857422e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.414865493774414, + "learning_rate": 1e-06, + "loss": 0.4429, + "mean_token_accuracy": 0.8592246770858765, + "num_tokens": 98848231.0, + "step": 2588 + }, + { + "epoch": 0.32934741127083067, + "ewc_loss": 0.018798828125, + "ewc_loss_parallel": 1.8835067749023438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.34205436706543, + "learning_rate": 1e-06, + "loss": 0.4631, + "mean_token_accuracy": 0.8537173271179199, + "num_tokens": 98886487.0, + "step": 2589 + }, + { + "epoch": 0.3294746215494212, + "ewc_loss": 0.0189208984375, + "ewc_loss_parallel": 1.895427703857422e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.500619888305664, + "learning_rate": 1e-06, + "loss": 0.4979, + "mean_token_accuracy": 0.8403750061988831, + "num_tokens": 98924750.0, + "step": 2590 + }, + { + "epoch": 0.3296018318280117, + "ewc_loss": 0.018798828125, + "ewc_loss_parallel": 1.8835067749023438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.366920471191406, + "learning_rate": 1e-06, + "loss": 0.5545, + "mean_token_accuracy": 0.8276427984237671, + "num_tokens": 98964151.0, + "step": 2591 + }, + { + "epoch": 0.3297290421066022, + "ewc_loss": 0.0189208984375, + "ewc_loss_parallel": 1.895427703857422e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.346372604370117, + "learning_rate": 1e-06, + "loss": 0.4315, + "mean_token_accuracy": 0.8609395623207092, + "num_tokens": 99000277.0, + "step": 2592 + }, + { + "epoch": 0.3298562523851927, + "ewc_loss": 0.01904296875, + "ewc_loss_parallel": 1.9073486328125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.41107940673828, + "learning_rate": 1e-06, + "loss": 0.4924, + "mean_token_accuracy": 0.8406355381011963, + "num_tokens": 99042335.0, + "step": 2593 + }, + { + "epoch": 0.32998346266378326, + "ewc_loss": 0.01904296875, + "ewc_loss_parallel": 1.9073486328125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.43718719482422, + "learning_rate": 1e-06, + "loss": 0.4737, + "mean_token_accuracy": 0.8480590581893921, + "num_tokens": 99082964.0, + "step": 2594 + }, + { + "epoch": 0.33011067294237373, + "ewc_loss": 0.018798828125, + "ewc_loss_parallel": 1.8835067749023438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.19483184814453, + "learning_rate": 1e-06, + "loss": 0.3996, + "mean_token_accuracy": 0.8730267882347107, + "num_tokens": 99123440.0, + "step": 2595 + }, + { + "epoch": 0.33023788322096426, + "ewc_loss": 0.0189208984375, + "ewc_loss_parallel": 1.895427703857422e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.57537841796875, + "learning_rate": 1e-06, + "loss": 0.4766, + "mean_token_accuracy": 0.8484148979187012, + "num_tokens": 99167047.0, + "step": 2596 + }, + { + "epoch": 0.3303650934995548, + "ewc_loss": 0.01904296875, + "ewc_loss_parallel": 1.9073486328125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.500011444091797, + "learning_rate": 1e-06, + "loss": 0.4724, + "mean_token_accuracy": 0.8471360206604004, + "num_tokens": 99202594.0, + "step": 2597 + }, + { + "epoch": 0.33049230377814526, + "ewc_loss": 0.01904296875, + "ewc_loss_parallel": 1.9073486328125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.171613693237305, + "learning_rate": 1e-06, + "loss": 0.4501, + "mean_token_accuracy": 0.8558638095855713, + "num_tokens": 99244732.0, + "step": 2598 + }, + { + "epoch": 0.3306195140567358, + "ewc_loss": 0.0189208984375, + "ewc_loss_parallel": 1.895427703857422e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.37139129638672, + "learning_rate": 1e-06, + "loss": 0.4353, + "mean_token_accuracy": 0.8611335754394531, + "num_tokens": 99287161.0, + "step": 2599 + }, + { + "epoch": 0.3307467243353263, + "ewc_loss": 0.0189208984375, + "ewc_loss_parallel": 1.895427703857422e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.49859046936035, + "learning_rate": 1e-06, + "loss": 0.4878, + "mean_token_accuracy": 0.842889666557312, + "num_tokens": 99322650.0, + "step": 2600 + }, + { + "epoch": 0.3308739346139168, + "ewc_loss": 0.01904296875, + "ewc_loss_parallel": 1.9073486328125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.34486961364746, + "learning_rate": 1e-06, + "loss": 0.4592, + "mean_token_accuracy": 0.8511542081832886, + "num_tokens": 99358529.0, + "step": 2601 + }, + { + "epoch": 0.3310011448925073, + "ewc_loss": 0.0189208984375, + "ewc_loss_parallel": 1.895427703857422e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.53764533996582, + "learning_rate": 1e-06, + "loss": 0.4464, + "mean_token_accuracy": 0.8591058850288391, + "num_tokens": 99394010.0, + "step": 2602 + }, + { + "epoch": 0.33112835517109784, + "ewc_loss": 0.01904296875, + "ewc_loss_parallel": 1.9073486328125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.768457412719727, + "learning_rate": 1e-06, + "loss": 0.4572, + "mean_token_accuracy": 0.8531057834625244, + "num_tokens": 99434764.0, + "step": 2603 + }, + { + "epoch": 0.3312555654496883, + "ewc_loss": 0.0189208984375, + "ewc_loss_parallel": 1.895427703857422e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.56258201599121, + "learning_rate": 1e-06, + "loss": 0.4572, + "mean_token_accuracy": 0.8517315983772278, + "num_tokens": 99481322.0, + "step": 2604 + }, + { + "epoch": 0.33138277572827884, + "ewc_loss": 0.01904296875, + "ewc_loss_parallel": 1.9073486328125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.784875869750977, + "learning_rate": 1e-06, + "loss": 0.5152, + "mean_token_accuracy": 0.84108567237854, + "num_tokens": 99517719.0, + "step": 2605 + }, + { + "epoch": 0.3315099860068694, + "ewc_loss": 0.0191650390625, + "ewc_loss_parallel": 1.919269561767578e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.33026885986328, + "learning_rate": 1e-06, + "loss": 0.4554, + "mean_token_accuracy": 0.8528051376342773, + "num_tokens": 99553662.0, + "step": 2606 + }, + { + "epoch": 0.33163719628545985, + "ewc_loss": 0.0186767578125, + "ewc_loss_parallel": 1.8715858459472656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.576332092285156, + "learning_rate": 1e-06, + "loss": 0.4689, + "mean_token_accuracy": 0.8498051762580872, + "num_tokens": 99589195.0, + "step": 2607 + }, + { + "epoch": 0.3317644065640504, + "ewc_loss": 0.019287109375, + "ewc_loss_parallel": 1.9311904907226562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.633485794067383, + "learning_rate": 1e-06, + "loss": 0.4522, + "mean_token_accuracy": 0.8528138399124146, + "num_tokens": 99626092.0, + "step": 2608 + }, + { + "epoch": 0.3318916168426409, + "ewc_loss": 0.0184326171875, + "ewc_loss_parallel": 1.8477439880371094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.1984806060791, + "learning_rate": 1e-06, + "loss": 0.5409, + "mean_token_accuracy": 0.8257788419723511, + "num_tokens": 99667087.0, + "step": 2609 + }, + { + "epoch": 0.3320188271212314, + "ewc_loss": 0.0189208984375, + "ewc_loss_parallel": 1.895427703857422e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.10811424255371, + "learning_rate": 1e-06, + "loss": 0.4891, + "mean_token_accuracy": 0.8452918529510498, + "num_tokens": 99705929.0, + "step": 2610 + }, + { + "epoch": 0.3321460373998219, + "ewc_loss": 0.0189208984375, + "ewc_loss_parallel": 1.895427703857422e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.033369064331055, + "learning_rate": 1e-06, + "loss": 0.4765, + "mean_token_accuracy": 0.8468040227890015, + "num_tokens": 99742044.0, + "step": 2611 + }, + { + "epoch": 0.33227324767841243, + "ewc_loss": 0.0185546875, + "ewc_loss_parallel": 1.8596649169921875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.441423416137695, + "learning_rate": 1e-06, + "loss": 0.4639, + "mean_token_accuracy": 0.8534076809883118, + "num_tokens": 99772621.0, + "step": 2612 + }, + { + "epoch": 0.3324004579570029, + "ewc_loss": 0.0186767578125, + "ewc_loss_parallel": 1.8715858459472656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.406795501708984, + "learning_rate": 1e-06, + "loss": 0.4456, + "mean_token_accuracy": 0.8563200831413269, + "num_tokens": 99814405.0, + "step": 2613 + }, + { + "epoch": 0.33252766823559343, + "ewc_loss": 0.01904296875, + "ewc_loss_parallel": 1.9073486328125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.888103485107422, + "learning_rate": 1e-06, + "loss": 0.431, + "mean_token_accuracy": 0.8601411581039429, + "num_tokens": 99852197.0, + "step": 2614 + }, + { + "epoch": 0.33265487851418396, + "ewc_loss": 0.01904296875, + "ewc_loss_parallel": 1.9073486328125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.404033660888672, + "learning_rate": 1e-06, + "loss": 0.489, + "mean_token_accuracy": 0.8461335897445679, + "num_tokens": 99894254.0, + "step": 2615 + }, + { + "epoch": 0.33278208879277443, + "ewc_loss": 0.018798828125, + "ewc_loss_parallel": 1.8835067749023438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.329769134521484, + "learning_rate": 1e-06, + "loss": 0.4845, + "mean_token_accuracy": 0.8443981409072876, + "num_tokens": 99936230.0, + "step": 2616 + }, + { + "epoch": 0.33290929907136496, + "ewc_loss": 0.01904296875, + "ewc_loss_parallel": 1.9073486328125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.47880744934082, + "learning_rate": 1e-06, + "loss": 0.469, + "mean_token_accuracy": 0.8515423536300659, + "num_tokens": 99974964.0, + "step": 2617 + }, + { + "epoch": 0.3330365093499555, + "ewc_loss": 0.01904296875, + "ewc_loss_parallel": 1.9073486328125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.47620964050293, + "learning_rate": 1e-06, + "loss": 0.4936, + "mean_token_accuracy": 0.8413597941398621, + "num_tokens": 100018750.0, + "step": 2618 + }, + { + "epoch": 0.33316371962854596, + "ewc_loss": 0.0189208984375, + "ewc_loss_parallel": 1.895427703857422e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.88874053955078, + "learning_rate": 1e-06, + "loss": 0.4916, + "mean_token_accuracy": 0.845581591129303, + "num_tokens": 100054064.0, + "step": 2619 + }, + { + "epoch": 0.3332909299071365, + "ewc_loss": 0.0189208984375, + "ewc_loss_parallel": 1.895427703857422e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.738508224487305, + "learning_rate": 1e-06, + "loss": 0.4528, + "mean_token_accuracy": 0.8551256656646729, + "num_tokens": 100092549.0, + "step": 2620 + }, + { + "epoch": 0.333418140185727, + "ewc_loss": 0.0189208984375, + "ewc_loss_parallel": 1.895427703857422e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.287845611572266, + "learning_rate": 1e-06, + "loss": 0.5511, + "mean_token_accuracy": 0.8248963356018066, + "num_tokens": 100123973.0, + "step": 2621 + }, + { + "epoch": 0.3335453504643175, + "ewc_loss": 0.01904296875, + "ewc_loss_parallel": 1.9073486328125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.73760223388672, + "learning_rate": 1e-06, + "loss": 0.4631, + "mean_token_accuracy": 0.8489435911178589, + "num_tokens": 100162142.0, + "step": 2622 + }, + { + "epoch": 0.333672560742908, + "ewc_loss": 0.01904296875, + "ewc_loss_parallel": 1.9073486328125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.2774715423584, + "learning_rate": 1e-06, + "loss": 0.4904, + "mean_token_accuracy": 0.8455225229263306, + "num_tokens": 100199352.0, + "step": 2623 + }, + { + "epoch": 0.33379977102149855, + "ewc_loss": 0.0189208984375, + "ewc_loss_parallel": 1.895427703857422e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.612871170043945, + "learning_rate": 1e-06, + "loss": 0.5206, + "mean_token_accuracy": 0.834881603717804, + "num_tokens": 100237446.0, + "step": 2624 + }, + { + "epoch": 0.333926981300089, + "ewc_loss": 0.0189208984375, + "ewc_loss_parallel": 1.895427703857422e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.292034149169922, + "learning_rate": 1e-06, + "loss": 0.4432, + "mean_token_accuracy": 0.8558849096298218, + "num_tokens": 100272986.0, + "step": 2625 + }, + { + "epoch": 0.33405419157867955, + "ewc_loss": 0.018798828125, + "ewc_loss_parallel": 1.8835067749023438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.544424057006836, + "learning_rate": 1e-06, + "loss": 0.4768, + "mean_token_accuracy": 0.8412197232246399, + "num_tokens": 100304998.0, + "step": 2626 + }, + { + "epoch": 0.3341814018572701, + "ewc_loss": 0.0191650390625, + "ewc_loss_parallel": 1.919269561767578e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.404296875, + "learning_rate": 1e-06, + "loss": 0.4835, + "mean_token_accuracy": 0.8475615978240967, + "num_tokens": 100338473.0, + "step": 2627 + }, + { + "epoch": 0.33430861213586055, + "ewc_loss": 0.0189208984375, + "ewc_loss_parallel": 1.895427703857422e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.492280960083008, + "learning_rate": 1e-06, + "loss": 0.5371, + "mean_token_accuracy": 0.8333479762077332, + "num_tokens": 100372644.0, + "step": 2628 + }, + { + "epoch": 0.3344358224144511, + "ewc_loss": 0.0191650390625, + "ewc_loss_parallel": 1.919269561767578e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.347448348999023, + "learning_rate": 1e-06, + "loss": 0.5382, + "mean_token_accuracy": 0.8302616477012634, + "num_tokens": 100413248.0, + "step": 2629 + }, + { + "epoch": 0.3345630326930416, + "ewc_loss": 0.0191650390625, + "ewc_loss_parallel": 1.919269561767578e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.480241775512695, + "learning_rate": 1e-06, + "loss": 0.4822, + "mean_token_accuracy": 0.843526303768158, + "num_tokens": 100448022.0, + "step": 2630 + }, + { + "epoch": 0.33469024297163213, + "ewc_loss": 0.0191650390625, + "ewc_loss_parallel": 1.919269561767578e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.30765724182129, + "learning_rate": 1e-06, + "loss": 0.5138, + "mean_token_accuracy": 0.8384615182876587, + "num_tokens": 100493222.0, + "step": 2631 + }, + { + "epoch": 0.3348174532502226, + "ewc_loss": 0.0191650390625, + "ewc_loss_parallel": 1.919269561767578e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.7630558013916, + "learning_rate": 1e-06, + "loss": 0.4552, + "mean_token_accuracy": 0.8557265996932983, + "num_tokens": 100534736.0, + "step": 2632 + }, + { + "epoch": 0.33494466352881314, + "ewc_loss": 0.0191650390625, + "ewc_loss_parallel": 1.919269561767578e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.18147087097168, + "learning_rate": 1e-06, + "loss": 0.4519, + "mean_token_accuracy": 0.8580765724182129, + "num_tokens": 100576754.0, + "step": 2633 + }, + { + "epoch": 0.33507187380740366, + "ewc_loss": 0.0189208984375, + "ewc_loss_parallel": 1.895427703857422e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.472536087036133, + "learning_rate": 1e-06, + "loss": 0.4987, + "mean_token_accuracy": 0.841376006603241, + "num_tokens": 100611407.0, + "step": 2634 + }, + { + "epoch": 0.33519908408599414, + "ewc_loss": 0.0191650390625, + "ewc_loss_parallel": 1.919269561767578e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.348989486694336, + "learning_rate": 1e-06, + "loss": 0.4445, + "mean_token_accuracy": 0.8558165431022644, + "num_tokens": 100643531.0, + "step": 2635 + }, + { + "epoch": 0.33532629436458466, + "ewc_loss": 0.0191650390625, + "ewc_loss_parallel": 1.919269561767578e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.37377166748047, + "learning_rate": 1e-06, + "loss": 0.4603, + "mean_token_accuracy": 0.8506353497505188, + "num_tokens": 100678612.0, + "step": 2636 + }, + { + "epoch": 0.3354535046431752, + "ewc_loss": 0.019287109375, + "ewc_loss_parallel": 1.9311904907226562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.39253807067871, + "learning_rate": 1e-06, + "loss": 0.4653, + "mean_token_accuracy": 0.8543996810913086, + "num_tokens": 100718873.0, + "step": 2637 + }, + { + "epoch": 0.33558071492176567, + "ewc_loss": 0.0191650390625, + "ewc_loss_parallel": 1.919269561767578e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.410226821899414, + "learning_rate": 1e-06, + "loss": 0.4726, + "mean_token_accuracy": 0.848812997341156, + "num_tokens": 100759883.0, + "step": 2638 + }, + { + "epoch": 0.3357079252003562, + "ewc_loss": 0.019287109375, + "ewc_loss_parallel": 1.9311904907226562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.457427978515625, + "learning_rate": 1e-06, + "loss": 0.5321, + "mean_token_accuracy": 0.8294825553894043, + "num_tokens": 100800022.0, + "step": 2639 + }, + { + "epoch": 0.3358351354789467, + "ewc_loss": 0.0194091796875, + "ewc_loss_parallel": 1.9431114196777344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.333147048950195, + "learning_rate": 1e-06, + "loss": 0.5282, + "mean_token_accuracy": 0.8345330953598022, + "num_tokens": 100837299.0, + "step": 2640 + }, + { + "epoch": 0.3359623457575372, + "ewc_loss": 0.0191650390625, + "ewc_loss_parallel": 1.919269561767578e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.470653533935547, + "learning_rate": 1e-06, + "loss": 0.4862, + "mean_token_accuracy": 0.8455988764762878, + "num_tokens": 100868433.0, + "step": 2641 + }, + { + "epoch": 0.3360895560361277, + "ewc_loss": 0.019287109375, + "ewc_loss_parallel": 1.9311904907226562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.24016761779785, + "learning_rate": 1e-06, + "loss": 0.5115, + "mean_token_accuracy": 0.8346339464187622, + "num_tokens": 100907768.0, + "step": 2642 + }, + { + "epoch": 0.33621676631471825, + "ewc_loss": 0.0191650390625, + "ewc_loss_parallel": 1.919269561767578e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.4697265625, + "learning_rate": 1e-06, + "loss": 0.5473, + "mean_token_accuracy": 0.8263153433799744, + "num_tokens": 100942723.0, + "step": 2643 + }, + { + "epoch": 0.3363439765933087, + "ewc_loss": 0.019287109375, + "ewc_loss_parallel": 1.9311904907226562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.475858688354492, + "learning_rate": 1e-06, + "loss": 0.4602, + "mean_token_accuracy": 0.855508029460907, + "num_tokens": 100984997.0, + "step": 2644 + }, + { + "epoch": 0.33647118687189925, + "ewc_loss": 0.0191650390625, + "ewc_loss_parallel": 1.919269561767578e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.092538833618164, + "learning_rate": 1e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.8646842241287231, + "num_tokens": 101026290.0, + "step": 2645 + }, + { + "epoch": 0.3365983971504898, + "ewc_loss": 0.0194091796875, + "ewc_loss_parallel": 1.9431114196777344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.49498176574707, + "learning_rate": 1e-06, + "loss": 0.4869, + "mean_token_accuracy": 0.8432400226593018, + "num_tokens": 101066766.0, + "step": 2646 + }, + { + "epoch": 0.33672560742908025, + "ewc_loss": 0.019287109375, + "ewc_loss_parallel": 1.9311904907226562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.556169509887695, + "learning_rate": 1e-06, + "loss": 0.4486, + "mean_token_accuracy": 0.8538544178009033, + "num_tokens": 101100833.0, + "step": 2647 + }, + { + "epoch": 0.3368528177076708, + "ewc_loss": 0.019775390625, + "ewc_loss_parallel": 1.9788742065429688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.764875411987305, + "learning_rate": 1e-06, + "loss": 0.4479, + "mean_token_accuracy": 0.8605191707611084, + "num_tokens": 101145552.0, + "step": 2648 + }, + { + "epoch": 0.3369800279862613, + "ewc_loss": 0.0185546875, + "ewc_loss_parallel": 1.8596649169921875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.86440658569336, + "learning_rate": 1e-06, + "loss": 0.4664, + "mean_token_accuracy": 0.8515102863311768, + "num_tokens": 101186644.0, + "step": 2649 + }, + { + "epoch": 0.3371072382648518, + "ewc_loss": 0.01953125, + "ewc_loss_parallel": 1.9550323486328125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.480728149414062, + "learning_rate": 1e-06, + "loss": 0.4799, + "mean_token_accuracy": 0.8473519682884216, + "num_tokens": 101223356.0, + "step": 2650 + }, + { + "epoch": 0.3372344485434423, + "ewc_loss": 0.0191650390625, + "ewc_loss_parallel": 1.919269561767578e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.079322814941406, + "learning_rate": 1e-06, + "loss": 0.4234, + "mean_token_accuracy": 0.8626083135604858, + "num_tokens": 101260527.0, + "step": 2651 + }, + { + "epoch": 0.33736165882203284, + "ewc_loss": 0.018798828125, + "ewc_loss_parallel": 1.8835067749023438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.466426849365234, + "learning_rate": 1e-06, + "loss": 0.4711, + "mean_token_accuracy": 0.8491604924201965, + "num_tokens": 101305403.0, + "step": 2652 + }, + { + "epoch": 0.3374888691006233, + "ewc_loss": 0.0191650390625, + "ewc_loss_parallel": 1.919269561767578e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.917930603027344, + "learning_rate": 1e-06, + "loss": 0.5293, + "mean_token_accuracy": 0.8345515727996826, + "num_tokens": 101345858.0, + "step": 2653 + }, + { + "epoch": 0.33761607937921384, + "ewc_loss": 0.0191650390625, + "ewc_loss_parallel": 1.919269561767578e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.790916442871094, + "learning_rate": 1e-06, + "loss": 0.5425, + "mean_token_accuracy": 0.8258463144302368, + "num_tokens": 101380321.0, + "step": 2654 + }, + { + "epoch": 0.33774328965780437, + "ewc_loss": 0.01904296875, + "ewc_loss_parallel": 1.9073486328125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.266429901123047, + "learning_rate": 1e-06, + "loss": 0.4475, + "mean_token_accuracy": 0.8576384782791138, + "num_tokens": 101419120.0, + "step": 2655 + }, + { + "epoch": 0.33787049993639484, + "ewc_loss": 0.0189208984375, + "ewc_loss_parallel": 1.895427703857422e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.620149612426758, + "learning_rate": 1e-06, + "loss": 0.4689, + "mean_token_accuracy": 0.8508172035217285, + "num_tokens": 101453272.0, + "step": 2656 + }, + { + "epoch": 0.33799771021498537, + "ewc_loss": 0.0191650390625, + "ewc_loss_parallel": 1.919269561767578e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.696758270263672, + "learning_rate": 1e-06, + "loss": 0.5156, + "mean_token_accuracy": 0.8382672071456909, + "num_tokens": 101486879.0, + "step": 2657 + }, + { + "epoch": 0.3381249204935759, + "ewc_loss": 0.0191650390625, + "ewc_loss_parallel": 1.919269561767578e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.56434440612793, + "learning_rate": 1e-06, + "loss": 0.4976, + "mean_token_accuracy": 0.8426237106323242, + "num_tokens": 101523691.0, + "step": 2658 + }, + { + "epoch": 0.33825213077216637, + "ewc_loss": 0.01904296875, + "ewc_loss_parallel": 1.9073486328125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.476966857910156, + "learning_rate": 1e-06, + "loss": 0.455, + "mean_token_accuracy": 0.8513563871383667, + "num_tokens": 101555279.0, + "step": 2659 + }, + { + "epoch": 0.3383793410507569, + "ewc_loss": 0.01904296875, + "ewc_loss_parallel": 1.9073486328125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.43203353881836, + "learning_rate": 1e-06, + "loss": 0.4117, + "mean_token_accuracy": 0.8659622073173523, + "num_tokens": 101591780.0, + "step": 2660 + }, + { + "epoch": 0.3385065513293474, + "ewc_loss": 0.019287109375, + "ewc_loss_parallel": 1.9311904907226562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.616249084472656, + "learning_rate": 1e-06, + "loss": 0.4054, + "mean_token_accuracy": 0.8658378720283508, + "num_tokens": 101631041.0, + "step": 2661 + }, + { + "epoch": 0.3386337616079379, + "ewc_loss": 0.0191650390625, + "ewc_loss_parallel": 1.919269561767578e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.778717041015625, + "learning_rate": 1e-06, + "loss": 0.5059, + "mean_token_accuracy": 0.839815616607666, + "num_tokens": 101666315.0, + "step": 2662 + }, + { + "epoch": 0.33876097188652843, + "ewc_loss": 0.0191650390625, + "ewc_loss_parallel": 1.919269561767578e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.42728042602539, + "learning_rate": 1e-06, + "loss": 0.4294, + "mean_token_accuracy": 0.8627534508705139, + "num_tokens": 101704046.0, + "step": 2663 + }, + { + "epoch": 0.33888818216511896, + "ewc_loss": 0.0191650390625, + "ewc_loss_parallel": 1.919269561767578e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.294776916503906, + "learning_rate": 1e-06, + "loss": 0.4319, + "mean_token_accuracy": 0.8602102994918823, + "num_tokens": 101743076.0, + "step": 2664 + }, + { + "epoch": 0.33901539244370943, + "ewc_loss": 0.019287109375, + "ewc_loss_parallel": 1.9311904907226562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.587114334106445, + "learning_rate": 1e-06, + "loss": 0.4291, + "mean_token_accuracy": 0.8624060750007629, + "num_tokens": 101784006.0, + "step": 2665 + }, + { + "epoch": 0.33914260272229996, + "ewc_loss": 0.0194091796875, + "ewc_loss_parallel": 1.9431114196777344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.832576751708984, + "learning_rate": 1e-06, + "loss": 0.4333, + "mean_token_accuracy": 0.8566746115684509, + "num_tokens": 101821845.0, + "step": 2666 + }, + { + "epoch": 0.3392698130008905, + "ewc_loss": 0.019287109375, + "ewc_loss_parallel": 1.9311904907226562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.364276885986328, + "learning_rate": 1e-06, + "loss": 0.4489, + "mean_token_accuracy": 0.8557161092758179, + "num_tokens": 101861121.0, + "step": 2667 + }, + { + "epoch": 0.33939702327948096, + "ewc_loss": 0.019287109375, + "ewc_loss_parallel": 1.9311904907226562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.446847915649414, + "learning_rate": 1e-06, + "loss": 0.4729, + "mean_token_accuracy": 0.8461463451385498, + "num_tokens": 101894330.0, + "step": 2668 + }, + { + "epoch": 0.3395242335580715, + "ewc_loss": 0.019287109375, + "ewc_loss_parallel": 1.9311904907226562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.43179702758789, + "learning_rate": 1e-06, + "loss": 0.5201, + "mean_token_accuracy": 0.8350558876991272, + "num_tokens": 101934835.0, + "step": 2669 + }, + { + "epoch": 0.339651443836662, + "ewc_loss": 0.019287109375, + "ewc_loss_parallel": 1.9311904907226562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.481809616088867, + "learning_rate": 1e-06, + "loss": 0.5296, + "mean_token_accuracy": 0.832831859588623, + "num_tokens": 101970943.0, + "step": 2670 + }, + { + "epoch": 0.3397786541152525, + "ewc_loss": 0.0191650390625, + "ewc_loss_parallel": 1.919269561767578e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.634754180908203, + "learning_rate": 1e-06, + "loss": 0.5055, + "mean_token_accuracy": 0.8384507894515991, + "num_tokens": 102001211.0, + "step": 2671 + }, + { + "epoch": 0.339905864393843, + "ewc_loss": 0.019287109375, + "ewc_loss_parallel": 1.9311904907226562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.474998474121094, + "learning_rate": 1e-06, + "loss": 0.477, + "mean_token_accuracy": 0.8566561937332153, + "num_tokens": 102039399.0, + "step": 2672 + }, + { + "epoch": 0.34003307467243354, + "ewc_loss": 0.019287109375, + "ewc_loss_parallel": 1.9311904907226562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.568269729614258, + "learning_rate": 1e-06, + "loss": 0.5065, + "mean_token_accuracy": 0.8391613960266113, + "num_tokens": 102070635.0, + "step": 2673 + }, + { + "epoch": 0.340160284951024, + "ewc_loss": 0.019287109375, + "ewc_loss_parallel": 1.9311904907226562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.447965621948242, + "learning_rate": 1e-06, + "loss": 0.4457, + "mean_token_accuracy": 0.8542780876159668, + "num_tokens": 102104590.0, + "step": 2674 + }, + { + "epoch": 0.34028749522961454, + "ewc_loss": 0.019287109375, + "ewc_loss_parallel": 1.9311904907226562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.519067764282227, + "learning_rate": 1e-06, + "loss": 0.4518, + "mean_token_accuracy": 0.8556637763977051, + "num_tokens": 102149267.0, + "step": 2675 + }, + { + "epoch": 0.3404147055082051, + "ewc_loss": 0.019287109375, + "ewc_loss_parallel": 1.9311904907226562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.41292381286621, + "learning_rate": 1e-06, + "loss": 0.5408, + "mean_token_accuracy": 0.834528386592865, + "num_tokens": 102188089.0, + "step": 2676 + }, + { + "epoch": 0.34054191578679555, + "ewc_loss": 0.0194091796875, + "ewc_loss_parallel": 1.9431114196777344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.38273811340332, + "learning_rate": 1e-06, + "loss": 0.4338, + "mean_token_accuracy": 0.8566601276397705, + "num_tokens": 102224335.0, + "step": 2677 + }, + { + "epoch": 0.3406691260653861, + "ewc_loss": 0.019287109375, + "ewc_loss_parallel": 1.9311904907226562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.421768188476562, + "learning_rate": 1e-06, + "loss": 0.4613, + "mean_token_accuracy": 0.8539751768112183, + "num_tokens": 102263421.0, + "step": 2678 + }, + { + "epoch": 0.3407963363439766, + "ewc_loss": 0.0194091796875, + "ewc_loss_parallel": 1.9431114196777344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.51719856262207, + "learning_rate": 1e-06, + "loss": 0.5351, + "mean_token_accuracy": 0.8341475129127502, + "num_tokens": 102301133.0, + "step": 2679 + }, + { + "epoch": 0.34092354662256713, + "ewc_loss": 0.01953125, + "ewc_loss_parallel": 1.9550323486328125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.470144271850586, + "learning_rate": 1e-06, + "loss": 0.4518, + "mean_token_accuracy": 0.8543187379837036, + "num_tokens": 102338516.0, + "step": 2680 + }, + { + "epoch": 0.3410507569011576, + "ewc_loss": 0.0194091796875, + "ewc_loss_parallel": 1.9431114196777344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.544212341308594, + "learning_rate": 1e-06, + "loss": 0.4954, + "mean_token_accuracy": 0.8399574756622314, + "num_tokens": 102376010.0, + "step": 2681 + }, + { + "epoch": 0.34117796717974813, + "ewc_loss": 0.019287109375, + "ewc_loss_parallel": 1.9311904907226562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.289499282836914, + "learning_rate": 1e-06, + "loss": 0.3953, + "mean_token_accuracy": 0.8714907169342041, + "num_tokens": 102417080.0, + "step": 2682 + }, + { + "epoch": 0.34130517745833866, + "ewc_loss": 0.0194091796875, + "ewc_loss_parallel": 1.9431114196777344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.645437240600586, + "learning_rate": 1e-06, + "loss": 0.4615, + "mean_token_accuracy": 0.852203369140625, + "num_tokens": 102458042.0, + "step": 2683 + }, + { + "epoch": 0.34143238773692913, + "ewc_loss": 0.0194091796875, + "ewc_loss_parallel": 1.9431114196777344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.594280242919922, + "learning_rate": 1e-06, + "loss": 0.4476, + "mean_token_accuracy": 0.8572964668273926, + "num_tokens": 102497152.0, + "step": 2684 + }, + { + "epoch": 0.34155959801551966, + "ewc_loss": 0.0194091796875, + "ewc_loss_parallel": 1.9431114196777344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.382299423217773, + "learning_rate": 1e-06, + "loss": 0.4843, + "mean_token_accuracy": 0.8473320603370667, + "num_tokens": 102531486.0, + "step": 2685 + }, + { + "epoch": 0.3416868082941102, + "ewc_loss": 0.0194091796875, + "ewc_loss_parallel": 1.9431114196777344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.091073989868164, + "learning_rate": 1e-06, + "loss": 0.4737, + "mean_token_accuracy": 0.8487061262130737, + "num_tokens": 102571552.0, + "step": 2686 + }, + { + "epoch": 0.34181401857270066, + "ewc_loss": 0.01953125, + "ewc_loss_parallel": 1.9550323486328125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.37589454650879, + "learning_rate": 1e-06, + "loss": 0.4636, + "mean_token_accuracy": 0.8558155298233032, + "num_tokens": 102607338.0, + "step": 2687 + }, + { + "epoch": 0.3419412288512912, + "ewc_loss": 0.019287109375, + "ewc_loss_parallel": 1.9311904907226562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.916542053222656, + "learning_rate": 1e-06, + "loss": 0.4745, + "mean_token_accuracy": 0.8495237827301025, + "num_tokens": 102654445.0, + "step": 2688 + }, + { + "epoch": 0.3420684391298817, + "ewc_loss": 0.0194091796875, + "ewc_loss_parallel": 1.9431114196777344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.5660457611084, + "learning_rate": 1e-06, + "loss": 0.4393, + "mean_token_accuracy": 0.8579016327857971, + "num_tokens": 102695265.0, + "step": 2689 + }, + { + "epoch": 0.3421956494084722, + "ewc_loss": 0.0191650390625, + "ewc_loss_parallel": 1.919269561767578e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.118757247924805, + "learning_rate": 1e-06, + "loss": 0.4528, + "mean_token_accuracy": 0.854520320892334, + "num_tokens": 102728621.0, + "step": 2690 + }, + { + "epoch": 0.3423228596870627, + "ewc_loss": 0.0194091796875, + "ewc_loss_parallel": 1.9431114196777344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.039579391479492, + "learning_rate": 1e-06, + "loss": 0.4909, + "mean_token_accuracy": 0.8438474535942078, + "num_tokens": 102766378.0, + "step": 2691 + }, + { + "epoch": 0.34245006996565325, + "ewc_loss": 0.0194091796875, + "ewc_loss_parallel": 1.9431114196777344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.50501251220703, + "learning_rate": 1e-06, + "loss": 0.4618, + "mean_token_accuracy": 0.8486864566802979, + "num_tokens": 102806331.0, + "step": 2692 + }, + { + "epoch": 0.3425772802442437, + "ewc_loss": 0.019287109375, + "ewc_loss_parallel": 1.9311904907226562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.255701065063477, + "learning_rate": 1e-06, + "loss": 0.4602, + "mean_token_accuracy": 0.8531554341316223, + "num_tokens": 102845314.0, + "step": 2693 + }, + { + "epoch": 0.34270449052283425, + "ewc_loss": 0.0194091796875, + "ewc_loss_parallel": 1.9431114196777344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.42424201965332, + "learning_rate": 1e-06, + "loss": 0.4712, + "mean_token_accuracy": 0.8493474721908569, + "num_tokens": 102888614.0, + "step": 2694 + }, + { + "epoch": 0.3428317008014248, + "ewc_loss": 0.0194091796875, + "ewc_loss_parallel": 1.9431114196777344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.825929641723633, + "learning_rate": 1e-06, + "loss": 0.4883, + "mean_token_accuracy": 0.845681369304657, + "num_tokens": 102925988.0, + "step": 2695 + }, + { + "epoch": 0.34295891108001525, + "ewc_loss": 0.0194091796875, + "ewc_loss_parallel": 1.9431114196777344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.51646614074707, + "learning_rate": 1e-06, + "loss": 0.4563, + "mean_token_accuracy": 0.8590449690818787, + "num_tokens": 102963150.0, + "step": 2696 + }, + { + "epoch": 0.3430861213586058, + "ewc_loss": 0.019287109375, + "ewc_loss_parallel": 1.9311904907226562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.35372543334961, + "learning_rate": 1e-06, + "loss": 0.4372, + "mean_token_accuracy": 0.8584451675415039, + "num_tokens": 102999164.0, + "step": 2697 + }, + { + "epoch": 0.3432133316371963, + "ewc_loss": 0.019287109375, + "ewc_loss_parallel": 1.9311904907226562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.50250244140625, + "learning_rate": 1e-06, + "loss": 0.4438, + "mean_token_accuracy": 0.8595266342163086, + "num_tokens": 103034337.0, + "step": 2698 + }, + { + "epoch": 0.3433405419157868, + "ewc_loss": 0.0194091796875, + "ewc_loss_parallel": 1.9431114196777344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.511104583740234, + "learning_rate": 1e-06, + "loss": 0.4473, + "mean_token_accuracy": 0.8567541837692261, + "num_tokens": 103075062.0, + "step": 2699 + }, + { + "epoch": 0.3434677521943773, + "ewc_loss": 0.019287109375, + "ewc_loss_parallel": 1.9311904907226562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.501277923583984, + "learning_rate": 1e-06, + "loss": 0.498, + "mean_token_accuracy": 0.8457967638969421, + "num_tokens": 103113209.0, + "step": 2700 + }, + { + "epoch": 0.34359496247296784, + "ewc_loss": 0.0194091796875, + "ewc_loss_parallel": 1.9431114196777344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.53826332092285, + "learning_rate": 1e-06, + "loss": 0.4591, + "mean_token_accuracy": 0.8545535802841187, + "num_tokens": 103153117.0, + "step": 2701 + }, + { + "epoch": 0.3437221727515583, + "ewc_loss": 0.0194091796875, + "ewc_loss_parallel": 1.9431114196777344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.411235809326172, + "learning_rate": 1e-06, + "loss": 0.4826, + "mean_token_accuracy": 0.8499491810798645, + "num_tokens": 103192234.0, + "step": 2702 + }, + { + "epoch": 0.34384938303014884, + "ewc_loss": 0.019287109375, + "ewc_loss_parallel": 1.9311904907226562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.531564712524414, + "learning_rate": 1e-06, + "loss": 0.4716, + "mean_token_accuracy": 0.8475536108016968, + "num_tokens": 103224958.0, + "step": 2703 + }, + { + "epoch": 0.34397659330873936, + "ewc_loss": 0.019287109375, + "ewc_loss_parallel": 1.9311904907226562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.47452163696289, + "learning_rate": 1e-06, + "loss": 0.4557, + "mean_token_accuracy": 0.852213442325592, + "num_tokens": 103259679.0, + "step": 2704 + }, + { + "epoch": 0.34410380358732984, + "ewc_loss": 0.0194091796875, + "ewc_loss_parallel": 1.9431114196777344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.40119743347168, + "learning_rate": 1e-06, + "loss": 0.4893, + "mean_token_accuracy": 0.8419912457466125, + "num_tokens": 103299987.0, + "step": 2705 + }, + { + "epoch": 0.34423101386592037, + "ewc_loss": 0.01953125, + "ewc_loss_parallel": 1.9550323486328125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.129283905029297, + "learning_rate": 1e-06, + "loss": 0.5152, + "mean_token_accuracy": 0.837023138999939, + "num_tokens": 103344457.0, + "step": 2706 + }, + { + "epoch": 0.3443582241445109, + "ewc_loss": 0.0194091796875, + "ewc_loss_parallel": 1.9431114196777344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.203815460205078, + "learning_rate": 1e-06, + "loss": 0.452, + "mean_token_accuracy": 0.8554825782775879, + "num_tokens": 103385173.0, + "step": 2707 + }, + { + "epoch": 0.34448543442310137, + "ewc_loss": 0.01953125, + "ewc_loss_parallel": 1.9550323486328125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.00712776184082, + "learning_rate": 1e-06, + "loss": 0.5201, + "mean_token_accuracy": 0.8360617756843567, + "num_tokens": 103425972.0, + "step": 2708 + }, + { + "epoch": 0.3446126447016919, + "ewc_loss": 0.0198974609375, + "ewc_loss_parallel": 1.990795135498047e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.66189193725586, + "learning_rate": 1e-06, + "loss": 0.4798, + "mean_token_accuracy": 0.8466558456420898, + "num_tokens": 103466819.0, + "step": 2709 + }, + { + "epoch": 0.3447398549802824, + "ewc_loss": 0.018798828125, + "ewc_loss_parallel": 1.8835067749023438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.806028366088867, + "learning_rate": 1e-06, + "loss": 0.5062, + "mean_token_accuracy": 0.8392894268035889, + "num_tokens": 103509660.0, + "step": 2710 + }, + { + "epoch": 0.3448670652588729, + "ewc_loss": 0.0196533203125, + "ewc_loss_parallel": 1.9669532775878906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.407974243164062, + "learning_rate": 1e-06, + "loss": 0.4933, + "mean_token_accuracy": 0.8446166515350342, + "num_tokens": 103544557.0, + "step": 2711 + }, + { + "epoch": 0.3449942755374634, + "ewc_loss": 0.0194091796875, + "ewc_loss_parallel": 1.9431114196777344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.202796936035156, + "learning_rate": 1e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.8604120016098022, + "num_tokens": 103585132.0, + "step": 2712 + }, + { + "epoch": 0.34512148581605395, + "ewc_loss": 0.0191650390625, + "ewc_loss_parallel": 1.919269561767578e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.451892852783203, + "learning_rate": 1e-06, + "loss": 0.4498, + "mean_token_accuracy": 0.8587521314620972, + "num_tokens": 103626353.0, + "step": 2713 + }, + { + "epoch": 0.3452486960946444, + "ewc_loss": 0.0194091796875, + "ewc_loss_parallel": 1.9431114196777344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.900022506713867, + "learning_rate": 1e-06, + "loss": 0.458, + "mean_token_accuracy": 0.8543280363082886, + "num_tokens": 103659035.0, + "step": 2714 + }, + { + "epoch": 0.34537590637323495, + "ewc_loss": 0.019287109375, + "ewc_loss_parallel": 1.9311904907226562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.584001541137695, + "learning_rate": 1e-06, + "loss": 0.4534, + "mean_token_accuracy": 0.8586034178733826, + "num_tokens": 103699497.0, + "step": 2715 + }, + { + "epoch": 0.3455031166518255, + "ewc_loss": 0.0194091796875, + "ewc_loss_parallel": 1.9431114196777344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.400646209716797, + "learning_rate": 1e-06, + "loss": 0.497, + "mean_token_accuracy": 0.8382759690284729, + "num_tokens": 103733685.0, + "step": 2716 + }, + { + "epoch": 0.34563032693041595, + "ewc_loss": 0.0194091796875, + "ewc_loss_parallel": 1.9431114196777344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.4476318359375, + "learning_rate": 1e-06, + "loss": 0.5217, + "mean_token_accuracy": 0.8334447145462036, + "num_tokens": 103769399.0, + "step": 2717 + }, + { + "epoch": 0.3457575372090065, + "ewc_loss": 0.0194091796875, + "ewc_loss_parallel": 1.9431114196777344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.841270446777344, + "learning_rate": 1e-06, + "loss": 0.4436, + "mean_token_accuracy": 0.857903003692627, + "num_tokens": 103809078.0, + "step": 2718 + }, + { + "epoch": 0.345884747487597, + "ewc_loss": 0.0194091796875, + "ewc_loss_parallel": 1.9431114196777344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.531719207763672, + "learning_rate": 1e-06, + "loss": 0.4037, + "mean_token_accuracy": 0.8680341839790344, + "num_tokens": 103846764.0, + "step": 2719 + }, + { + "epoch": 0.3460119577661875, + "ewc_loss": 0.0194091796875, + "ewc_loss_parallel": 1.9431114196777344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.315078735351562, + "learning_rate": 1e-06, + "loss": 0.4403, + "mean_token_accuracy": 0.8601351380348206, + "num_tokens": 103883089.0, + "step": 2720 + }, + { + "epoch": 0.346139168044778, + "ewc_loss": 0.0194091796875, + "ewc_loss_parallel": 1.9431114196777344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.41728401184082, + "learning_rate": 1e-06, + "loss": 0.4094, + "mean_token_accuracy": 0.867743968963623, + "num_tokens": 103915775.0, + "step": 2721 + }, + { + "epoch": 0.34626637832336854, + "ewc_loss": 0.0194091796875, + "ewc_loss_parallel": 1.9431114196777344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.57958221435547, + "learning_rate": 1e-06, + "loss": 0.4872, + "mean_token_accuracy": 0.8464194536209106, + "num_tokens": 103953703.0, + "step": 2722 + }, + { + "epoch": 0.346393588601959, + "ewc_loss": 0.0194091796875, + "ewc_loss_parallel": 1.9431114196777344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.38986587524414, + "learning_rate": 1e-06, + "loss": 0.4983, + "mean_token_accuracy": 0.8418248891830444, + "num_tokens": 103990775.0, + "step": 2723 + }, + { + "epoch": 0.34652079888054954, + "ewc_loss": 0.019287109375, + "ewc_loss_parallel": 1.9311904907226562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.691516876220703, + "learning_rate": 1e-06, + "loss": 0.4411, + "mean_token_accuracy": 0.8538504838943481, + "num_tokens": 104025927.0, + "step": 2724 + }, + { + "epoch": 0.34664800915914007, + "ewc_loss": 0.01953125, + "ewc_loss_parallel": 1.9550323486328125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.452722549438477, + "learning_rate": 1e-06, + "loss": 0.4877, + "mean_token_accuracy": 0.8415487408638, + "num_tokens": 104065860.0, + "step": 2725 + }, + { + "epoch": 0.34677521943773054, + "ewc_loss": 0.0194091796875, + "ewc_loss_parallel": 1.9431114196777344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.56559944152832, + "learning_rate": 1e-06, + "loss": 0.4672, + "mean_token_accuracy": 0.8536402583122253, + "num_tokens": 104102117.0, + "step": 2726 + }, + { + "epoch": 0.34690242971632107, + "ewc_loss": 0.01953125, + "ewc_loss_parallel": 1.9550323486328125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.370521545410156, + "learning_rate": 1e-06, + "loss": 0.4771, + "mean_token_accuracy": 0.8477505445480347, + "num_tokens": 104140896.0, + "step": 2727 + }, + { + "epoch": 0.3470296399949116, + "ewc_loss": 0.0194091796875, + "ewc_loss_parallel": 1.9431114196777344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.535362243652344, + "learning_rate": 1e-06, + "loss": 0.4698, + "mean_token_accuracy": 0.8490704298019409, + "num_tokens": 104181355.0, + "step": 2728 + }, + { + "epoch": 0.3471568502735021, + "ewc_loss": 0.0194091796875, + "ewc_loss_parallel": 1.9431114196777344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.43741798400879, + "learning_rate": 1e-06, + "loss": 0.4603, + "mean_token_accuracy": 0.8530251383781433, + "num_tokens": 104218564.0, + "step": 2729 + }, + { + "epoch": 0.3472840605520926, + "ewc_loss": 0.0194091796875, + "ewc_loss_parallel": 1.9431114196777344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.30409812927246, + "learning_rate": 1e-06, + "loss": 0.4428, + "mean_token_accuracy": 0.8550900220870972, + "num_tokens": 104256553.0, + "step": 2730 + }, + { + "epoch": 0.3474112708306831, + "ewc_loss": 0.01953125, + "ewc_loss_parallel": 1.9550323486328125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.44249725341797, + "learning_rate": 1e-06, + "loss": 0.4298, + "mean_token_accuracy": 0.8647228479385376, + "num_tokens": 104291150.0, + "step": 2731 + }, + { + "epoch": 0.34753848110927366, + "ewc_loss": 0.0196533203125, + "ewc_loss_parallel": 1.9669532775878906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.45949363708496, + "learning_rate": 1e-06, + "loss": 0.4849, + "mean_token_accuracy": 0.843645453453064, + "num_tokens": 104327977.0, + "step": 2732 + }, + { + "epoch": 0.34766569138786413, + "ewc_loss": 0.0196533203125, + "ewc_loss_parallel": 1.9669532775878906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.513011932373047, + "learning_rate": 1e-06, + "loss": 0.4853, + "mean_token_accuracy": 0.8472195863723755, + "num_tokens": 104365167.0, + "step": 2733 + }, + { + "epoch": 0.34779290166645466, + "ewc_loss": 0.01953125, + "ewc_loss_parallel": 1.9550323486328125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.365345001220703, + "learning_rate": 1e-06, + "loss": 0.4465, + "mean_token_accuracy": 0.8560781478881836, + "num_tokens": 104400304.0, + "step": 2734 + }, + { + "epoch": 0.3479201119450452, + "ewc_loss": 0.01953125, + "ewc_loss_parallel": 1.9550323486328125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.552682876586914, + "learning_rate": 1e-06, + "loss": 0.5184, + "mean_token_accuracy": 0.8362336158752441, + "num_tokens": 104437974.0, + "step": 2735 + }, + { + "epoch": 0.34804732222363566, + "ewc_loss": 0.0194091796875, + "ewc_loss_parallel": 1.9431114196777344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.45650291442871, + "learning_rate": 1e-06, + "loss": 0.455, + "mean_token_accuracy": 0.8546220064163208, + "num_tokens": 104474474.0, + "step": 2736 + }, + { + "epoch": 0.3481745325022262, + "ewc_loss": 0.01953125, + "ewc_loss_parallel": 1.9550323486328125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.582822799682617, + "learning_rate": 1e-06, + "loss": 0.5165, + "mean_token_accuracy": 0.8372484445571899, + "num_tokens": 104511004.0, + "step": 2737 + }, + { + "epoch": 0.3483017427808167, + "ewc_loss": 0.01953125, + "ewc_loss_parallel": 1.9550323486328125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.50574493408203, + "learning_rate": 1e-06, + "loss": 0.4899, + "mean_token_accuracy": 0.844788670539856, + "num_tokens": 104551889.0, + "step": 2738 + }, + { + "epoch": 0.3484289530594072, + "ewc_loss": 0.01953125, + "ewc_loss_parallel": 1.9550323486328125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.533443450927734, + "learning_rate": 1e-06, + "loss": 0.5171, + "mean_token_accuracy": 0.8351964950561523, + "num_tokens": 104591962.0, + "step": 2739 + }, + { + "epoch": 0.3485561633379977, + "ewc_loss": 0.01953125, + "ewc_loss_parallel": 1.9550323486328125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.69934844970703, + "learning_rate": 1e-06, + "loss": 0.4792, + "mean_token_accuracy": 0.8496639132499695, + "num_tokens": 104636201.0, + "step": 2740 + }, + { + "epoch": 0.34868337361658824, + "ewc_loss": 0.01953125, + "ewc_loss_parallel": 1.9550323486328125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.550260543823242, + "learning_rate": 1e-06, + "loss": 0.466, + "mean_token_accuracy": 0.850432276725769, + "num_tokens": 104673448.0, + "step": 2741 + }, + { + "epoch": 0.3488105838951787, + "ewc_loss": 0.01953125, + "ewc_loss_parallel": 1.9550323486328125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.598270416259766, + "learning_rate": 1e-06, + "loss": 0.4337, + "mean_token_accuracy": 0.8594143390655518, + "num_tokens": 104708879.0, + "step": 2742 + }, + { + "epoch": 0.34893779417376924, + "ewc_loss": 0.0194091796875, + "ewc_loss_parallel": 1.9431114196777344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.476911544799805, + "learning_rate": 1e-06, + "loss": 0.4323, + "mean_token_accuracy": 0.861454963684082, + "num_tokens": 104747073.0, + "step": 2743 + }, + { + "epoch": 0.3490650044523598, + "ewc_loss": 0.0194091796875, + "ewc_loss_parallel": 1.9431114196777344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.50271987915039, + "learning_rate": 1e-06, + "loss": 0.4748, + "mean_token_accuracy": 0.8499990701675415, + "num_tokens": 104786950.0, + "step": 2744 + }, + { + "epoch": 0.34919221473095025, + "ewc_loss": 0.0194091796875, + "ewc_loss_parallel": 1.9431114196777344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.466991424560547, + "learning_rate": 1e-06, + "loss": 0.4525, + "mean_token_accuracy": 0.8564326763153076, + "num_tokens": 104828306.0, + "step": 2745 + }, + { + "epoch": 0.3493194250095408, + "ewc_loss": 0.01953125, + "ewc_loss_parallel": 1.9550323486328125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.541812896728516, + "learning_rate": 1e-06, + "loss": 0.5082, + "mean_token_accuracy": 0.8429261445999146, + "num_tokens": 104867030.0, + "step": 2746 + }, + { + "epoch": 0.3494466352881313, + "ewc_loss": 0.0194091796875, + "ewc_loss_parallel": 1.9431114196777344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.562776565551758, + "learning_rate": 1e-06, + "loss": 0.4463, + "mean_token_accuracy": 0.8558653593063354, + "num_tokens": 104909182.0, + "step": 2747 + }, + { + "epoch": 0.3495738455667218, + "ewc_loss": 0.0196533203125, + "ewc_loss_parallel": 1.9669532775878906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.577516555786133, + "learning_rate": 1e-06, + "loss": 0.4707, + "mean_token_accuracy": 0.8470039963722229, + "num_tokens": 104950648.0, + "step": 2748 + }, + { + "epoch": 0.3497010558453123, + "ewc_loss": 0.0194091796875, + "ewc_loss_parallel": 1.9431114196777344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.449552536010742, + "learning_rate": 1e-06, + "loss": 0.449, + "mean_token_accuracy": 0.8563158512115479, + "num_tokens": 104994780.0, + "step": 2749 + }, + { + "epoch": 0.34982826612390283, + "ewc_loss": 0.0196533203125, + "ewc_loss_parallel": 1.9669532775878906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.67073631286621, + "learning_rate": 1e-06, + "loss": 0.4271, + "mean_token_accuracy": 0.858604907989502, + "num_tokens": 105029469.0, + "step": 2750 + }, + { + "epoch": 0.3499554764024933, + "ewc_loss": 0.01953125, + "ewc_loss_parallel": 1.9550323486328125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.372819900512695, + "learning_rate": 1e-06, + "loss": 0.4776, + "mean_token_accuracy": 0.8455342650413513, + "num_tokens": 105066797.0, + "step": 2751 + }, + { + "epoch": 0.35008268668108383, + "ewc_loss": 0.01953125, + "ewc_loss_parallel": 1.9550323486328125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.635953903198242, + "learning_rate": 1e-06, + "loss": 0.4158, + "mean_token_accuracy": 0.8662505149841309, + "num_tokens": 105100869.0, + "step": 2752 + }, + { + "epoch": 0.35020989695967436, + "ewc_loss": 0.01953125, + "ewc_loss_parallel": 1.9550323486328125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.64019012451172, + "learning_rate": 1e-06, + "loss": 0.4171, + "mean_token_accuracy": 0.8642938137054443, + "num_tokens": 105135409.0, + "step": 2753 + }, + { + "epoch": 0.35033710723826483, + "ewc_loss": 0.0194091796875, + "ewc_loss_parallel": 1.9431114196777344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.563058853149414, + "learning_rate": 1e-06, + "loss": 0.4524, + "mean_token_accuracy": 0.8580545783042908, + "num_tokens": 105172839.0, + "step": 2754 + }, + { + "epoch": 0.35046431751685536, + "ewc_loss": 0.0194091796875, + "ewc_loss_parallel": 1.9431114196777344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.438201904296875, + "learning_rate": 1e-06, + "loss": 0.4484, + "mean_token_accuracy": 0.8580551743507385, + "num_tokens": 105216656.0, + "step": 2755 + }, + { + "epoch": 0.3505915277954459, + "ewc_loss": 0.0194091796875, + "ewc_loss_parallel": 1.9431114196777344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.556827545166016, + "learning_rate": 1e-06, + "loss": 0.4873, + "mean_token_accuracy": 0.8464251756668091, + "num_tokens": 105253740.0, + "step": 2756 + }, + { + "epoch": 0.35071873807403636, + "ewc_loss": 0.01953125, + "ewc_loss_parallel": 1.9550323486328125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.51259422302246, + "learning_rate": 1e-06, + "loss": 0.4491, + "mean_token_accuracy": 0.8559082746505737, + "num_tokens": 105290073.0, + "step": 2757 + }, + { + "epoch": 0.3508459483526269, + "ewc_loss": 0.0196533203125, + "ewc_loss_parallel": 1.9669532775878906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.484895706176758, + "learning_rate": 1e-06, + "loss": 0.4545, + "mean_token_accuracy": 0.8578698039054871, + "num_tokens": 105325933.0, + "step": 2758 + }, + { + "epoch": 0.3509731586312174, + "ewc_loss": 0.01953125, + "ewc_loss_parallel": 1.9550323486328125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.598642349243164, + "learning_rate": 1e-06, + "loss": 0.4486, + "mean_token_accuracy": 0.8551473617553711, + "num_tokens": 105362962.0, + "step": 2759 + }, + { + "epoch": 0.3511003689098079, + "ewc_loss": 0.01953125, + "ewc_loss_parallel": 1.9550323486328125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.466964721679688, + "learning_rate": 1e-06, + "loss": 0.458, + "mean_token_accuracy": 0.8527059555053711, + "num_tokens": 105410074.0, + "step": 2760 + }, + { + "epoch": 0.3512275791883984, + "ewc_loss": 0.01953125, + "ewc_loss_parallel": 1.9550323486328125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.36391830444336, + "learning_rate": 1e-06, + "loss": 0.528, + "mean_token_accuracy": 0.8380764722824097, + "num_tokens": 105450264.0, + "step": 2761 + }, + { + "epoch": 0.35135478946698895, + "ewc_loss": 0.01953125, + "ewc_loss_parallel": 1.9550323486328125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.58502197265625, + "learning_rate": 1e-06, + "loss": 0.4805, + "mean_token_accuracy": 0.8431941270828247, + "num_tokens": 105486566.0, + "step": 2762 + }, + { + "epoch": 0.3514819997455794, + "ewc_loss": 0.01953125, + "ewc_loss_parallel": 1.9550323486328125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.38300323486328, + "learning_rate": 1e-06, + "loss": 0.4521, + "mean_token_accuracy": 0.8556991815567017, + "num_tokens": 105527185.0, + "step": 2763 + }, + { + "epoch": 0.35160921002416995, + "ewc_loss": 0.01953125, + "ewc_loss_parallel": 1.9550323486328125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.549631118774414, + "learning_rate": 1e-06, + "loss": 0.469, + "mean_token_accuracy": 0.850877583026886, + "num_tokens": 105557401.0, + "step": 2764 + }, + { + "epoch": 0.3517364203027605, + "ewc_loss": 0.01953125, + "ewc_loss_parallel": 1.9550323486328125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.40610694885254, + "learning_rate": 1e-06, + "loss": 0.442, + "mean_token_accuracy": 0.8566595315933228, + "num_tokens": 105586877.0, + "step": 2765 + }, + { + "epoch": 0.35186363058135095, + "ewc_loss": 0.0196533203125, + "ewc_loss_parallel": 1.9669532775878906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.79396629333496, + "learning_rate": 1e-06, + "loss": 0.4439, + "mean_token_accuracy": 0.856087327003479, + "num_tokens": 105626988.0, + "step": 2766 + }, + { + "epoch": 0.3519908408599415, + "ewc_loss": 0.0196533203125, + "ewc_loss_parallel": 1.9669532775878906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.466785430908203, + "learning_rate": 1e-06, + "loss": 0.4775, + "mean_token_accuracy": 0.8434571027755737, + "num_tokens": 105665796.0, + "step": 2767 + }, + { + "epoch": 0.352118051138532, + "ewc_loss": 0.01953125, + "ewc_loss_parallel": 1.9550323486328125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.680255889892578, + "learning_rate": 1e-06, + "loss": 0.4903, + "mean_token_accuracy": 0.842887282371521, + "num_tokens": 105708892.0, + "step": 2768 + }, + { + "epoch": 0.3522452614171225, + "ewc_loss": 0.0196533203125, + "ewc_loss_parallel": 1.9669532775878906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.38564682006836, + "learning_rate": 1e-06, + "loss": 0.4584, + "mean_token_accuracy": 0.8463987708091736, + "num_tokens": 105748221.0, + "step": 2769 + }, + { + "epoch": 0.352372471695713, + "ewc_loss": 0.0196533203125, + "ewc_loss_parallel": 1.9669532775878906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.521005630493164, + "learning_rate": 1e-06, + "loss": 0.4844, + "mean_token_accuracy": 0.8496288061141968, + "num_tokens": 105784568.0, + "step": 2770 + }, + { + "epoch": 0.35249968197430354, + "ewc_loss": 0.0196533203125, + "ewc_loss_parallel": 1.9669532775878906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.810579299926758, + "learning_rate": 1e-06, + "loss": 0.5181, + "mean_token_accuracy": 0.8406500816345215, + "num_tokens": 105827223.0, + "step": 2771 + }, + { + "epoch": 0.352626892252894, + "ewc_loss": 0.01953125, + "ewc_loss_parallel": 1.9550323486328125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.2817440032959, + "learning_rate": 1e-06, + "loss": 0.4107, + "mean_token_accuracy": 0.8692376613616943, + "num_tokens": 105871014.0, + "step": 2772 + }, + { + "epoch": 0.35275410253148454, + "ewc_loss": 0.0196533203125, + "ewc_loss_parallel": 1.9669532775878906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.69181251525879, + "learning_rate": 1e-06, + "loss": 0.4852, + "mean_token_accuracy": 0.8473593592643738, + "num_tokens": 105908995.0, + "step": 2773 + }, + { + "epoch": 0.35288131281007507, + "ewc_loss": 0.0196533203125, + "ewc_loss_parallel": 1.9669532775878906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.541654586791992, + "learning_rate": 1e-06, + "loss": 0.5001, + "mean_token_accuracy": 0.8450631499290466, + "num_tokens": 105948243.0, + "step": 2774 + }, + { + "epoch": 0.35300852308866554, + "ewc_loss": 0.01953125, + "ewc_loss_parallel": 1.9550323486328125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.43609619140625, + "learning_rate": 1e-06, + "loss": 0.4507, + "mean_token_accuracy": 0.8531532287597656, + "num_tokens": 105983997.0, + "step": 2775 + }, + { + "epoch": 0.35313573336725607, + "ewc_loss": 0.0196533203125, + "ewc_loss_parallel": 1.9669532775878906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.626420974731445, + "learning_rate": 1e-06, + "loss": 0.5018, + "mean_token_accuracy": 0.8386644124984741, + "num_tokens": 106020336.0, + "step": 2776 + }, + { + "epoch": 0.3532629436458466, + "ewc_loss": 0.019775390625, + "ewc_loss_parallel": 1.9788742065429688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.63157844543457, + "learning_rate": 1e-06, + "loss": 0.4653, + "mean_token_accuracy": 0.8533502817153931, + "num_tokens": 106053300.0, + "step": 2777 + }, + { + "epoch": 0.35339015392443707, + "ewc_loss": 0.0196533203125, + "ewc_loss_parallel": 1.9669532775878906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.366300582885742, + "learning_rate": 1e-06, + "loss": 0.4212, + "mean_token_accuracy": 0.8662402629852295, + "num_tokens": 106091790.0, + "step": 2778 + }, + { + "epoch": 0.3535173642030276, + "ewc_loss": 0.0196533203125, + "ewc_loss_parallel": 1.9669532775878906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.670040130615234, + "learning_rate": 1e-06, + "loss": 0.4596, + "mean_token_accuracy": 0.8557504415512085, + "num_tokens": 106123768.0, + "step": 2779 + }, + { + "epoch": 0.3536445744816181, + "ewc_loss": 0.019775390625, + "ewc_loss_parallel": 1.9788742065429688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.56024169921875, + "learning_rate": 1e-06, + "loss": 0.4457, + "mean_token_accuracy": 0.859740138053894, + "num_tokens": 106158219.0, + "step": 2780 + }, + { + "epoch": 0.35377178476020865, + "ewc_loss": 0.019775390625, + "ewc_loss_parallel": 1.9788742065429688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.58602523803711, + "learning_rate": 1e-06, + "loss": 0.4807, + "mean_token_accuracy": 0.8474599123001099, + "num_tokens": 106195795.0, + "step": 2781 + }, + { + "epoch": 0.3538989950387991, + "ewc_loss": 0.0196533203125, + "ewc_loss_parallel": 1.9669532775878906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.651142120361328, + "learning_rate": 1e-06, + "loss": 0.4693, + "mean_token_accuracy": 0.8530181050300598, + "num_tokens": 106234973.0, + "step": 2782 + }, + { + "epoch": 0.35402620531738965, + "ewc_loss": 0.0196533203125, + "ewc_loss_parallel": 1.9669532775878906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.560073852539062, + "learning_rate": 1e-06, + "loss": 0.527, + "mean_token_accuracy": 0.8376055955886841, + "num_tokens": 106267297.0, + "step": 2783 + }, + { + "epoch": 0.3541534155959802, + "ewc_loss": 0.0196533203125, + "ewc_loss_parallel": 1.9669532775878906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.58963394165039, + "learning_rate": 1e-06, + "loss": 0.4424, + "mean_token_accuracy": 0.8594292402267456, + "num_tokens": 106309070.0, + "step": 2784 + }, + { + "epoch": 0.35428062587457065, + "ewc_loss": 0.0196533203125, + "ewc_loss_parallel": 1.9669532775878906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.645511627197266, + "learning_rate": 1e-06, + "loss": 0.453, + "mean_token_accuracy": 0.8560830950737, + "num_tokens": 106345784.0, + "step": 2785 + }, + { + "epoch": 0.3544078361531612, + "ewc_loss": 0.0196533203125, + "ewc_loss_parallel": 1.9669532775878906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.402177810668945, + "learning_rate": 1e-06, + "loss": 0.4447, + "mean_token_accuracy": 0.8594247102737427, + "num_tokens": 106383172.0, + "step": 2786 + }, + { + "epoch": 0.3545350464317517, + "ewc_loss": 0.0196533203125, + "ewc_loss_parallel": 1.9669532775878906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.701427459716797, + "learning_rate": 1e-06, + "loss": 0.4654, + "mean_token_accuracy": 0.8499774932861328, + "num_tokens": 106416916.0, + "step": 2787 + }, + { + "epoch": 0.3546622567103422, + "ewc_loss": 0.019775390625, + "ewc_loss_parallel": 1.9788742065429688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.43926239013672, + "learning_rate": 1e-06, + "loss": 0.5194, + "mean_token_accuracy": 0.836113452911377, + "num_tokens": 106456969.0, + "step": 2788 + }, + { + "epoch": 0.3547894669889327, + "ewc_loss": 0.019775390625, + "ewc_loss_parallel": 1.9788742065429688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.767467498779297, + "learning_rate": 1e-06, + "loss": 0.4708, + "mean_token_accuracy": 0.8492051362991333, + "num_tokens": 106490476.0, + "step": 2789 + }, + { + "epoch": 0.35491667726752324, + "ewc_loss": 0.0196533203125, + "ewc_loss_parallel": 1.9669532775878906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.543201446533203, + "learning_rate": 1e-06, + "loss": 0.4671, + "mean_token_accuracy": 0.8524975180625916, + "num_tokens": 106525990.0, + "step": 2790 + }, + { + "epoch": 0.3550438875461137, + "ewc_loss": 0.019775390625, + "ewc_loss_parallel": 1.9788742065429688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.659523010253906, + "learning_rate": 1e-06, + "loss": 0.5661, + "mean_token_accuracy": 0.8270209431648254, + "num_tokens": 106562953.0, + "step": 2791 + }, + { + "epoch": 0.35517109782470424, + "ewc_loss": 0.0196533203125, + "ewc_loss_parallel": 1.9669532775878906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.536569595336914, + "learning_rate": 1e-06, + "loss": 0.4449, + "mean_token_accuracy": 0.8535606861114502, + "num_tokens": 106599142.0, + "step": 2792 + }, + { + "epoch": 0.35529830810329477, + "ewc_loss": 0.0196533203125, + "ewc_loss_parallel": 1.9669532775878906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.496923446655273, + "learning_rate": 1e-06, + "loss": 0.4224, + "mean_token_accuracy": 0.8635623455047607, + "num_tokens": 106636743.0, + "step": 2793 + }, + { + "epoch": 0.35542551838188524, + "ewc_loss": 0.0196533203125, + "ewc_loss_parallel": 1.9669532775878906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.537445068359375, + "learning_rate": 1e-06, + "loss": 0.4501, + "mean_token_accuracy": 0.8558200597763062, + "num_tokens": 106678992.0, + "step": 2794 + }, + { + "epoch": 0.35555272866047577, + "ewc_loss": 0.0196533203125, + "ewc_loss_parallel": 1.9669532775878906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.57658576965332, + "learning_rate": 1e-06, + "loss": 0.4902, + "mean_token_accuracy": 0.8428886532783508, + "num_tokens": 106719884.0, + "step": 2795 + }, + { + "epoch": 0.3556799389390663, + "ewc_loss": 0.019775390625, + "ewc_loss_parallel": 1.9788742065429688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.609106063842773, + "learning_rate": 1e-06, + "loss": 0.4818, + "mean_token_accuracy": 0.848170280456543, + "num_tokens": 106755224.0, + "step": 2796 + }, + { + "epoch": 0.35580714921765677, + "ewc_loss": 0.0196533203125, + "ewc_loss_parallel": 1.9669532775878906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.619224548339844, + "learning_rate": 1e-06, + "loss": 0.4361, + "mean_token_accuracy": 0.8631791472434998, + "num_tokens": 106789366.0, + "step": 2797 + }, + { + "epoch": 0.3559343594962473, + "ewc_loss": 0.019775390625, + "ewc_loss_parallel": 1.9788742065429688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.51898956298828, + "learning_rate": 1e-06, + "loss": 0.4964, + "mean_token_accuracy": 0.8426344990730286, + "num_tokens": 106826382.0, + "step": 2798 + }, + { + "epoch": 0.3560615697748378, + "ewc_loss": 0.01953125, + "ewc_loss_parallel": 1.9550323486328125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.5275936126709, + "learning_rate": 1e-06, + "loss": 0.4992, + "mean_token_accuracy": 0.8423499464988708, + "num_tokens": 106860027.0, + "step": 2799 + }, + { + "epoch": 0.3561887800534283, + "ewc_loss": 0.019775390625, + "ewc_loss_parallel": 1.9788742065429688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.484731674194336, + "learning_rate": 1e-06, + "loss": 0.4371, + "mean_token_accuracy": 0.8597695231437683, + "num_tokens": 106898684.0, + "step": 2800 + }, + { + "epoch": 0.35631599033201883, + "ewc_loss": 0.019775390625, + "ewc_loss_parallel": 1.9788742065429688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.612581253051758, + "learning_rate": 1e-06, + "loss": 0.4709, + "mean_token_accuracy": 0.8525381088256836, + "num_tokens": 106934496.0, + "step": 2801 + }, + { + "epoch": 0.35644320061060936, + "ewc_loss": 0.019775390625, + "ewc_loss_parallel": 1.9788742065429688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.58517074584961, + "learning_rate": 1e-06, + "loss": 0.4228, + "mean_token_accuracy": 0.8665155172348022, + "num_tokens": 106974335.0, + "step": 2802 + }, + { + "epoch": 0.35657041088919983, + "ewc_loss": 0.0196533203125, + "ewc_loss_parallel": 1.9669532775878906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.499526977539062, + "learning_rate": 1e-06, + "loss": 0.5709, + "mean_token_accuracy": 0.8259279727935791, + "num_tokens": 107013914.0, + "step": 2803 + }, + { + "epoch": 0.35669762116779036, + "ewc_loss": 0.019775390625, + "ewc_loss_parallel": 1.9788742065429688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.535152435302734, + "learning_rate": 1e-06, + "loss": 0.453, + "mean_token_accuracy": 0.8560200929641724, + "num_tokens": 107053747.0, + "step": 2804 + }, + { + "epoch": 0.3568248314463809, + "ewc_loss": 0.019775390625, + "ewc_loss_parallel": 1.9788742065429688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.537193298339844, + "learning_rate": 1e-06, + "loss": 0.4997, + "mean_token_accuracy": 0.8416976928710938, + "num_tokens": 107095245.0, + "step": 2805 + }, + { + "epoch": 0.35695204172497136, + "ewc_loss": 0.019775390625, + "ewc_loss_parallel": 1.9788742065429688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.632383346557617, + "learning_rate": 1e-06, + "loss": 0.4537, + "mean_token_accuracy": 0.8524476289749146, + "num_tokens": 107133627.0, + "step": 2806 + }, + { + "epoch": 0.3570792520035619, + "ewc_loss": 0.0196533203125, + "ewc_loss_parallel": 1.9669532775878906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.56670379638672, + "learning_rate": 1e-06, + "loss": 0.5349, + "mean_token_accuracy": 0.8277890086174011, + "num_tokens": 107170128.0, + "step": 2807 + }, + { + "epoch": 0.3572064622821524, + "ewc_loss": 0.0196533203125, + "ewc_loss_parallel": 1.9669532775878906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.44270896911621, + "learning_rate": 1e-06, + "loss": 0.4241, + "mean_token_accuracy": 0.8621479272842407, + "num_tokens": 107209137.0, + "step": 2808 + }, + { + "epoch": 0.3573336725607429, + "ewc_loss": 0.019775390625, + "ewc_loss_parallel": 1.9788742065429688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.61045265197754, + "learning_rate": 1e-06, + "loss": 0.4742, + "mean_token_accuracy": 0.8469415903091431, + "num_tokens": 107244866.0, + "step": 2809 + }, + { + "epoch": 0.3574608828393334, + "ewc_loss": 0.019775390625, + "ewc_loss_parallel": 1.9788742065429688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.50897216796875, + "learning_rate": 1e-06, + "loss": 0.4632, + "mean_token_accuracy": 0.8515225648880005, + "num_tokens": 107287399.0, + "step": 2810 + }, + { + "epoch": 0.35758809311792394, + "ewc_loss": 0.019775390625, + "ewc_loss_parallel": 1.9788742065429688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.573867797851562, + "learning_rate": 1e-06, + "loss": 0.5023, + "mean_token_accuracy": 0.8410508632659912, + "num_tokens": 107325627.0, + "step": 2811 + }, + { + "epoch": 0.3577153033965144, + "ewc_loss": 0.019775390625, + "ewc_loss_parallel": 1.9788742065429688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.525991439819336, + "learning_rate": 1e-06, + "loss": 0.4091, + "mean_token_accuracy": 0.8672200441360474, + "num_tokens": 107359375.0, + "step": 2812 + }, + { + "epoch": 0.35784251367510495, + "ewc_loss": 0.019775390625, + "ewc_loss_parallel": 1.9788742065429688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.569244384765625, + "learning_rate": 1e-06, + "loss": 0.5061, + "mean_token_accuracy": 0.8398022651672363, + "num_tokens": 107400286.0, + "step": 2813 + }, + { + "epoch": 0.3579697239536955, + "ewc_loss": 0.019775390625, + "ewc_loss_parallel": 1.9788742065429688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.475345611572266, + "learning_rate": 1e-06, + "loss": 0.4757, + "mean_token_accuracy": 0.8425443768501282, + "num_tokens": 107431768.0, + "step": 2814 + }, + { + "epoch": 0.35809693423228595, + "ewc_loss": 0.019775390625, + "ewc_loss_parallel": 1.9788742065429688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.69706916809082, + "learning_rate": 1e-06, + "loss": 0.4437, + "mean_token_accuracy": 0.8613187670707703, + "num_tokens": 107470875.0, + "step": 2815 + }, + { + "epoch": 0.3582241445108765, + "ewc_loss": 0.019775390625, + "ewc_loss_parallel": 1.9788742065429688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.57424545288086, + "learning_rate": 1e-06, + "loss": 0.4971, + "mean_token_accuracy": 0.8432217836380005, + "num_tokens": 107510988.0, + "step": 2816 + }, + { + "epoch": 0.358351354789467, + "ewc_loss": 0.019775390625, + "ewc_loss_parallel": 1.9788742065429688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.7937068939209, + "learning_rate": 1e-06, + "loss": 0.4418, + "mean_token_accuracy": 0.8584966659545898, + "num_tokens": 107552423.0, + "step": 2817 + }, + { + "epoch": 0.3584785650680575, + "ewc_loss": 0.0196533203125, + "ewc_loss_parallel": 1.9669532775878906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.670013427734375, + "learning_rate": 1e-06, + "loss": 0.4931, + "mean_token_accuracy": 0.8439528942108154, + "num_tokens": 107591693.0, + "step": 2818 + }, + { + "epoch": 0.358605775346648, + "ewc_loss": 0.019775390625, + "ewc_loss_parallel": 1.9788742065429688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.726242065429688, + "learning_rate": 1e-06, + "loss": 0.4468, + "mean_token_accuracy": 0.8567543029785156, + "num_tokens": 107626017.0, + "step": 2819 + }, + { + "epoch": 0.35873298562523853, + "ewc_loss": 0.0196533203125, + "ewc_loss_parallel": 1.9669532775878906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.580535888671875, + "learning_rate": 1e-06, + "loss": 0.4057, + "mean_token_accuracy": 0.8684700727462769, + "num_tokens": 107664927.0, + "step": 2820 + }, + { + "epoch": 0.358860195903829, + "ewc_loss": 0.0196533203125, + "ewc_loss_parallel": 1.9669532775878906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.70634651184082, + "learning_rate": 1e-06, + "loss": 0.4864, + "mean_token_accuracy": 0.844244122505188, + "num_tokens": 107702487.0, + "step": 2821 + }, + { + "epoch": 0.35898740618241953, + "ewc_loss": 0.019775390625, + "ewc_loss_parallel": 1.9788742065429688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.645374298095703, + "learning_rate": 1e-06, + "loss": 0.4292, + "mean_token_accuracy": 0.8646132349967957, + "num_tokens": 107741263.0, + "step": 2822 + }, + { + "epoch": 0.35911461646101006, + "ewc_loss": 0.0196533203125, + "ewc_loss_parallel": 1.9669532775878906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.596433639526367, + "learning_rate": 1e-06, + "loss": 0.45, + "mean_token_accuracy": 0.8544333577156067, + "num_tokens": 107781079.0, + "step": 2823 + }, + { + "epoch": 0.35924182673960053, + "ewc_loss": 0.019775390625, + "ewc_loss_parallel": 1.9788742065429688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.53183364868164, + "learning_rate": 1e-06, + "loss": 0.3997, + "mean_token_accuracy": 0.8706231117248535, + "num_tokens": 107819309.0, + "step": 2824 + }, + { + "epoch": 0.35936903701819106, + "ewc_loss": 0.0196533203125, + "ewc_loss_parallel": 1.9669532775878906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.560747146606445, + "learning_rate": 1e-06, + "loss": 0.4608, + "mean_token_accuracy": 0.8537129163742065, + "num_tokens": 107853251.0, + "step": 2825 + }, + { + "epoch": 0.3594962472967816, + "ewc_loss": 0.019775390625, + "ewc_loss_parallel": 1.9788742065429688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.579797744750977, + "learning_rate": 1e-06, + "loss": 0.3959, + "mean_token_accuracy": 0.8716130256652832, + "num_tokens": 107893476.0, + "step": 2826 + }, + { + "epoch": 0.35962345757537206, + "ewc_loss": 0.019775390625, + "ewc_loss_parallel": 1.9788742065429688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.655797958374023, + "learning_rate": 1e-06, + "loss": 0.4316, + "mean_token_accuracy": 0.8606098890304565, + "num_tokens": 107931204.0, + "step": 2827 + }, + { + "epoch": 0.3597506678539626, + "ewc_loss": 0.0196533203125, + "ewc_loss_parallel": 1.9669532775878906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.4051513671875, + "learning_rate": 1e-06, + "loss": 0.5192, + "mean_token_accuracy": 0.8338444232940674, + "num_tokens": 107966471.0, + "step": 2828 + }, + { + "epoch": 0.3598778781325531, + "ewc_loss": 0.0196533203125, + "ewc_loss_parallel": 1.9669532775878906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.478506088256836, + "learning_rate": 1e-06, + "loss": 0.4553, + "mean_token_accuracy": 0.8533269166946411, + "num_tokens": 108005328.0, + "step": 2829 + }, + { + "epoch": 0.36000508841114365, + "ewc_loss": 0.0196533203125, + "ewc_loss_parallel": 1.9669532775878906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.589889526367188, + "learning_rate": 1e-06, + "loss": 0.48, + "mean_token_accuracy": 0.8407353162765503, + "num_tokens": 108046295.0, + "step": 2830 + }, + { + "epoch": 0.3601322986897341, + "ewc_loss": 0.0196533203125, + "ewc_loss_parallel": 1.9669532775878906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.43276596069336, + "learning_rate": 1e-06, + "loss": 0.4593, + "mean_token_accuracy": 0.8539502620697021, + "num_tokens": 108078164.0, + "step": 2831 + }, + { + "epoch": 0.36025950896832465, + "ewc_loss": 0.0196533203125, + "ewc_loss_parallel": 1.9669532775878906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.661733627319336, + "learning_rate": 1e-06, + "loss": 0.4792, + "mean_token_accuracy": 0.8470747470855713, + "num_tokens": 108117105.0, + "step": 2832 + }, + { + "epoch": 0.3603867192469152, + "ewc_loss": 0.019775390625, + "ewc_loss_parallel": 1.9788742065429688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.520292282104492, + "learning_rate": 1e-06, + "loss": 0.4795, + "mean_token_accuracy": 0.8491402864456177, + "num_tokens": 108155301.0, + "step": 2833 + }, + { + "epoch": 0.36051392952550565, + "ewc_loss": 0.019775390625, + "ewc_loss_parallel": 1.9788742065429688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.757966995239258, + "learning_rate": 1e-06, + "loss": 0.4333, + "mean_token_accuracy": 0.8627758026123047, + "num_tokens": 108193091.0, + "step": 2834 + }, + { + "epoch": 0.3606411398040962, + "ewc_loss": 0.019775390625, + "ewc_loss_parallel": 1.9788742065429688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.469314575195312, + "learning_rate": 1e-06, + "loss": 0.4714, + "mean_token_accuracy": 0.8498530983924866, + "num_tokens": 108233719.0, + "step": 2835 + }, + { + "epoch": 0.3607683500826867, + "ewc_loss": 0.019775390625, + "ewc_loss_parallel": 1.9788742065429688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.58031463623047, + "learning_rate": 1e-06, + "loss": 0.4458, + "mean_token_accuracy": 0.8578915596008301, + "num_tokens": 108265775.0, + "step": 2836 + }, + { + "epoch": 0.3608955603612772, + "ewc_loss": 0.019775390625, + "ewc_loss_parallel": 1.9788742065429688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.508920669555664, + "learning_rate": 1e-06, + "loss": 0.4306, + "mean_token_accuracy": 0.8623009920120239, + "num_tokens": 108307953.0, + "step": 2837 + }, + { + "epoch": 0.3610227706398677, + "ewc_loss": 0.019775390625, + "ewc_loss_parallel": 1.9788742065429688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.646684646606445, + "learning_rate": 1e-06, + "loss": 0.4705, + "mean_token_accuracy": 0.8478720188140869, + "num_tokens": 108351998.0, + "step": 2838 + }, + { + "epoch": 0.36114998091845824, + "ewc_loss": 0.019775390625, + "ewc_loss_parallel": 1.9788742065429688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.562257766723633, + "learning_rate": 1e-06, + "loss": 0.5089, + "mean_token_accuracy": 0.8400510549545288, + "num_tokens": 108395968.0, + "step": 2839 + }, + { + "epoch": 0.3612771911970487, + "ewc_loss": 0.0196533203125, + "ewc_loss_parallel": 1.9669532775878906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.710363388061523, + "learning_rate": 1e-06, + "loss": 0.5316, + "mean_token_accuracy": 0.833532452583313, + "num_tokens": 108433045.0, + "step": 2840 + }, + { + "epoch": 0.36140440147563924, + "ewc_loss": 0.019775390625, + "ewc_loss_parallel": 1.9788742065429688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.59134292602539, + "learning_rate": 1e-06, + "loss": 0.4834, + "mean_token_accuracy": 0.8476186990737915, + "num_tokens": 108470964.0, + "step": 2841 + }, + { + "epoch": 0.36153161175422976, + "ewc_loss": 0.0196533203125, + "ewc_loss_parallel": 1.9669532775878906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.685441970825195, + "learning_rate": 1e-06, + "loss": 0.4659, + "mean_token_accuracy": 0.8522359728813171, + "num_tokens": 108512572.0, + "step": 2842 + }, + { + "epoch": 0.36165882203282024, + "ewc_loss": 0.019775390625, + "ewc_loss_parallel": 1.9788742065429688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.615192413330078, + "learning_rate": 1e-06, + "loss": 0.5028, + "mean_token_accuracy": 0.8369115591049194, + "num_tokens": 108548526.0, + "step": 2843 + }, + { + "epoch": 0.36178603231141077, + "ewc_loss": 0.0196533203125, + "ewc_loss_parallel": 1.9669532775878906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.688207626342773, + "learning_rate": 1e-06, + "loss": 0.4784, + "mean_token_accuracy": 0.8499146699905396, + "num_tokens": 108588447.0, + "step": 2844 + }, + { + "epoch": 0.3619132425900013, + "ewc_loss": 0.0196533203125, + "ewc_loss_parallel": 1.9669532775878906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.6353702545166, + "learning_rate": 1e-06, + "loss": 0.4926, + "mean_token_accuracy": 0.8418797254562378, + "num_tokens": 108628426.0, + "step": 2845 + }, + { + "epoch": 0.36204045286859177, + "ewc_loss": 0.019775390625, + "ewc_loss_parallel": 1.9788742065429688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.76171112060547, + "learning_rate": 1e-06, + "loss": 0.4825, + "mean_token_accuracy": 0.8464982509613037, + "num_tokens": 108661355.0, + "step": 2846 + }, + { + "epoch": 0.3621676631471823, + "ewc_loss": 0.019775390625, + "ewc_loss_parallel": 1.9788742065429688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.687135696411133, + "learning_rate": 1e-06, + "loss": 0.4049, + "mean_token_accuracy": 0.8672876358032227, + "num_tokens": 108698505.0, + "step": 2847 + }, + { + "epoch": 0.3622948734257728, + "ewc_loss": 0.019775390625, + "ewc_loss_parallel": 1.9788742065429688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.564178466796875, + "learning_rate": 1e-06, + "loss": 0.4582, + "mean_token_accuracy": 0.8566423058509827, + "num_tokens": 108737409.0, + "step": 2848 + }, + { + "epoch": 0.3624220837043633, + "ewc_loss": 0.019775390625, + "ewc_loss_parallel": 1.9788742065429688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.56092643737793, + "learning_rate": 1e-06, + "loss": 0.4551, + "mean_token_accuracy": 0.8525230288505554, + "num_tokens": 108772969.0, + "step": 2849 + }, + { + "epoch": 0.3625492939829538, + "ewc_loss": 0.019775390625, + "ewc_loss_parallel": 1.9788742065429688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.651336669921875, + "learning_rate": 1e-06, + "loss": 0.4456, + "mean_token_accuracy": 0.8588991165161133, + "num_tokens": 108805129.0, + "step": 2850 + }, + { + "epoch": 0.36267650426154435, + "ewc_loss": 0.019775390625, + "ewc_loss_parallel": 1.9788742065429688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.62612533569336, + "learning_rate": 1e-06, + "loss": 0.5061, + "mean_token_accuracy": 0.8376689553260803, + "num_tokens": 108841197.0, + "step": 2851 + }, + { + "epoch": 0.3628037145401348, + "ewc_loss": 0.0196533203125, + "ewc_loss_parallel": 1.9669532775878906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.496944427490234, + "learning_rate": 1e-06, + "loss": 0.427, + "mean_token_accuracy": 0.8652839660644531, + "num_tokens": 108876442.0, + "step": 2852 + }, + { + "epoch": 0.36293092481872535, + "ewc_loss": 0.019775390625, + "ewc_loss_parallel": 1.9788742065429688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.77480125427246, + "learning_rate": 1e-06, + "loss": 0.5032, + "mean_token_accuracy": 0.8371083736419678, + "num_tokens": 108911432.0, + "step": 2853 + }, + { + "epoch": 0.3630581350973159, + "ewc_loss": 0.019775390625, + "ewc_loss_parallel": 1.9788742065429688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.663217544555664, + "learning_rate": 1e-06, + "loss": 0.5133, + "mean_token_accuracy": 0.8366036415100098, + "num_tokens": 108952977.0, + "step": 2854 + }, + { + "epoch": 0.36318534537590635, + "ewc_loss": 0.019775390625, + "ewc_loss_parallel": 1.9788742065429688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.663551330566406, + "learning_rate": 1e-06, + "loss": 0.551, + "mean_token_accuracy": 0.8289083242416382, + "num_tokens": 108995602.0, + "step": 2855 + }, + { + "epoch": 0.3633125556544969, + "ewc_loss": 0.0198974609375, + "ewc_loss_parallel": 1.990795135498047e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.510488510131836, + "learning_rate": 1e-06, + "loss": 0.4551, + "mean_token_accuracy": 0.8537572026252747, + "num_tokens": 109040226.0, + "step": 2856 + }, + { + "epoch": 0.3634397659330874, + "ewc_loss": 0.019775390625, + "ewc_loss_parallel": 1.9788742065429688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.595962524414062, + "learning_rate": 1e-06, + "loss": 0.462, + "mean_token_accuracy": 0.853550910949707, + "num_tokens": 109082898.0, + "step": 2857 + }, + { + "epoch": 0.3635669762116779, + "ewc_loss": 0.0196533203125, + "ewc_loss_parallel": 1.9669532775878906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.590478897094727, + "learning_rate": 1e-06, + "loss": 0.4923, + "mean_token_accuracy": 0.8423023819923401, + "num_tokens": 109122460.0, + "step": 2858 + }, + { + "epoch": 0.3636941864902684, + "ewc_loss": 0.0198974609375, + "ewc_loss_parallel": 1.990795135498047e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.651954650878906, + "learning_rate": 1e-06, + "loss": 0.4632, + "mean_token_accuracy": 0.8511759042739868, + "num_tokens": 109161680.0, + "step": 2859 + }, + { + "epoch": 0.36382139676885894, + "ewc_loss": 0.0198974609375, + "ewc_loss_parallel": 1.990795135498047e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.77409553527832, + "learning_rate": 1e-06, + "loss": 0.3963, + "mean_token_accuracy": 0.8703916072845459, + "num_tokens": 109198943.0, + "step": 2860 + }, + { + "epoch": 0.3639486070474494, + "ewc_loss": 0.019775390625, + "ewc_loss_parallel": 1.9788742065429688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.56907844543457, + "learning_rate": 1e-06, + "loss": 0.4459, + "mean_token_accuracy": 0.8599457740783691, + "num_tokens": 109235719.0, + "step": 2861 + }, + { + "epoch": 0.36407581732603994, + "ewc_loss": 0.019775390625, + "ewc_loss_parallel": 1.9788742065429688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.69721031188965, + "learning_rate": 1e-06, + "loss": 0.4838, + "mean_token_accuracy": 0.8445395827293396, + "num_tokens": 109272681.0, + "step": 2862 + }, + { + "epoch": 0.36420302760463047, + "ewc_loss": 0.019775390625, + "ewc_loss_parallel": 1.9788742065429688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.715591430664062, + "learning_rate": 1e-06, + "loss": 0.512, + "mean_token_accuracy": 0.8402674198150635, + "num_tokens": 109306114.0, + "step": 2863 + }, + { + "epoch": 0.36433023788322094, + "ewc_loss": 0.019775390625, + "ewc_loss_parallel": 1.9788742065429688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.84625244140625, + "learning_rate": 1e-06, + "loss": 0.4846, + "mean_token_accuracy": 0.8429034948348999, + "num_tokens": 109337728.0, + "step": 2864 + }, + { + "epoch": 0.36445744816181147, + "ewc_loss": 0.019775390625, + "ewc_loss_parallel": 1.9788742065429688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.757606506347656, + "learning_rate": 1e-06, + "loss": 0.4979, + "mean_token_accuracy": 0.841861367225647, + "num_tokens": 109377809.0, + "step": 2865 + }, + { + "epoch": 0.364584658440402, + "ewc_loss": 0.0198974609375, + "ewc_loss_parallel": 1.990795135498047e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.522560119628906, + "learning_rate": 1e-06, + "loss": 0.4199, + "mean_token_accuracy": 0.8655219078063965, + "num_tokens": 109413499.0, + "step": 2866 + }, + { + "epoch": 0.36471186871899247, + "ewc_loss": 0.0196533203125, + "ewc_loss_parallel": 1.9669532775878906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.768522262573242, + "learning_rate": 1e-06, + "loss": 0.4951, + "mean_token_accuracy": 0.8418647050857544, + "num_tokens": 109460961.0, + "step": 2867 + }, + { + "epoch": 0.364839078997583, + "ewc_loss": 0.019775390625, + "ewc_loss_parallel": 1.9788742065429688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.741275787353516, + "learning_rate": 1e-06, + "loss": 0.4303, + "mean_token_accuracy": 0.8623486757278442, + "num_tokens": 109500615.0, + "step": 2868 + }, + { + "epoch": 0.36496628927617353, + "ewc_loss": 0.0196533203125, + "ewc_loss_parallel": 1.9669532775878906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.9830265045166, + "learning_rate": 1e-06, + "loss": 0.4494, + "mean_token_accuracy": 0.8546745777130127, + "num_tokens": 109533921.0, + "step": 2869 + }, + { + "epoch": 0.365093499554764, + "ewc_loss": 0.019775390625, + "ewc_loss_parallel": 1.9788742065429688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.738563537597656, + "learning_rate": 1e-06, + "loss": 0.4982, + "mean_token_accuracy": 0.8418362736701965, + "num_tokens": 109567069.0, + "step": 2870 + }, + { + "epoch": 0.36522070983335453, + "ewc_loss": 0.0196533203125, + "ewc_loss_parallel": 1.9669532775878906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.541885375976562, + "learning_rate": 1e-06, + "loss": 0.4299, + "mean_token_accuracy": 0.8650132417678833, + "num_tokens": 109603696.0, + "step": 2871 + }, + { + "epoch": 0.36534792011194506, + "ewc_loss": 0.0198974609375, + "ewc_loss_parallel": 1.990795135498047e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.756793975830078, + "learning_rate": 1e-06, + "loss": 0.4159, + "mean_token_accuracy": 0.8695317506790161, + "num_tokens": 109644590.0, + "step": 2872 + }, + { + "epoch": 0.36547513039053553, + "ewc_loss": 0.019775390625, + "ewc_loss_parallel": 1.9788742065429688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.652183532714844, + "learning_rate": 1e-06, + "loss": 0.4764, + "mean_token_accuracy": 0.8471460342407227, + "num_tokens": 109678869.0, + "step": 2873 + }, + { + "epoch": 0.36560234066912606, + "ewc_loss": 0.019775390625, + "ewc_loss_parallel": 1.9788742065429688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.776405334472656, + "learning_rate": 1e-06, + "loss": 0.4864, + "mean_token_accuracy": 0.8456617593765259, + "num_tokens": 109722517.0, + "step": 2874 + }, + { + "epoch": 0.3657295509477166, + "ewc_loss": 0.019775390625, + "ewc_loss_parallel": 1.9788742065429688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.666227340698242, + "learning_rate": 1e-06, + "loss": 0.4657, + "mean_token_accuracy": 0.848664402961731, + "num_tokens": 109757869.0, + "step": 2875 + }, + { + "epoch": 0.36585676122630706, + "ewc_loss": 0.019775390625, + "ewc_loss_parallel": 1.9788742065429688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.833358764648438, + "learning_rate": 1e-06, + "loss": 0.4821, + "mean_token_accuracy": 0.8470316529273987, + "num_tokens": 109793162.0, + "step": 2876 + }, + { + "epoch": 0.3659839715048976, + "ewc_loss": 0.019775390625, + "ewc_loss_parallel": 1.9788742065429688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.643203735351562, + "learning_rate": 1e-06, + "loss": 0.441, + "mean_token_accuracy": 0.8581092953681946, + "num_tokens": 109832942.0, + "step": 2877 + }, + { + "epoch": 0.3661111817834881, + "ewc_loss": 0.019775390625, + "ewc_loss_parallel": 1.9788742065429688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.726659774780273, + "learning_rate": 1e-06, + "loss": 0.4777, + "mean_token_accuracy": 0.851111650466919, + "num_tokens": 109876230.0, + "step": 2878 + }, + { + "epoch": 0.3662383920620786, + "ewc_loss": 0.0198974609375, + "ewc_loss_parallel": 1.990795135498047e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.733423233032227, + "learning_rate": 1e-06, + "loss": 0.5008, + "mean_token_accuracy": 0.8421345949172974, + "num_tokens": 109911844.0, + "step": 2879 + }, + { + "epoch": 0.3663656023406691, + "ewc_loss": 0.019775390625, + "ewc_loss_parallel": 1.9788742065429688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.638336181640625, + "learning_rate": 1e-06, + "loss": 0.5283, + "mean_token_accuracy": 0.839766263961792, + "num_tokens": 109957619.0, + "step": 2880 + }, + { + "epoch": 0.36649281261925964, + "ewc_loss": 0.0198974609375, + "ewc_loss_parallel": 1.990795135498047e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.828842163085938, + "learning_rate": 1e-06, + "loss": 0.4991, + "mean_token_accuracy": 0.8412669897079468, + "num_tokens": 109999117.0, + "step": 2881 + }, + { + "epoch": 0.3666200228978502, + "ewc_loss": 0.019775390625, + "ewc_loss_parallel": 1.9788742065429688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.715084075927734, + "learning_rate": 1e-06, + "loss": 0.4466, + "mean_token_accuracy": 0.8585817813873291, + "num_tokens": 110039557.0, + "step": 2882 + }, + { + "epoch": 0.36674723317644065, + "ewc_loss": 0.019775390625, + "ewc_loss_parallel": 1.9788742065429688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.76361846923828, + "learning_rate": 1e-06, + "loss": 0.4878, + "mean_token_accuracy": 0.8433730602264404, + "num_tokens": 110074418.0, + "step": 2883 + }, + { + "epoch": 0.3668744434550312, + "ewc_loss": 0.0196533203125, + "ewc_loss_parallel": 1.9669532775878906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.704978942871094, + "learning_rate": 1e-06, + "loss": 0.45, + "mean_token_accuracy": 0.8571703433990479, + "num_tokens": 110109255.0, + "step": 2884 + }, + { + "epoch": 0.3670016537336217, + "ewc_loss": 0.019775390625, + "ewc_loss_parallel": 1.9788742065429688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.638450622558594, + "learning_rate": 1e-06, + "loss": 0.4591, + "mean_token_accuracy": 0.8573954105377197, + "num_tokens": 110150180.0, + "step": 2885 + }, + { + "epoch": 0.3671288640122122, + "ewc_loss": 0.019775390625, + "ewc_loss_parallel": 1.9788742065429688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.713472366333008, + "learning_rate": 1e-06, + "loss": 0.5216, + "mean_token_accuracy": 0.8353373408317566, + "num_tokens": 110193653.0, + "step": 2886 + }, + { + "epoch": 0.3672560742908027, + "ewc_loss": 0.019775390625, + "ewc_loss_parallel": 1.9788742065429688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.760770797729492, + "learning_rate": 1e-06, + "loss": 0.429, + "mean_token_accuracy": 0.8581434488296509, + "num_tokens": 110231020.0, + "step": 2887 + }, + { + "epoch": 0.36738328456939323, + "ewc_loss": 0.019775390625, + "ewc_loss_parallel": 1.9788742065429688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.767139434814453, + "learning_rate": 1e-06, + "loss": 0.4216, + "mean_token_accuracy": 0.8667504191398621, + "num_tokens": 110269397.0, + "step": 2888 + }, + { + "epoch": 0.3675104948479837, + "ewc_loss": 0.0196533203125, + "ewc_loss_parallel": 1.9669532775878906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.772838592529297, + "learning_rate": 1e-06, + "loss": 0.4224, + "mean_token_accuracy": 0.8670251369476318, + "num_tokens": 110311188.0, + "step": 2889 + }, + { + "epoch": 0.36763770512657423, + "ewc_loss": 0.0196533203125, + "ewc_loss_parallel": 1.9669532775878906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.71323013305664, + "learning_rate": 1e-06, + "loss": 0.4422, + "mean_token_accuracy": 0.8552981019020081, + "num_tokens": 110343801.0, + "step": 2890 + }, + { + "epoch": 0.36776491540516476, + "ewc_loss": 0.0198974609375, + "ewc_loss_parallel": 1.990795135498047e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.7055606842041, + "learning_rate": 1e-06, + "loss": 0.5186, + "mean_token_accuracy": 0.8346769213676453, + "num_tokens": 110385830.0, + "step": 2891 + }, + { + "epoch": 0.36789212568375523, + "ewc_loss": 0.019775390625, + "ewc_loss_parallel": 1.9788742065429688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.73453140258789, + "learning_rate": 1e-06, + "loss": 0.4747, + "mean_token_accuracy": 0.8503198623657227, + "num_tokens": 110421494.0, + "step": 2892 + }, + { + "epoch": 0.36801933596234576, + "ewc_loss": 0.019775390625, + "ewc_loss_parallel": 1.9788742065429688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.726905822753906, + "learning_rate": 1e-06, + "loss": 0.4743, + "mean_token_accuracy": 0.8508782386779785, + "num_tokens": 110463132.0, + "step": 2893 + }, + { + "epoch": 0.3681465462409363, + "ewc_loss": 0.019775390625, + "ewc_loss_parallel": 1.9788742065429688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.81130027770996, + "learning_rate": 1e-06, + "loss": 0.4486, + "mean_token_accuracy": 0.8542405366897583, + "num_tokens": 110498330.0, + "step": 2894 + }, + { + "epoch": 0.36827375651952676, + "ewc_loss": 0.0198974609375, + "ewc_loss_parallel": 1.990795135498047e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.777002334594727, + "learning_rate": 1e-06, + "loss": 0.4228, + "mean_token_accuracy": 0.8606316447257996, + "num_tokens": 110534438.0, + "step": 2895 + }, + { + "epoch": 0.3684009667981173, + "ewc_loss": 0.019775390625, + "ewc_loss_parallel": 1.9788742065429688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.756372451782227, + "learning_rate": 1e-06, + "loss": 0.4733, + "mean_token_accuracy": 0.8474414348602295, + "num_tokens": 110572059.0, + "step": 2896 + }, + { + "epoch": 0.3685281770767078, + "ewc_loss": 0.019775390625, + "ewc_loss_parallel": 1.9788742065429688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.849733352661133, + "learning_rate": 1e-06, + "loss": 0.4537, + "mean_token_accuracy": 0.8533986806869507, + "num_tokens": 110611012.0, + "step": 2897 + }, + { + "epoch": 0.3686553873552983, + "ewc_loss": 0.0198974609375, + "ewc_loss_parallel": 1.990795135498047e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.74375343322754, + "learning_rate": 1e-06, + "loss": 0.4323, + "mean_token_accuracy": 0.8594242930412292, + "num_tokens": 110647429.0, + "step": 2898 + }, + { + "epoch": 0.3687825976338888, + "ewc_loss": 0.0198974609375, + "ewc_loss_parallel": 1.990795135498047e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.734783172607422, + "learning_rate": 1e-06, + "loss": 0.4921, + "mean_token_accuracy": 0.8458826541900635, + "num_tokens": 110685052.0, + "step": 2899 + }, + { + "epoch": 0.36890980791247935, + "ewc_loss": 0.019775390625, + "ewc_loss_parallel": 1.9788742065429688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.993709564208984, + "learning_rate": 1e-06, + "loss": 0.5229, + "mean_token_accuracy": 0.8306897878646851, + "num_tokens": 110725222.0, + "step": 2900 + }, + { + "epoch": 0.3690370181910698, + "ewc_loss": 0.0198974609375, + "ewc_loss_parallel": 1.990795135498047e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.390094757080078, + "learning_rate": 1e-06, + "loss": 0.5099, + "mean_token_accuracy": 0.8413901925086975, + "num_tokens": 110764314.0, + "step": 2901 + }, + { + "epoch": 0.36916422846966035, + "ewc_loss": 0.019775390625, + "ewc_loss_parallel": 1.9788742065429688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.945512771606445, + "learning_rate": 1e-06, + "loss": 0.4155, + "mean_token_accuracy": 0.8633172512054443, + "num_tokens": 110803585.0, + "step": 2902 + }, + { + "epoch": 0.3692914387482509, + "ewc_loss": 0.0196533203125, + "ewc_loss_parallel": 1.9669532775878906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.865434646606445, + "learning_rate": 1e-06, + "loss": 0.4837, + "mean_token_accuracy": 0.8466823101043701, + "num_tokens": 110846532.0, + "step": 2903 + }, + { + "epoch": 0.36941864902684135, + "ewc_loss": 0.019775390625, + "ewc_loss_parallel": 1.9788742065429688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.915851593017578, + "learning_rate": 1e-06, + "loss": 0.4753, + "mean_token_accuracy": 0.8491215109825134, + "num_tokens": 110889807.0, + "step": 2904 + }, + { + "epoch": 0.3695458593054319, + "ewc_loss": 0.019775390625, + "ewc_loss_parallel": 1.9788742065429688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.417579650878906, + "learning_rate": 1e-06, + "loss": 0.5265, + "mean_token_accuracy": 0.8322272300720215, + "num_tokens": 110930567.0, + "step": 2905 + }, + { + "epoch": 0.3696730695840224, + "ewc_loss": 0.019775390625, + "ewc_loss_parallel": 1.9788742065429688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.988502502441406, + "learning_rate": 1e-06, + "loss": 0.4329, + "mean_token_accuracy": 0.8572100400924683, + "num_tokens": 110968725.0, + "step": 2906 + }, + { + "epoch": 0.3698002798626129, + "ewc_loss": 0.0198974609375, + "ewc_loss_parallel": 1.990795135498047e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.911806106567383, + "learning_rate": 1e-06, + "loss": 0.3869, + "mean_token_accuracy": 0.8754636645317078, + "num_tokens": 111002724.0, + "step": 2907 + }, + { + "epoch": 0.3699274901412034, + "ewc_loss": 0.0196533203125, + "ewc_loss_parallel": 1.9669532775878906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.646833419799805, + "learning_rate": 1e-06, + "loss": 0.4675, + "mean_token_accuracy": 0.8500176668167114, + "num_tokens": 111036668.0, + "step": 2908 + }, + { + "epoch": 0.37005470041979394, + "ewc_loss": 0.019775390625, + "ewc_loss_parallel": 1.9788742065429688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.718109130859375, + "learning_rate": 1e-06, + "loss": 0.5, + "mean_token_accuracy": 0.8436674475669861, + "num_tokens": 111077712.0, + "step": 2909 + }, + { + "epoch": 0.3701819106983844, + "ewc_loss": 0.0198974609375, + "ewc_loss_parallel": 1.990795135498047e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.109848022460938, + "learning_rate": 1e-06, + "loss": 0.5055, + "mean_token_accuracy": 0.8416931629180908, + "num_tokens": 111118856.0, + "step": 2910 + }, + { + "epoch": 0.37030912097697494, + "ewc_loss": 0.019775390625, + "ewc_loss_parallel": 1.9788742065429688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.80350112915039, + "learning_rate": 1e-06, + "loss": 0.4969, + "mean_token_accuracy": 0.8430931568145752, + "num_tokens": 111156712.0, + "step": 2911 + }, + { + "epoch": 0.37043633125556547, + "ewc_loss": 0.019775390625, + "ewc_loss_parallel": 1.9788742065429688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.763338088989258, + "learning_rate": 1e-06, + "loss": 0.4694, + "mean_token_accuracy": 0.8526373505592346, + "num_tokens": 111195542.0, + "step": 2912 + }, + { + "epoch": 0.37056354153415594, + "ewc_loss": 0.019775390625, + "ewc_loss_parallel": 1.9788742065429688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.69944190979004, + "learning_rate": 1e-06, + "loss": 0.5397, + "mean_token_accuracy": 0.8338613510131836, + "num_tokens": 111237827.0, + "step": 2913 + }, + { + "epoch": 0.37069075181274647, + "ewc_loss": 0.0198974609375, + "ewc_loss_parallel": 1.990795135498047e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.091571807861328, + "learning_rate": 1e-06, + "loss": 0.4351, + "mean_token_accuracy": 0.8600316047668457, + "num_tokens": 111278802.0, + "step": 2914 + }, + { + "epoch": 0.370817962091337, + "ewc_loss": 0.0198974609375, + "ewc_loss_parallel": 1.990795135498047e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.736848831176758, + "learning_rate": 1e-06, + "loss": 0.4708, + "mean_token_accuracy": 0.8496191501617432, + "num_tokens": 111313795.0, + "step": 2915 + }, + { + "epoch": 0.37094517236992747, + "ewc_loss": 0.019775390625, + "ewc_loss_parallel": 1.9788742065429688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.82546615600586, + "learning_rate": 1e-06, + "loss": 0.4441, + "mean_token_accuracy": 0.8580838441848755, + "num_tokens": 111353798.0, + "step": 2916 + }, + { + "epoch": 0.371072382648518, + "ewc_loss": 0.019775390625, + "ewc_loss_parallel": 1.9788742065429688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.90308380126953, + "learning_rate": 1e-06, + "loss": 0.467, + "mean_token_accuracy": 0.8523815870285034, + "num_tokens": 111390613.0, + "step": 2917 + }, + { + "epoch": 0.3711995929271085, + "ewc_loss": 0.019775390625, + "ewc_loss_parallel": 1.9788742065429688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.584421157836914, + "learning_rate": 1e-06, + "loss": 0.4525, + "mean_token_accuracy": 0.8524988293647766, + "num_tokens": 111432043.0, + "step": 2918 + }, + { + "epoch": 0.371326803205699, + "ewc_loss": 0.0198974609375, + "ewc_loss_parallel": 1.990795135498047e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.641315460205078, + "learning_rate": 1e-06, + "loss": 0.4466, + "mean_token_accuracy": 0.8579258322715759, + "num_tokens": 111476575.0, + "step": 2919 + }, + { + "epoch": 0.3714540134842895, + "ewc_loss": 0.02001953125, + "ewc_loss_parallel": 2.002716064453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.734939575195312, + "learning_rate": 1e-06, + "loss": 0.4683, + "mean_token_accuracy": 0.851033627986908, + "num_tokens": 111515304.0, + "step": 2920 + }, + { + "epoch": 0.37158122376288005, + "ewc_loss": 0.019775390625, + "ewc_loss_parallel": 1.9788742065429688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.838943481445312, + "learning_rate": 1e-06, + "loss": 0.4288, + "mean_token_accuracy": 0.860837459564209, + "num_tokens": 111550246.0, + "step": 2921 + }, + { + "epoch": 0.3717084340414705, + "ewc_loss": 0.0198974609375, + "ewc_loss_parallel": 1.990795135498047e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.66950035095215, + "learning_rate": 1e-06, + "loss": 0.4635, + "mean_token_accuracy": 0.8524408936500549, + "num_tokens": 111592189.0, + "step": 2922 + }, + { + "epoch": 0.37183564432006105, + "ewc_loss": 0.019775390625, + "ewc_loss_parallel": 1.9788742065429688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.639097213745117, + "learning_rate": 1e-06, + "loss": 0.4545, + "mean_token_accuracy": 0.8550835847854614, + "num_tokens": 111638177.0, + "step": 2923 + }, + { + "epoch": 0.3719628545986516, + "ewc_loss": 0.0198974609375, + "ewc_loss_parallel": 1.990795135498047e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.872852325439453, + "learning_rate": 1e-06, + "loss": 0.4673, + "mean_token_accuracy": 0.8532922267913818, + "num_tokens": 111674365.0, + "step": 2924 + }, + { + "epoch": 0.37209006487724205, + "ewc_loss": 0.0198974609375, + "ewc_loss_parallel": 1.990795135498047e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.793739318847656, + "learning_rate": 1e-06, + "loss": 0.4897, + "mean_token_accuracy": 0.8467724323272705, + "num_tokens": 111714571.0, + "step": 2925 + }, + { + "epoch": 0.3722172751558326, + "ewc_loss": 0.0198974609375, + "ewc_loss_parallel": 1.990795135498047e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.582645416259766, + "learning_rate": 1e-06, + "loss": 0.4648, + "mean_token_accuracy": 0.85174560546875, + "num_tokens": 111755214.0, + "step": 2926 + }, + { + "epoch": 0.3723444854344231, + "ewc_loss": 0.0198974609375, + "ewc_loss_parallel": 1.990795135498047e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.938608169555664, + "learning_rate": 1e-06, + "loss": 0.5517, + "mean_token_accuracy": 0.8250695466995239, + "num_tokens": 111796516.0, + "step": 2927 + }, + { + "epoch": 0.3724716957130136, + "ewc_loss": 0.0198974609375, + "ewc_loss_parallel": 1.990795135498047e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.70391273498535, + "learning_rate": 1e-06, + "loss": 0.5068, + "mean_token_accuracy": 0.8375643491744995, + "num_tokens": 111832615.0, + "step": 2928 + }, + { + "epoch": 0.3725989059916041, + "ewc_loss": 0.0198974609375, + "ewc_loss_parallel": 1.990795135498047e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.829483032226562, + "learning_rate": 1e-06, + "loss": 0.489, + "mean_token_accuracy": 0.8422656655311584, + "num_tokens": 111874682.0, + "step": 2929 + }, + { + "epoch": 0.37272611627019464, + "ewc_loss": 0.0198974609375, + "ewc_loss_parallel": 1.990795135498047e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.741586685180664, + "learning_rate": 1e-06, + "loss": 0.5176, + "mean_token_accuracy": 0.8395093679428101, + "num_tokens": 111916369.0, + "step": 2930 + }, + { + "epoch": 0.37285332654878517, + "ewc_loss": 0.0198974609375, + "ewc_loss_parallel": 1.990795135498047e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.69034767150879, + "learning_rate": 1e-06, + "loss": 0.4905, + "mean_token_accuracy": 0.8429276943206787, + "num_tokens": 111950671.0, + "step": 2931 + }, + { + "epoch": 0.37298053682737564, + "ewc_loss": 0.0198974609375, + "ewc_loss_parallel": 1.990795135498047e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.917055130004883, + "learning_rate": 1e-06, + "loss": 0.519, + "mean_token_accuracy": 0.8337575197219849, + "num_tokens": 111988846.0, + "step": 2932 + }, + { + "epoch": 0.37310774710596617, + "ewc_loss": 0.0198974609375, + "ewc_loss_parallel": 1.990795135498047e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.692882537841797, + "learning_rate": 1e-06, + "loss": 0.4411, + "mean_token_accuracy": 0.8607848882675171, + "num_tokens": 112022853.0, + "step": 2933 + }, + { + "epoch": 0.3732349573845567, + "ewc_loss": 0.0198974609375, + "ewc_loss_parallel": 1.990795135498047e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.743989944458008, + "learning_rate": 1e-06, + "loss": 0.4689, + "mean_token_accuracy": 0.8475587368011475, + "num_tokens": 112065631.0, + "step": 2934 + }, + { + "epoch": 0.37336216766314717, + "ewc_loss": 0.0198974609375, + "ewc_loss_parallel": 1.990795135498047e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.597734451293945, + "learning_rate": 1e-06, + "loss": 0.504, + "mean_token_accuracy": 0.8403440117835999, + "num_tokens": 112107236.0, + "step": 2935 + }, + { + "epoch": 0.3734893779417377, + "ewc_loss": 0.019775390625, + "ewc_loss_parallel": 1.9788742065429688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.845123291015625, + "learning_rate": 1e-06, + "loss": 0.5323, + "mean_token_accuracy": 0.8290088176727295, + "num_tokens": 112153914.0, + "step": 2936 + }, + { + "epoch": 0.3736165882203282, + "ewc_loss": 0.0198974609375, + "ewc_loss_parallel": 1.990795135498047e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.704814910888672, + "learning_rate": 1e-06, + "loss": 0.4504, + "mean_token_accuracy": 0.8531254529953003, + "num_tokens": 112187596.0, + "step": 2937 + }, + { + "epoch": 0.3737437984989187, + "ewc_loss": 0.019775390625, + "ewc_loss_parallel": 1.9788742065429688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.679790496826172, + "learning_rate": 1e-06, + "loss": 0.4752, + "mean_token_accuracy": 0.8486515283584595, + "num_tokens": 112222409.0, + "step": 2938 + }, + { + "epoch": 0.37387100877750923, + "ewc_loss": 0.0198974609375, + "ewc_loss_parallel": 1.990795135498047e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.65867042541504, + "learning_rate": 1e-06, + "loss": 0.4985, + "mean_token_accuracy": 0.8386890888214111, + "num_tokens": 112257699.0, + "step": 2939 + }, + { + "epoch": 0.37399821905609976, + "ewc_loss": 0.0198974609375, + "ewc_loss_parallel": 1.990795135498047e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.63378143310547, + "learning_rate": 1e-06, + "loss": 0.4369, + "mean_token_accuracy": 0.8592847585678101, + "num_tokens": 112292431.0, + "step": 2940 + }, + { + "epoch": 0.37412542933469023, + "ewc_loss": 0.0198974609375, + "ewc_loss_parallel": 1.990795135498047e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.702320098876953, + "learning_rate": 1e-06, + "loss": 0.4606, + "mean_token_accuracy": 0.852175235748291, + "num_tokens": 112331241.0, + "step": 2941 + }, + { + "epoch": 0.37425263961328076, + "ewc_loss": 0.0198974609375, + "ewc_loss_parallel": 1.990795135498047e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.68045997619629, + "learning_rate": 1e-06, + "loss": 0.477, + "mean_token_accuracy": 0.8476166725158691, + "num_tokens": 112368486.0, + "step": 2942 + }, + { + "epoch": 0.3743798498918713, + "ewc_loss": 0.0198974609375, + "ewc_loss_parallel": 1.990795135498047e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.662723541259766, + "learning_rate": 1e-06, + "loss": 0.4906, + "mean_token_accuracy": 0.8418329954147339, + "num_tokens": 112409638.0, + "step": 2943 + }, + { + "epoch": 0.37450706017046176, + "ewc_loss": 0.0198974609375, + "ewc_loss_parallel": 1.990795135498047e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.612627029418945, + "learning_rate": 1e-06, + "loss": 0.4784, + "mean_token_accuracy": 0.8463459014892578, + "num_tokens": 112448257.0, + "step": 2944 + }, + { + "epoch": 0.3746342704490523, + "ewc_loss": 0.0198974609375, + "ewc_loss_parallel": 1.990795135498047e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.768564224243164, + "learning_rate": 1e-06, + "loss": 0.4209, + "mean_token_accuracy": 0.8661769032478333, + "num_tokens": 112488739.0, + "step": 2945 + }, + { + "epoch": 0.3747614807276428, + "ewc_loss": 0.0198974609375, + "ewc_loss_parallel": 1.990795135498047e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.840953826904297, + "learning_rate": 1e-06, + "loss": 0.5146, + "mean_token_accuracy": 0.8363739252090454, + "num_tokens": 112528621.0, + "step": 2946 + }, + { + "epoch": 0.3748886910062333, + "ewc_loss": 0.0198974609375, + "ewc_loss_parallel": 1.990795135498047e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.647348403930664, + "learning_rate": 1e-06, + "loss": 0.4934, + "mean_token_accuracy": 0.8457933664321899, + "num_tokens": 112564615.0, + "step": 2947 + }, + { + "epoch": 0.3750159012848238, + "ewc_loss": 0.0198974609375, + "ewc_loss_parallel": 1.990795135498047e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.677396774291992, + "learning_rate": 1e-06, + "loss": 0.4942, + "mean_token_accuracy": 0.8397691249847412, + "num_tokens": 112598190.0, + "step": 2948 + }, + { + "epoch": 0.37514311156341434, + "ewc_loss": 0.0198974609375, + "ewc_loss_parallel": 1.990795135498047e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.873994827270508, + "learning_rate": 1e-06, + "loss": 0.474, + "mean_token_accuracy": 0.8501206040382385, + "num_tokens": 112632078.0, + "step": 2949 + }, + { + "epoch": 0.3752703218420048, + "ewc_loss": 0.02001953125, + "ewc_loss_parallel": 2.002716064453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.766939163208008, + "learning_rate": 1e-06, + "loss": 0.5412, + "mean_token_accuracy": 0.8285772204399109, + "num_tokens": 112667848.0, + "step": 2950 + }, + { + "epoch": 0.37539753212059535, + "ewc_loss": 0.0198974609375, + "ewc_loss_parallel": 1.990795135498047e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.707271575927734, + "learning_rate": 1e-06, + "loss": 0.4771, + "mean_token_accuracy": 0.8489099144935608, + "num_tokens": 112707037.0, + "step": 2951 + }, + { + "epoch": 0.3755247423991859, + "ewc_loss": 0.02001953125, + "ewc_loss_parallel": 2.002716064453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.634777069091797, + "learning_rate": 1e-06, + "loss": 0.4328, + "mean_token_accuracy": 0.8600456714630127, + "num_tokens": 112751921.0, + "step": 2952 + }, + { + "epoch": 0.37565195267777635, + "ewc_loss": 0.0198974609375, + "ewc_loss_parallel": 1.990795135498047e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.46693229675293, + "learning_rate": 1e-06, + "loss": 0.4603, + "mean_token_accuracy": 0.8546996116638184, + "num_tokens": 112793061.0, + "step": 2953 + }, + { + "epoch": 0.3757791629563669, + "ewc_loss": 0.02001953125, + "ewc_loss_parallel": 2.002716064453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.795818328857422, + "learning_rate": 1e-06, + "loss": 0.4396, + "mean_token_accuracy": 0.8603993654251099, + "num_tokens": 112833852.0, + "step": 2954 + }, + { + "epoch": 0.3759063732349574, + "ewc_loss": 0.0201416015625, + "ewc_loss_parallel": 2.014636993408203e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.68792724609375, + "learning_rate": 1e-06, + "loss": 0.4729, + "mean_token_accuracy": 0.8468770980834961, + "num_tokens": 112866718.0, + "step": 2955 + }, + { + "epoch": 0.3760335835135479, + "ewc_loss": 0.0198974609375, + "ewc_loss_parallel": 1.990795135498047e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.710405349731445, + "learning_rate": 1e-06, + "loss": 0.4659, + "mean_token_accuracy": 0.8517439961433411, + "num_tokens": 112909420.0, + "step": 2956 + }, + { + "epoch": 0.3761607937921384, + "ewc_loss": 0.0201416015625, + "ewc_loss_parallel": 2.014636993408203e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.839204788208008, + "learning_rate": 1e-06, + "loss": 0.4578, + "mean_token_accuracy": 0.853915810585022, + "num_tokens": 112944570.0, + "step": 2957 + }, + { + "epoch": 0.37628800407072893, + "ewc_loss": 0.02001953125, + "ewc_loss_parallel": 2.002716064453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.72557258605957, + "learning_rate": 1e-06, + "loss": 0.4261, + "mean_token_accuracy": 0.8639988899230957, + "num_tokens": 112984107.0, + "step": 2958 + }, + { + "epoch": 0.3764152143493194, + "ewc_loss": 0.0198974609375, + "ewc_loss_parallel": 1.990795135498047e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.779088973999023, + "learning_rate": 1e-06, + "loss": 0.3942, + "mean_token_accuracy": 0.8733013868331909, + "num_tokens": 113023066.0, + "step": 2959 + }, + { + "epoch": 0.37654242462790993, + "ewc_loss": 0.0201416015625, + "ewc_loss_parallel": 2.014636993408203e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.829545974731445, + "learning_rate": 1e-06, + "loss": 0.4748, + "mean_token_accuracy": 0.8484482169151306, + "num_tokens": 113060093.0, + "step": 2960 + }, + { + "epoch": 0.37666963490650046, + "ewc_loss": 0.0198974609375, + "ewc_loss_parallel": 1.990795135498047e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.63898277282715, + "learning_rate": 1e-06, + "loss": 0.4887, + "mean_token_accuracy": 0.8437408208847046, + "num_tokens": 113095940.0, + "step": 2961 + }, + { + "epoch": 0.37679684518509093, + "ewc_loss": 0.0198974609375, + "ewc_loss_parallel": 1.990795135498047e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.965679168701172, + "learning_rate": 1e-06, + "loss": 0.4318, + "mean_token_accuracy": 0.861992359161377, + "num_tokens": 113130699.0, + "step": 2962 + }, + { + "epoch": 0.37692405546368146, + "ewc_loss": 0.0198974609375, + "ewc_loss_parallel": 1.990795135498047e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.58173370361328, + "learning_rate": 1e-06, + "loss": 0.4635, + "mean_token_accuracy": 0.8515470027923584, + "num_tokens": 113169145.0, + "step": 2963 + }, + { + "epoch": 0.377051265742272, + "ewc_loss": 0.02001953125, + "ewc_loss_parallel": 2.002716064453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.022188186645508, + "learning_rate": 1e-06, + "loss": 0.4154, + "mean_token_accuracy": 0.8657699823379517, + "num_tokens": 113201987.0, + "step": 2964 + }, + { + "epoch": 0.37717847602086246, + "ewc_loss": 0.0198974609375, + "ewc_loss_parallel": 1.990795135498047e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.8311767578125, + "learning_rate": 1e-06, + "loss": 0.4385, + "mean_token_accuracy": 0.8562041521072388, + "num_tokens": 113239588.0, + "step": 2965 + }, + { + "epoch": 0.377305686299453, + "ewc_loss": 0.02001953125, + "ewc_loss_parallel": 2.002716064453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.75001335144043, + "learning_rate": 1e-06, + "loss": 0.4728, + "mean_token_accuracy": 0.8526596426963806, + "num_tokens": 113277075.0, + "step": 2966 + }, + { + "epoch": 0.3774328965780435, + "ewc_loss": 0.0198974609375, + "ewc_loss_parallel": 1.990795135498047e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.781394958496094, + "learning_rate": 1e-06, + "loss": 0.3763, + "mean_token_accuracy": 0.8785895109176636, + "num_tokens": 113313708.0, + "step": 2967 + }, + { + "epoch": 0.377560106856634, + "ewc_loss": 0.0198974609375, + "ewc_loss_parallel": 1.990795135498047e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.891616821289062, + "learning_rate": 1e-06, + "loss": 0.526, + "mean_token_accuracy": 0.8322168588638306, + "num_tokens": 113344027.0, + "step": 2968 + }, + { + "epoch": 0.3776873171352245, + "ewc_loss": 0.02001953125, + "ewc_loss_parallel": 2.002716064453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.57767677307129, + "learning_rate": 1e-06, + "loss": 0.4508, + "mean_token_accuracy": 0.8557324409484863, + "num_tokens": 113388444.0, + "step": 2969 + }, + { + "epoch": 0.37781452741381505, + "ewc_loss": 0.019775390625, + "ewc_loss_parallel": 1.9788742065429688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.731733322143555, + "learning_rate": 1e-06, + "loss": 0.478, + "mean_token_accuracy": 0.8466629981994629, + "num_tokens": 113428660.0, + "step": 2970 + }, + { + "epoch": 0.3779417376924055, + "ewc_loss": 0.02001953125, + "ewc_loss_parallel": 2.002716064453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.967374801635742, + "learning_rate": 1e-06, + "loss": 0.5348, + "mean_token_accuracy": 0.8300659656524658, + "num_tokens": 113469160.0, + "step": 2971 + }, + { + "epoch": 0.37806894797099605, + "ewc_loss": 0.0198974609375, + "ewc_loss_parallel": 1.990795135498047e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.80889892578125, + "learning_rate": 1e-06, + "loss": 0.5363, + "mean_token_accuracy": 0.8268595337867737, + "num_tokens": 113504291.0, + "step": 2972 + }, + { + "epoch": 0.3781961582495866, + "ewc_loss": 0.0198974609375, + "ewc_loss_parallel": 1.990795135498047e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.636646270751953, + "learning_rate": 1e-06, + "loss": 0.4611, + "mean_token_accuracy": 0.8530130386352539, + "num_tokens": 113548085.0, + "step": 2973 + }, + { + "epoch": 0.37832336852817705, + "ewc_loss": 0.0198974609375, + "ewc_loss_parallel": 1.990795135498047e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.887977600097656, + "learning_rate": 1e-06, + "loss": 0.4405, + "mean_token_accuracy": 0.855352520942688, + "num_tokens": 113581664.0, + "step": 2974 + }, + { + "epoch": 0.3784505788067676, + "ewc_loss": 0.02001953125, + "ewc_loss_parallel": 2.002716064453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.623104095458984, + "learning_rate": 1e-06, + "loss": 0.4214, + "mean_token_accuracy": 0.8646544814109802, + "num_tokens": 113618552.0, + "step": 2975 + }, + { + "epoch": 0.3785777890853581, + "ewc_loss": 0.0198974609375, + "ewc_loss_parallel": 1.990795135498047e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.764957427978516, + "learning_rate": 1e-06, + "loss": 0.542, + "mean_token_accuracy": 0.8310549259185791, + "num_tokens": 113653406.0, + "step": 2976 + }, + { + "epoch": 0.3787049993639486, + "ewc_loss": 0.02001953125, + "ewc_loss_parallel": 2.002716064453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.720308303833008, + "learning_rate": 1e-06, + "loss": 0.4446, + "mean_token_accuracy": 0.8591729402542114, + "num_tokens": 113693803.0, + "step": 2977 + }, + { + "epoch": 0.3788322096425391, + "ewc_loss": 0.02001953125, + "ewc_loss_parallel": 2.002716064453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.710908889770508, + "learning_rate": 1e-06, + "loss": 0.4218, + "mean_token_accuracy": 0.866378903388977, + "num_tokens": 113734637.0, + "step": 2978 + }, + { + "epoch": 0.37895941992112964, + "ewc_loss": 0.02001953125, + "ewc_loss_parallel": 2.002716064453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.600927352905273, + "learning_rate": 1e-06, + "loss": 0.4926, + "mean_token_accuracy": 0.8432661890983582, + "num_tokens": 113776579.0, + "step": 2979 + }, + { + "epoch": 0.37908663019972016, + "ewc_loss": 0.02001953125, + "ewc_loss_parallel": 2.002716064453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.921894073486328, + "learning_rate": 1e-06, + "loss": 0.4989, + "mean_token_accuracy": 0.8407402038574219, + "num_tokens": 113818857.0, + "step": 2980 + }, + { + "epoch": 0.37921384047831064, + "ewc_loss": 0.0201416015625, + "ewc_loss_parallel": 2.014636993408203e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.75647735595703, + "learning_rate": 1e-06, + "loss": 0.489, + "mean_token_accuracy": 0.8426677584648132, + "num_tokens": 113853248.0, + "step": 2981 + }, + { + "epoch": 0.37934105075690117, + "ewc_loss": 0.0201416015625, + "ewc_loss_parallel": 2.014636993408203e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.642311096191406, + "learning_rate": 1e-06, + "loss": 0.4604, + "mean_token_accuracy": 0.8581264019012451, + "num_tokens": 113894206.0, + "step": 2982 + }, + { + "epoch": 0.3794682610354917, + "ewc_loss": 0.0201416015625, + "ewc_loss_parallel": 2.014636993408203e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.634695053100586, + "learning_rate": 1e-06, + "loss": 0.4424, + "mean_token_accuracy": 0.8591800928115845, + "num_tokens": 113931309.0, + "step": 2983 + }, + { + "epoch": 0.37959547131408217, + "ewc_loss": 0.02001953125, + "ewc_loss_parallel": 2.002716064453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.754657745361328, + "learning_rate": 1e-06, + "loss": 0.496, + "mean_token_accuracy": 0.8417927622795105, + "num_tokens": 113976559.0, + "step": 2984 + }, + { + "epoch": 0.3797226815926727, + "ewc_loss": 0.020263671875, + "ewc_loss_parallel": 2.0265579223632812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.7662353515625, + "learning_rate": 1e-06, + "loss": 0.4671, + "mean_token_accuracy": 0.850013017654419, + "num_tokens": 114010814.0, + "step": 2985 + }, + { + "epoch": 0.3798498918712632, + "ewc_loss": 0.0201416015625, + "ewc_loss_parallel": 2.014636993408203e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.690420150756836, + "learning_rate": 1e-06, + "loss": 0.4012, + "mean_token_accuracy": 0.8697381019592285, + "num_tokens": 114049389.0, + "step": 2986 + }, + { + "epoch": 0.3799771021498537, + "ewc_loss": 0.02001953125, + "ewc_loss_parallel": 2.002716064453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.787845611572266, + "learning_rate": 1e-06, + "loss": 0.408, + "mean_token_accuracy": 0.8693050742149353, + "num_tokens": 114081463.0, + "step": 2987 + }, + { + "epoch": 0.3801043124284442, + "ewc_loss": 0.0201416015625, + "ewc_loss_parallel": 2.014636993408203e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.69768524169922, + "learning_rate": 1e-06, + "loss": 0.4194, + "mean_token_accuracy": 0.8673802614212036, + "num_tokens": 114117175.0, + "step": 2988 + }, + { + "epoch": 0.38023152270703475, + "ewc_loss": 0.0198974609375, + "ewc_loss_parallel": 1.990795135498047e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.591184616088867, + "learning_rate": 1e-06, + "loss": 0.4959, + "mean_token_accuracy": 0.8404921293258667, + "num_tokens": 114153764.0, + "step": 2989 + }, + { + "epoch": 0.3803587329856252, + "ewc_loss": 0.020263671875, + "ewc_loss_parallel": 2.0265579223632812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.82139778137207, + "learning_rate": 1e-06, + "loss": 0.4773, + "mean_token_accuracy": 0.8511802554130554, + "num_tokens": 114191487.0, + "step": 2990 + }, + { + "epoch": 0.38048594326421575, + "ewc_loss": 0.02001953125, + "ewc_loss_parallel": 2.002716064453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.587984085083008, + "learning_rate": 1e-06, + "loss": 0.5382, + "mean_token_accuracy": 0.826832115650177, + "num_tokens": 114233880.0, + "step": 2991 + }, + { + "epoch": 0.3806131535428063, + "ewc_loss": 0.0198974609375, + "ewc_loss_parallel": 1.990795135498047e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.797954559326172, + "learning_rate": 1e-06, + "loss": 0.4706, + "mean_token_accuracy": 0.8494582176208496, + "num_tokens": 114269238.0, + "step": 2992 + }, + { + "epoch": 0.38074036382139675, + "ewc_loss": 0.020263671875, + "ewc_loss_parallel": 2.0265579223632812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.652585983276367, + "learning_rate": 1e-06, + "loss": 0.4256, + "mean_token_accuracy": 0.8635250329971313, + "num_tokens": 114307645.0, + "step": 2993 + }, + { + "epoch": 0.3808675740999873, + "ewc_loss": 0.02001953125, + "ewc_loss_parallel": 2.002716064453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.77365493774414, + "learning_rate": 1e-06, + "loss": 0.4376, + "mean_token_accuracy": 0.8553894758224487, + "num_tokens": 114341800.0, + "step": 2994 + }, + { + "epoch": 0.3809947843785778, + "ewc_loss": 0.02001953125, + "ewc_loss_parallel": 2.002716064453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.718034744262695, + "learning_rate": 1e-06, + "loss": 0.4611, + "mean_token_accuracy": 0.8598388433456421, + "num_tokens": 114378363.0, + "step": 2995 + }, + { + "epoch": 0.3811219946571683, + "ewc_loss": 0.0201416015625, + "ewc_loss_parallel": 2.014636993408203e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.625877380371094, + "learning_rate": 1e-06, + "loss": 0.5042, + "mean_token_accuracy": 0.8401429653167725, + "num_tokens": 114414489.0, + "step": 2996 + }, + { + "epoch": 0.3812492049357588, + "ewc_loss": 0.0201416015625, + "ewc_loss_parallel": 2.014636993408203e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.733781814575195, + "learning_rate": 1e-06, + "loss": 0.4869, + "mean_token_accuracy": 0.8467333316802979, + "num_tokens": 114449197.0, + "step": 2997 + }, + { + "epoch": 0.38137641521434934, + "ewc_loss": 0.02001953125, + "ewc_loss_parallel": 2.002716064453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.740259170532227, + "learning_rate": 1e-06, + "loss": 0.4649, + "mean_token_accuracy": 0.8485406637191772, + "num_tokens": 114478654.0, + "step": 2998 + }, + { + "epoch": 0.3815036254929398, + "ewc_loss": 0.020263671875, + "ewc_loss_parallel": 2.0265579223632812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.719205856323242, + "learning_rate": 1e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.8597457408905029, + "num_tokens": 114521807.0, + "step": 2999 + }, + { + "epoch": 0.38163083577153034, + "ewc_loss": 0.02001953125, + "ewc_loss_parallel": 2.002716064453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.79665184020996, + "learning_rate": 1e-06, + "loss": 0.4199, + "mean_token_accuracy": 0.865357518196106, + "num_tokens": 114558297.0, + "step": 3000 + }, + { + "epoch": 0.38175804605012087, + "ewc_loss": 0.0201416015625, + "ewc_loss_parallel": 2.014636993408203e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.813873291015625, + "learning_rate": 1e-06, + "loss": 0.4409, + "mean_token_accuracy": 0.8573046922683716, + "num_tokens": 114599630.0, + "step": 3001 + }, + { + "epoch": 0.38188525632871134, + "ewc_loss": 0.02001953125, + "ewc_loss_parallel": 2.002716064453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.908206939697266, + "learning_rate": 1e-06, + "loss": 0.4849, + "mean_token_accuracy": 0.849949061870575, + "num_tokens": 114638697.0, + "step": 3002 + }, + { + "epoch": 0.38201246660730187, + "ewc_loss": 0.0201416015625, + "ewc_loss_parallel": 2.014636993408203e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.929643630981445, + "learning_rate": 1e-06, + "loss": 0.462, + "mean_token_accuracy": 0.8521426320075989, + "num_tokens": 114676106.0, + "step": 3003 + }, + { + "epoch": 0.3821396768858924, + "ewc_loss": 0.02001953125, + "ewc_loss_parallel": 2.002716064453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.687395095825195, + "learning_rate": 1e-06, + "loss": 0.463, + "mean_token_accuracy": 0.8495907783508301, + "num_tokens": 114712026.0, + "step": 3004 + }, + { + "epoch": 0.38226688716448287, + "ewc_loss": 0.02001953125, + "ewc_loss_parallel": 2.002716064453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.817529678344727, + "learning_rate": 1e-06, + "loss": 0.5328, + "mean_token_accuracy": 0.8292585611343384, + "num_tokens": 114752348.0, + "step": 3005 + }, + { + "epoch": 0.3823940974430734, + "ewc_loss": 0.020263671875, + "ewc_loss_parallel": 2.0265579223632812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.682767868041992, + "learning_rate": 1e-06, + "loss": 0.4547, + "mean_token_accuracy": 0.8554982542991638, + "num_tokens": 114789922.0, + "step": 3006 + }, + { + "epoch": 0.38252130772166393, + "ewc_loss": 0.0201416015625, + "ewc_loss_parallel": 2.014636993408203e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.73287582397461, + "learning_rate": 1e-06, + "loss": 0.4826, + "mean_token_accuracy": 0.8463528752326965, + "num_tokens": 114838834.0, + "step": 3007 + }, + { + "epoch": 0.3826485180002544, + "ewc_loss": 0.020263671875, + "ewc_loss_parallel": 2.0265579223632812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.800127029418945, + "learning_rate": 1e-06, + "loss": 0.5211, + "mean_token_accuracy": 0.8312655687332153, + "num_tokens": 114873320.0, + "step": 3008 + }, + { + "epoch": 0.38277572827884493, + "ewc_loss": 0.020263671875, + "ewc_loss_parallel": 2.0265579223632812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.730697631835938, + "learning_rate": 1e-06, + "loss": 0.4195, + "mean_token_accuracy": 0.8637284636497498, + "num_tokens": 114908293.0, + "step": 3009 + }, + { + "epoch": 0.38290293855743546, + "ewc_loss": 0.020263671875, + "ewc_loss_parallel": 2.0265579223632812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.833053588867188, + "learning_rate": 1e-06, + "loss": 0.4977, + "mean_token_accuracy": 0.8415654897689819, + "num_tokens": 114950083.0, + "step": 3010 + }, + { + "epoch": 0.38303014883602593, + "ewc_loss": 0.02001953125, + "ewc_loss_parallel": 2.002716064453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.79328727722168, + "learning_rate": 1e-06, + "loss": 0.446, + "mean_token_accuracy": 0.8562837839126587, + "num_tokens": 114982353.0, + "step": 3011 + }, + { + "epoch": 0.38315735911461646, + "ewc_loss": 0.0201416015625, + "ewc_loss_parallel": 2.014636993408203e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.675777435302734, + "learning_rate": 1e-06, + "loss": 0.4478, + "mean_token_accuracy": 0.8564531803131104, + "num_tokens": 115017668.0, + "step": 3012 + }, + { + "epoch": 0.383284569393207, + "ewc_loss": 0.0201416015625, + "ewc_loss_parallel": 2.014636993408203e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.89628791809082, + "learning_rate": 1e-06, + "loss": 0.4177, + "mean_token_accuracy": 0.8630681037902832, + "num_tokens": 115055266.0, + "step": 3013 + }, + { + "epoch": 0.38341177967179746, + "ewc_loss": 0.02001953125, + "ewc_loss_parallel": 2.002716064453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.63582992553711, + "learning_rate": 1e-06, + "loss": 0.4722, + "mean_token_accuracy": 0.8467222452163696, + "num_tokens": 115089728.0, + "step": 3014 + }, + { + "epoch": 0.383538989950388, + "ewc_loss": 0.0201416015625, + "ewc_loss_parallel": 2.014636993408203e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.877843856811523, + "learning_rate": 1e-06, + "loss": 0.5072, + "mean_token_accuracy": 0.8364447355270386, + "num_tokens": 115135192.0, + "step": 3015 + }, + { + "epoch": 0.3836662002289785, + "ewc_loss": 0.02001953125, + "ewc_loss_parallel": 2.002716064453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.83122444152832, + "learning_rate": 1e-06, + "loss": 0.5256, + "mean_token_accuracy": 0.8345116376876831, + "num_tokens": 115170350.0, + "step": 3016 + }, + { + "epoch": 0.383793410507569, + "ewc_loss": 0.0198974609375, + "ewc_loss_parallel": 1.990795135498047e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.639022827148438, + "learning_rate": 1e-06, + "loss": 0.4358, + "mean_token_accuracy": 0.8586267232894897, + "num_tokens": 115202724.0, + "step": 3017 + }, + { + "epoch": 0.3839206207861595, + "ewc_loss": 0.020263671875, + "ewc_loss_parallel": 2.0265579223632812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.893932342529297, + "learning_rate": 1e-06, + "loss": 0.461, + "mean_token_accuracy": 0.8512979745864868, + "num_tokens": 115239274.0, + "step": 3018 + }, + { + "epoch": 0.38404783106475004, + "ewc_loss": 0.0201416015625, + "ewc_loss_parallel": 2.014636993408203e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.748687744140625, + "learning_rate": 1e-06, + "loss": 0.4429, + "mean_token_accuracy": 0.8600577116012573, + "num_tokens": 115278330.0, + "step": 3019 + }, + { + "epoch": 0.3841750413433405, + "ewc_loss": 0.02001953125, + "ewc_loss_parallel": 2.002716064453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.728160858154297, + "learning_rate": 1e-06, + "loss": 0.4813, + "mean_token_accuracy": 0.8474115133285522, + "num_tokens": 115321258.0, + "step": 3020 + }, + { + "epoch": 0.38430225162193105, + "ewc_loss": 0.0201416015625, + "ewc_loss_parallel": 2.014636993408203e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.78609848022461, + "learning_rate": 1e-06, + "loss": 0.4673, + "mean_token_accuracy": 0.8529805541038513, + "num_tokens": 115362609.0, + "step": 3021 + }, + { + "epoch": 0.3844294619005216, + "ewc_loss": 0.02001953125, + "ewc_loss_parallel": 2.002716064453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.83881950378418, + "learning_rate": 1e-06, + "loss": 0.5578, + "mean_token_accuracy": 0.8251326084136963, + "num_tokens": 115405814.0, + "step": 3022 + }, + { + "epoch": 0.38455667217911205, + "ewc_loss": 0.020263671875, + "ewc_loss_parallel": 2.0265579223632812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.820810317993164, + "learning_rate": 1e-06, + "loss": 0.4839, + "mean_token_accuracy": 0.8449518084526062, + "num_tokens": 115440856.0, + "step": 3023 + }, + { + "epoch": 0.3846838824577026, + "ewc_loss": 0.0201416015625, + "ewc_loss_parallel": 2.014636993408203e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.087291717529297, + "learning_rate": 1e-06, + "loss": 0.446, + "mean_token_accuracy": 0.8575656414031982, + "num_tokens": 115482582.0, + "step": 3024 + }, + { + "epoch": 0.3848110927362931, + "ewc_loss": 0.020263671875, + "ewc_loss_parallel": 2.0265579223632812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.96456527709961, + "learning_rate": 1e-06, + "loss": 0.4988, + "mean_token_accuracy": 0.8423508405685425, + "num_tokens": 115527268.0, + "step": 3025 + }, + { + "epoch": 0.3849383030148836, + "ewc_loss": 0.02001953125, + "ewc_loss_parallel": 2.002716064453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.85603141784668, + "learning_rate": 1e-06, + "loss": 0.4824, + "mean_token_accuracy": 0.8462209701538086, + "num_tokens": 115563757.0, + "step": 3026 + }, + { + "epoch": 0.3850655132934741, + "ewc_loss": 0.02001953125, + "ewc_loss_parallel": 2.002716064453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.723514556884766, + "learning_rate": 1e-06, + "loss": 0.4493, + "mean_token_accuracy": 0.8549717664718628, + "num_tokens": 115599629.0, + "step": 3027 + }, + { + "epoch": 0.38519272357206463, + "ewc_loss": 0.0201416015625, + "ewc_loss_parallel": 2.014636993408203e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.076717376708984, + "learning_rate": 1e-06, + "loss": 0.4495, + "mean_token_accuracy": 0.8547754287719727, + "num_tokens": 115634261.0, + "step": 3028 + }, + { + "epoch": 0.3853199338506551, + "ewc_loss": 0.02001953125, + "ewc_loss_parallel": 2.002716064453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.746444702148438, + "learning_rate": 1e-06, + "loss": 0.423, + "mean_token_accuracy": 0.8643175363540649, + "num_tokens": 115671150.0, + "step": 3029 + }, + { + "epoch": 0.38544714412924563, + "ewc_loss": 0.0198974609375, + "ewc_loss_parallel": 1.990795135498047e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.670869827270508, + "learning_rate": 1e-06, + "loss": 0.4215, + "mean_token_accuracy": 0.8651140928268433, + "num_tokens": 115714328.0, + "step": 3030 + }, + { + "epoch": 0.38557435440783616, + "ewc_loss": 0.020263671875, + "ewc_loss_parallel": 2.0265579223632812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.93979835510254, + "learning_rate": 1e-06, + "loss": 0.4206, + "mean_token_accuracy": 0.8667706847190857, + "num_tokens": 115747118.0, + "step": 3031 + }, + { + "epoch": 0.3857015646864267, + "ewc_loss": 0.02001953125, + "ewc_loss_parallel": 2.002716064453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.71854591369629, + "learning_rate": 1e-06, + "loss": 0.53, + "mean_token_accuracy": 0.8314468860626221, + "num_tokens": 115790309.0, + "step": 3032 + }, + { + "epoch": 0.38582877496501716, + "ewc_loss": 0.02001953125, + "ewc_loss_parallel": 2.002716064453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.708526611328125, + "learning_rate": 1e-06, + "loss": 0.4262, + "mean_token_accuracy": 0.8642392754554749, + "num_tokens": 115825630.0, + "step": 3033 + }, + { + "epoch": 0.3859559852436077, + "ewc_loss": 0.0201416015625, + "ewc_loss_parallel": 2.014636993408203e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.963481903076172, + "learning_rate": 1e-06, + "loss": 0.4395, + "mean_token_accuracy": 0.8588896989822388, + "num_tokens": 115858848.0, + "step": 3034 + }, + { + "epoch": 0.3860831955221982, + "ewc_loss": 0.02001953125, + "ewc_loss_parallel": 2.002716064453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.57977294921875, + "learning_rate": 1e-06, + "loss": 0.4334, + "mean_token_accuracy": 0.860638439655304, + "num_tokens": 115900024.0, + "step": 3035 + }, + { + "epoch": 0.3862104058007887, + "ewc_loss": 0.02001953125, + "ewc_loss_parallel": 2.002716064453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.78753089904785, + "learning_rate": 1e-06, + "loss": 0.5438, + "mean_token_accuracy": 0.8268803358078003, + "num_tokens": 115938899.0, + "step": 3036 + }, + { + "epoch": 0.3863376160793792, + "ewc_loss": 0.020263671875, + "ewc_loss_parallel": 2.0265579223632812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.737342834472656, + "learning_rate": 1e-06, + "loss": 0.4277, + "mean_token_accuracy": 0.8608123064041138, + "num_tokens": 115972199.0, + "step": 3037 + }, + { + "epoch": 0.38646482635796975, + "ewc_loss": 0.02001953125, + "ewc_loss_parallel": 2.002716064453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.667383193969727, + "learning_rate": 1e-06, + "loss": 0.4443, + "mean_token_accuracy": 0.8607024550437927, + "num_tokens": 116011506.0, + "step": 3038 + }, + { + "epoch": 0.3865920366365602, + "ewc_loss": 0.02001953125, + "ewc_loss_parallel": 2.002716064453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.694364547729492, + "learning_rate": 1e-06, + "loss": 0.5418, + "mean_token_accuracy": 0.8301012516021729, + "num_tokens": 116051428.0, + "step": 3039 + }, + { + "epoch": 0.38671924691515075, + "ewc_loss": 0.0201416015625, + "ewc_loss_parallel": 2.014636993408203e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.831357955932617, + "learning_rate": 1e-06, + "loss": 0.4314, + "mean_token_accuracy": 0.8599787950515747, + "num_tokens": 116090307.0, + "step": 3040 + }, + { + "epoch": 0.3868464571937413, + "ewc_loss": 0.02001953125, + "ewc_loss_parallel": 2.002716064453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.806142807006836, + "learning_rate": 1e-06, + "loss": 0.4769, + "mean_token_accuracy": 0.8422675728797913, + "num_tokens": 116123381.0, + "step": 3041 + }, + { + "epoch": 0.38697366747233175, + "ewc_loss": 0.02001953125, + "ewc_loss_parallel": 2.002716064453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.879724502563477, + "learning_rate": 1e-06, + "loss": 0.4651, + "mean_token_accuracy": 0.854765772819519, + "num_tokens": 116157755.0, + "step": 3042 + }, + { + "epoch": 0.3871008777509223, + "ewc_loss": 0.02001953125, + "ewc_loss_parallel": 2.002716064453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.682767868041992, + "learning_rate": 1e-06, + "loss": 0.4741, + "mean_token_accuracy": 0.8525388240814209, + "num_tokens": 116197371.0, + "step": 3043 + }, + { + "epoch": 0.3872280880295128, + "ewc_loss": 0.02001953125, + "ewc_loss_parallel": 2.002716064453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.820541381835938, + "learning_rate": 1e-06, + "loss": 0.5197, + "mean_token_accuracy": 0.8369855880737305, + "num_tokens": 116240544.0, + "step": 3044 + }, + { + "epoch": 0.3873552983081033, + "ewc_loss": 0.020263671875, + "ewc_loss_parallel": 2.0265579223632812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.79418182373047, + "learning_rate": 1e-06, + "loss": 0.4724, + "mean_token_accuracy": 0.8492856025695801, + "num_tokens": 116272079.0, + "step": 3045 + }, + { + "epoch": 0.3874825085866938, + "ewc_loss": 0.0201416015625, + "ewc_loss_parallel": 2.014636993408203e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.719215393066406, + "learning_rate": 1e-06, + "loss": 0.4723, + "mean_token_accuracy": 0.849166750907898, + "num_tokens": 116309641.0, + "step": 3046 + }, + { + "epoch": 0.38760971886528434, + "ewc_loss": 0.0201416015625, + "ewc_loss_parallel": 2.014636993408203e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.85503387451172, + "learning_rate": 1e-06, + "loss": 0.4841, + "mean_token_accuracy": 0.8466600775718689, + "num_tokens": 116344470.0, + "step": 3047 + }, + { + "epoch": 0.3877369291438748, + "ewc_loss": 0.0201416015625, + "ewc_loss_parallel": 2.014636993408203e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.846492767333984, + "learning_rate": 1e-06, + "loss": 0.4596, + "mean_token_accuracy": 0.8553732633590698, + "num_tokens": 116380177.0, + "step": 3048 + }, + { + "epoch": 0.38786413942246534, + "ewc_loss": 0.02001953125, + "ewc_loss_parallel": 2.002716064453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.759662628173828, + "learning_rate": 1e-06, + "loss": 0.4243, + "mean_token_accuracy": 0.8622059226036072, + "num_tokens": 116416482.0, + "step": 3049 + }, + { + "epoch": 0.38799134970105587, + "ewc_loss": 0.0201416015625, + "ewc_loss_parallel": 2.014636993408203e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.829668045043945, + "learning_rate": 1e-06, + "loss": 0.5123, + "mean_token_accuracy": 0.8402175307273865, + "num_tokens": 116456103.0, + "step": 3050 + }, + { + "epoch": 0.38811855997964634, + "ewc_loss": 0.020263671875, + "ewc_loss_parallel": 2.0265579223632812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.682167053222656, + "learning_rate": 1e-06, + "loss": 0.4345, + "mean_token_accuracy": 0.8575341701507568, + "num_tokens": 116496810.0, + "step": 3051 + }, + { + "epoch": 0.38824577025823687, + "ewc_loss": 0.020263671875, + "ewc_loss_parallel": 2.0265579223632812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.881498336791992, + "learning_rate": 1e-06, + "loss": 0.4741, + "mean_token_accuracy": 0.8479925394058228, + "num_tokens": 116532999.0, + "step": 3052 + }, + { + "epoch": 0.3883729805368274, + "ewc_loss": 0.020263671875, + "ewc_loss_parallel": 2.0265579223632812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.688886642456055, + "learning_rate": 1e-06, + "loss": 0.4291, + "mean_token_accuracy": 0.8596251606941223, + "num_tokens": 116565700.0, + "step": 3053 + }, + { + "epoch": 0.38850019081541787, + "ewc_loss": 0.0201416015625, + "ewc_loss_parallel": 2.014636993408203e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.894929885864258, + "learning_rate": 1e-06, + "loss": 0.4415, + "mean_token_accuracy": 0.8553829193115234, + "num_tokens": 116602798.0, + "step": 3054 + }, + { + "epoch": 0.3886274010940084, + "ewc_loss": 0.020263671875, + "ewc_loss_parallel": 2.0265579223632812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.714675903320312, + "learning_rate": 1e-06, + "loss": 0.432, + "mean_token_accuracy": 0.8626485466957092, + "num_tokens": 116638743.0, + "step": 3055 + }, + { + "epoch": 0.3887546113725989, + "ewc_loss": 0.020263671875, + "ewc_loss_parallel": 2.0265579223632812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.713130950927734, + "learning_rate": 1e-06, + "loss": 0.4258, + "mean_token_accuracy": 0.8637343645095825, + "num_tokens": 116672290.0, + "step": 3056 + }, + { + "epoch": 0.3888818216511894, + "ewc_loss": 0.020263671875, + "ewc_loss_parallel": 2.0265579223632812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.73574447631836, + "learning_rate": 1e-06, + "loss": 0.4627, + "mean_token_accuracy": 0.8540834188461304, + "num_tokens": 116712593.0, + "step": 3057 + }, + { + "epoch": 0.3890090319297799, + "ewc_loss": 0.020263671875, + "ewc_loss_parallel": 2.0265579223632812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.7521915435791, + "learning_rate": 1e-06, + "loss": 0.4716, + "mean_token_accuracy": 0.8504369258880615, + "num_tokens": 116748663.0, + "step": 3058 + }, + { + "epoch": 0.38913624220837045, + "ewc_loss": 0.020263671875, + "ewc_loss_parallel": 2.0265579223632812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.927749633789062, + "learning_rate": 1e-06, + "loss": 0.4887, + "mean_token_accuracy": 0.8487462997436523, + "num_tokens": 116796018.0, + "step": 3059 + }, + { + "epoch": 0.3892634524869609, + "ewc_loss": 0.020263671875, + "ewc_loss_parallel": 2.0265579223632812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.97605323791504, + "learning_rate": 1e-06, + "loss": 0.4726, + "mean_token_accuracy": 0.8495545983314514, + "num_tokens": 116837287.0, + "step": 3060 + }, + { + "epoch": 0.38939066276555145, + "ewc_loss": 0.020263671875, + "ewc_loss_parallel": 2.0265579223632812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.869831085205078, + "learning_rate": 1e-06, + "loss": 0.5348, + "mean_token_accuracy": 0.8363479375839233, + "num_tokens": 116872163.0, + "step": 3061 + }, + { + "epoch": 0.389517873044142, + "ewc_loss": 0.0203857421875, + "ewc_loss_parallel": 2.0384788513183594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.343461990356445, + "learning_rate": 1e-06, + "loss": 0.452, + "mean_token_accuracy": 0.8595547676086426, + "num_tokens": 116913445.0, + "step": 3062 + }, + { + "epoch": 0.38964508332273246, + "ewc_loss": 0.020263671875, + "ewc_loss_parallel": 2.0265579223632812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.82271385192871, + "learning_rate": 1e-06, + "loss": 0.497, + "mean_token_accuracy": 0.8420416116714478, + "num_tokens": 116954469.0, + "step": 3063 + }, + { + "epoch": 0.389772293601323, + "ewc_loss": 0.02001953125, + "ewc_loss_parallel": 2.002716064453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.39569854736328, + "learning_rate": 1e-06, + "loss": 0.4559, + "mean_token_accuracy": 0.8511068820953369, + "num_tokens": 116994339.0, + "step": 3064 + }, + { + "epoch": 0.3898995038799135, + "ewc_loss": 0.0203857421875, + "ewc_loss_parallel": 2.0384788513183594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.96974754333496, + "learning_rate": 1e-06, + "loss": 0.4275, + "mean_token_accuracy": 0.8638893365859985, + "num_tokens": 117030867.0, + "step": 3065 + }, + { + "epoch": 0.390026714158504, + "ewc_loss": 0.0198974609375, + "ewc_loss_parallel": 1.990795135498047e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.57126808166504, + "learning_rate": 1e-06, + "loss": 0.4364, + "mean_token_accuracy": 0.8606786727905273, + "num_tokens": 117068309.0, + "step": 3066 + }, + { + "epoch": 0.3901539244370945, + "ewc_loss": 0.020263671875, + "ewc_loss_parallel": 2.0265579223632812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.999902725219727, + "learning_rate": 1e-06, + "loss": 0.4734, + "mean_token_accuracy": 0.8479982614517212, + "num_tokens": 117105278.0, + "step": 3067 + }, + { + "epoch": 0.39028113471568504, + "ewc_loss": 0.020263671875, + "ewc_loss_parallel": 2.0265579223632812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.063077926635742, + "learning_rate": 1e-06, + "loss": 0.504, + "mean_token_accuracy": 0.8442646265029907, + "num_tokens": 117143096.0, + "step": 3068 + }, + { + "epoch": 0.3904083449942755, + "ewc_loss": 0.02001953125, + "ewc_loss_parallel": 2.002716064453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.866140365600586, + "learning_rate": 1e-06, + "loss": 0.4557, + "mean_token_accuracy": 0.8526435494422913, + "num_tokens": 117179638.0, + "step": 3069 + }, + { + "epoch": 0.39053555527286604, + "ewc_loss": 0.020263671875, + "ewc_loss_parallel": 2.0265579223632812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.69019317626953, + "learning_rate": 1e-06, + "loss": 0.4209, + "mean_token_accuracy": 0.8652232885360718, + "num_tokens": 117211560.0, + "step": 3070 + }, + { + "epoch": 0.39066276555145657, + "ewc_loss": 0.0201416015625, + "ewc_loss_parallel": 2.014636993408203e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.72687530517578, + "learning_rate": 1e-06, + "loss": 0.4921, + "mean_token_accuracy": 0.8446038961410522, + "num_tokens": 117250622.0, + "step": 3071 + }, + { + "epoch": 0.39078997583004704, + "ewc_loss": 0.020263671875, + "ewc_loss_parallel": 2.0265579223632812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.103124618530273, + "learning_rate": 1e-06, + "loss": 0.3862, + "mean_token_accuracy": 0.8729227185249329, + "num_tokens": 117287600.0, + "step": 3072 + }, + { + "epoch": 0.39091718610863757, + "ewc_loss": 0.020263671875, + "ewc_loss_parallel": 2.0265579223632812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.973628997802734, + "learning_rate": 1e-06, + "loss": 0.5132, + "mean_token_accuracy": 0.8375734090805054, + "num_tokens": 117328188.0, + "step": 3073 + }, + { + "epoch": 0.3910443963872281, + "ewc_loss": 0.0201416015625, + "ewc_loss_parallel": 2.014636993408203e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.897424697875977, + "learning_rate": 1e-06, + "loss": 0.4368, + "mean_token_accuracy": 0.8601548075675964, + "num_tokens": 117367282.0, + "step": 3074 + }, + { + "epoch": 0.39117160666581857, + "ewc_loss": 0.020263671875, + "ewc_loss_parallel": 2.0265579223632812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.72223663330078, + "learning_rate": 1e-06, + "loss": 0.4328, + "mean_token_accuracy": 0.8633046746253967, + "num_tokens": 117403376.0, + "step": 3075 + }, + { + "epoch": 0.3912988169444091, + "ewc_loss": 0.0201416015625, + "ewc_loss_parallel": 2.014636993408203e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.00760841369629, + "learning_rate": 1e-06, + "loss": 0.4933, + "mean_token_accuracy": 0.8415562510490417, + "num_tokens": 117438661.0, + "step": 3076 + }, + { + "epoch": 0.39142602722299963, + "ewc_loss": 0.020263671875, + "ewc_loss_parallel": 2.0265579223632812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.952152252197266, + "learning_rate": 1e-06, + "loss": 0.4479, + "mean_token_accuracy": 0.8566327095031738, + "num_tokens": 117477351.0, + "step": 3077 + }, + { + "epoch": 0.3915532375015901, + "ewc_loss": 0.020263671875, + "ewc_loss_parallel": 2.0265579223632812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.03025245666504, + "learning_rate": 1e-06, + "loss": 0.4632, + "mean_token_accuracy": 0.8497452735900879, + "num_tokens": 117512773.0, + "step": 3078 + }, + { + "epoch": 0.39168044778018063, + "ewc_loss": 0.0201416015625, + "ewc_loss_parallel": 2.014636993408203e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.80888557434082, + "learning_rate": 1e-06, + "loss": 0.5132, + "mean_token_accuracy": 0.837040901184082, + "num_tokens": 117555370.0, + "step": 3079 + }, + { + "epoch": 0.39180765805877116, + "ewc_loss": 0.020263671875, + "ewc_loss_parallel": 2.0265579223632812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.878753662109375, + "learning_rate": 1e-06, + "loss": 0.4373, + "mean_token_accuracy": 0.8604793548583984, + "num_tokens": 117594208.0, + "step": 3080 + }, + { + "epoch": 0.3919348683373617, + "ewc_loss": 0.020263671875, + "ewc_loss_parallel": 2.0265579223632812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.824888229370117, + "learning_rate": 1e-06, + "loss": 0.4556, + "mean_token_accuracy": 0.8551979660987854, + "num_tokens": 117632715.0, + "step": 3081 + }, + { + "epoch": 0.39206207861595216, + "ewc_loss": 0.0201416015625, + "ewc_loss_parallel": 2.014636993408203e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.941804885864258, + "learning_rate": 1e-06, + "loss": 0.4757, + "mean_token_accuracy": 0.8479940295219421, + "num_tokens": 117669221.0, + "step": 3082 + }, + { + "epoch": 0.3921892888945427, + "ewc_loss": 0.020263671875, + "ewc_loss_parallel": 2.0265579223632812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.013668060302734, + "learning_rate": 1e-06, + "loss": 0.483, + "mean_token_accuracy": 0.8465741872787476, + "num_tokens": 117709559.0, + "step": 3083 + }, + { + "epoch": 0.3923164991731332, + "ewc_loss": 0.020263671875, + "ewc_loss_parallel": 2.0265579223632812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.836450576782227, + "learning_rate": 1e-06, + "loss": 0.4409, + "mean_token_accuracy": 0.8596405982971191, + "num_tokens": 117745240.0, + "step": 3084 + }, + { + "epoch": 0.3924437094517237, + "ewc_loss": 0.020263671875, + "ewc_loss_parallel": 2.0265579223632812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.045167922973633, + "learning_rate": 1e-06, + "loss": 0.4548, + "mean_token_accuracy": 0.8528575897216797, + "num_tokens": 117787897.0, + "step": 3085 + }, + { + "epoch": 0.3925709197303142, + "ewc_loss": 0.020263671875, + "ewc_loss_parallel": 2.0265579223632812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.944183349609375, + "learning_rate": 1e-06, + "loss": 0.4801, + "mean_token_accuracy": 0.8485240340232849, + "num_tokens": 117826862.0, + "step": 3086 + }, + { + "epoch": 0.39269813000890474, + "ewc_loss": 0.0201416015625, + "ewc_loss_parallel": 2.014636993408203e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.829307556152344, + "learning_rate": 1e-06, + "loss": 0.4808, + "mean_token_accuracy": 0.847793459892273, + "num_tokens": 117866932.0, + "step": 3087 + }, + { + "epoch": 0.3928253402874952, + "ewc_loss": 0.020263671875, + "ewc_loss_parallel": 2.0265579223632812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.82669448852539, + "learning_rate": 1e-06, + "loss": 0.4643, + "mean_token_accuracy": 0.8537986278533936, + "num_tokens": 117905538.0, + "step": 3088 + }, + { + "epoch": 0.39295255056608575, + "ewc_loss": 0.020263671875, + "ewc_loss_parallel": 2.0265579223632812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.943952560424805, + "learning_rate": 1e-06, + "loss": 0.5076, + "mean_token_accuracy": 0.8409154415130615, + "num_tokens": 117945123.0, + "step": 3089 + }, + { + "epoch": 0.3930797608446763, + "ewc_loss": 0.020263671875, + "ewc_loss_parallel": 2.0265579223632812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.94484519958496, + "learning_rate": 1e-06, + "loss": 0.4669, + "mean_token_accuracy": 0.8552840948104858, + "num_tokens": 117988743.0, + "step": 3090 + }, + { + "epoch": 0.39320697112326675, + "ewc_loss": 0.0201416015625, + "ewc_loss_parallel": 2.014636993408203e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.763748168945312, + "learning_rate": 1e-06, + "loss": 0.4076, + "mean_token_accuracy": 0.8673546314239502, + "num_tokens": 118026078.0, + "step": 3091 + }, + { + "epoch": 0.3933341814018573, + "ewc_loss": 0.0201416015625, + "ewc_loss_parallel": 2.014636993408203e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.958343505859375, + "learning_rate": 1e-06, + "loss": 0.425, + "mean_token_accuracy": 0.8637985587120056, + "num_tokens": 118060105.0, + "step": 3092 + }, + { + "epoch": 0.3934613916804478, + "ewc_loss": 0.020263671875, + "ewc_loss_parallel": 2.0265579223632812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.850889205932617, + "learning_rate": 1e-06, + "loss": 0.372, + "mean_token_accuracy": 0.8792047500610352, + "num_tokens": 118089677.0, + "step": 3093 + }, + { + "epoch": 0.3935886019590383, + "ewc_loss": 0.0201416015625, + "ewc_loss_parallel": 2.014636993408203e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.903751373291016, + "learning_rate": 1e-06, + "loss": 0.4227, + "mean_token_accuracy": 0.866341233253479, + "num_tokens": 118128449.0, + "step": 3094 + }, + { + "epoch": 0.3937158122376288, + "ewc_loss": 0.020263671875, + "ewc_loss_parallel": 2.0265579223632812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.004863739013672, + "learning_rate": 1e-06, + "loss": 0.4371, + "mean_token_accuracy": 0.8613707423210144, + "num_tokens": 118168411.0, + "step": 3095 + }, + { + "epoch": 0.39384302251621933, + "ewc_loss": 0.020263671875, + "ewc_loss_parallel": 2.0265579223632812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.7382755279541, + "learning_rate": 1e-06, + "loss": 0.4462, + "mean_token_accuracy": 0.863620400428772, + "num_tokens": 118206097.0, + "step": 3096 + }, + { + "epoch": 0.3939702327948098, + "ewc_loss": 0.02001953125, + "ewc_loss_parallel": 2.002716064453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.916046142578125, + "learning_rate": 1e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.8655314445495605, + "num_tokens": 118244429.0, + "step": 3097 + }, + { + "epoch": 0.39409744307340033, + "ewc_loss": 0.020263671875, + "ewc_loss_parallel": 2.0265579223632812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.975189208984375, + "learning_rate": 1e-06, + "loss": 0.4472, + "mean_token_accuracy": 0.8588564395904541, + "num_tokens": 118279125.0, + "step": 3098 + }, + { + "epoch": 0.39422465335199086, + "ewc_loss": 0.0201416015625, + "ewc_loss_parallel": 2.014636993408203e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.87165069580078, + "learning_rate": 1e-06, + "loss": 0.4715, + "mean_token_accuracy": 0.8501768112182617, + "num_tokens": 118313156.0, + "step": 3099 + }, + { + "epoch": 0.39435186363058133, + "ewc_loss": 0.020263671875, + "ewc_loss_parallel": 2.0265579223632812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.045787811279297, + "learning_rate": 1e-06, + "loss": 0.4978, + "mean_token_accuracy": 0.8380717635154724, + "num_tokens": 118349868.0, + "step": 3100 + }, + { + "epoch": 0.39447907390917186, + "ewc_loss": 0.0201416015625, + "ewc_loss_parallel": 2.014636993408203e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.9074649810791, + "learning_rate": 1e-06, + "loss": 0.5011, + "mean_token_accuracy": 0.8436163663864136, + "num_tokens": 118394998.0, + "step": 3101 + }, + { + "epoch": 0.3946062841877624, + "ewc_loss": 0.020263671875, + "ewc_loss_parallel": 2.0265579223632812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.849504470825195, + "learning_rate": 1e-06, + "loss": 0.4154, + "mean_token_accuracy": 0.8634820580482483, + "num_tokens": 118436002.0, + "step": 3102 + }, + { + "epoch": 0.39473349446635286, + "ewc_loss": 0.020263671875, + "ewc_loss_parallel": 2.0265579223632812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.924835205078125, + "learning_rate": 1e-06, + "loss": 0.4354, + "mean_token_accuracy": 0.8586467504501343, + "num_tokens": 118475094.0, + "step": 3103 + }, + { + "epoch": 0.3948607047449434, + "ewc_loss": 0.020263671875, + "ewc_loss_parallel": 2.0265579223632812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.908296585083008, + "learning_rate": 1e-06, + "loss": 0.4651, + "mean_token_accuracy": 0.8506917953491211, + "num_tokens": 118514164.0, + "step": 3104 + }, + { + "epoch": 0.3949879150235339, + "ewc_loss": 0.020263671875, + "ewc_loss_parallel": 2.0265579223632812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.84729766845703, + "learning_rate": 1e-06, + "loss": 0.4884, + "mean_token_accuracy": 0.8487128019332886, + "num_tokens": 118554271.0, + "step": 3105 + }, + { + "epoch": 0.3951151253021244, + "ewc_loss": 0.020263671875, + "ewc_loss_parallel": 2.0265579223632812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.825315475463867, + "learning_rate": 1e-06, + "loss": 0.4748, + "mean_token_accuracy": 0.8537960052490234, + "num_tokens": 118597123.0, + "step": 3106 + }, + { + "epoch": 0.3952423355807149, + "ewc_loss": 0.020263671875, + "ewc_loss_parallel": 2.0265579223632812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.033798217773438, + "learning_rate": 1e-06, + "loss": 0.4753, + "mean_token_accuracy": 0.847673773765564, + "num_tokens": 118636683.0, + "step": 3107 + }, + { + "epoch": 0.39536954585930545, + "ewc_loss": 0.020263671875, + "ewc_loss_parallel": 2.0265579223632812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.98125648498535, + "learning_rate": 1e-06, + "loss": 0.4447, + "mean_token_accuracy": 0.8555803894996643, + "num_tokens": 118665770.0, + "step": 3108 + }, + { + "epoch": 0.3954967561378959, + "ewc_loss": 0.020263671875, + "ewc_loss_parallel": 2.0265579223632812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.83316421508789, + "learning_rate": 1e-06, + "loss": 0.4175, + "mean_token_accuracy": 0.8641336560249329, + "num_tokens": 118701652.0, + "step": 3109 + }, + { + "epoch": 0.39562396641648645, + "ewc_loss": 0.0201416015625, + "ewc_loss_parallel": 2.014636993408203e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.850332260131836, + "learning_rate": 1e-06, + "loss": 0.4319, + "mean_token_accuracy": 0.8599679470062256, + "num_tokens": 118741756.0, + "step": 3110 + }, + { + "epoch": 0.395751176695077, + "ewc_loss": 0.020263671875, + "ewc_loss_parallel": 2.0265579223632812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.890283584594727, + "learning_rate": 1e-06, + "loss": 0.4614, + "mean_token_accuracy": 0.8531109094619751, + "num_tokens": 118784320.0, + "step": 3111 + }, + { + "epoch": 0.39587838697366745, + "ewc_loss": 0.020263671875, + "ewc_loss_parallel": 2.0265579223632812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.819971084594727, + "learning_rate": 1e-06, + "loss": 0.4703, + "mean_token_accuracy": 0.8471319675445557, + "num_tokens": 118821456.0, + "step": 3112 + }, + { + "epoch": 0.396005597252258, + "ewc_loss": 0.0203857421875, + "ewc_loss_parallel": 2.0384788513183594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.921701431274414, + "learning_rate": 1e-06, + "loss": 0.4736, + "mean_token_accuracy": 0.8512856960296631, + "num_tokens": 118861746.0, + "step": 3113 + }, + { + "epoch": 0.3961328075308485, + "ewc_loss": 0.020263671875, + "ewc_loss_parallel": 2.0265579223632812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.744047164916992, + "learning_rate": 1e-06, + "loss": 0.4578, + "mean_token_accuracy": 0.8583377003669739, + "num_tokens": 118900346.0, + "step": 3114 + }, + { + "epoch": 0.396260017809439, + "ewc_loss": 0.020263671875, + "ewc_loss_parallel": 2.0265579223632812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.99765968322754, + "learning_rate": 1e-06, + "loss": 0.4686, + "mean_token_accuracy": 0.8529843688011169, + "num_tokens": 118939707.0, + "step": 3115 + }, + { + "epoch": 0.3963872280880295, + "ewc_loss": 0.0203857421875, + "ewc_loss_parallel": 2.0384788513183594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.896291732788086, + "learning_rate": 1e-06, + "loss": 0.4615, + "mean_token_accuracy": 0.8526455163955688, + "num_tokens": 118971154.0, + "step": 3116 + }, + { + "epoch": 0.39651443836662004, + "ewc_loss": 0.0201416015625, + "ewc_loss_parallel": 2.014636993408203e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.82415771484375, + "learning_rate": 1e-06, + "loss": 0.5211, + "mean_token_accuracy": 0.837579607963562, + "num_tokens": 119012344.0, + "step": 3117 + }, + { + "epoch": 0.3966416486452105, + "ewc_loss": 0.0203857421875, + "ewc_loss_parallel": 2.0384788513183594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.85747718811035, + "learning_rate": 1e-06, + "loss": 0.4646, + "mean_token_accuracy": 0.852203905582428, + "num_tokens": 119047677.0, + "step": 3118 + }, + { + "epoch": 0.39676885892380104, + "ewc_loss": 0.020263671875, + "ewc_loss_parallel": 2.0265579223632812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.95890998840332, + "learning_rate": 1e-06, + "loss": 0.4321, + "mean_token_accuracy": 0.8675909638404846, + "num_tokens": 119086357.0, + "step": 3119 + }, + { + "epoch": 0.39689606920239157, + "ewc_loss": 0.020263671875, + "ewc_loss_parallel": 2.0265579223632812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.6136531829834, + "learning_rate": 1e-06, + "loss": 0.4291, + "mean_token_accuracy": 0.8599509596824646, + "num_tokens": 119122854.0, + "step": 3120 + }, + { + "epoch": 0.39702327948098204, + "ewc_loss": 0.020263671875, + "ewc_loss_parallel": 2.0265579223632812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.08639144897461, + "learning_rate": 1e-06, + "loss": 0.4408, + "mean_token_accuracy": 0.8602913618087769, + "num_tokens": 119155449.0, + "step": 3121 + }, + { + "epoch": 0.39715048975957257, + "ewc_loss": 0.0203857421875, + "ewc_loss_parallel": 2.0384788513183594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.859790802001953, + "learning_rate": 1e-06, + "loss": 0.4804, + "mean_token_accuracy": 0.8487197756767273, + "num_tokens": 119197287.0, + "step": 3122 + }, + { + "epoch": 0.3972777000381631, + "ewc_loss": 0.020263671875, + "ewc_loss_parallel": 2.0265579223632812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.92749786376953, + "learning_rate": 1e-06, + "loss": 0.4976, + "mean_token_accuracy": 0.8413037657737732, + "num_tokens": 119235749.0, + "step": 3123 + }, + { + "epoch": 0.39740491031675357, + "ewc_loss": 0.0203857421875, + "ewc_loss_parallel": 2.0384788513183594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.94161033630371, + "learning_rate": 1e-06, + "loss": 0.4752, + "mean_token_accuracy": 0.8488467931747437, + "num_tokens": 119267282.0, + "step": 3124 + }, + { + "epoch": 0.3975321205953441, + "ewc_loss": 0.020263671875, + "ewc_loss_parallel": 2.0265579223632812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.79817771911621, + "learning_rate": 1e-06, + "loss": 0.4684, + "mean_token_accuracy": 0.844768762588501, + "num_tokens": 119304387.0, + "step": 3125 + }, + { + "epoch": 0.3976593308739346, + "ewc_loss": 0.020263671875, + "ewc_loss_parallel": 2.0265579223632812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.832950592041016, + "learning_rate": 1e-06, + "loss": 0.5023, + "mean_token_accuracy": 0.8437137603759766, + "num_tokens": 119341470.0, + "step": 3126 + }, + { + "epoch": 0.3977865411525251, + "ewc_loss": 0.020263671875, + "ewc_loss_parallel": 2.0265579223632812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.800230026245117, + "learning_rate": 1e-06, + "loss": 0.4832, + "mean_token_accuracy": 0.8422027826309204, + "num_tokens": 119381067.0, + "step": 3127 + }, + { + "epoch": 0.3979137514311156, + "ewc_loss": 0.0203857421875, + "ewc_loss_parallel": 2.0384788513183594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.72092628479004, + "learning_rate": 1e-06, + "loss": 0.4628, + "mean_token_accuracy": 0.8539901971817017, + "num_tokens": 119415680.0, + "step": 3128 + }, + { + "epoch": 0.39804096170970615, + "ewc_loss": 0.020263671875, + "ewc_loss_parallel": 2.0265579223632812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.057231903076172, + "learning_rate": 1e-06, + "loss": 0.4907, + "mean_token_accuracy": 0.8438581228256226, + "num_tokens": 119454300.0, + "step": 3129 + }, + { + "epoch": 0.3981681719882967, + "ewc_loss": 0.020263671875, + "ewc_loss_parallel": 2.0265579223632812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.777801513671875, + "learning_rate": 1e-06, + "loss": 0.4186, + "mean_token_accuracy": 0.8656396865844727, + "num_tokens": 119485599.0, + "step": 3130 + }, + { + "epoch": 0.39829538226688715, + "ewc_loss": 0.020263671875, + "ewc_loss_parallel": 2.0265579223632812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.006778717041016, + "learning_rate": 1e-06, + "loss": 0.5063, + "mean_token_accuracy": 0.8387042284011841, + "num_tokens": 119525820.0, + "step": 3131 + }, + { + "epoch": 0.3984225925454777, + "ewc_loss": 0.0203857421875, + "ewc_loss_parallel": 2.0384788513183594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.76608657836914, + "learning_rate": 1e-06, + "loss": 0.4442, + "mean_token_accuracy": 0.8594068288803101, + "num_tokens": 119568216.0, + "step": 3132 + }, + { + "epoch": 0.3985498028240682, + "ewc_loss": 0.020263671875, + "ewc_loss_parallel": 2.0265579223632812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.791175842285156, + "learning_rate": 1e-06, + "loss": 0.4989, + "mean_token_accuracy": 0.8407415151596069, + "num_tokens": 119602914.0, + "step": 3133 + }, + { + "epoch": 0.3986770131026587, + "ewc_loss": 0.0201416015625, + "ewc_loss_parallel": 2.014636993408203e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.798198699951172, + "learning_rate": 1e-06, + "loss": 0.4627, + "mean_token_accuracy": 0.8485148549079895, + "num_tokens": 119637024.0, + "step": 3134 + }, + { + "epoch": 0.3988042233812492, + "ewc_loss": 0.020263671875, + "ewc_loss_parallel": 2.0265579223632812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.89073944091797, + "learning_rate": 1e-06, + "loss": 0.4926, + "mean_token_accuracy": 0.8431345820426941, + "num_tokens": 119679265.0, + "step": 3135 + }, + { + "epoch": 0.39893143365983974, + "ewc_loss": 0.020263671875, + "ewc_loss_parallel": 2.0265579223632812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.802160263061523, + "learning_rate": 1e-06, + "loss": 0.4811, + "mean_token_accuracy": 0.8447284698486328, + "num_tokens": 119716254.0, + "step": 3136 + }, + { + "epoch": 0.3990586439384302, + "ewc_loss": 0.020263671875, + "ewc_loss_parallel": 2.0265579223632812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.905961990356445, + "learning_rate": 1e-06, + "loss": 0.4763, + "mean_token_accuracy": 0.8448752164840698, + "num_tokens": 119761325.0, + "step": 3137 + }, + { + "epoch": 0.39918585421702074, + "ewc_loss": 0.020263671875, + "ewc_loss_parallel": 2.0265579223632812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.838388442993164, + "learning_rate": 1e-06, + "loss": 0.3937, + "mean_token_accuracy": 0.8763952851295471, + "num_tokens": 119795468.0, + "step": 3138 + }, + { + "epoch": 0.39931306449561127, + "ewc_loss": 0.020263671875, + "ewc_loss_parallel": 2.0265579223632812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.056705474853516, + "learning_rate": 1e-06, + "loss": 0.5079, + "mean_token_accuracy": 0.8431159257888794, + "num_tokens": 119833804.0, + "step": 3139 + }, + { + "epoch": 0.39944027477420174, + "ewc_loss": 0.0205078125, + "ewc_loss_parallel": 2.0503997802734375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.850723266601562, + "learning_rate": 1e-06, + "loss": 0.4747, + "mean_token_accuracy": 0.8483206033706665, + "num_tokens": 119870852.0, + "step": 3140 + }, + { + "epoch": 0.39956748505279227, + "ewc_loss": 0.0201416015625, + "ewc_loss_parallel": 2.014636993408203e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.010074615478516, + "learning_rate": 1e-06, + "loss": 0.4597, + "mean_token_accuracy": 0.8551222085952759, + "num_tokens": 119909258.0, + "step": 3141 + }, + { + "epoch": 0.3996946953313828, + "ewc_loss": 0.0203857421875, + "ewc_loss_parallel": 2.0384788513183594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.972646713256836, + "learning_rate": 1e-06, + "loss": 0.5024, + "mean_token_accuracy": 0.8417267799377441, + "num_tokens": 119940701.0, + "step": 3142 + }, + { + "epoch": 0.39982190560997327, + "ewc_loss": 0.020263671875, + "ewc_loss_parallel": 2.0265579223632812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.8060302734375, + "learning_rate": 1e-06, + "loss": 0.4481, + "mean_token_accuracy": 0.8532285690307617, + "num_tokens": 119976780.0, + "step": 3143 + }, + { + "epoch": 0.3999491158885638, + "ewc_loss": 0.020263671875, + "ewc_loss_parallel": 2.0265579223632812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.82459831237793, + "learning_rate": 1e-06, + "loss": 0.4878, + "mean_token_accuracy": 0.8444545865058899, + "num_tokens": 120017543.0, + "step": 3144 + }, + { + "epoch": 0.40007632616715433, + "ewc_loss": 0.020263671875, + "ewc_loss_parallel": 2.0265579223632812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.962961196899414, + "learning_rate": 1e-06, + "loss": 0.4777, + "mean_token_accuracy": 0.8493866920471191, + "num_tokens": 120050036.0, + "step": 3145 + }, + { + "epoch": 0.4002035364457448, + "ewc_loss": 0.0203857421875, + "ewc_loss_parallel": 2.0384788513183594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.810304641723633, + "learning_rate": 1e-06, + "loss": 0.4349, + "mean_token_accuracy": 0.8671749830245972, + "num_tokens": 120084582.0, + "step": 3146 + }, + { + "epoch": 0.40033074672433533, + "ewc_loss": 0.020263671875, + "ewc_loss_parallel": 2.0265579223632812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.994136810302734, + "learning_rate": 1e-06, + "loss": 0.4691, + "mean_token_accuracy": 0.8485328555107117, + "num_tokens": 120120218.0, + "step": 3147 + }, + { + "epoch": 0.40045795700292586, + "ewc_loss": 0.020263671875, + "ewc_loss_parallel": 2.0265579223632812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.009822845458984, + "learning_rate": 1e-06, + "loss": 0.397, + "mean_token_accuracy": 0.8739117383956909, + "num_tokens": 120161650.0, + "step": 3148 + }, + { + "epoch": 0.40058516728151633, + "ewc_loss": 0.0203857421875, + "ewc_loss_parallel": 2.0384788513183594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.842111587524414, + "learning_rate": 1e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.8612151145935059, + "num_tokens": 120197409.0, + "step": 3149 + }, + { + "epoch": 0.40071237756010686, + "ewc_loss": 0.020263671875, + "ewc_loss_parallel": 2.0265579223632812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.11400032043457, + "learning_rate": 1e-06, + "loss": 0.5066, + "mean_token_accuracy": 0.8407182693481445, + "num_tokens": 120240700.0, + "step": 3150 + }, + { + "epoch": 0.4008395878386974, + "ewc_loss": 0.0205078125, + "ewc_loss_parallel": 2.0503997802734375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.216569900512695, + "learning_rate": 1e-06, + "loss": 0.4932, + "mean_token_accuracy": 0.8427162170410156, + "num_tokens": 120286741.0, + "step": 3151 + }, + { + "epoch": 0.40096679811728786, + "ewc_loss": 0.0201416015625, + "ewc_loss_parallel": 2.014636993408203e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.827741622924805, + "learning_rate": 1e-06, + "loss": 0.4548, + "mean_token_accuracy": 0.8556941747665405, + "num_tokens": 120323236.0, + "step": 3152 + }, + { + "epoch": 0.4010940083958784, + "ewc_loss": 0.020263671875, + "ewc_loss_parallel": 2.0265579223632812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.743038177490234, + "learning_rate": 1e-06, + "loss": 0.4665, + "mean_token_accuracy": 0.8512445688247681, + "num_tokens": 120362185.0, + "step": 3153 + }, + { + "epoch": 0.4012212186744689, + "ewc_loss": 0.0203857421875, + "ewc_loss_parallel": 2.0384788513183594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.999879837036133, + "learning_rate": 1e-06, + "loss": 0.4891, + "mean_token_accuracy": 0.8436392545700073, + "num_tokens": 120397524.0, + "step": 3154 + }, + { + "epoch": 0.4013484289530594, + "ewc_loss": 0.0203857421875, + "ewc_loss_parallel": 2.0384788513183594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.905866622924805, + "learning_rate": 1e-06, + "loss": 0.4565, + "mean_token_accuracy": 0.8564788699150085, + "num_tokens": 120439723.0, + "step": 3155 + }, + { + "epoch": 0.4014756392316499, + "ewc_loss": 0.020263671875, + "ewc_loss_parallel": 2.0265579223632812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.816526412963867, + "learning_rate": 1e-06, + "loss": 0.443, + "mean_token_accuracy": 0.8572440147399902, + "num_tokens": 120483693.0, + "step": 3156 + }, + { + "epoch": 0.40160284951024044, + "ewc_loss": 0.0201416015625, + "ewc_loss_parallel": 2.014636993408203e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.037879943847656, + "learning_rate": 1e-06, + "loss": 0.4395, + "mean_token_accuracy": 0.8616068363189697, + "num_tokens": 120520558.0, + "step": 3157 + }, + { + "epoch": 0.4017300597888309, + "ewc_loss": 0.0203857421875, + "ewc_loss_parallel": 2.0384788513183594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.834203720092773, + "learning_rate": 1e-06, + "loss": 0.4419, + "mean_token_accuracy": 0.8621455430984497, + "num_tokens": 120557901.0, + "step": 3158 + }, + { + "epoch": 0.40185727006742145, + "ewc_loss": 0.0203857421875, + "ewc_loss_parallel": 2.0384788513183594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.02152442932129, + "learning_rate": 1e-06, + "loss": 0.4351, + "mean_token_accuracy": 0.8587079048156738, + "num_tokens": 120593851.0, + "step": 3159 + }, + { + "epoch": 0.401984480346012, + "ewc_loss": 0.020263671875, + "ewc_loss_parallel": 2.0265579223632812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.01693344116211, + "learning_rate": 1e-06, + "loss": 0.4714, + "mean_token_accuracy": 0.8453598022460938, + "num_tokens": 120624533.0, + "step": 3160 + }, + { + "epoch": 0.40211169062460245, + "ewc_loss": 0.0203857421875, + "ewc_loss_parallel": 2.0384788513183594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.86116600036621, + "learning_rate": 1e-06, + "loss": 0.4674, + "mean_token_accuracy": 0.8475795388221741, + "num_tokens": 120660820.0, + "step": 3161 + }, + { + "epoch": 0.402238900903193, + "ewc_loss": 0.020263671875, + "ewc_loss_parallel": 2.0265579223632812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.886760711669922, + "learning_rate": 1e-06, + "loss": 0.4135, + "mean_token_accuracy": 0.8640776872634888, + "num_tokens": 120701485.0, + "step": 3162 + }, + { + "epoch": 0.4023661111817835, + "ewc_loss": 0.020263671875, + "ewc_loss_parallel": 2.0265579223632812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.827228546142578, + "learning_rate": 1e-06, + "loss": 0.4395, + "mean_token_accuracy": 0.8618217706680298, + "num_tokens": 120736677.0, + "step": 3163 + }, + { + "epoch": 0.402493321460374, + "ewc_loss": 0.0203857421875, + "ewc_loss_parallel": 2.0384788513183594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.907108306884766, + "learning_rate": 1e-06, + "loss": 0.4682, + "mean_token_accuracy": 0.8499463796615601, + "num_tokens": 120773933.0, + "step": 3164 + }, + { + "epoch": 0.4026205317389645, + "ewc_loss": 0.0203857421875, + "ewc_loss_parallel": 2.0384788513183594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.80975914001465, + "learning_rate": 1e-06, + "loss": 0.4388, + "mean_token_accuracy": 0.8575021624565125, + "num_tokens": 120804671.0, + "step": 3165 + }, + { + "epoch": 0.40274774201755503, + "ewc_loss": 0.020263671875, + "ewc_loss_parallel": 2.0265579223632812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.884708404541016, + "learning_rate": 1e-06, + "loss": 0.4498, + "mean_token_accuracy": 0.8565692901611328, + "num_tokens": 120837042.0, + "step": 3166 + }, + { + "epoch": 0.4028749522961455, + "ewc_loss": 0.0205078125, + "ewc_loss_parallel": 2.0503997802734375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.056745529174805, + "learning_rate": 1e-06, + "loss": 0.4723, + "mean_token_accuracy": 0.8499794006347656, + "num_tokens": 120875127.0, + "step": 3167 + }, + { + "epoch": 0.40300216257473603, + "ewc_loss": 0.0205078125, + "ewc_loss_parallel": 2.0503997802734375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.93524742126465, + "learning_rate": 1e-06, + "loss": 0.4638, + "mean_token_accuracy": 0.8520166873931885, + "num_tokens": 120915937.0, + "step": 3168 + }, + { + "epoch": 0.40312937285332656, + "ewc_loss": 0.0203857421875, + "ewc_loss_parallel": 2.0384788513183594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.79389190673828, + "learning_rate": 1e-06, + "loss": 0.499, + "mean_token_accuracy": 0.8374063968658447, + "num_tokens": 120953334.0, + "step": 3169 + }, + { + "epoch": 0.40325658313191703, + "ewc_loss": 0.0205078125, + "ewc_loss_parallel": 2.0503997802734375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.024269104003906, + "learning_rate": 1e-06, + "loss": 0.4677, + "mean_token_accuracy": 0.8528051972389221, + "num_tokens": 120986674.0, + "step": 3170 + }, + { + "epoch": 0.40338379341050756, + "ewc_loss": 0.0203857421875, + "ewc_loss_parallel": 2.0384788513183594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.918392181396484, + "learning_rate": 1e-06, + "loss": 0.4529, + "mean_token_accuracy": 0.8589411973953247, + "num_tokens": 121026305.0, + "step": 3171 + }, + { + "epoch": 0.4035110036890981, + "ewc_loss": 0.0203857421875, + "ewc_loss_parallel": 2.0384788513183594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.95763397216797, + "learning_rate": 1e-06, + "loss": 0.416, + "mean_token_accuracy": 0.8670836091041565, + "num_tokens": 121061644.0, + "step": 3172 + }, + { + "epoch": 0.40363821396768856, + "ewc_loss": 0.0206298828125, + "ewc_loss_parallel": 2.0623207092285156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.223800659179688, + "learning_rate": 1e-06, + "loss": 0.4747, + "mean_token_accuracy": 0.8487840890884399, + "num_tokens": 121102860.0, + "step": 3173 + }, + { + "epoch": 0.4037654242462791, + "ewc_loss": 0.0203857421875, + "ewc_loss_parallel": 2.0384788513183594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.751270294189453, + "learning_rate": 1e-06, + "loss": 0.4146, + "mean_token_accuracy": 0.8661642074584961, + "num_tokens": 121136566.0, + "step": 3174 + }, + { + "epoch": 0.4038926345248696, + "ewc_loss": 0.020263671875, + "ewc_loss_parallel": 2.0265579223632812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.11298942565918, + "learning_rate": 1e-06, + "loss": 0.4208, + "mean_token_accuracy": 0.8623960614204407, + "num_tokens": 121173759.0, + "step": 3175 + }, + { + "epoch": 0.4040198448034601, + "ewc_loss": 0.0205078125, + "ewc_loss_parallel": 2.0503997802734375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.99677848815918, + "learning_rate": 1e-06, + "loss": 0.4855, + "mean_token_accuracy": 0.853670597076416, + "num_tokens": 121219881.0, + "step": 3176 + }, + { + "epoch": 0.4041470550820506, + "ewc_loss": 0.020263671875, + "ewc_loss_parallel": 2.0265579223632812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.899539947509766, + "learning_rate": 1e-06, + "loss": 0.4747, + "mean_token_accuracy": 0.8515315055847168, + "num_tokens": 121258719.0, + "step": 3177 + }, + { + "epoch": 0.40427426536064115, + "ewc_loss": 0.0205078125, + "ewc_loss_parallel": 2.0503997802734375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.133195877075195, + "learning_rate": 1e-06, + "loss": 0.5292, + "mean_token_accuracy": 0.8299448490142822, + "num_tokens": 121298013.0, + "step": 3178 + }, + { + "epoch": 0.4044014756392316, + "ewc_loss": 0.0205078125, + "ewc_loss_parallel": 2.0503997802734375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.85620880126953, + "learning_rate": 1e-06, + "loss": 0.4256, + "mean_token_accuracy": 0.8646761178970337, + "num_tokens": 121329615.0, + "step": 3179 + }, + { + "epoch": 0.40452868591782215, + "ewc_loss": 0.0201416015625, + "ewc_loss_parallel": 2.014636993408203e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.066673278808594, + "learning_rate": 1e-06, + "loss": 0.4328, + "mean_token_accuracy": 0.8627867698669434, + "num_tokens": 121365199.0, + "step": 3180 + }, + { + "epoch": 0.4046558961964127, + "ewc_loss": 0.0205078125, + "ewc_loss_parallel": 2.0503997802734375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.09427833557129, + "learning_rate": 1e-06, + "loss": 0.4735, + "mean_token_accuracy": 0.8480536937713623, + "num_tokens": 121402393.0, + "step": 3181 + }, + { + "epoch": 0.4047831064750032, + "ewc_loss": 0.020263671875, + "ewc_loss_parallel": 2.0265579223632812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.94800567626953, + "learning_rate": 1e-06, + "loss": 0.4566, + "mean_token_accuracy": 0.8525795936584473, + "num_tokens": 121439678.0, + "step": 3182 + }, + { + "epoch": 0.4049103167535937, + "ewc_loss": 0.0205078125, + "ewc_loss_parallel": 2.0503997802734375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.000951766967773, + "learning_rate": 1e-06, + "loss": 0.4691, + "mean_token_accuracy": 0.848006010055542, + "num_tokens": 121480781.0, + "step": 3183 + }, + { + "epoch": 0.4050375270321842, + "ewc_loss": 0.0203857421875, + "ewc_loss_parallel": 2.0384788513183594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.288471221923828, + "learning_rate": 1e-06, + "loss": 0.4326, + "mean_token_accuracy": 0.8604280948638916, + "num_tokens": 121517614.0, + "step": 3184 + }, + { + "epoch": 0.40516473731077474, + "ewc_loss": 0.0203857421875, + "ewc_loss_parallel": 2.0384788513183594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.953039169311523, + "learning_rate": 1e-06, + "loss": 0.4721, + "mean_token_accuracy": 0.8495088219642639, + "num_tokens": 121552225.0, + "step": 3185 + }, + { + "epoch": 0.4052919475893652, + "ewc_loss": 0.020263671875, + "ewc_loss_parallel": 2.0265579223632812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.10944938659668, + "learning_rate": 1e-06, + "loss": 0.4797, + "mean_token_accuracy": 0.8478598594665527, + "num_tokens": 121587177.0, + "step": 3186 + }, + { + "epoch": 0.40541915786795574, + "ewc_loss": 0.020263671875, + "ewc_loss_parallel": 2.0265579223632812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.90508460998535, + "learning_rate": 1e-06, + "loss": 0.5136, + "mean_token_accuracy": 0.8352748155593872, + "num_tokens": 121619547.0, + "step": 3187 + }, + { + "epoch": 0.40554636814654627, + "ewc_loss": 0.0205078125, + "ewc_loss_parallel": 2.0503997802734375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.114866256713867, + "learning_rate": 1e-06, + "loss": 0.4392, + "mean_token_accuracy": 0.860719621181488, + "num_tokens": 121657396.0, + "step": 3188 + }, + { + "epoch": 0.40567357842513674, + "ewc_loss": 0.020263671875, + "ewc_loss_parallel": 2.0265579223632812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.949378967285156, + "learning_rate": 1e-06, + "loss": 0.5075, + "mean_token_accuracy": 0.8365530967712402, + "num_tokens": 121691867.0, + "step": 3189 + }, + { + "epoch": 0.40580078870372727, + "ewc_loss": 0.020263671875, + "ewc_loss_parallel": 2.0265579223632812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.89893913269043, + "learning_rate": 1e-06, + "loss": 0.4702, + "mean_token_accuracy": 0.847034752368927, + "num_tokens": 121723546.0, + "step": 3190 + }, + { + "epoch": 0.4059279989823178, + "ewc_loss": 0.020263671875, + "ewc_loss_parallel": 2.0265579223632812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.045564651489258, + "learning_rate": 1e-06, + "loss": 0.4646, + "mean_token_accuracy": 0.8531339764595032, + "num_tokens": 121764589.0, + "step": 3191 + }, + { + "epoch": 0.40605520926090827, + "ewc_loss": 0.0203857421875, + "ewc_loss_parallel": 2.0384788513183594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.968618392944336, + "learning_rate": 1e-06, + "loss": 0.4983, + "mean_token_accuracy": 0.8423416614532471, + "num_tokens": 121803281.0, + "step": 3192 + }, + { + "epoch": 0.4061824195394988, + "ewc_loss": 0.0203857421875, + "ewc_loss_parallel": 2.0384788513183594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.075563430786133, + "learning_rate": 1e-06, + "loss": 0.5043, + "mean_token_accuracy": 0.8398194313049316, + "num_tokens": 121841710.0, + "step": 3193 + }, + { + "epoch": 0.4063096298180893, + "ewc_loss": 0.0203857421875, + "ewc_loss_parallel": 2.0384788513183594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.02248764038086, + "learning_rate": 1e-06, + "loss": 0.4801, + "mean_token_accuracy": 0.8511158227920532, + "num_tokens": 121881979.0, + "step": 3194 + }, + { + "epoch": 0.4064368400966798, + "ewc_loss": 0.020263671875, + "ewc_loss_parallel": 2.0265579223632812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.989200592041016, + "learning_rate": 1e-06, + "loss": 0.4403, + "mean_token_accuracy": 0.8572234511375427, + "num_tokens": 121921599.0, + "step": 3195 + }, + { + "epoch": 0.4065640503752703, + "ewc_loss": 0.0203857421875, + "ewc_loss_parallel": 2.0384788513183594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.83183479309082, + "learning_rate": 1e-06, + "loss": 0.4337, + "mean_token_accuracy": 0.862468421459198, + "num_tokens": 121960545.0, + "step": 3196 + }, + { + "epoch": 0.40669126065386085, + "ewc_loss": 0.020263671875, + "ewc_loss_parallel": 2.0265579223632812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.033100128173828, + "learning_rate": 1e-06, + "loss": 0.522, + "mean_token_accuracy": 0.8344549536705017, + "num_tokens": 121999259.0, + "step": 3197 + }, + { + "epoch": 0.4068184709324513, + "ewc_loss": 0.0205078125, + "ewc_loss_parallel": 2.0503997802734375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.90694808959961, + "learning_rate": 1e-06, + "loss": 0.417, + "mean_token_accuracy": 0.8662394285202026, + "num_tokens": 122039066.0, + "step": 3198 + }, + { + "epoch": 0.40694568121104185, + "ewc_loss": 0.0203857421875, + "ewc_loss_parallel": 2.0384788513183594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.98294448852539, + "learning_rate": 1e-06, + "loss": 0.4346, + "mean_token_accuracy": 0.8609553575515747, + "num_tokens": 122085035.0, + "step": 3199 + }, + { + "epoch": 0.4070728914896324, + "ewc_loss": 0.0205078125, + "ewc_loss_parallel": 2.0503997802734375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.90581512451172, + "learning_rate": 1e-06, + "loss": 0.4206, + "mean_token_accuracy": 0.8675284385681152, + "num_tokens": 122128007.0, + "step": 3200 + }, + { + "epoch": 0.40720010176822286, + "ewc_loss": 0.020263671875, + "ewc_loss_parallel": 2.0265579223632812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.037822723388672, + "learning_rate": 1e-06, + "loss": 0.4663, + "mean_token_accuracy": 0.8512372970581055, + "num_tokens": 122169623.0, + "step": 3201 + }, + { + "epoch": 0.4073273120468134, + "ewc_loss": 0.0205078125, + "ewc_loss_parallel": 2.0503997802734375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.015987396240234, + "learning_rate": 1e-06, + "loss": 0.4842, + "mean_token_accuracy": 0.8479684591293335, + "num_tokens": 122201755.0, + "step": 3202 + }, + { + "epoch": 0.4074545223254039, + "ewc_loss": 0.0203857421875, + "ewc_loss_parallel": 2.0384788513183594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.976749420166016, + "learning_rate": 1e-06, + "loss": 0.4291, + "mean_token_accuracy": 0.8629686236381531, + "num_tokens": 122233680.0, + "step": 3203 + }, + { + "epoch": 0.4075817326039944, + "ewc_loss": 0.0205078125, + "ewc_loss_parallel": 2.0503997802734375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.973461151123047, + "learning_rate": 1e-06, + "loss": 0.4678, + "mean_token_accuracy": 0.850186824798584, + "num_tokens": 122265107.0, + "step": 3204 + }, + { + "epoch": 0.4077089428825849, + "ewc_loss": 0.0205078125, + "ewc_loss_parallel": 2.0503997802734375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.958938598632812, + "learning_rate": 1e-06, + "loss": 0.4421, + "mean_token_accuracy": 0.8576246500015259, + "num_tokens": 122302750.0, + "step": 3205 + }, + { + "epoch": 0.40783615316117544, + "ewc_loss": 0.0203857421875, + "ewc_loss_parallel": 2.0384788513183594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.90764045715332, + "learning_rate": 1e-06, + "loss": 0.4916, + "mean_token_accuracy": 0.8441903591156006, + "num_tokens": 122346820.0, + "step": 3206 + }, + { + "epoch": 0.4079633634397659, + "ewc_loss": 0.0205078125, + "ewc_loss_parallel": 2.0503997802734375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.948352813720703, + "learning_rate": 1e-06, + "loss": 0.4403, + "mean_token_accuracy": 0.8584017753601074, + "num_tokens": 122383254.0, + "step": 3207 + }, + { + "epoch": 0.40809057371835644, + "ewc_loss": 0.0203857421875, + "ewc_loss_parallel": 2.0384788513183594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.133792877197266, + "learning_rate": 1e-06, + "loss": 0.4262, + "mean_token_accuracy": 0.8650214672088623, + "num_tokens": 122416693.0, + "step": 3208 + }, + { + "epoch": 0.40821778399694697, + "ewc_loss": 0.0205078125, + "ewc_loss_parallel": 2.0503997802734375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.91726303100586, + "learning_rate": 1e-06, + "loss": 0.4462, + "mean_token_accuracy": 0.8580728769302368, + "num_tokens": 122452403.0, + "step": 3209 + }, + { + "epoch": 0.40834499427553744, + "ewc_loss": 0.0205078125, + "ewc_loss_parallel": 2.0503997802734375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.94760513305664, + "learning_rate": 1e-06, + "loss": 0.4899, + "mean_token_accuracy": 0.8424946665763855, + "num_tokens": 122489685.0, + "step": 3210 + }, + { + "epoch": 0.40847220455412797, + "ewc_loss": 0.0203857421875, + "ewc_loss_parallel": 2.0384788513183594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.95839500427246, + "learning_rate": 1e-06, + "loss": 0.5157, + "mean_token_accuracy": 0.8357725143432617, + "num_tokens": 122529079.0, + "step": 3211 + }, + { + "epoch": 0.4085994148327185, + "ewc_loss": 0.0206298828125, + "ewc_loss_parallel": 2.0623207092285156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.101348876953125, + "learning_rate": 1e-06, + "loss": 0.4796, + "mean_token_accuracy": 0.8472129702568054, + "num_tokens": 122569933.0, + "step": 3212 + }, + { + "epoch": 0.40872662511130897, + "ewc_loss": 0.0203857421875, + "ewc_loss_parallel": 2.0384788513183594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.787405014038086, + "learning_rate": 1e-06, + "loss": 0.5233, + "mean_token_accuracy": 0.8341218829154968, + "num_tokens": 122607884.0, + "step": 3213 + }, + { + "epoch": 0.4088538353898995, + "ewc_loss": 0.0203857421875, + "ewc_loss_parallel": 2.0384788513183594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.976778030395508, + "learning_rate": 1e-06, + "loss": 0.46, + "mean_token_accuracy": 0.8542283773422241, + "num_tokens": 122651968.0, + "step": 3214 + }, + { + "epoch": 0.40898104566849003, + "ewc_loss": 0.0205078125, + "ewc_loss_parallel": 2.0503997802734375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.99636459350586, + "learning_rate": 1e-06, + "loss": 0.4461, + "mean_token_accuracy": 0.8574957847595215, + "num_tokens": 122688680.0, + "step": 3215 + }, + { + "epoch": 0.4091082559470805, + "ewc_loss": 0.0203857421875, + "ewc_loss_parallel": 2.0384788513183594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.772615432739258, + "learning_rate": 1e-06, + "loss": 0.441, + "mean_token_accuracy": 0.8587095737457275, + "num_tokens": 122728271.0, + "step": 3216 + }, + { + "epoch": 0.40923546622567103, + "ewc_loss": 0.0206298828125, + "ewc_loss_parallel": 2.0623207092285156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.991382598876953, + "learning_rate": 1e-06, + "loss": 0.4618, + "mean_token_accuracy": 0.8519333600997925, + "num_tokens": 122759691.0, + "step": 3217 + }, + { + "epoch": 0.40936267650426156, + "ewc_loss": 0.0205078125, + "ewc_loss_parallel": 2.0503997802734375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.892677307128906, + "learning_rate": 1e-06, + "loss": 0.4695, + "mean_token_accuracy": 0.8522436618804932, + "num_tokens": 122800185.0, + "step": 3218 + }, + { + "epoch": 0.40948988678285203, + "ewc_loss": 0.0206298828125, + "ewc_loss_parallel": 2.0623207092285156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.90690040588379, + "learning_rate": 1e-06, + "loss": 0.4308, + "mean_token_accuracy": 0.8628873825073242, + "num_tokens": 122839548.0, + "step": 3219 + }, + { + "epoch": 0.40961709706144256, + "ewc_loss": 0.0205078125, + "ewc_loss_parallel": 2.0503997802734375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.900920867919922, + "learning_rate": 1e-06, + "loss": 0.4987, + "mean_token_accuracy": 0.8392230272293091, + "num_tokens": 122879621.0, + "step": 3220 + }, + { + "epoch": 0.4097443073400331, + "ewc_loss": 0.0205078125, + "ewc_loss_parallel": 2.0503997802734375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.1260929107666, + "learning_rate": 1e-06, + "loss": 0.4695, + "mean_token_accuracy": 0.8478330373764038, + "num_tokens": 122921495.0, + "step": 3221 + }, + { + "epoch": 0.40987151761862356, + "ewc_loss": 0.020751953125, + "ewc_loss_parallel": 2.0742416381835938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.83504867553711, + "learning_rate": 1e-06, + "loss": 0.4392, + "mean_token_accuracy": 0.8594092130661011, + "num_tokens": 122957197.0, + "step": 3222 + }, + { + "epoch": 0.4099987278972141, + "ewc_loss": 0.0205078125, + "ewc_loss_parallel": 2.0503997802734375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.02568244934082, + "learning_rate": 1e-06, + "loss": 0.5161, + "mean_token_accuracy": 0.8341716527938843, + "num_tokens": 122999999.0, + "step": 3223 + }, + { + "epoch": 0.4101259381758046, + "ewc_loss": 0.020751953125, + "ewc_loss_parallel": 2.0742416381835938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.921138763427734, + "learning_rate": 1e-06, + "loss": 0.4449, + "mean_token_accuracy": 0.8579716086387634, + "num_tokens": 123040402.0, + "step": 3224 + }, + { + "epoch": 0.4102531484543951, + "ewc_loss": 0.0206298828125, + "ewc_loss_parallel": 2.0623207092285156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.242401123046875, + "learning_rate": 1e-06, + "loss": 0.4481, + "mean_token_accuracy": 0.8575589656829834, + "num_tokens": 123080086.0, + "step": 3225 + }, + { + "epoch": 0.4103803587329856, + "ewc_loss": 0.0205078125, + "ewc_loss_parallel": 2.0503997802734375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.950531005859375, + "learning_rate": 1e-06, + "loss": 0.4517, + "mean_token_accuracy": 0.8555963039398193, + "num_tokens": 123120058.0, + "step": 3226 + }, + { + "epoch": 0.41050756901157615, + "ewc_loss": 0.0205078125, + "ewc_loss_parallel": 2.0503997802734375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.08629035949707, + "learning_rate": 1e-06, + "loss": 0.4508, + "mean_token_accuracy": 0.8543455600738525, + "num_tokens": 123162003.0, + "step": 3227 + }, + { + "epoch": 0.4106347792901666, + "ewc_loss": 0.0206298828125, + "ewc_loss_parallel": 2.0623207092285156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.96755599975586, + "learning_rate": 1e-06, + "loss": 0.5561, + "mean_token_accuracy": 0.8205543756484985, + "num_tokens": 123202249.0, + "step": 3228 + }, + { + "epoch": 0.41076198956875715, + "ewc_loss": 0.0205078125, + "ewc_loss_parallel": 2.0503997802734375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.004764556884766, + "learning_rate": 1e-06, + "loss": 0.4764, + "mean_token_accuracy": 0.8479874134063721, + "num_tokens": 123237401.0, + "step": 3229 + }, + { + "epoch": 0.4108891998473477, + "ewc_loss": 0.0206298828125, + "ewc_loss_parallel": 2.0623207092285156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.890371322631836, + "learning_rate": 1e-06, + "loss": 0.4134, + "mean_token_accuracy": 0.8692575693130493, + "num_tokens": 123269364.0, + "step": 3230 + }, + { + "epoch": 0.4110164101259382, + "ewc_loss": 0.0205078125, + "ewc_loss_parallel": 2.0503997802734375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.056379318237305, + "learning_rate": 1e-06, + "loss": 0.4571, + "mean_token_accuracy": 0.855246901512146, + "num_tokens": 123307743.0, + "step": 3231 + }, + { + "epoch": 0.4111436204045287, + "ewc_loss": 0.0206298828125, + "ewc_loss_parallel": 2.0623207092285156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.118244171142578, + "learning_rate": 1e-06, + "loss": 0.4429, + "mean_token_accuracy": 0.8576375246047974, + "num_tokens": 123340102.0, + "step": 3232 + }, + { + "epoch": 0.4112708306831192, + "ewc_loss": 0.020751953125, + "ewc_loss_parallel": 2.0742416381835938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.961429595947266, + "learning_rate": 1e-06, + "loss": 0.4755, + "mean_token_accuracy": 0.8483095765113831, + "num_tokens": 123374956.0, + "step": 3233 + }, + { + "epoch": 0.41139804096170973, + "ewc_loss": 0.0203857421875, + "ewc_loss_parallel": 2.0384788513183594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.927621841430664, + "learning_rate": 1e-06, + "loss": 0.437, + "mean_token_accuracy": 0.8630694150924683, + "num_tokens": 123418070.0, + "step": 3234 + }, + { + "epoch": 0.4115252512403002, + "ewc_loss": 0.0205078125, + "ewc_loss_parallel": 2.0503997802734375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.05178451538086, + "learning_rate": 1e-06, + "loss": 0.4827, + "mean_token_accuracy": 0.8514891862869263, + "num_tokens": 123453183.0, + "step": 3235 + }, + { + "epoch": 0.41165246151889073, + "ewc_loss": 0.020751953125, + "ewc_loss_parallel": 2.0742416381835938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.119247436523438, + "learning_rate": 1e-06, + "loss": 0.4586, + "mean_token_accuracy": 0.8538938164710999, + "num_tokens": 123491526.0, + "step": 3236 + }, + { + "epoch": 0.41177967179748126, + "ewc_loss": 0.0205078125, + "ewc_loss_parallel": 2.0503997802734375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.986587524414062, + "learning_rate": 1e-06, + "loss": 0.459, + "mean_token_accuracy": 0.8545738458633423, + "num_tokens": 123529743.0, + "step": 3237 + }, + { + "epoch": 0.41190688207607173, + "ewc_loss": 0.0205078125, + "ewc_loss_parallel": 2.0503997802734375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.73723602294922, + "learning_rate": 1e-06, + "loss": 0.4492, + "mean_token_accuracy": 0.8560938835144043, + "num_tokens": 123573402.0, + "step": 3238 + }, + { + "epoch": 0.41203409235466226, + "ewc_loss": 0.0205078125, + "ewc_loss_parallel": 2.0503997802734375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.022287368774414, + "learning_rate": 1e-06, + "loss": 0.4683, + "mean_token_accuracy": 0.8501496911048889, + "num_tokens": 123604167.0, + "step": 3239 + }, + { + "epoch": 0.4121613026332528, + "ewc_loss": 0.020751953125, + "ewc_loss_parallel": 2.0742416381835938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.936870574951172, + "learning_rate": 1e-06, + "loss": 0.4696, + "mean_token_accuracy": 0.8501932621002197, + "num_tokens": 123648104.0, + "step": 3240 + }, + { + "epoch": 0.41228851291184326, + "ewc_loss": 0.020751953125, + "ewc_loss_parallel": 2.0742416381835938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.013267517089844, + "learning_rate": 1e-06, + "loss": 0.4912, + "mean_token_accuracy": 0.843128502368927, + "num_tokens": 123689678.0, + "step": 3241 + }, + { + "epoch": 0.4124157231904338, + "ewc_loss": 0.020751953125, + "ewc_loss_parallel": 2.0742416381835938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.825450897216797, + "learning_rate": 1e-06, + "loss": 0.4361, + "mean_token_accuracy": 0.8605591654777527, + "num_tokens": 123729257.0, + "step": 3242 + }, + { + "epoch": 0.4125429334690243, + "ewc_loss": 0.020751953125, + "ewc_loss_parallel": 2.0742416381835938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.029775619506836, + "learning_rate": 1e-06, + "loss": 0.4523, + "mean_token_accuracy": 0.8555729389190674, + "num_tokens": 123772251.0, + "step": 3243 + }, + { + "epoch": 0.4126701437476148, + "ewc_loss": 0.0206298828125, + "ewc_loss_parallel": 2.0623207092285156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.047876358032227, + "learning_rate": 1e-06, + "loss": 0.4212, + "mean_token_accuracy": 0.8662333488464355, + "num_tokens": 123810345.0, + "step": 3244 + }, + { + "epoch": 0.4127973540262053, + "ewc_loss": 0.020751953125, + "ewc_loss_parallel": 2.0742416381835938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.078893661499023, + "learning_rate": 1e-06, + "loss": 0.4822, + "mean_token_accuracy": 0.8498205542564392, + "num_tokens": 123855276.0, + "step": 3245 + }, + { + "epoch": 0.41292456430479585, + "ewc_loss": 0.0206298828125, + "ewc_loss_parallel": 2.0623207092285156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.98599624633789, + "learning_rate": 1e-06, + "loss": 0.4498, + "mean_token_accuracy": 0.856513261795044, + "num_tokens": 123896613.0, + "step": 3246 + }, + { + "epoch": 0.4130517745833863, + "ewc_loss": 0.0206298828125, + "ewc_loss_parallel": 2.0623207092285156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.032405853271484, + "learning_rate": 1e-06, + "loss": 0.5021, + "mean_token_accuracy": 0.8371464014053345, + "num_tokens": 123940713.0, + "step": 3247 + }, + { + "epoch": 0.41317898486197685, + "ewc_loss": 0.020751953125, + "ewc_loss_parallel": 2.0742416381835938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.01430892944336, + "learning_rate": 1e-06, + "loss": 0.4911, + "mean_token_accuracy": 0.8428725004196167, + "num_tokens": 123981485.0, + "step": 3248 + }, + { + "epoch": 0.4133061951405674, + "ewc_loss": 0.020751953125, + "ewc_loss_parallel": 2.0742416381835938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.13290023803711, + "learning_rate": 1e-06, + "loss": 0.458, + "mean_token_accuracy": 0.8537262678146362, + "num_tokens": 124018159.0, + "step": 3249 + }, + { + "epoch": 0.41343340541915785, + "ewc_loss": 0.020751953125, + "ewc_loss_parallel": 2.0742416381835938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.96648406982422, + "learning_rate": 1e-06, + "loss": 0.4095, + "mean_token_accuracy": 0.8628475666046143, + "num_tokens": 124049911.0, + "step": 3250 + }, + { + "epoch": 0.4135606156977484, + "ewc_loss": 0.0203857421875, + "ewc_loss_parallel": 2.0384788513183594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.111888885498047, + "learning_rate": 1e-06, + "loss": 0.4023, + "mean_token_accuracy": 0.873956024646759, + "num_tokens": 124084260.0, + "step": 3251 + }, + { + "epoch": 0.4136878259763389, + "ewc_loss": 0.020751953125, + "ewc_loss_parallel": 2.0742416381835938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.945741653442383, + "learning_rate": 1e-06, + "loss": 0.446, + "mean_token_accuracy": 0.8568795919418335, + "num_tokens": 124122741.0, + "step": 3252 + }, + { + "epoch": 0.4138150362549294, + "ewc_loss": 0.0206298828125, + "ewc_loss_parallel": 2.0623207092285156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.05553436279297, + "learning_rate": 1e-06, + "loss": 0.4867, + "mean_token_accuracy": 0.8498213291168213, + "num_tokens": 124162793.0, + "step": 3253 + }, + { + "epoch": 0.4139422465335199, + "ewc_loss": 0.020751953125, + "ewc_loss_parallel": 2.0742416381835938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.078899383544922, + "learning_rate": 1e-06, + "loss": 0.4834, + "mean_token_accuracy": 0.8464962244033813, + "num_tokens": 124206889.0, + "step": 3254 + }, + { + "epoch": 0.41406945681211044, + "ewc_loss": 0.0205078125, + "ewc_loss_parallel": 2.0503997802734375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.004108428955078, + "learning_rate": 1e-06, + "loss": 0.4348, + "mean_token_accuracy": 0.8642879128456116, + "num_tokens": 124244551.0, + "step": 3255 + }, + { + "epoch": 0.4141966670907009, + "ewc_loss": 0.020751953125, + "ewc_loss_parallel": 2.0742416381835938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.183610916137695, + "learning_rate": 1e-06, + "loss": 0.4606, + "mean_token_accuracy": 0.8565452098846436, + "num_tokens": 124290485.0, + "step": 3256 + }, + { + "epoch": 0.41432387736929144, + "ewc_loss": 0.0206298828125, + "ewc_loss_parallel": 2.0623207092285156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.929523468017578, + "learning_rate": 1e-06, + "loss": 0.4333, + "mean_token_accuracy": 0.8604476451873779, + "num_tokens": 124323763.0, + "step": 3257 + }, + { + "epoch": 0.41445108764788197, + "ewc_loss": 0.0206298828125, + "ewc_loss_parallel": 2.0623207092285156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.12302017211914, + "learning_rate": 1e-06, + "loss": 0.4581, + "mean_token_accuracy": 0.8544676303863525, + "num_tokens": 124363318.0, + "step": 3258 + }, + { + "epoch": 0.41457829792647244, + "ewc_loss": 0.0206298828125, + "ewc_loss_parallel": 2.0623207092285156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.069232940673828, + "learning_rate": 1e-06, + "loss": 0.391, + "mean_token_accuracy": 0.8768782615661621, + "num_tokens": 124400845.0, + "step": 3259 + }, + { + "epoch": 0.41470550820506297, + "ewc_loss": 0.0205078125, + "ewc_loss_parallel": 2.0503997802734375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.052410125732422, + "learning_rate": 1e-06, + "loss": 0.4255, + "mean_token_accuracy": 0.8598147034645081, + "num_tokens": 124442546.0, + "step": 3260 + }, + { + "epoch": 0.4148327184836535, + "ewc_loss": 0.0205078125, + "ewc_loss_parallel": 2.0503997802734375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.050678253173828, + "learning_rate": 1e-06, + "loss": 0.4859, + "mean_token_accuracy": 0.8486669063568115, + "num_tokens": 124477737.0, + "step": 3261 + }, + { + "epoch": 0.41495992876224397, + "ewc_loss": 0.020263671875, + "ewc_loss_parallel": 2.0265579223632812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.833650588989258, + "learning_rate": 1e-06, + "loss": 0.5003, + "mean_token_accuracy": 0.840639054775238, + "num_tokens": 124518081.0, + "step": 3262 + }, + { + "epoch": 0.4150871390408345, + "ewc_loss": 0.0206298828125, + "ewc_loss_parallel": 2.0623207092285156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.03215789794922, + "learning_rate": 1e-06, + "loss": 0.4876, + "mean_token_accuracy": 0.8434208631515503, + "num_tokens": 124556361.0, + "step": 3263 + }, + { + "epoch": 0.415214349319425, + "ewc_loss": 0.020751953125, + "ewc_loss_parallel": 2.0742416381835938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.039113998413086, + "learning_rate": 1e-06, + "loss": 0.4556, + "mean_token_accuracy": 0.8523577451705933, + "num_tokens": 124588490.0, + "step": 3264 + }, + { + "epoch": 0.4153415595980155, + "ewc_loss": 0.0206298828125, + "ewc_loss_parallel": 2.0623207092285156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.0128231048584, + "learning_rate": 1e-06, + "loss": 0.4635, + "mean_token_accuracy": 0.8516254425048828, + "num_tokens": 124624656.0, + "step": 3265 + }, + { + "epoch": 0.415468769876606, + "ewc_loss": 0.0206298828125, + "ewc_loss_parallel": 2.0623207092285156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.098793029785156, + "learning_rate": 1e-06, + "loss": 0.5046, + "mean_token_accuracy": 0.8406128883361816, + "num_tokens": 124660681.0, + "step": 3266 + }, + { + "epoch": 0.41559598015519655, + "ewc_loss": 0.0206298828125, + "ewc_loss_parallel": 2.0623207092285156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.016616821289062, + "learning_rate": 1e-06, + "loss": 0.4569, + "mean_token_accuracy": 0.8546644449234009, + "num_tokens": 124703448.0, + "step": 3267 + }, + { + "epoch": 0.415723190433787, + "ewc_loss": 0.0205078125, + "ewc_loss_parallel": 2.0503997802734375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.988454818725586, + "learning_rate": 1e-06, + "loss": 0.4511, + "mean_token_accuracy": 0.8510315418243408, + "num_tokens": 124742078.0, + "step": 3268 + }, + { + "epoch": 0.41585040071237755, + "ewc_loss": 0.0206298828125, + "ewc_loss_parallel": 2.0623207092285156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.118818283081055, + "learning_rate": 1e-06, + "loss": 0.5102, + "mean_token_accuracy": 0.8407347202301025, + "num_tokens": 124779894.0, + "step": 3269 + }, + { + "epoch": 0.4159776109909681, + "ewc_loss": 0.020751953125, + "ewc_loss_parallel": 2.0742416381835938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.117151260375977, + "learning_rate": 1e-06, + "loss": 0.4777, + "mean_token_accuracy": 0.8484191298484802, + "num_tokens": 124817918.0, + "step": 3270 + }, + { + "epoch": 0.41610482126955856, + "ewc_loss": 0.0205078125, + "ewc_loss_parallel": 2.0503997802734375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.869129180908203, + "learning_rate": 1e-06, + "loss": 0.4137, + "mean_token_accuracy": 0.8694605827331543, + "num_tokens": 124854839.0, + "step": 3271 + }, + { + "epoch": 0.4162320315481491, + "ewc_loss": 0.020751953125, + "ewc_loss_parallel": 2.0742416381835938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.211238861083984, + "learning_rate": 1e-06, + "loss": 0.5099, + "mean_token_accuracy": 0.8370946645736694, + "num_tokens": 124886779.0, + "step": 3272 + }, + { + "epoch": 0.4163592418267396, + "ewc_loss": 0.020751953125, + "ewc_loss_parallel": 2.0742416381835938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.029224395751953, + "learning_rate": 1e-06, + "loss": 0.4417, + "mean_token_accuracy": 0.8625894784927368, + "num_tokens": 124927919.0, + "step": 3273 + }, + { + "epoch": 0.4164864521053301, + "ewc_loss": 0.020751953125, + "ewc_loss_parallel": 2.0742416381835938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.108488082885742, + "learning_rate": 1e-06, + "loss": 0.4661, + "mean_token_accuracy": 0.8499208688735962, + "num_tokens": 124966475.0, + "step": 3274 + }, + { + "epoch": 0.4166136623839206, + "ewc_loss": 0.0205078125, + "ewc_loss_parallel": 2.0503997802734375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.104522705078125, + "learning_rate": 1e-06, + "loss": 0.5425, + "mean_token_accuracy": 0.8286092281341553, + "num_tokens": 125004110.0, + "step": 3275 + }, + { + "epoch": 0.41674087266251114, + "ewc_loss": 0.0206298828125, + "ewc_loss_parallel": 2.0623207092285156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.907981872558594, + "learning_rate": 1e-06, + "loss": 0.4081, + "mean_token_accuracy": 0.8674495816230774, + "num_tokens": 125043164.0, + "step": 3276 + }, + { + "epoch": 0.4168680829411016, + "ewc_loss": 0.0203857421875, + "ewc_loss_parallel": 2.0384788513183594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.18872833251953, + "learning_rate": 1e-06, + "loss": 0.4563, + "mean_token_accuracy": 0.8568358421325684, + "num_tokens": 125084255.0, + "step": 3277 + }, + { + "epoch": 0.41699529321969214, + "ewc_loss": 0.020751953125, + "ewc_loss_parallel": 2.0742416381835938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.177207946777344, + "learning_rate": 1e-06, + "loss": 0.4187, + "mean_token_accuracy": 0.8670706152915955, + "num_tokens": 125124971.0, + "step": 3278 + }, + { + "epoch": 0.41712250349828267, + "ewc_loss": 0.0208740234375, + "ewc_loss_parallel": 2.086162567138672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.219411849975586, + "learning_rate": 1e-06, + "loss": 0.4825, + "mean_token_accuracy": 0.8497745990753174, + "num_tokens": 125161470.0, + "step": 3279 + }, + { + "epoch": 0.4172497137768732, + "ewc_loss": 0.0208740234375, + "ewc_loss_parallel": 2.086162567138672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.062360763549805, + "learning_rate": 1e-06, + "loss": 0.5083, + "mean_token_accuracy": 0.8375158309936523, + "num_tokens": 125199117.0, + "step": 3280 + }, + { + "epoch": 0.41737692405546367, + "ewc_loss": 0.0205078125, + "ewc_loss_parallel": 2.0503997802734375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.175365447998047, + "learning_rate": 1e-06, + "loss": 0.5211, + "mean_token_accuracy": 0.8345849514007568, + "num_tokens": 125238136.0, + "step": 3281 + }, + { + "epoch": 0.4175041343340542, + "ewc_loss": 0.0206298828125, + "ewc_loss_parallel": 2.0623207092285156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.121686935424805, + "learning_rate": 1e-06, + "loss": 0.4747, + "mean_token_accuracy": 0.8510106205940247, + "num_tokens": 125281041.0, + "step": 3282 + }, + { + "epoch": 0.41763134461264473, + "ewc_loss": 0.020751953125, + "ewc_loss_parallel": 2.0742416381835938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.24966049194336, + "learning_rate": 1e-06, + "loss": 0.4402, + "mean_token_accuracy": 0.8556687831878662, + "num_tokens": 125314739.0, + "step": 3283 + }, + { + "epoch": 0.4177585548912352, + "ewc_loss": 0.020751953125, + "ewc_loss_parallel": 2.0742416381835938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.01372528076172, + "learning_rate": 1e-06, + "loss": 0.4758, + "mean_token_accuracy": 0.8459945917129517, + "num_tokens": 125353129.0, + "step": 3284 + }, + { + "epoch": 0.41788576516982573, + "ewc_loss": 0.0205078125, + "ewc_loss_parallel": 2.0503997802734375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.150901794433594, + "learning_rate": 1e-06, + "loss": 0.4608, + "mean_token_accuracy": 0.8522830009460449, + "num_tokens": 125394092.0, + "step": 3285 + }, + { + "epoch": 0.41801297544841626, + "ewc_loss": 0.0208740234375, + "ewc_loss_parallel": 2.086162567138672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.99551010131836, + "learning_rate": 1e-06, + "loss": 0.4921, + "mean_token_accuracy": 0.8466647863388062, + "num_tokens": 125438262.0, + "step": 3286 + }, + { + "epoch": 0.41814018572700673, + "ewc_loss": 0.0205078125, + "ewc_loss_parallel": 2.0503997802734375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.102245330810547, + "learning_rate": 1e-06, + "loss": 0.4683, + "mean_token_accuracy": 0.8506066799163818, + "num_tokens": 125477842.0, + "step": 3287 + }, + { + "epoch": 0.41826739600559726, + "ewc_loss": 0.020751953125, + "ewc_loss_parallel": 2.0742416381835938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.058168411254883, + "learning_rate": 1e-06, + "loss": 0.4195, + "mean_token_accuracy": 0.8647381067276001, + "num_tokens": 125517236.0, + "step": 3288 + }, + { + "epoch": 0.4183946062841878, + "ewc_loss": 0.0206298828125, + "ewc_loss_parallel": 2.0623207092285156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.10769271850586, + "learning_rate": 1e-06, + "loss": 0.4831, + "mean_token_accuracy": 0.8463486433029175, + "num_tokens": 125561598.0, + "step": 3289 + }, + { + "epoch": 0.41852181656277826, + "ewc_loss": 0.0206298828125, + "ewc_loss_parallel": 2.0623207092285156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.01341438293457, + "learning_rate": 1e-06, + "loss": 0.4746, + "mean_token_accuracy": 0.8513494729995728, + "num_tokens": 125599104.0, + "step": 3290 + }, + { + "epoch": 0.4186490268413688, + "ewc_loss": 0.020751953125, + "ewc_loss_parallel": 2.0742416381835938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.97072410583496, + "learning_rate": 1e-06, + "loss": 0.4643, + "mean_token_accuracy": 0.85355544090271, + "num_tokens": 125635286.0, + "step": 3291 + }, + { + "epoch": 0.4187762371199593, + "ewc_loss": 0.020751953125, + "ewc_loss_parallel": 2.0742416381835938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.127506256103516, + "learning_rate": 1e-06, + "loss": 0.4411, + "mean_token_accuracy": 0.8577007055282593, + "num_tokens": 125669534.0, + "step": 3292 + }, + { + "epoch": 0.4189034473985498, + "ewc_loss": 0.020751953125, + "ewc_loss_parallel": 2.0742416381835938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.10965919494629, + "learning_rate": 1e-06, + "loss": 0.4666, + "mean_token_accuracy": 0.8525919318199158, + "num_tokens": 125707966.0, + "step": 3293 + }, + { + "epoch": 0.4190306576771403, + "ewc_loss": 0.020751953125, + "ewc_loss_parallel": 2.0742416381835938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.00394058227539, + "learning_rate": 1e-06, + "loss": 0.479, + "mean_token_accuracy": 0.8482518196105957, + "num_tokens": 125741448.0, + "step": 3294 + }, + { + "epoch": 0.41915786795573085, + "ewc_loss": 0.0206298828125, + "ewc_loss_parallel": 2.0623207092285156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.913904190063477, + "learning_rate": 1e-06, + "loss": 0.4978, + "mean_token_accuracy": 0.8473062515258789, + "num_tokens": 125776887.0, + "step": 3295 + }, + { + "epoch": 0.4192850782343213, + "ewc_loss": 0.020751953125, + "ewc_loss_parallel": 2.0742416381835938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.04884910583496, + "learning_rate": 1e-06, + "loss": 0.4339, + "mean_token_accuracy": 0.8575374484062195, + "num_tokens": 125807899.0, + "step": 3296 + }, + { + "epoch": 0.41941228851291185, + "ewc_loss": 0.020751953125, + "ewc_loss_parallel": 2.0742416381835938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.99754524230957, + "learning_rate": 1e-06, + "loss": 0.4406, + "mean_token_accuracy": 0.8605513572692871, + "num_tokens": 125847711.0, + "step": 3297 + }, + { + "epoch": 0.4195394987915024, + "ewc_loss": 0.0206298828125, + "ewc_loss_parallel": 2.0623207092285156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.080970764160156, + "learning_rate": 1e-06, + "loss": 0.4226, + "mean_token_accuracy": 0.8659851551055908, + "num_tokens": 125883248.0, + "step": 3298 + }, + { + "epoch": 0.41966670907009285, + "ewc_loss": 0.020751953125, + "ewc_loss_parallel": 2.0742416381835938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.210527420043945, + "learning_rate": 1e-06, + "loss": 0.4972, + "mean_token_accuracy": 0.8409878015518188, + "num_tokens": 125925771.0, + "step": 3299 + }, + { + "epoch": 0.4197939193486834, + "ewc_loss": 0.020751953125, + "ewc_loss_parallel": 2.0742416381835938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.990753173828125, + "learning_rate": 1e-06, + "loss": 0.4977, + "mean_token_accuracy": 0.8398036956787109, + "num_tokens": 125968815.0, + "step": 3300 + }, + { + "epoch": 0.4199211296272739, + "ewc_loss": 0.0206298828125, + "ewc_loss_parallel": 2.0623207092285156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.136274337768555, + "learning_rate": 1e-06, + "loss": 0.471, + "mean_token_accuracy": 0.8473938703536987, + "num_tokens": 126007761.0, + "step": 3301 + }, + { + "epoch": 0.4200483399058644, + "ewc_loss": 0.020751953125, + "ewc_loss_parallel": 2.0742416381835938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.88402557373047, + "learning_rate": 1e-06, + "loss": 0.5164, + "mean_token_accuracy": 0.8337968587875366, + "num_tokens": 126047872.0, + "step": 3302 + }, + { + "epoch": 0.4201755501844549, + "ewc_loss": 0.020751953125, + "ewc_loss_parallel": 2.0742416381835938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.063247680664062, + "learning_rate": 1e-06, + "loss": 0.4304, + "mean_token_accuracy": 0.8599427342414856, + "num_tokens": 126081273.0, + "step": 3303 + }, + { + "epoch": 0.42030276046304543, + "ewc_loss": 0.020751953125, + "ewc_loss_parallel": 2.0742416381835938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.058881759643555, + "learning_rate": 1e-06, + "loss": 0.5122, + "mean_token_accuracy": 0.8383073210716248, + "num_tokens": 126121590.0, + "step": 3304 + }, + { + "epoch": 0.4204299707416359, + "ewc_loss": 0.020751953125, + "ewc_loss_parallel": 2.0742416381835938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.070167541503906, + "learning_rate": 1e-06, + "loss": 0.5063, + "mean_token_accuracy": 0.8366276621818542, + "num_tokens": 126168396.0, + "step": 3305 + }, + { + "epoch": 0.42055718102022643, + "ewc_loss": 0.0208740234375, + "ewc_loss_parallel": 2.086162567138672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.04387092590332, + "learning_rate": 1e-06, + "loss": 0.4908, + "mean_token_accuracy": 0.8478947281837463, + "num_tokens": 126205220.0, + "step": 3306 + }, + { + "epoch": 0.42068439129881696, + "ewc_loss": 0.0208740234375, + "ewc_loss_parallel": 2.086162567138672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.224098205566406, + "learning_rate": 1e-06, + "loss": 0.4679, + "mean_token_accuracy": 0.8464062213897705, + "num_tokens": 126239131.0, + "step": 3307 + }, + { + "epoch": 0.42081160157740743, + "ewc_loss": 0.0208740234375, + "ewc_loss_parallel": 2.086162567138672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.212135314941406, + "learning_rate": 1e-06, + "loss": 0.4539, + "mean_token_accuracy": 0.8551090955734253, + "num_tokens": 126271931.0, + "step": 3308 + }, + { + "epoch": 0.42093881185599796, + "ewc_loss": 0.020751953125, + "ewc_loss_parallel": 2.0742416381835938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.10676383972168, + "learning_rate": 1e-06, + "loss": 0.4304, + "mean_token_accuracy": 0.8622044920921326, + "num_tokens": 126307730.0, + "step": 3309 + }, + { + "epoch": 0.4210660221345885, + "ewc_loss": 0.0208740234375, + "ewc_loss_parallel": 2.086162567138672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.759431838989258, + "learning_rate": 1e-06, + "loss": 0.4593, + "mean_token_accuracy": 0.8559751510620117, + "num_tokens": 126344528.0, + "step": 3310 + }, + { + "epoch": 0.42119323241317896, + "ewc_loss": 0.0208740234375, + "ewc_loss_parallel": 2.086162567138672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.049028396606445, + "learning_rate": 1e-06, + "loss": 0.4293, + "mean_token_accuracy": 0.860927939414978, + "num_tokens": 126377260.0, + "step": 3311 + }, + { + "epoch": 0.4213204426917695, + "ewc_loss": 0.0206298828125, + "ewc_loss_parallel": 2.0623207092285156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.7790584564209, + "learning_rate": 1e-06, + "loss": 0.474, + "mean_token_accuracy": 0.849815309047699, + "num_tokens": 126409723.0, + "step": 3312 + }, + { + "epoch": 0.42144765297036, + "ewc_loss": 0.020751953125, + "ewc_loss_parallel": 2.0742416381835938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.279993057250977, + "learning_rate": 1e-06, + "loss": 0.4375, + "mean_token_accuracy": 0.856524646282196, + "num_tokens": 126450929.0, + "step": 3313 + }, + { + "epoch": 0.4215748632489505, + "ewc_loss": 0.0203857421875, + "ewc_loss_parallel": 2.0384788513183594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.10301399230957, + "learning_rate": 1e-06, + "loss": 0.4259, + "mean_token_accuracy": 0.862291157245636, + "num_tokens": 126486878.0, + "step": 3314 + }, + { + "epoch": 0.421702073527541, + "ewc_loss": 0.020751953125, + "ewc_loss_parallel": 2.0742416381835938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.343055725097656, + "learning_rate": 1e-06, + "loss": 0.4982, + "mean_token_accuracy": 0.8405923843383789, + "num_tokens": 126522884.0, + "step": 3315 + }, + { + "epoch": 0.42182928380613155, + "ewc_loss": 0.020751953125, + "ewc_loss_parallel": 2.0742416381835938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.354202270507812, + "learning_rate": 1e-06, + "loss": 0.4536, + "mean_token_accuracy": 0.8556973934173584, + "num_tokens": 126560848.0, + "step": 3316 + }, + { + "epoch": 0.421956494084722, + "ewc_loss": 0.020751953125, + "ewc_loss_parallel": 2.0742416381835938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.164262771606445, + "learning_rate": 1e-06, + "loss": 0.4558, + "mean_token_accuracy": 0.8544950485229492, + "num_tokens": 126592947.0, + "step": 3317 + }, + { + "epoch": 0.42208370436331255, + "ewc_loss": 0.020751953125, + "ewc_loss_parallel": 2.0742416381835938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.20237159729004, + "learning_rate": 1e-06, + "loss": 0.4222, + "mean_token_accuracy": 0.8635201454162598, + "num_tokens": 126630631.0, + "step": 3318 + }, + { + "epoch": 0.4222109146419031, + "ewc_loss": 0.020751953125, + "ewc_loss_parallel": 2.0742416381835938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.453933715820312, + "learning_rate": 1e-06, + "loss": 0.4733, + "mean_token_accuracy": 0.8479973077774048, + "num_tokens": 126673395.0, + "step": 3319 + }, + { + "epoch": 0.42233812492049355, + "ewc_loss": 0.020751953125, + "ewc_loss_parallel": 2.0742416381835938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.4212646484375, + "learning_rate": 1e-06, + "loss": 0.4092, + "mean_token_accuracy": 0.8695659041404724, + "num_tokens": 126711504.0, + "step": 3320 + }, + { + "epoch": 0.4224653351990841, + "ewc_loss": 0.0206298828125, + "ewc_loss_parallel": 2.0623207092285156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.273263931274414, + "learning_rate": 1e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.8663533926010132, + "num_tokens": 126753217.0, + "step": 3321 + }, + { + "epoch": 0.4225925454776746, + "ewc_loss": 0.020751953125, + "ewc_loss_parallel": 2.0742416381835938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.07959747314453, + "learning_rate": 1e-06, + "loss": 0.4498, + "mean_token_accuracy": 0.8588172793388367, + "num_tokens": 126792982.0, + "step": 3322 + }, + { + "epoch": 0.4227197557562651, + "ewc_loss": 0.020751953125, + "ewc_loss_parallel": 2.0742416381835938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.456926345825195, + "learning_rate": 1e-06, + "loss": 0.4742, + "mean_token_accuracy": 0.8466918468475342, + "num_tokens": 126823771.0, + "step": 3323 + }, + { + "epoch": 0.4228469660348556, + "ewc_loss": 0.020751953125, + "ewc_loss_parallel": 2.0742416381835938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.10175132751465, + "learning_rate": 1e-06, + "loss": 0.441, + "mean_token_accuracy": 0.8570452928543091, + "num_tokens": 126867137.0, + "step": 3324 + }, + { + "epoch": 0.42297417631344614, + "ewc_loss": 0.0206298828125, + "ewc_loss_parallel": 2.0623207092285156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.12928581237793, + "learning_rate": 1e-06, + "loss": 0.4398, + "mean_token_accuracy": 0.8589895367622375, + "num_tokens": 126902680.0, + "step": 3325 + }, + { + "epoch": 0.4231013865920366, + "ewc_loss": 0.020751953125, + "ewc_loss_parallel": 2.0742416381835938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.283233642578125, + "learning_rate": 1e-06, + "loss": 0.4219, + "mean_token_accuracy": 0.8653741478919983, + "num_tokens": 126941037.0, + "step": 3326 + }, + { + "epoch": 0.42322859687062714, + "ewc_loss": 0.020751953125, + "ewc_loss_parallel": 2.0742416381835938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.16008949279785, + "learning_rate": 1e-06, + "loss": 0.4469, + "mean_token_accuracy": 0.8575412631034851, + "num_tokens": 126980009.0, + "step": 3327 + }, + { + "epoch": 0.42335580714921767, + "ewc_loss": 0.0206298828125, + "ewc_loss_parallel": 2.0623207092285156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.34071159362793, + "learning_rate": 1e-06, + "loss": 0.4148, + "mean_token_accuracy": 0.8654166460037231, + "num_tokens": 127020022.0, + "step": 3328 + }, + { + "epoch": 0.42348301742780814, + "ewc_loss": 0.020751953125, + "ewc_loss_parallel": 2.0742416381835938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.10127067565918, + "learning_rate": 1e-06, + "loss": 0.5203, + "mean_token_accuracy": 0.8342211842536926, + "num_tokens": 127060500.0, + "step": 3329 + }, + { + "epoch": 0.42361022770639867, + "ewc_loss": 0.0205078125, + "ewc_loss_parallel": 2.0503997802734375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.226003646850586, + "learning_rate": 1e-06, + "loss": 0.4678, + "mean_token_accuracy": 0.8506847620010376, + "num_tokens": 127095886.0, + "step": 3330 + }, + { + "epoch": 0.4237374379849892, + "ewc_loss": 0.020751953125, + "ewc_loss_parallel": 2.0742416381835938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.049697875976562, + "learning_rate": 1e-06, + "loss": 0.4949, + "mean_token_accuracy": 0.8446990847587585, + "num_tokens": 127134334.0, + "step": 3331 + }, + { + "epoch": 0.4238646482635797, + "ewc_loss": 0.0205078125, + "ewc_loss_parallel": 2.0503997802734375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.9560489654541, + "learning_rate": 1e-06, + "loss": 0.4928, + "mean_token_accuracy": 0.8460093140602112, + "num_tokens": 127174288.0, + "step": 3332 + }, + { + "epoch": 0.4239918585421702, + "ewc_loss": 0.020751953125, + "ewc_loss_parallel": 2.0742416381835938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.29588508605957, + "learning_rate": 1e-06, + "loss": 0.4912, + "mean_token_accuracy": 0.8514394760131836, + "num_tokens": 127210311.0, + "step": 3333 + }, + { + "epoch": 0.4241190688207607, + "ewc_loss": 0.020751953125, + "ewc_loss_parallel": 2.0742416381835938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.186250686645508, + "learning_rate": 1e-06, + "loss": 0.3954, + "mean_token_accuracy": 0.874204158782959, + "num_tokens": 127252813.0, + "step": 3334 + }, + { + "epoch": 0.42424627909935125, + "ewc_loss": 0.020751953125, + "ewc_loss_parallel": 2.0742416381835938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.07225799560547, + "learning_rate": 1e-06, + "loss": 0.4972, + "mean_token_accuracy": 0.8405681252479553, + "num_tokens": 127285847.0, + "step": 3335 + }, + { + "epoch": 0.4243734893779417, + "ewc_loss": 0.020751953125, + "ewc_loss_parallel": 2.0742416381835938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.178987503051758, + "learning_rate": 1e-06, + "loss": 0.4037, + "mean_token_accuracy": 0.8680267333984375, + "num_tokens": 127325360.0, + "step": 3336 + }, + { + "epoch": 0.42450069965653225, + "ewc_loss": 0.020751953125, + "ewc_loss_parallel": 2.0742416381835938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.141332626342773, + "learning_rate": 1e-06, + "loss": 0.4511, + "mean_token_accuracy": 0.8579647541046143, + "num_tokens": 127366446.0, + "step": 3337 + }, + { + "epoch": 0.4246279099351228, + "ewc_loss": 0.0205078125, + "ewc_loss_parallel": 2.0503997802734375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.129344940185547, + "learning_rate": 1e-06, + "loss": 0.4279, + "mean_token_accuracy": 0.8627926111221313, + "num_tokens": 127402608.0, + "step": 3338 + }, + { + "epoch": 0.42475512021371326, + "ewc_loss": 0.020751953125, + "ewc_loss_parallel": 2.0742416381835938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.977632522583008, + "learning_rate": 1e-06, + "loss": 0.4343, + "mean_token_accuracy": 0.8596965074539185, + "num_tokens": 127445355.0, + "step": 3339 + }, + { + "epoch": 0.4248823304923038, + "ewc_loss": 0.020751953125, + "ewc_loss_parallel": 2.0742416381835938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.16676902770996, + "learning_rate": 1e-06, + "loss": 0.4824, + "mean_token_accuracy": 0.8493648767471313, + "num_tokens": 127480523.0, + "step": 3340 + }, + { + "epoch": 0.4250095407708943, + "ewc_loss": 0.0208740234375, + "ewc_loss_parallel": 2.086162567138672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.34137725830078, + "learning_rate": 1e-06, + "loss": 0.3994, + "mean_token_accuracy": 0.8736501336097717, + "num_tokens": 127520787.0, + "step": 3341 + }, + { + "epoch": 0.4251367510494848, + "ewc_loss": 0.020751953125, + "ewc_loss_parallel": 2.0742416381835938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.195947647094727, + "learning_rate": 1e-06, + "loss": 0.48, + "mean_token_accuracy": 0.8493093252182007, + "num_tokens": 127565576.0, + "step": 3342 + }, + { + "epoch": 0.4252639613280753, + "ewc_loss": 0.020751953125, + "ewc_loss_parallel": 2.0742416381835938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.12504005432129, + "learning_rate": 1e-06, + "loss": 0.4145, + "mean_token_accuracy": 0.8645914793014526, + "num_tokens": 127599144.0, + "step": 3343 + }, + { + "epoch": 0.42539117160666584, + "ewc_loss": 0.020751953125, + "ewc_loss_parallel": 2.0742416381835938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.196624755859375, + "learning_rate": 1e-06, + "loss": 0.4019, + "mean_token_accuracy": 0.8729357719421387, + "num_tokens": 127637361.0, + "step": 3344 + }, + { + "epoch": 0.4255183818852563, + "ewc_loss": 0.020751953125, + "ewc_loss_parallel": 2.0742416381835938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.164203643798828, + "learning_rate": 1e-06, + "loss": 0.4917, + "mean_token_accuracy": 0.8441197872161865, + "num_tokens": 127675139.0, + "step": 3345 + }, + { + "epoch": 0.42564559216384684, + "ewc_loss": 0.020751953125, + "ewc_loss_parallel": 2.0742416381835938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.156524658203125, + "learning_rate": 1e-06, + "loss": 0.4387, + "mean_token_accuracy": 0.8599129915237427, + "num_tokens": 127718822.0, + "step": 3346 + }, + { + "epoch": 0.42577280244243737, + "ewc_loss": 0.020751953125, + "ewc_loss_parallel": 2.0742416381835938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.23979377746582, + "learning_rate": 1e-06, + "loss": 0.4419, + "mean_token_accuracy": 0.8586745262145996, + "num_tokens": 127763332.0, + "step": 3347 + }, + { + "epoch": 0.42590001272102784, + "ewc_loss": 0.020751953125, + "ewc_loss_parallel": 2.0742416381835938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.0310001373291, + "learning_rate": 1e-06, + "loss": 0.4259, + "mean_token_accuracy": 0.8658313155174255, + "num_tokens": 127800346.0, + "step": 3348 + }, + { + "epoch": 0.42602722299961837, + "ewc_loss": 0.0206298828125, + "ewc_loss_parallel": 2.0623207092285156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.340087890625, + "learning_rate": 1e-06, + "loss": 0.4643, + "mean_token_accuracy": 0.8520267009735107, + "num_tokens": 127845142.0, + "step": 3349 + }, + { + "epoch": 0.4261544332782089, + "ewc_loss": 0.020751953125, + "ewc_loss_parallel": 2.0742416381835938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.16312599182129, + "learning_rate": 1e-06, + "loss": 0.522, + "mean_token_accuracy": 0.8370669484138489, + "num_tokens": 127881131.0, + "step": 3350 + }, + { + "epoch": 0.4262816435567994, + "ewc_loss": 0.0205078125, + "ewc_loss_parallel": 2.0503997802734375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.199228286743164, + "learning_rate": 1e-06, + "loss": 0.4075, + "mean_token_accuracy": 0.868370771408081, + "num_tokens": 127917941.0, + "step": 3351 + }, + { + "epoch": 0.4264088538353899, + "ewc_loss": 0.020751953125, + "ewc_loss_parallel": 2.0742416381835938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.070932388305664, + "learning_rate": 1e-06, + "loss": 0.5044, + "mean_token_accuracy": 0.8405298590660095, + "num_tokens": 127963345.0, + "step": 3352 + }, + { + "epoch": 0.42653606411398043, + "ewc_loss": 0.020751953125, + "ewc_loss_parallel": 2.0742416381835938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.192577362060547, + "learning_rate": 1e-06, + "loss": 0.4406, + "mean_token_accuracy": 0.8626528978347778, + "num_tokens": 128004839.0, + "step": 3353 + }, + { + "epoch": 0.4266632743925709, + "ewc_loss": 0.0206298828125, + "ewc_loss_parallel": 2.0623207092285156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.3940372467041, + "learning_rate": 1e-06, + "loss": 0.4846, + "mean_token_accuracy": 0.8471245765686035, + "num_tokens": 128042709.0, + "step": 3354 + }, + { + "epoch": 0.42679048467116143, + "ewc_loss": 0.0205078125, + "ewc_loss_parallel": 2.0503997802734375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.02763557434082, + "learning_rate": 1e-06, + "loss": 0.5059, + "mean_token_accuracy": 0.8392505049705505, + "num_tokens": 128085069.0, + "step": 3355 + }, + { + "epoch": 0.42691769494975196, + "ewc_loss": 0.0206298828125, + "ewc_loss_parallel": 2.0623207092285156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.35002326965332, + "learning_rate": 1e-06, + "loss": 0.4877, + "mean_token_accuracy": 0.8479847311973572, + "num_tokens": 128124300.0, + "step": 3356 + }, + { + "epoch": 0.42704490522834243, + "ewc_loss": 0.020751953125, + "ewc_loss_parallel": 2.0742416381835938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.14045524597168, + "learning_rate": 1e-06, + "loss": 0.4633, + "mean_token_accuracy": 0.8520179390907288, + "num_tokens": 128164410.0, + "step": 3357 + }, + { + "epoch": 0.42717211550693296, + "ewc_loss": 0.020751953125, + "ewc_loss_parallel": 2.0742416381835938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.331493377685547, + "learning_rate": 1e-06, + "loss": 0.4308, + "mean_token_accuracy": 0.8645056486129761, + "num_tokens": 128206524.0, + "step": 3358 + }, + { + "epoch": 0.4272993257855235, + "ewc_loss": 0.020751953125, + "ewc_loss_parallel": 2.0742416381835938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.01883888244629, + "learning_rate": 1e-06, + "loss": 0.4802, + "mean_token_accuracy": 0.8450374603271484, + "num_tokens": 128246387.0, + "step": 3359 + }, + { + "epoch": 0.42742653606411396, + "ewc_loss": 0.0206298828125, + "ewc_loss_parallel": 2.0623207092285156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.14883041381836, + "learning_rate": 1e-06, + "loss": 0.4273, + "mean_token_accuracy": 0.8643785715103149, + "num_tokens": 128287794.0, + "step": 3360 + }, + { + "epoch": 0.4275537463427045, + "ewc_loss": 0.020751953125, + "ewc_loss_parallel": 2.0742416381835938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.258317947387695, + "learning_rate": 1e-06, + "loss": 0.4702, + "mean_token_accuracy": 0.8510730266571045, + "num_tokens": 128328467.0, + "step": 3361 + }, + { + "epoch": 0.427680956621295, + "ewc_loss": 0.020751953125, + "ewc_loss_parallel": 2.0742416381835938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.030193328857422, + "learning_rate": 1e-06, + "loss": 0.4295, + "mean_token_accuracy": 0.863048791885376, + "num_tokens": 128364319.0, + "step": 3362 + }, + { + "epoch": 0.4278081668998855, + "ewc_loss": 0.020751953125, + "ewc_loss_parallel": 2.0742416381835938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.048582077026367, + "learning_rate": 1e-06, + "loss": 0.4781, + "mean_token_accuracy": 0.8497036099433899, + "num_tokens": 128403892.0, + "step": 3363 + }, + { + "epoch": 0.427935377178476, + "ewc_loss": 0.0206298828125, + "ewc_loss_parallel": 2.0623207092285156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.052793502807617, + "learning_rate": 1e-06, + "loss": 0.5221, + "mean_token_accuracy": 0.8347564935684204, + "num_tokens": 128446760.0, + "step": 3364 + }, + { + "epoch": 0.42806258745706655, + "ewc_loss": 0.020751953125, + "ewc_loss_parallel": 2.0742416381835938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.257972717285156, + "learning_rate": 1e-06, + "loss": 0.5338, + "mean_token_accuracy": 0.8295390605926514, + "num_tokens": 128485498.0, + "step": 3365 + }, + { + "epoch": 0.428189797735657, + "ewc_loss": 0.02099609375, + "ewc_loss_parallel": 2.09808349609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.0483341217041, + "learning_rate": 1e-06, + "loss": 0.5094, + "mean_token_accuracy": 0.842964768409729, + "num_tokens": 128527871.0, + "step": 3366 + }, + { + "epoch": 0.42831700801424755, + "ewc_loss": 0.020751953125, + "ewc_loss_parallel": 2.0742416381835938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.26728057861328, + "learning_rate": 1e-06, + "loss": 0.4953, + "mean_token_accuracy": 0.8405395746231079, + "num_tokens": 128567792.0, + "step": 3367 + }, + { + "epoch": 0.4284442182928381, + "ewc_loss": 0.020751953125, + "ewc_loss_parallel": 2.0742416381835938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.023656845092773, + "learning_rate": 1e-06, + "loss": 0.491, + "mean_token_accuracy": 0.8446325659751892, + "num_tokens": 128607217.0, + "step": 3368 + }, + { + "epoch": 0.42857142857142855, + "ewc_loss": 0.020751953125, + "ewc_loss_parallel": 2.0742416381835938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.079858779907227, + "learning_rate": 1e-06, + "loss": 0.4271, + "mean_token_accuracy": 0.859257698059082, + "num_tokens": 128642787.0, + "step": 3369 + }, + { + "epoch": 0.4286986388500191, + "ewc_loss": 0.020751953125, + "ewc_loss_parallel": 2.0742416381835938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.379119873046875, + "learning_rate": 1e-06, + "loss": 0.4171, + "mean_token_accuracy": 0.864669144153595, + "num_tokens": 128680462.0, + "step": 3370 + }, + { + "epoch": 0.4288258491286096, + "ewc_loss": 0.020751953125, + "ewc_loss_parallel": 2.0742416381835938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.129358291625977, + "learning_rate": 1e-06, + "loss": 0.456, + "mean_token_accuracy": 0.849889874458313, + "num_tokens": 128715101.0, + "step": 3371 + }, + { + "epoch": 0.4289530594072001, + "ewc_loss": 0.020751953125, + "ewc_loss_parallel": 2.0742416381835938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.126020431518555, + "learning_rate": 1e-06, + "loss": 0.4825, + "mean_token_accuracy": 0.846211314201355, + "num_tokens": 128749601.0, + "step": 3372 + }, + { + "epoch": 0.4290802696857906, + "ewc_loss": 0.0208740234375, + "ewc_loss_parallel": 2.086162567138672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.112987518310547, + "learning_rate": 1e-06, + "loss": 0.4647, + "mean_token_accuracy": 0.8488643169403076, + "num_tokens": 128788744.0, + "step": 3373 + }, + { + "epoch": 0.42920747996438113, + "ewc_loss": 0.020751953125, + "ewc_loss_parallel": 2.0742416381835938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.260915756225586, + "learning_rate": 1e-06, + "loss": 0.469, + "mean_token_accuracy": 0.8494242429733276, + "num_tokens": 128828102.0, + "step": 3374 + }, + { + "epoch": 0.4293346902429716, + "ewc_loss": 0.0208740234375, + "ewc_loss_parallel": 2.086162567138672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.965917587280273, + "learning_rate": 1e-06, + "loss": 0.4701, + "mean_token_accuracy": 0.8478413820266724, + "num_tokens": 128863610.0, + "step": 3375 + }, + { + "epoch": 0.42946190052156213, + "ewc_loss": 0.020751953125, + "ewc_loss_parallel": 2.0742416381835938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.405475616455078, + "learning_rate": 1e-06, + "loss": 0.4166, + "mean_token_accuracy": 0.8668436408042908, + "num_tokens": 128898836.0, + "step": 3376 + }, + { + "epoch": 0.42958911080015266, + "ewc_loss": 0.0208740234375, + "ewc_loss_parallel": 2.086162567138672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.90455436706543, + "learning_rate": 1e-06, + "loss": 0.4511, + "mean_token_accuracy": 0.8549646735191345, + "num_tokens": 128934821.0, + "step": 3377 + }, + { + "epoch": 0.42971632107874314, + "ewc_loss": 0.020751953125, + "ewc_loss_parallel": 2.0742416381835938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.275455474853516, + "learning_rate": 1e-06, + "loss": 0.4618, + "mean_token_accuracy": 0.8523209095001221, + "num_tokens": 128966914.0, + "step": 3378 + }, + { + "epoch": 0.42984353135733366, + "ewc_loss": 0.0211181640625, + "ewc_loss_parallel": 2.110004425048828e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.16595458984375, + "learning_rate": 1e-06, + "loss": 0.446, + "mean_token_accuracy": 0.8596420288085938, + "num_tokens": 129008305.0, + "step": 3379 + }, + { + "epoch": 0.4299707416359242, + "ewc_loss": 0.02099609375, + "ewc_loss_parallel": 2.09808349609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.639755249023438, + "learning_rate": 1e-06, + "loss": 0.3939, + "mean_token_accuracy": 0.8737285733222961, + "num_tokens": 129043331.0, + "step": 3380 + }, + { + "epoch": 0.4300979519145147, + "ewc_loss": 0.0208740234375, + "ewc_loss_parallel": 2.086162567138672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.10216522216797, + "learning_rate": 1e-06, + "loss": 0.4316, + "mean_token_accuracy": 0.8595361709594727, + "num_tokens": 129083513.0, + "step": 3381 + }, + { + "epoch": 0.4302251621931052, + "ewc_loss": 0.020751953125, + "ewc_loss_parallel": 2.0742416381835938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.032800674438477, + "learning_rate": 1e-06, + "loss": 0.4604, + "mean_token_accuracy": 0.8532098531723022, + "num_tokens": 129113463.0, + "step": 3382 + }, + { + "epoch": 0.4303523724716957, + "ewc_loss": 0.02099609375, + "ewc_loss_parallel": 2.09808349609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.446704864501953, + "learning_rate": 1e-06, + "loss": 0.5231, + "mean_token_accuracy": 0.8297714591026306, + "num_tokens": 129153080.0, + "step": 3383 + }, + { + "epoch": 0.43047958275028625, + "ewc_loss": 0.02099609375, + "ewc_loss_parallel": 2.09808349609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.0367488861084, + "learning_rate": 1e-06, + "loss": 0.4573, + "mean_token_accuracy": 0.850473165512085, + "num_tokens": 129193338.0, + "step": 3384 + }, + { + "epoch": 0.4306067930288767, + "ewc_loss": 0.020751953125, + "ewc_loss_parallel": 2.0742416381835938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.318449020385742, + "learning_rate": 1e-06, + "loss": 0.4828, + "mean_token_accuracy": 0.843546986579895, + "num_tokens": 129234816.0, + "step": 3385 + }, + { + "epoch": 0.43073400330746725, + "ewc_loss": 0.0208740234375, + "ewc_loss_parallel": 2.086162567138672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.163700103759766, + "learning_rate": 1e-06, + "loss": 0.4409, + "mean_token_accuracy": 0.8557937145233154, + "num_tokens": 129275228.0, + "step": 3386 + }, + { + "epoch": 0.4308612135860578, + "ewc_loss": 0.0208740234375, + "ewc_loss_parallel": 2.086162567138672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.175270080566406, + "learning_rate": 1e-06, + "loss": 0.5165, + "mean_token_accuracy": 0.8363710045814514, + "num_tokens": 129309747.0, + "step": 3387 + }, + { + "epoch": 0.43098842386464825, + "ewc_loss": 0.020751953125, + "ewc_loss_parallel": 2.0742416381835938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.0910701751709, + "learning_rate": 1e-06, + "loss": 0.449, + "mean_token_accuracy": 0.8563349843025208, + "num_tokens": 129342580.0, + "step": 3388 + }, + { + "epoch": 0.4311156341432388, + "ewc_loss": 0.0208740234375, + "ewc_loss_parallel": 2.086162567138672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.209211349487305, + "learning_rate": 1e-06, + "loss": 0.4686, + "mean_token_accuracy": 0.8476558923721313, + "num_tokens": 129376599.0, + "step": 3389 + }, + { + "epoch": 0.4312428444218293, + "ewc_loss": 0.02099609375, + "ewc_loss_parallel": 2.09808349609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.273731231689453, + "learning_rate": 1e-06, + "loss": 0.4714, + "mean_token_accuracy": 0.8523075580596924, + "num_tokens": 129414783.0, + "step": 3390 + }, + { + "epoch": 0.4313700547004198, + "ewc_loss": 0.0208740234375, + "ewc_loss_parallel": 2.086162567138672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.119768142700195, + "learning_rate": 1e-06, + "loss": 0.4599, + "mean_token_accuracy": 0.8530462384223938, + "num_tokens": 129450287.0, + "step": 3391 + }, + { + "epoch": 0.4314972649790103, + "ewc_loss": 0.0208740234375, + "ewc_loss_parallel": 2.086162567138672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.51119041442871, + "learning_rate": 1e-06, + "loss": 0.5137, + "mean_token_accuracy": 0.8361254334449768, + "num_tokens": 129486733.0, + "step": 3392 + }, + { + "epoch": 0.43162447525760084, + "ewc_loss": 0.0208740234375, + "ewc_loss_parallel": 2.086162567138672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.14021110534668, + "learning_rate": 1e-06, + "loss": 0.4632, + "mean_token_accuracy": 0.8534076809883118, + "num_tokens": 129524392.0, + "step": 3393 + }, + { + "epoch": 0.4317516855361913, + "ewc_loss": 0.0208740234375, + "ewc_loss_parallel": 2.086162567138672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.24781608581543, + "learning_rate": 1e-06, + "loss": 0.4994, + "mean_token_accuracy": 0.8396713733673096, + "num_tokens": 129558452.0, + "step": 3394 + }, + { + "epoch": 0.43187889581478184, + "ewc_loss": 0.0208740234375, + "ewc_loss_parallel": 2.086162567138672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.923686981201172, + "learning_rate": 1e-06, + "loss": 0.4616, + "mean_token_accuracy": 0.852053165435791, + "num_tokens": 129597076.0, + "step": 3395 + }, + { + "epoch": 0.43200610609337237, + "ewc_loss": 0.0208740234375, + "ewc_loss_parallel": 2.086162567138672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.328697204589844, + "learning_rate": 1e-06, + "loss": 0.4793, + "mean_token_accuracy": 0.8466807007789612, + "num_tokens": 129635861.0, + "step": 3396 + }, + { + "epoch": 0.43213331637196284, + "ewc_loss": 0.02099609375, + "ewc_loss_parallel": 2.09808349609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.98943328857422, + "learning_rate": 1e-06, + "loss": 0.4758, + "mean_token_accuracy": 0.8488836288452148, + "num_tokens": 129673760.0, + "step": 3397 + }, + { + "epoch": 0.43226052665055337, + "ewc_loss": 0.0208740234375, + "ewc_loss_parallel": 2.086162567138672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.144596099853516, + "learning_rate": 1e-06, + "loss": 0.452, + "mean_token_accuracy": 0.8544696569442749, + "num_tokens": 129710730.0, + "step": 3398 + }, + { + "epoch": 0.4323877369291439, + "ewc_loss": 0.0211181640625, + "ewc_loss_parallel": 2.110004425048828e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.196958541870117, + "learning_rate": 1e-06, + "loss": 0.479, + "mean_token_accuracy": 0.8500650525093079, + "num_tokens": 129748724.0, + "step": 3399 + }, + { + "epoch": 0.43251494720773437, + "ewc_loss": 0.02099609375, + "ewc_loss_parallel": 2.09808349609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.10131072998047, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.8661415576934814, + "num_tokens": 129787478.0, + "step": 3400 + }, + { + "epoch": 0.4326421574863249, + "ewc_loss": 0.02099609375, + "ewc_loss_parallel": 2.09808349609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.299949645996094, + "learning_rate": 1e-06, + "loss": 0.4381, + "mean_token_accuracy": 0.8633389472961426, + "num_tokens": 129829852.0, + "step": 3401 + }, + { + "epoch": 0.4327693677649154, + "ewc_loss": 0.0211181640625, + "ewc_loss_parallel": 2.110004425048828e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 18.933746337890625, + "learning_rate": 1e-06, + "loss": 0.4667, + "mean_token_accuracy": 0.8526378273963928, + "num_tokens": 129869279.0, + "step": 3402 + }, + { + "epoch": 0.4328965780435059, + "ewc_loss": 0.02099609375, + "ewc_loss_parallel": 2.09808349609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.24237632751465, + "learning_rate": 1e-06, + "loss": 0.4748, + "mean_token_accuracy": 0.8479692935943604, + "num_tokens": 129903347.0, + "step": 3403 + }, + { + "epoch": 0.4330237883220964, + "ewc_loss": 0.0211181640625, + "ewc_loss_parallel": 2.110004425048828e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.114953994750977, + "learning_rate": 1e-06, + "loss": 0.4334, + "mean_token_accuracy": 0.8635812997817993, + "num_tokens": 129940028.0, + "step": 3404 + }, + { + "epoch": 0.43315099860068695, + "ewc_loss": 0.02099609375, + "ewc_loss_parallel": 2.09808349609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.16347885131836, + "learning_rate": 1e-06, + "loss": 0.4297, + "mean_token_accuracy": 0.8626923561096191, + "num_tokens": 129976438.0, + "step": 3405 + }, + { + "epoch": 0.4332782088792774, + "ewc_loss": 0.02099609375, + "ewc_loss_parallel": 2.09808349609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.16916847229004, + "learning_rate": 1e-06, + "loss": 0.4418, + "mean_token_accuracy": 0.858698844909668, + "num_tokens": 130015698.0, + "step": 3406 + }, + { + "epoch": 0.43340541915786795, + "ewc_loss": 0.02099609375, + "ewc_loss_parallel": 2.09808349609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.146764755249023, + "learning_rate": 1e-06, + "loss": 0.4117, + "mean_token_accuracy": 0.8679035902023315, + "num_tokens": 130056000.0, + "step": 3407 + }, + { + "epoch": 0.4335326294364585, + "ewc_loss": 0.02099609375, + "ewc_loss_parallel": 2.09808349609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.22751808166504, + "learning_rate": 1e-06, + "loss": 0.4661, + "mean_token_accuracy": 0.8506220579147339, + "num_tokens": 130096990.0, + "step": 3408 + }, + { + "epoch": 0.43365983971504896, + "ewc_loss": 0.02099609375, + "ewc_loss_parallel": 2.09808349609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.28940200805664, + "learning_rate": 1e-06, + "loss": 0.4147, + "mean_token_accuracy": 0.8628950119018555, + "num_tokens": 130128556.0, + "step": 3409 + }, + { + "epoch": 0.4337870499936395, + "ewc_loss": 0.02099609375, + "ewc_loss_parallel": 2.09808349609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.07018280029297, + "learning_rate": 1e-06, + "loss": 0.4362, + "mean_token_accuracy": 0.8597372770309448, + "num_tokens": 130170607.0, + "step": 3410 + }, + { + "epoch": 0.43391426027223, + "ewc_loss": 0.0208740234375, + "ewc_loss_parallel": 2.086162567138672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.275054931640625, + "learning_rate": 1e-06, + "loss": 0.4765, + "mean_token_accuracy": 0.849321722984314, + "num_tokens": 130212511.0, + "step": 3411 + }, + { + "epoch": 0.4340414705508205, + "ewc_loss": 0.02099609375, + "ewc_loss_parallel": 2.09808349609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.163639068603516, + "learning_rate": 1e-06, + "loss": 0.4775, + "mean_token_accuracy": 0.8459479808807373, + "num_tokens": 130249246.0, + "step": 3412 + }, + { + "epoch": 0.434168680829411, + "ewc_loss": 0.02099609375, + "ewc_loss_parallel": 2.09808349609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.017772674560547, + "learning_rate": 1e-06, + "loss": 0.3907, + "mean_token_accuracy": 0.8734594583511353, + "num_tokens": 130291250.0, + "step": 3413 + }, + { + "epoch": 0.43429589110800154, + "ewc_loss": 0.02099609375, + "ewc_loss_parallel": 2.09808349609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.34589195251465, + "learning_rate": 1e-06, + "loss": 0.518, + "mean_token_accuracy": 0.8326644897460938, + "num_tokens": 130327094.0, + "step": 3414 + }, + { + "epoch": 0.434423101386592, + "ewc_loss": 0.02099609375, + "ewc_loss_parallel": 2.09808349609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.12893295288086, + "learning_rate": 1e-06, + "loss": 0.4765, + "mean_token_accuracy": 0.8457733392715454, + "num_tokens": 130366482.0, + "step": 3415 + }, + { + "epoch": 0.43455031166518254, + "ewc_loss": 0.02099609375, + "ewc_loss_parallel": 2.09808349609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.211421966552734, + "learning_rate": 1e-06, + "loss": 0.4641, + "mean_token_accuracy": 0.8534858822822571, + "num_tokens": 130407416.0, + "step": 3416 + }, + { + "epoch": 0.43467752194377307, + "ewc_loss": 0.02099609375, + "ewc_loss_parallel": 2.09808349609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.254398345947266, + "learning_rate": 1e-06, + "loss": 0.4376, + "mean_token_accuracy": 0.8601611852645874, + "num_tokens": 130447125.0, + "step": 3417 + }, + { + "epoch": 0.43480473222236354, + "ewc_loss": 0.02099609375, + "ewc_loss_parallel": 2.09808349609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.199748992919922, + "learning_rate": 1e-06, + "loss": 0.4348, + "mean_token_accuracy": 0.8595186471939087, + "num_tokens": 130480955.0, + "step": 3418 + }, + { + "epoch": 0.43493194250095407, + "ewc_loss": 0.02099609375, + "ewc_loss_parallel": 2.09808349609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.173437118530273, + "learning_rate": 1e-06, + "loss": 0.41, + "mean_token_accuracy": 0.866126298904419, + "num_tokens": 130511771.0, + "step": 3419 + }, + { + "epoch": 0.4350591527795446, + "ewc_loss": 0.02099609375, + "ewc_loss_parallel": 2.09808349609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.255298614501953, + "learning_rate": 1e-06, + "loss": 0.453, + "mean_token_accuracy": 0.8524007201194763, + "num_tokens": 130550080.0, + "step": 3420 + }, + { + "epoch": 0.4351863630581351, + "ewc_loss": 0.02099609375, + "ewc_loss_parallel": 2.09808349609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.40496063232422, + "learning_rate": 1e-06, + "loss": 0.4995, + "mean_token_accuracy": 0.8422756791114807, + "num_tokens": 130588109.0, + "step": 3421 + }, + { + "epoch": 0.4353135733367256, + "ewc_loss": 0.0208740234375, + "ewc_loss_parallel": 2.086162567138672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.155485153198242, + "learning_rate": 1e-06, + "loss": 0.5155, + "mean_token_accuracy": 0.8398908972740173, + "num_tokens": 130626832.0, + "step": 3422 + }, + { + "epoch": 0.43544078361531613, + "ewc_loss": 0.0208740234375, + "ewc_loss_parallel": 2.086162567138672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.358333587646484, + "learning_rate": 1e-06, + "loss": 0.4357, + "mean_token_accuracy": 0.8583531975746155, + "num_tokens": 130662202.0, + "step": 3423 + }, + { + "epoch": 0.4355679938939066, + "ewc_loss": 0.02099609375, + "ewc_loss_parallel": 2.09808349609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.250953674316406, + "learning_rate": 1e-06, + "loss": 0.4655, + "mean_token_accuracy": 0.8485938310623169, + "num_tokens": 130700747.0, + "step": 3424 + }, + { + "epoch": 0.43569520417249713, + "ewc_loss": 0.020751953125, + "ewc_loss_parallel": 2.0742416381835938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.1014404296875, + "learning_rate": 1e-06, + "loss": 0.4604, + "mean_token_accuracy": 0.8508687019348145, + "num_tokens": 130742849.0, + "step": 3425 + }, + { + "epoch": 0.43582241445108766, + "ewc_loss": 0.0208740234375, + "ewc_loss_parallel": 2.086162567138672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.438663482666016, + "learning_rate": 1e-06, + "loss": 0.4195, + "mean_token_accuracy": 0.8675767779350281, + "num_tokens": 130780156.0, + "step": 3426 + }, + { + "epoch": 0.43594962472967813, + "ewc_loss": 0.02099609375, + "ewc_loss_parallel": 2.09808349609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.28070640563965, + "learning_rate": 1e-06, + "loss": 0.4572, + "mean_token_accuracy": 0.8531899452209473, + "num_tokens": 130825824.0, + "step": 3427 + }, + { + "epoch": 0.43607683500826866, + "ewc_loss": 0.02099609375, + "ewc_loss_parallel": 2.09808349609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.241085052490234, + "learning_rate": 1e-06, + "loss": 0.4425, + "mean_token_accuracy": 0.8621355295181274, + "num_tokens": 130863603.0, + "step": 3428 + }, + { + "epoch": 0.4362040452868592, + "ewc_loss": 0.02099609375, + "ewc_loss_parallel": 2.09808349609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.22274398803711, + "learning_rate": 1e-06, + "loss": 0.4626, + "mean_token_accuracy": 0.8560614585876465, + "num_tokens": 130906689.0, + "step": 3429 + }, + { + "epoch": 0.4363312555654497, + "ewc_loss": 0.02099609375, + "ewc_loss_parallel": 2.09808349609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.217470169067383, + "learning_rate": 1e-06, + "loss": 0.466, + "mean_token_accuracy": 0.8503914475440979, + "num_tokens": 130947785.0, + "step": 3430 + }, + { + "epoch": 0.4364584658440402, + "ewc_loss": 0.0208740234375, + "ewc_loss_parallel": 2.086162567138672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.192306518554688, + "learning_rate": 1e-06, + "loss": 0.4256, + "mean_token_accuracy": 0.8644739389419556, + "num_tokens": 130985489.0, + "step": 3431 + }, + { + "epoch": 0.4365856761226307, + "ewc_loss": 0.0208740234375, + "ewc_loss_parallel": 2.086162567138672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.43600845336914, + "learning_rate": 1e-06, + "loss": 0.4299, + "mean_token_accuracy": 0.862595796585083, + "num_tokens": 131025274.0, + "step": 3432 + }, + { + "epoch": 0.43671288640122125, + "ewc_loss": 0.02099609375, + "ewc_loss_parallel": 2.09808349609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.161901473999023, + "learning_rate": 1e-06, + "loss": 0.4559, + "mean_token_accuracy": 0.8531787395477295, + "num_tokens": 131060132.0, + "step": 3433 + }, + { + "epoch": 0.4368400966798117, + "ewc_loss": 0.02099609375, + "ewc_loss_parallel": 2.09808349609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.23583221435547, + "learning_rate": 1e-06, + "loss": 0.5, + "mean_token_accuracy": 0.8435676693916321, + "num_tokens": 131105109.0, + "step": 3434 + }, + { + "epoch": 0.43696730695840225, + "ewc_loss": 0.02099609375, + "ewc_loss_parallel": 2.09808349609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.02167510986328, + "learning_rate": 1e-06, + "loss": 0.4228, + "mean_token_accuracy": 0.8691322803497314, + "num_tokens": 131142094.0, + "step": 3435 + }, + { + "epoch": 0.4370945172369928, + "ewc_loss": 0.0208740234375, + "ewc_loss_parallel": 2.086162567138672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.374135971069336, + "learning_rate": 1e-06, + "loss": 0.4629, + "mean_token_accuracy": 0.8530235290527344, + "num_tokens": 131174648.0, + "step": 3436 + }, + { + "epoch": 0.43722172751558325, + "ewc_loss": 0.02099609375, + "ewc_loss_parallel": 2.09808349609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.074045181274414, + "learning_rate": 1e-06, + "loss": 0.415, + "mean_token_accuracy": 0.8699606657028198, + "num_tokens": 131213447.0, + "step": 3437 + }, + { + "epoch": 0.4373489377941738, + "ewc_loss": 0.02099609375, + "ewc_loss_parallel": 2.09808349609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.137798309326172, + "learning_rate": 1e-06, + "loss": 0.4613, + "mean_token_accuracy": 0.8499715924263, + "num_tokens": 131249622.0, + "step": 3438 + }, + { + "epoch": 0.4374761480727643, + "ewc_loss": 0.0211181640625, + "ewc_loss_parallel": 2.110004425048828e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.305593490600586, + "learning_rate": 1e-06, + "loss": 0.4033, + "mean_token_accuracy": 0.8704286813735962, + "num_tokens": 131285883.0, + "step": 3439 + }, + { + "epoch": 0.4376033583513548, + "ewc_loss": 0.0208740234375, + "ewc_loss_parallel": 2.086162567138672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.204490661621094, + "learning_rate": 1e-06, + "loss": 0.4511, + "mean_token_accuracy": 0.8575696349143982, + "num_tokens": 131322774.0, + "step": 3440 + }, + { + "epoch": 0.4377305686299453, + "ewc_loss": 0.0211181640625, + "ewc_loss_parallel": 2.110004425048828e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.290603637695312, + "learning_rate": 1e-06, + "loss": 0.4192, + "mean_token_accuracy": 0.8659993410110474, + "num_tokens": 131364513.0, + "step": 3441 + }, + { + "epoch": 0.43785777890853583, + "ewc_loss": 0.0208740234375, + "ewc_loss_parallel": 2.086162567138672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.257478713989258, + "learning_rate": 1e-06, + "loss": 0.4824, + "mean_token_accuracy": 0.8450242280960083, + "num_tokens": 131402299.0, + "step": 3442 + }, + { + "epoch": 0.4379849891871263, + "ewc_loss": 0.02099609375, + "ewc_loss_parallel": 2.09808349609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.12484359741211, + "learning_rate": 1e-06, + "loss": 0.4298, + "mean_token_accuracy": 0.8620069026947021, + "num_tokens": 131447830.0, + "step": 3443 + }, + { + "epoch": 0.43811219946571683, + "ewc_loss": 0.0208740234375, + "ewc_loss_parallel": 2.086162567138672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.481306076049805, + "learning_rate": 1e-06, + "loss": 0.4861, + "mean_token_accuracy": 0.8478686213493347, + "num_tokens": 131481058.0, + "step": 3444 + }, + { + "epoch": 0.43823940974430736, + "ewc_loss": 0.02099609375, + "ewc_loss_parallel": 2.09808349609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.093881607055664, + "learning_rate": 1e-06, + "loss": 0.4713, + "mean_token_accuracy": 0.8509507775306702, + "num_tokens": 131518376.0, + "step": 3445 + }, + { + "epoch": 0.43836662002289783, + "ewc_loss": 0.0208740234375, + "ewc_loss_parallel": 2.086162567138672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.300439834594727, + "learning_rate": 1e-06, + "loss": 0.5211, + "mean_token_accuracy": 0.8395799398422241, + "num_tokens": 131553702.0, + "step": 3446 + }, + { + "epoch": 0.43849383030148836, + "ewc_loss": 0.0208740234375, + "ewc_loss_parallel": 2.086162567138672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.045692443847656, + "learning_rate": 1e-06, + "loss": 0.4348, + "mean_token_accuracy": 0.862529456615448, + "num_tokens": 131587026.0, + "step": 3447 + }, + { + "epoch": 0.4386210405800789, + "ewc_loss": 0.02099609375, + "ewc_loss_parallel": 2.09808349609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.441429138183594, + "learning_rate": 1e-06, + "loss": 0.427, + "mean_token_accuracy": 0.8604416251182556, + "num_tokens": 131621145.0, + "step": 3448 + }, + { + "epoch": 0.43874825085866936, + "ewc_loss": 0.021240234375, + "ewc_loss_parallel": 2.1219253540039062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.308361053466797, + "learning_rate": 1e-06, + "loss": 0.4352, + "mean_token_accuracy": 0.8608619570732117, + "num_tokens": 131661618.0, + "step": 3449 + }, + { + "epoch": 0.4388754611372599, + "ewc_loss": 0.021240234375, + "ewc_loss_parallel": 2.1219253540039062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.469846725463867, + "learning_rate": 1e-06, + "loss": 0.4605, + "mean_token_accuracy": 0.8482183218002319, + "num_tokens": 131693840.0, + "step": 3450 + }, + { + "epoch": 0.4390026714158504, + "ewc_loss": 0.02099609375, + "ewc_loss_parallel": 2.09808349609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.197202682495117, + "learning_rate": 1e-06, + "loss": 0.4407, + "mean_token_accuracy": 0.8569071888923645, + "num_tokens": 131730856.0, + "step": 3451 + }, + { + "epoch": 0.4391298816944409, + "ewc_loss": 0.0208740234375, + "ewc_loss_parallel": 2.086162567138672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.445383071899414, + "learning_rate": 1e-06, + "loss": 0.5308, + "mean_token_accuracy": 0.8318485021591187, + "num_tokens": 131769172.0, + "step": 3452 + }, + { + "epoch": 0.4392570919730314, + "ewc_loss": 0.021240234375, + "ewc_loss_parallel": 2.1219253540039062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.307199478149414, + "learning_rate": 1e-06, + "loss": 0.4394, + "mean_token_accuracy": 0.8571006059646606, + "num_tokens": 131810481.0, + "step": 3453 + }, + { + "epoch": 0.43938430225162195, + "ewc_loss": 0.0208740234375, + "ewc_loss_parallel": 2.086162567138672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.32745933532715, + "learning_rate": 1e-06, + "loss": 0.4422, + "mean_token_accuracy": 0.8565903306007385, + "num_tokens": 131850742.0, + "step": 3454 + }, + { + "epoch": 0.4395115125302124, + "ewc_loss": 0.0211181640625, + "ewc_loss_parallel": 2.110004425048828e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.369556427001953, + "learning_rate": 1e-06, + "loss": 0.4675, + "mean_token_accuracy": 0.8536633253097534, + "num_tokens": 131886675.0, + "step": 3455 + }, + { + "epoch": 0.43963872280880295, + "ewc_loss": 0.02099609375, + "ewc_loss_parallel": 2.09808349609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.253124237060547, + "learning_rate": 1e-06, + "loss": 0.4394, + "mean_token_accuracy": 0.8607257008552551, + "num_tokens": 131926831.0, + "step": 3456 + }, + { + "epoch": 0.4397659330873935, + "ewc_loss": 0.0208740234375, + "ewc_loss_parallel": 2.086162567138672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.196895599365234, + "learning_rate": 1e-06, + "loss": 0.4636, + "mean_token_accuracy": 0.8537263870239258, + "num_tokens": 131962265.0, + "step": 3457 + }, + { + "epoch": 0.43989314336598395, + "ewc_loss": 0.02099609375, + "ewc_loss_parallel": 2.09808349609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.26997947692871, + "learning_rate": 1e-06, + "loss": 0.4851, + "mean_token_accuracy": 0.8497191667556763, + "num_tokens": 131999906.0, + "step": 3458 + }, + { + "epoch": 0.4400203536445745, + "ewc_loss": 0.0211181640625, + "ewc_loss_parallel": 2.110004425048828e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.284231185913086, + "learning_rate": 1e-06, + "loss": 0.4074, + "mean_token_accuracy": 0.8711690902709961, + "num_tokens": 132031330.0, + "step": 3459 + }, + { + "epoch": 0.440147563923165, + "ewc_loss": 0.02099609375, + "ewc_loss_parallel": 2.09808349609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.419347763061523, + "learning_rate": 1e-06, + "loss": 0.461, + "mean_token_accuracy": 0.8536478877067566, + "num_tokens": 132065049.0, + "step": 3460 + }, + { + "epoch": 0.4402747742017555, + "ewc_loss": 0.02099609375, + "ewc_loss_parallel": 2.09808349609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.317340850830078, + "learning_rate": 1e-06, + "loss": 0.4388, + "mean_token_accuracy": 0.8588336110115051, + "num_tokens": 132100993.0, + "step": 3461 + }, + { + "epoch": 0.440401984480346, + "ewc_loss": 0.02099609375, + "ewc_loss_parallel": 2.09808349609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.286787033081055, + "learning_rate": 1e-06, + "loss": 0.4604, + "mean_token_accuracy": 0.85286545753479, + "num_tokens": 132138169.0, + "step": 3462 + }, + { + "epoch": 0.44052919475893654, + "ewc_loss": 0.02099609375, + "ewc_loss_parallel": 2.09808349609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.361175537109375, + "learning_rate": 1e-06, + "loss": 0.4303, + "mean_token_accuracy": 0.8618911504745483, + "num_tokens": 132170498.0, + "step": 3463 + }, + { + "epoch": 0.440656405037527, + "ewc_loss": 0.0211181640625, + "ewc_loss_parallel": 2.110004425048828e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.21392059326172, + "learning_rate": 1e-06, + "loss": 0.5149, + "mean_token_accuracy": 0.8358038663864136, + "num_tokens": 132213583.0, + "step": 3464 + }, + { + "epoch": 0.44078361531611754, + "ewc_loss": 0.02099609375, + "ewc_loss_parallel": 2.09808349609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.307483673095703, + "learning_rate": 1e-06, + "loss": 0.4665, + "mean_token_accuracy": 0.849973201751709, + "num_tokens": 132255427.0, + "step": 3465 + }, + { + "epoch": 0.44091082559470807, + "ewc_loss": 0.02099609375, + "ewc_loss_parallel": 2.09808349609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.405956268310547, + "learning_rate": 1e-06, + "loss": 0.4641, + "mean_token_accuracy": 0.8528790473937988, + "num_tokens": 132295795.0, + "step": 3466 + }, + { + "epoch": 0.44103803587329854, + "ewc_loss": 0.0211181640625, + "ewc_loss_parallel": 2.110004425048828e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.26382827758789, + "learning_rate": 1e-06, + "loss": 0.507, + "mean_token_accuracy": 0.8368513584136963, + "num_tokens": 132336881.0, + "step": 3467 + }, + { + "epoch": 0.44116524615188907, + "ewc_loss": 0.0208740234375, + "ewc_loss_parallel": 2.086162567138672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.222795486450195, + "learning_rate": 1e-06, + "loss": 0.4556, + "mean_token_accuracy": 0.8553940057754517, + "num_tokens": 132375136.0, + "step": 3468 + }, + { + "epoch": 0.4412924564304796, + "ewc_loss": 0.02099609375, + "ewc_loss_parallel": 2.09808349609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.378076553344727, + "learning_rate": 1e-06, + "loss": 0.4676, + "mean_token_accuracy": 0.851669430732727, + "num_tokens": 132412893.0, + "step": 3469 + }, + { + "epoch": 0.44141966670907007, + "ewc_loss": 0.021240234375, + "ewc_loss_parallel": 2.1219253540039062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.765987396240234, + "learning_rate": 1e-06, + "loss": 0.4688, + "mean_token_accuracy": 0.8471613526344299, + "num_tokens": 132444360.0, + "step": 3470 + }, + { + "epoch": 0.4415468769876606, + "ewc_loss": 0.0211181640625, + "ewc_loss_parallel": 2.110004425048828e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.368656158447266, + "learning_rate": 1e-06, + "loss": 0.5551, + "mean_token_accuracy": 0.8278993368148804, + "num_tokens": 132485022.0, + "step": 3471 + }, + { + "epoch": 0.4416740872662511, + "ewc_loss": 0.0208740234375, + "ewc_loss_parallel": 2.086162567138672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.47935676574707, + "learning_rate": 1e-06, + "loss": 0.5246, + "mean_token_accuracy": 0.8400280475616455, + "num_tokens": 132522800.0, + "step": 3472 + }, + { + "epoch": 0.4418012975448416, + "ewc_loss": 0.021728515625, + "ewc_loss_parallel": 2.1696090698242188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.416553497314453, + "learning_rate": 1e-06, + "loss": 0.4494, + "mean_token_accuracy": 0.8563694357872009, + "num_tokens": 132560527.0, + "step": 3473 + }, + { + "epoch": 0.4419285078234321, + "ewc_loss": 0.0203857421875, + "ewc_loss_parallel": 2.0384788513183594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.211742401123047, + "learning_rate": 1e-06, + "loss": 0.4621, + "mean_token_accuracy": 0.8517454266548157, + "num_tokens": 132601075.0, + "step": 3474 + }, + { + "epoch": 0.44205571810202265, + "ewc_loss": 0.0211181640625, + "ewc_loss_parallel": 2.110004425048828e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.40793228149414, + "learning_rate": 1e-06, + "loss": 0.4854, + "mean_token_accuracy": 0.8448663949966431, + "num_tokens": 132639851.0, + "step": 3475 + }, + { + "epoch": 0.4421829283806131, + "ewc_loss": 0.0208740234375, + "ewc_loss_parallel": 2.086162567138672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.14278221130371, + "learning_rate": 1e-06, + "loss": 0.4565, + "mean_token_accuracy": 0.8560076951980591, + "num_tokens": 132679849.0, + "step": 3476 + }, + { + "epoch": 0.44231013865920366, + "ewc_loss": 0.020751953125, + "ewc_loss_parallel": 2.0742416381835938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.154682159423828, + "learning_rate": 1e-06, + "loss": 0.4402, + "mean_token_accuracy": 0.8587222099304199, + "num_tokens": 132721020.0, + "step": 3477 + }, + { + "epoch": 0.4424373489377942, + "ewc_loss": 0.02099609375, + "ewc_loss_parallel": 2.09808349609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.655820846557617, + "learning_rate": 1e-06, + "loss": 0.4891, + "mean_token_accuracy": 0.8410743474960327, + "num_tokens": 132760933.0, + "step": 3478 + }, + { + "epoch": 0.44256455921638466, + "ewc_loss": 0.02099609375, + "ewc_loss_parallel": 2.09808349609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.56142807006836, + "learning_rate": 1e-06, + "loss": 0.4167, + "mean_token_accuracy": 0.8649992346763611, + "num_tokens": 132800627.0, + "step": 3479 + }, + { + "epoch": 0.4426917694949752, + "ewc_loss": 0.0208740234375, + "ewc_loss_parallel": 2.086162567138672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.070974349975586, + "learning_rate": 1e-06, + "loss": 0.4622, + "mean_token_accuracy": 0.8529850840568542, + "num_tokens": 132833232.0, + "step": 3480 + }, + { + "epoch": 0.4428189797735657, + "ewc_loss": 0.0208740234375, + "ewc_loss_parallel": 2.086162567138672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.275299072265625, + "learning_rate": 1e-06, + "loss": 0.463, + "mean_token_accuracy": 0.8518427610397339, + "num_tokens": 132874670.0, + "step": 3481 + }, + { + "epoch": 0.44294619005215624, + "ewc_loss": 0.02099609375, + "ewc_loss_parallel": 2.09808349609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.627758026123047, + "learning_rate": 1e-06, + "loss": 0.441, + "mean_token_accuracy": 0.8567423820495605, + "num_tokens": 132916187.0, + "step": 3482 + }, + { + "epoch": 0.4430734003307467, + "ewc_loss": 0.02099609375, + "ewc_loss_parallel": 2.09808349609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.315263748168945, + "learning_rate": 1e-06, + "loss": 0.4414, + "mean_token_accuracy": 0.8577009439468384, + "num_tokens": 132956948.0, + "step": 3483 + }, + { + "epoch": 0.44320061060933724, + "ewc_loss": 0.0208740234375, + "ewc_loss_parallel": 2.086162567138672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.300968170166016, + "learning_rate": 1e-06, + "loss": 0.4371, + "mean_token_accuracy": 0.8592211604118347, + "num_tokens": 132995213.0, + "step": 3484 + }, + { + "epoch": 0.44332782088792777, + "ewc_loss": 0.02099609375, + "ewc_loss_parallel": 2.09808349609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.190969467163086, + "learning_rate": 1e-06, + "loss": 0.4643, + "mean_token_accuracy": 0.8510022163391113, + "num_tokens": 133038850.0, + "step": 3485 + }, + { + "epoch": 0.44345503116651824, + "ewc_loss": 0.0211181640625, + "ewc_loss_parallel": 2.110004425048828e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.537721633911133, + "learning_rate": 1e-06, + "loss": 0.4445, + "mean_token_accuracy": 0.8558773994445801, + "num_tokens": 133080766.0, + "step": 3486 + }, + { + "epoch": 0.44358224144510877, + "ewc_loss": 0.02099609375, + "ewc_loss_parallel": 2.09808349609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.26201820373535, + "learning_rate": 1e-06, + "loss": 0.4622, + "mean_token_accuracy": 0.8501577377319336, + "num_tokens": 133120845.0, + "step": 3487 + }, + { + "epoch": 0.4437094517236993, + "ewc_loss": 0.02099609375, + "ewc_loss_parallel": 2.09808349609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.16571044921875, + "learning_rate": 1e-06, + "loss": 0.4616, + "mean_token_accuracy": 0.8513486385345459, + "num_tokens": 133166011.0, + "step": 3488 + }, + { + "epoch": 0.4438366620022898, + "ewc_loss": 0.02099609375, + "ewc_loss_parallel": 2.09808349609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.07524299621582, + "learning_rate": 1e-06, + "loss": 0.4509, + "mean_token_accuracy": 0.852523148059845, + "num_tokens": 133203507.0, + "step": 3489 + }, + { + "epoch": 0.4439638722808803, + "ewc_loss": 0.02099609375, + "ewc_loss_parallel": 2.09808349609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.454757690429688, + "learning_rate": 1e-06, + "loss": 0.5288, + "mean_token_accuracy": 0.8315101265907288, + "num_tokens": 133240554.0, + "step": 3490 + }, + { + "epoch": 0.44409108255947083, + "ewc_loss": 0.0211181640625, + "ewc_loss_parallel": 2.110004425048828e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.22406578063965, + "learning_rate": 1e-06, + "loss": 0.5167, + "mean_token_accuracy": 0.8368365168571472, + "num_tokens": 133274987.0, + "step": 3491 + }, + { + "epoch": 0.4442182928380613, + "ewc_loss": 0.0211181640625, + "ewc_loss_parallel": 2.110004425048828e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.420602798461914, + "learning_rate": 1e-06, + "loss": 0.5405, + "mean_token_accuracy": 0.8264352083206177, + "num_tokens": 133319287.0, + "step": 3492 + }, + { + "epoch": 0.44434550311665183, + "ewc_loss": 0.02099609375, + "ewc_loss_parallel": 2.09808349609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.058319091796875, + "learning_rate": 1e-06, + "loss": 0.4521, + "mean_token_accuracy": 0.8588322997093201, + "num_tokens": 133353670.0, + "step": 3493 + }, + { + "epoch": 0.44447271339524236, + "ewc_loss": 0.0211181640625, + "ewc_loss_parallel": 2.110004425048828e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.41179656982422, + "learning_rate": 1e-06, + "loss": 0.4532, + "mean_token_accuracy": 0.8557103872299194, + "num_tokens": 133391950.0, + "step": 3494 + }, + { + "epoch": 0.44459992367383283, + "ewc_loss": 0.0211181640625, + "ewc_loss_parallel": 2.110004425048828e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.43263053894043, + "learning_rate": 1e-06, + "loss": 0.4381, + "mean_token_accuracy": 0.8590511083602905, + "num_tokens": 133424758.0, + "step": 3495 + }, + { + "epoch": 0.44472713395242336, + "ewc_loss": 0.021240234375, + "ewc_loss_parallel": 2.1219253540039062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.423234939575195, + "learning_rate": 1e-06, + "loss": 0.3661, + "mean_token_accuracy": 0.8832871913909912, + "num_tokens": 133459869.0, + "step": 3496 + }, + { + "epoch": 0.4448543442310139, + "ewc_loss": 0.0211181640625, + "ewc_loss_parallel": 2.110004425048828e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.206514358520508, + "learning_rate": 1e-06, + "loss": 0.4667, + "mean_token_accuracy": 0.8524729013442993, + "num_tokens": 133494598.0, + "step": 3497 + }, + { + "epoch": 0.44498155450960436, + "ewc_loss": 0.021240234375, + "ewc_loss_parallel": 2.1219253540039062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.291824340820312, + "learning_rate": 1e-06, + "loss": 0.4306, + "mean_token_accuracy": 0.8630322217941284, + "num_tokens": 133536403.0, + "step": 3498 + }, + { + "epoch": 0.4451087647881949, + "ewc_loss": 0.02099609375, + "ewc_loss_parallel": 2.09808349609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.40848159790039, + "learning_rate": 1e-06, + "loss": 0.5264, + "mean_token_accuracy": 0.8327912092208862, + "num_tokens": 133567319.0, + "step": 3499 + }, + { + "epoch": 0.4452359750667854, + "ewc_loss": 0.021240234375, + "ewc_loss_parallel": 2.1219253540039062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.478548049926758, + "learning_rate": 1e-06, + "loss": 0.4916, + "mean_token_accuracy": 0.8470519781112671, + "num_tokens": 133604829.0, + "step": 3500 + }, + { + "epoch": 0.4453631853453759, + "ewc_loss": 0.021240234375, + "ewc_loss_parallel": 2.1219253540039062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.47806167602539, + "learning_rate": 1e-06, + "loss": 0.449, + "mean_token_accuracy": 0.8539088368415833, + "num_tokens": 133638373.0, + "step": 3501 + }, + { + "epoch": 0.4454903956239664, + "ewc_loss": 0.0211181640625, + "ewc_loss_parallel": 2.110004425048828e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.348894119262695, + "learning_rate": 1e-06, + "loss": 0.4079, + "mean_token_accuracy": 0.8647139668464661, + "num_tokens": 133674110.0, + "step": 3502 + }, + { + "epoch": 0.44561760590255695, + "ewc_loss": 0.0211181640625, + "ewc_loss_parallel": 2.110004425048828e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.296539306640625, + "learning_rate": 1e-06, + "loss": 0.4467, + "mean_token_accuracy": 0.8551214933395386, + "num_tokens": 133711005.0, + "step": 3503 + }, + { + "epoch": 0.4457448161811474, + "ewc_loss": 0.0211181640625, + "ewc_loss_parallel": 2.110004425048828e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.477439880371094, + "learning_rate": 1e-06, + "loss": 0.4134, + "mean_token_accuracy": 0.8657597899436951, + "num_tokens": 133746258.0, + "step": 3504 + }, + { + "epoch": 0.44587202645973795, + "ewc_loss": 0.021240234375, + "ewc_loss_parallel": 2.1219253540039062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.367233276367188, + "learning_rate": 1e-06, + "loss": 0.432, + "mean_token_accuracy": 0.863107442855835, + "num_tokens": 133790695.0, + "step": 3505 + }, + { + "epoch": 0.4459992367383285, + "ewc_loss": 0.0211181640625, + "ewc_loss_parallel": 2.110004425048828e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.21111297607422, + "learning_rate": 1e-06, + "loss": 0.4655, + "mean_token_accuracy": 0.8504284620285034, + "num_tokens": 133830181.0, + "step": 3506 + }, + { + "epoch": 0.44612644701691895, + "ewc_loss": 0.02099609375, + "ewc_loss_parallel": 2.09808349609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.456979751586914, + "learning_rate": 1e-06, + "loss": 0.4481, + "mean_token_accuracy": 0.8577101230621338, + "num_tokens": 133866849.0, + "step": 3507 + }, + { + "epoch": 0.4462536572955095, + "ewc_loss": 0.0211181640625, + "ewc_loss_parallel": 2.110004425048828e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.429977416992188, + "learning_rate": 1e-06, + "loss": 0.4576, + "mean_token_accuracy": 0.853255033493042, + "num_tokens": 133906812.0, + "step": 3508 + }, + { + "epoch": 0.4463808675741, + "ewc_loss": 0.0211181640625, + "ewc_loss_parallel": 2.110004425048828e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.41862678527832, + "learning_rate": 1e-06, + "loss": 0.4791, + "mean_token_accuracy": 0.8469127416610718, + "num_tokens": 133948956.0, + "step": 3509 + }, + { + "epoch": 0.4465080778526905, + "ewc_loss": 0.02099609375, + "ewc_loss_parallel": 2.09808349609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.31548309326172, + "learning_rate": 1e-06, + "loss": 0.5059, + "mean_token_accuracy": 0.836819589138031, + "num_tokens": 133988764.0, + "step": 3510 + }, + { + "epoch": 0.446635288131281, + "ewc_loss": 0.021240234375, + "ewc_loss_parallel": 2.1219253540039062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.383743286132812, + "learning_rate": 1e-06, + "loss": 0.4478, + "mean_token_accuracy": 0.8589229583740234, + "num_tokens": 134028148.0, + "step": 3511 + }, + { + "epoch": 0.44676249840987153, + "ewc_loss": 0.0211181640625, + "ewc_loss_parallel": 2.110004425048828e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.297046661376953, + "learning_rate": 1e-06, + "loss": 0.5372, + "mean_token_accuracy": 0.8301659822463989, + "num_tokens": 134065909.0, + "step": 3512 + }, + { + "epoch": 0.446889708688462, + "ewc_loss": 0.02099609375, + "ewc_loss_parallel": 2.09808349609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.499935150146484, + "learning_rate": 1e-06, + "loss": 0.441, + "mean_token_accuracy": 0.8600704669952393, + "num_tokens": 134108342.0, + "step": 3513 + }, + { + "epoch": 0.44701691896705253, + "ewc_loss": 0.021240234375, + "ewc_loss_parallel": 2.1219253540039062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.33925437927246, + "learning_rate": 1e-06, + "loss": 0.4824, + "mean_token_accuracy": 0.8526965975761414, + "num_tokens": 134145928.0, + "step": 3514 + }, + { + "epoch": 0.44714412924564306, + "ewc_loss": 0.02099609375, + "ewc_loss_parallel": 2.09808349609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.22951889038086, + "learning_rate": 1e-06, + "loss": 0.4774, + "mean_token_accuracy": 0.8504000306129456, + "num_tokens": 134185955.0, + "step": 3515 + }, + { + "epoch": 0.44727133952423354, + "ewc_loss": 0.02099609375, + "ewc_loss_parallel": 2.09808349609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.224510192871094, + "learning_rate": 1e-06, + "loss": 0.4456, + "mean_token_accuracy": 0.8604457378387451, + "num_tokens": 134225829.0, + "step": 3516 + }, + { + "epoch": 0.44739854980282406, + "ewc_loss": 0.02099609375, + "ewc_loss_parallel": 2.09808349609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.29368019104004, + "learning_rate": 1e-06, + "loss": 0.4724, + "mean_token_accuracy": 0.8440051078796387, + "num_tokens": 134261381.0, + "step": 3517 + }, + { + "epoch": 0.4475257600814146, + "ewc_loss": 0.021240234375, + "ewc_loss_parallel": 2.1219253540039062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.332733154296875, + "learning_rate": 1e-06, + "loss": 0.4318, + "mean_token_accuracy": 0.8613678216934204, + "num_tokens": 134294697.0, + "step": 3518 + }, + { + "epoch": 0.44765297036000506, + "ewc_loss": 0.021240234375, + "ewc_loss_parallel": 2.1219253540039062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.151851654052734, + "learning_rate": 1e-06, + "loss": 0.3917, + "mean_token_accuracy": 0.8746860027313232, + "num_tokens": 134334349.0, + "step": 3519 + }, + { + "epoch": 0.4477801806385956, + "ewc_loss": 0.0211181640625, + "ewc_loss_parallel": 2.110004425048828e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.270727157592773, + "learning_rate": 1e-06, + "loss": 0.4918, + "mean_token_accuracy": 0.8427666425704956, + "num_tokens": 134373381.0, + "step": 3520 + }, + { + "epoch": 0.4479073909171861, + "ewc_loss": 0.0211181640625, + "ewc_loss_parallel": 2.110004425048828e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.39664649963379, + "learning_rate": 1e-06, + "loss": 0.4435, + "mean_token_accuracy": 0.8598372936248779, + "num_tokens": 134407952.0, + "step": 3521 + }, + { + "epoch": 0.4480346011957766, + "ewc_loss": 0.0211181640625, + "ewc_loss_parallel": 2.110004425048828e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.098020553588867, + "learning_rate": 1e-06, + "loss": 0.4856, + "mean_token_accuracy": 0.8440356850624084, + "num_tokens": 134458862.0, + "step": 3522 + }, + { + "epoch": 0.4481618114743671, + "ewc_loss": 0.021240234375, + "ewc_loss_parallel": 2.1219253540039062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.393470764160156, + "learning_rate": 1e-06, + "loss": 0.4702, + "mean_token_accuracy": 0.8514743447303772, + "num_tokens": 134494007.0, + "step": 3523 + }, + { + "epoch": 0.44828902175295765, + "ewc_loss": 0.02099609375, + "ewc_loss_parallel": 2.09808349609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.26114273071289, + "learning_rate": 1e-06, + "loss": 0.503, + "mean_token_accuracy": 0.8394361734390259, + "num_tokens": 134534195.0, + "step": 3524 + }, + { + "epoch": 0.4484162320315481, + "ewc_loss": 0.0211181640625, + "ewc_loss_parallel": 2.110004425048828e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.237966537475586, + "learning_rate": 1e-06, + "loss": 0.4351, + "mean_token_accuracy": 0.8617832660675049, + "num_tokens": 134570241.0, + "step": 3525 + }, + { + "epoch": 0.44854344231013865, + "ewc_loss": 0.021240234375, + "ewc_loss_parallel": 2.1219253540039062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.38772964477539, + "learning_rate": 1e-06, + "loss": 0.4548, + "mean_token_accuracy": 0.8519973754882812, + "num_tokens": 134611904.0, + "step": 3526 + }, + { + "epoch": 0.4486706525887292, + "ewc_loss": 0.021240234375, + "ewc_loss_parallel": 2.1219253540039062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.291297912597656, + "learning_rate": 1e-06, + "loss": 0.4963, + "mean_token_accuracy": 0.8445689678192139, + "num_tokens": 134644426.0, + "step": 3527 + }, + { + "epoch": 0.44879786286731965, + "ewc_loss": 0.021240234375, + "ewc_loss_parallel": 2.1219253540039062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.353517532348633, + "learning_rate": 1e-06, + "loss": 0.4168, + "mean_token_accuracy": 0.8627407550811768, + "num_tokens": 134678979.0, + "step": 3528 + }, + { + "epoch": 0.4489250731459102, + "ewc_loss": 0.0213623046875, + "ewc_loss_parallel": 2.1338462829589844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.17108726501465, + "learning_rate": 1e-06, + "loss": 0.4444, + "mean_token_accuracy": 0.8564568161964417, + "num_tokens": 134717270.0, + "step": 3529 + }, + { + "epoch": 0.4490522834245007, + "ewc_loss": 0.02099609375, + "ewc_loss_parallel": 2.09808349609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.30204963684082, + "learning_rate": 1e-06, + "loss": 0.4512, + "mean_token_accuracy": 0.8546172380447388, + "num_tokens": 134755464.0, + "step": 3530 + }, + { + "epoch": 0.44917949370309124, + "ewc_loss": 0.021240234375, + "ewc_loss_parallel": 2.1219253540039062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.306570053100586, + "learning_rate": 1e-06, + "loss": 0.4686, + "mean_token_accuracy": 0.8463741540908813, + "num_tokens": 134793737.0, + "step": 3531 + }, + { + "epoch": 0.4493067039816817, + "ewc_loss": 0.0213623046875, + "ewc_loss_parallel": 2.1338462829589844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.38334846496582, + "learning_rate": 1e-06, + "loss": 0.4735, + "mean_token_accuracy": 0.8482581377029419, + "num_tokens": 134834332.0, + "step": 3532 + }, + { + "epoch": 0.44943391426027224, + "ewc_loss": 0.021240234375, + "ewc_loss_parallel": 2.1219253540039062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.347932815551758, + "learning_rate": 1e-06, + "loss": 0.4227, + "mean_token_accuracy": 0.8638162016868591, + "num_tokens": 134871027.0, + "step": 3533 + }, + { + "epoch": 0.44956112453886277, + "ewc_loss": 0.0211181640625, + "ewc_loss_parallel": 2.110004425048828e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.241395950317383, + "learning_rate": 1e-06, + "loss": 0.4479, + "mean_token_accuracy": 0.8536370396614075, + "num_tokens": 134908529.0, + "step": 3534 + }, + { + "epoch": 0.44968833481745324, + "ewc_loss": 0.021240234375, + "ewc_loss_parallel": 2.1219253540039062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.26924705505371, + "learning_rate": 1e-06, + "loss": 0.4151, + "mean_token_accuracy": 0.8634738326072693, + "num_tokens": 134944177.0, + "step": 3535 + }, + { + "epoch": 0.44981554509604377, + "ewc_loss": 0.021240234375, + "ewc_loss_parallel": 2.1219253540039062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.30804443359375, + "learning_rate": 1e-06, + "loss": 0.4757, + "mean_token_accuracy": 0.8533602356910706, + "num_tokens": 134977928.0, + "step": 3536 + }, + { + "epoch": 0.4499427553746343, + "ewc_loss": 0.021240234375, + "ewc_loss_parallel": 2.1219253540039062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.178319931030273, + "learning_rate": 1e-06, + "loss": 0.4562, + "mean_token_accuracy": 0.8544586896896362, + "num_tokens": 135017554.0, + "step": 3537 + }, + { + "epoch": 0.45006996565322477, + "ewc_loss": 0.0211181640625, + "ewc_loss_parallel": 2.110004425048828e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.357336044311523, + "learning_rate": 1e-06, + "loss": 0.4377, + "mean_token_accuracy": 0.8600879907608032, + "num_tokens": 135058694.0, + "step": 3538 + }, + { + "epoch": 0.4501971759318153, + "ewc_loss": 0.0213623046875, + "ewc_loss_parallel": 2.1338462829589844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.350400924682617, + "learning_rate": 1e-06, + "loss": 0.4527, + "mean_token_accuracy": 0.8558143973350525, + "num_tokens": 135094037.0, + "step": 3539 + }, + { + "epoch": 0.4503243862104058, + "ewc_loss": 0.0211181640625, + "ewc_loss_parallel": 2.110004425048828e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.280202865600586, + "learning_rate": 1e-06, + "loss": 0.4749, + "mean_token_accuracy": 0.847571849822998, + "num_tokens": 135132758.0, + "step": 3540 + }, + { + "epoch": 0.4504515964889963, + "ewc_loss": 0.021240234375, + "ewc_loss_parallel": 2.1219253540039062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.31369400024414, + "learning_rate": 1e-06, + "loss": 0.4113, + "mean_token_accuracy": 0.8671417832374573, + "num_tokens": 135169398.0, + "step": 3541 + }, + { + "epoch": 0.4505788067675868, + "ewc_loss": 0.021240234375, + "ewc_loss_parallel": 2.1219253540039062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.431350708007812, + "learning_rate": 1e-06, + "loss": 0.5296, + "mean_token_accuracy": 0.8397863507270813, + "num_tokens": 135204312.0, + "step": 3542 + }, + { + "epoch": 0.45070601704617735, + "ewc_loss": 0.021240234375, + "ewc_loss_parallel": 2.1219253540039062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.211101531982422, + "learning_rate": 1e-06, + "loss": 0.4712, + "mean_token_accuracy": 0.8511419296264648, + "num_tokens": 135246287.0, + "step": 3543 + }, + { + "epoch": 0.4508332273247678, + "ewc_loss": 0.0211181640625, + "ewc_loss_parallel": 2.110004425048828e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.261198043823242, + "learning_rate": 1e-06, + "loss": 0.4429, + "mean_token_accuracy": 0.8569661378860474, + "num_tokens": 135280970.0, + "step": 3544 + }, + { + "epoch": 0.45096043760335836, + "ewc_loss": 0.021484375, + "ewc_loss_parallel": 2.1457672119140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.46401023864746, + "learning_rate": 1e-06, + "loss": 0.3929, + "mean_token_accuracy": 0.8758432865142822, + "num_tokens": 135317873.0, + "step": 3545 + }, + { + "epoch": 0.4510876478819489, + "ewc_loss": 0.0211181640625, + "ewc_loss_parallel": 2.110004425048828e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.251558303833008, + "learning_rate": 1e-06, + "loss": 0.4419, + "mean_token_accuracy": 0.8616337180137634, + "num_tokens": 135354994.0, + "step": 3546 + }, + { + "epoch": 0.45121485816053936, + "ewc_loss": 0.0211181640625, + "ewc_loss_parallel": 2.110004425048828e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.254621505737305, + "learning_rate": 1e-06, + "loss": 0.483, + "mean_token_accuracy": 0.8490413427352905, + "num_tokens": 135393050.0, + "step": 3547 + }, + { + "epoch": 0.4513420684391299, + "ewc_loss": 0.0213623046875, + "ewc_loss_parallel": 2.1338462829589844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.396297454833984, + "learning_rate": 1e-06, + "loss": 0.4387, + "mean_token_accuracy": 0.8585442900657654, + "num_tokens": 135431704.0, + "step": 3548 + }, + { + "epoch": 0.4514692787177204, + "ewc_loss": 0.0213623046875, + "ewc_loss_parallel": 2.1338462829589844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.24476432800293, + "learning_rate": 1e-06, + "loss": 0.5235, + "mean_token_accuracy": 0.8318071365356445, + "num_tokens": 135471306.0, + "step": 3549 + }, + { + "epoch": 0.4515964889963109, + "ewc_loss": 0.0213623046875, + "ewc_loss_parallel": 2.1338462829589844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.42510414123535, + "learning_rate": 1e-06, + "loss": 0.4884, + "mean_token_accuracy": 0.8416339159011841, + "num_tokens": 135509647.0, + "step": 3550 + }, + { + "epoch": 0.4517236992749014, + "ewc_loss": 0.0213623046875, + "ewc_loss_parallel": 2.1338462829589844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.23763084411621, + "learning_rate": 1e-06, + "loss": 0.4222, + "mean_token_accuracy": 0.8613003492355347, + "num_tokens": 135550469.0, + "step": 3551 + }, + { + "epoch": 0.45185090955349194, + "ewc_loss": 0.0211181640625, + "ewc_loss_parallel": 2.110004425048828e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.249706268310547, + "learning_rate": 1e-06, + "loss": 0.4619, + "mean_token_accuracy": 0.8539262413978577, + "num_tokens": 135589290.0, + "step": 3552 + }, + { + "epoch": 0.4519781198320824, + "ewc_loss": 0.021484375, + "ewc_loss_parallel": 2.1457672119140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.406658172607422, + "learning_rate": 1e-06, + "loss": 0.4768, + "mean_token_accuracy": 0.8485199809074402, + "num_tokens": 135624416.0, + "step": 3553 + }, + { + "epoch": 0.45210533011067294, + "ewc_loss": 0.021240234375, + "ewc_loss_parallel": 2.1219253540039062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.27836036682129, + "learning_rate": 1e-06, + "loss": 0.4514, + "mean_token_accuracy": 0.8581819534301758, + "num_tokens": 135662316.0, + "step": 3554 + }, + { + "epoch": 0.45223254038926347, + "ewc_loss": 0.021484375, + "ewc_loss_parallel": 2.1457672119140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.20466423034668, + "learning_rate": 1e-06, + "loss": 0.4502, + "mean_token_accuracy": 0.8557281494140625, + "num_tokens": 135701739.0, + "step": 3555 + }, + { + "epoch": 0.45235975066785394, + "ewc_loss": 0.0213623046875, + "ewc_loss_parallel": 2.1338462829589844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.36781120300293, + "learning_rate": 1e-06, + "loss": 0.471, + "mean_token_accuracy": 0.851283848285675, + "num_tokens": 135741372.0, + "step": 3556 + }, + { + "epoch": 0.45248696094644447, + "ewc_loss": 0.021484375, + "ewc_loss_parallel": 2.1457672119140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.187837600708008, + "learning_rate": 1e-06, + "loss": 0.3818, + "mean_token_accuracy": 0.876944363117218, + "num_tokens": 135779945.0, + "step": 3557 + }, + { + "epoch": 0.452614171225035, + "ewc_loss": 0.0211181640625, + "ewc_loss_parallel": 2.110004425048828e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.350648880004883, + "learning_rate": 1e-06, + "loss": 0.4536, + "mean_token_accuracy": 0.8482857942581177, + "num_tokens": 135813179.0, + "step": 3558 + }, + { + "epoch": 0.4527413815036255, + "ewc_loss": 0.021484375, + "ewc_loss_parallel": 2.1457672119140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.24150848388672, + "learning_rate": 1e-06, + "loss": 0.44, + "mean_token_accuracy": 0.8561391830444336, + "num_tokens": 135855147.0, + "step": 3559 + }, + { + "epoch": 0.452868591782216, + "ewc_loss": 0.021484375, + "ewc_loss_parallel": 2.1457672119140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.137300491333008, + "learning_rate": 1e-06, + "loss": 0.4338, + "mean_token_accuracy": 0.8626455664634705, + "num_tokens": 135892990.0, + "step": 3560 + }, + { + "epoch": 0.45299580206080653, + "ewc_loss": 0.021240234375, + "ewc_loss_parallel": 2.1219253540039062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.36650848388672, + "learning_rate": 1e-06, + "loss": 0.3967, + "mean_token_accuracy": 0.8730243444442749, + "num_tokens": 135926911.0, + "step": 3561 + }, + { + "epoch": 0.453123012339397, + "ewc_loss": 0.0213623046875, + "ewc_loss_parallel": 2.1338462829589844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.289518356323242, + "learning_rate": 1e-06, + "loss": 0.4244, + "mean_token_accuracy": 0.8625538349151611, + "num_tokens": 135963530.0, + "step": 3562 + }, + { + "epoch": 0.45325022261798753, + "ewc_loss": 0.0213623046875, + "ewc_loss_parallel": 2.1338462829589844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.284759521484375, + "learning_rate": 1e-06, + "loss": 0.4804, + "mean_token_accuracy": 0.8448929190635681, + "num_tokens": 136000677.0, + "step": 3563 + }, + { + "epoch": 0.45337743289657806, + "ewc_loss": 0.021240234375, + "ewc_loss_parallel": 2.1219253540039062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.329744338989258, + "learning_rate": 1e-06, + "loss": 0.4897, + "mean_token_accuracy": 0.8443319797515869, + "num_tokens": 136037283.0, + "step": 3564 + }, + { + "epoch": 0.45350464317516853, + "ewc_loss": 0.0213623046875, + "ewc_loss_parallel": 2.1338462829589844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.239107131958008, + "learning_rate": 1e-06, + "loss": 0.5164, + "mean_token_accuracy": 0.8360210061073303, + "num_tokens": 136071454.0, + "step": 3565 + }, + { + "epoch": 0.45363185345375906, + "ewc_loss": 0.021240234375, + "ewc_loss_parallel": 2.1219253540039062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.20258331298828, + "learning_rate": 1e-06, + "loss": 0.4834, + "mean_token_accuracy": 0.8437997698783875, + "num_tokens": 136105638.0, + "step": 3566 + }, + { + "epoch": 0.4537590637323496, + "ewc_loss": 0.021484375, + "ewc_loss_parallel": 2.1457672119140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.316984176635742, + "learning_rate": 1e-06, + "loss": 0.4415, + "mean_token_accuracy": 0.8572980165481567, + "num_tokens": 136147968.0, + "step": 3567 + }, + { + "epoch": 0.45388627401094006, + "ewc_loss": 0.021240234375, + "ewc_loss_parallel": 2.1219253540039062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.255107879638672, + "learning_rate": 1e-06, + "loss": 0.4494, + "mean_token_accuracy": 0.8552247285842896, + "num_tokens": 136190468.0, + "step": 3568 + }, + { + "epoch": 0.4540134842895306, + "ewc_loss": 0.021484375, + "ewc_loss_parallel": 2.1457672119140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.211950302124023, + "learning_rate": 1e-06, + "loss": 0.4369, + "mean_token_accuracy": 0.8588698506355286, + "num_tokens": 136229363.0, + "step": 3569 + }, + { + "epoch": 0.4541406945681211, + "ewc_loss": 0.021484375, + "ewc_loss_parallel": 2.1457672119140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.237869262695312, + "learning_rate": 1e-06, + "loss": 0.4755, + "mean_token_accuracy": 0.8478258848190308, + "num_tokens": 136268834.0, + "step": 3570 + }, + { + "epoch": 0.4542679048467116, + "ewc_loss": 0.021484375, + "ewc_loss_parallel": 2.1457672119140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.274080276489258, + "learning_rate": 1e-06, + "loss": 0.5421, + "mean_token_accuracy": 0.8314401507377625, + "num_tokens": 136306761.0, + "step": 3571 + }, + { + "epoch": 0.4543951151253021, + "ewc_loss": 0.021240234375, + "ewc_loss_parallel": 2.1219253540039062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.244932174682617, + "learning_rate": 1e-06, + "loss": 0.4712, + "mean_token_accuracy": 0.8538793325424194, + "num_tokens": 136347736.0, + "step": 3572 + }, + { + "epoch": 0.45452232540389265, + "ewc_loss": 0.0213623046875, + "ewc_loss_parallel": 2.1338462829589844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.275781631469727, + "learning_rate": 1e-06, + "loss": 0.4537, + "mean_token_accuracy": 0.8571583032608032, + "num_tokens": 136381009.0, + "step": 3573 + }, + { + "epoch": 0.4546495356824831, + "ewc_loss": 0.021240234375, + "ewc_loss_parallel": 2.1219253540039062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.301559448242188, + "learning_rate": 1e-06, + "loss": 0.4117, + "mean_token_accuracy": 0.8646295070648193, + "num_tokens": 136418050.0, + "step": 3574 + }, + { + "epoch": 0.45477674596107365, + "ewc_loss": 0.021240234375, + "ewc_loss_parallel": 2.1219253540039062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.30902099609375, + "learning_rate": 1e-06, + "loss": 0.5141, + "mean_token_accuracy": 0.8366639614105225, + "num_tokens": 136459109.0, + "step": 3575 + }, + { + "epoch": 0.4549039562396642, + "ewc_loss": 0.021240234375, + "ewc_loss_parallel": 2.1219253540039062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.273405075073242, + "learning_rate": 1e-06, + "loss": 0.4739, + "mean_token_accuracy": 0.8508632779121399, + "num_tokens": 136497993.0, + "step": 3576 + }, + { + "epoch": 0.45503116651825465, + "ewc_loss": 0.021484375, + "ewc_loss_parallel": 2.1457672119140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.325851440429688, + "learning_rate": 1e-06, + "loss": 0.4483, + "mean_token_accuracy": 0.8565082550048828, + "num_tokens": 136534446.0, + "step": 3577 + }, + { + "epoch": 0.4551583767968452, + "ewc_loss": 0.021240234375, + "ewc_loss_parallel": 2.1219253540039062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.281021118164062, + "learning_rate": 1e-06, + "loss": 0.5089, + "mean_token_accuracy": 0.8391017913818359, + "num_tokens": 136571117.0, + "step": 3578 + }, + { + "epoch": 0.4552855870754357, + "ewc_loss": 0.021240234375, + "ewc_loss_parallel": 2.1219253540039062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.597383499145508, + "learning_rate": 1e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.8619105815887451, + "num_tokens": 136601706.0, + "step": 3579 + }, + { + "epoch": 0.4554127973540262, + "ewc_loss": 0.021240234375, + "ewc_loss_parallel": 2.1219253540039062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.26018524169922, + "learning_rate": 1e-06, + "loss": 0.4434, + "mean_token_accuracy": 0.8550652265548706, + "num_tokens": 136639735.0, + "step": 3580 + }, + { + "epoch": 0.4555400076326167, + "ewc_loss": 0.021240234375, + "ewc_loss_parallel": 2.1219253540039062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.938674926757812, + "learning_rate": 1e-06, + "loss": 0.4795, + "mean_token_accuracy": 0.8471689224243164, + "num_tokens": 136675647.0, + "step": 3581 + }, + { + "epoch": 0.45566721791120723, + "ewc_loss": 0.021484375, + "ewc_loss_parallel": 2.1457672119140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.204591751098633, + "learning_rate": 1e-06, + "loss": 0.4414, + "mean_token_accuracy": 0.8618072867393494, + "num_tokens": 136713361.0, + "step": 3582 + }, + { + "epoch": 0.45579442818979776, + "ewc_loss": 0.02099609375, + "ewc_loss_parallel": 2.09808349609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.492292404174805, + "learning_rate": 1e-06, + "loss": 0.4627, + "mean_token_accuracy": 0.853171706199646, + "num_tokens": 136751440.0, + "step": 3583 + }, + { + "epoch": 0.45592163846838824, + "ewc_loss": 0.021484375, + "ewc_loss_parallel": 2.1457672119140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.657812118530273, + "learning_rate": 1e-06, + "loss": 0.4764, + "mean_token_accuracy": 0.8502817153930664, + "num_tokens": 136786078.0, + "step": 3584 + }, + { + "epoch": 0.45604884874697876, + "ewc_loss": 0.0211181640625, + "ewc_loss_parallel": 2.110004425048828e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.28187370300293, + "learning_rate": 1e-06, + "loss": 0.4397, + "mean_token_accuracy": 0.8620785474777222, + "num_tokens": 136828313.0, + "step": 3585 + }, + { + "epoch": 0.4561760590255693, + "ewc_loss": 0.021240234375, + "ewc_loss_parallel": 2.1219253540039062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.251110076904297, + "learning_rate": 1e-06, + "loss": 0.4602, + "mean_token_accuracy": 0.8524183630943298, + "num_tokens": 136862549.0, + "step": 3586 + }, + { + "epoch": 0.45630326930415976, + "ewc_loss": 0.021240234375, + "ewc_loss_parallel": 2.1219253540039062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.4450626373291, + "learning_rate": 1e-06, + "loss": 0.4474, + "mean_token_accuracy": 0.8578722476959229, + "num_tokens": 136906423.0, + "step": 3587 + }, + { + "epoch": 0.4564304795827503, + "ewc_loss": 0.021240234375, + "ewc_loss_parallel": 2.1219253540039062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.32705307006836, + "learning_rate": 1e-06, + "loss": 0.4893, + "mean_token_accuracy": 0.8425702452659607, + "num_tokens": 136941550.0, + "step": 3588 + }, + { + "epoch": 0.4565576898613408, + "ewc_loss": 0.021240234375, + "ewc_loss_parallel": 2.1219253540039062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.116540908813477, + "learning_rate": 1e-06, + "loss": 0.4453, + "mean_token_accuracy": 0.8573303818702698, + "num_tokens": 136980234.0, + "step": 3589 + }, + { + "epoch": 0.4566849001399313, + "ewc_loss": 0.021240234375, + "ewc_loss_parallel": 2.1219253540039062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.46982765197754, + "learning_rate": 1e-06, + "loss": 0.4499, + "mean_token_accuracy": 0.8576276302337646, + "num_tokens": 137023991.0, + "step": 3590 + }, + { + "epoch": 0.4568121104185218, + "ewc_loss": 0.0213623046875, + "ewc_loss_parallel": 2.1338462829589844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.332870483398438, + "learning_rate": 1e-06, + "loss": 0.4931, + "mean_token_accuracy": 0.8448878526687622, + "num_tokens": 137063261.0, + "step": 3591 + }, + { + "epoch": 0.45693932069711235, + "ewc_loss": 0.021240234375, + "ewc_loss_parallel": 2.1219253540039062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.400922775268555, + "learning_rate": 1e-06, + "loss": 0.4234, + "mean_token_accuracy": 0.863530695438385, + "num_tokens": 137103935.0, + "step": 3592 + }, + { + "epoch": 0.4570665309757028, + "ewc_loss": 0.021484375, + "ewc_loss_parallel": 2.1457672119140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.52326774597168, + "learning_rate": 1e-06, + "loss": 0.4555, + "mean_token_accuracy": 0.8551466464996338, + "num_tokens": 137140194.0, + "step": 3593 + }, + { + "epoch": 0.45719374125429335, + "ewc_loss": 0.021484375, + "ewc_loss_parallel": 2.1457672119140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.30240249633789, + "learning_rate": 1e-06, + "loss": 0.4893, + "mean_token_accuracy": 0.8450857400894165, + "num_tokens": 137179086.0, + "step": 3594 + }, + { + "epoch": 0.4573209515328839, + "ewc_loss": 0.021484375, + "ewc_loss_parallel": 2.1457672119140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.72785758972168, + "learning_rate": 1e-06, + "loss": 0.4409, + "mean_token_accuracy": 0.859311580657959, + "num_tokens": 137216887.0, + "step": 3595 + }, + { + "epoch": 0.45744816181147435, + "ewc_loss": 0.021240234375, + "ewc_loss_parallel": 2.1219253540039062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.401920318603516, + "learning_rate": 1e-06, + "loss": 0.4051, + "mean_token_accuracy": 0.8697199821472168, + "num_tokens": 137255029.0, + "step": 3596 + }, + { + "epoch": 0.4575753720900649, + "ewc_loss": 0.0211181640625, + "ewc_loss_parallel": 2.110004425048828e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.575393676757812, + "learning_rate": 1e-06, + "loss": 0.4348, + "mean_token_accuracy": 0.8581016659736633, + "num_tokens": 137291399.0, + "step": 3597 + }, + { + "epoch": 0.4577025823686554, + "ewc_loss": 0.021240234375, + "ewc_loss_parallel": 2.1219253540039062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.349288940429688, + "learning_rate": 1e-06, + "loss": 0.4439, + "mean_token_accuracy": 0.8587359189987183, + "num_tokens": 137326569.0, + "step": 3598 + }, + { + "epoch": 0.4578297926472459, + "ewc_loss": 0.021240234375, + "ewc_loss_parallel": 2.1219253540039062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.39590835571289, + "learning_rate": 1e-06, + "loss": 0.4508, + "mean_token_accuracy": 0.8507978916168213, + "num_tokens": 137370035.0, + "step": 3599 + }, + { + "epoch": 0.4579570029258364, + "ewc_loss": 0.0213623046875, + "ewc_loss_parallel": 2.1338462829589844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.539047241210938, + "learning_rate": 1e-06, + "loss": 0.4704, + "mean_token_accuracy": 0.8510886430740356, + "num_tokens": 137407969.0, + "step": 3600 + }, + { + "epoch": 0.45808421320442694, + "ewc_loss": 0.0213623046875, + "ewc_loss_parallel": 2.1338462829589844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.267255783081055, + "learning_rate": 1e-06, + "loss": 0.4528, + "mean_token_accuracy": 0.8532052040100098, + "num_tokens": 137447256.0, + "step": 3601 + }, + { + "epoch": 0.4582114234830174, + "ewc_loss": 0.0211181640625, + "ewc_loss_parallel": 2.110004425048828e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.57025146484375, + "learning_rate": 1e-06, + "loss": 0.5049, + "mean_token_accuracy": 0.8401634693145752, + "num_tokens": 137479762.0, + "step": 3602 + }, + { + "epoch": 0.45833863376160794, + "ewc_loss": 0.0213623046875, + "ewc_loss_parallel": 2.1338462829589844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.351320266723633, + "learning_rate": 1e-06, + "loss": 0.4744, + "mean_token_accuracy": 0.8468054533004761, + "num_tokens": 137521823.0, + "step": 3603 + }, + { + "epoch": 0.45846584404019847, + "ewc_loss": 0.021240234375, + "ewc_loss_parallel": 2.1219253540039062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.312793731689453, + "learning_rate": 1e-06, + "loss": 0.4688, + "mean_token_accuracy": 0.8517576456069946, + "num_tokens": 137558174.0, + "step": 3604 + }, + { + "epoch": 0.45859305431878894, + "ewc_loss": 0.0213623046875, + "ewc_loss_parallel": 2.1338462829589844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.3303165435791, + "learning_rate": 1e-06, + "loss": 0.542, + "mean_token_accuracy": 0.8302671909332275, + "num_tokens": 137597265.0, + "step": 3605 + }, + { + "epoch": 0.45872026459737947, + "ewc_loss": 0.021240234375, + "ewc_loss_parallel": 2.1219253540039062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.135908126831055, + "learning_rate": 1e-06, + "loss": 0.4304, + "mean_token_accuracy": 0.8621727228164673, + "num_tokens": 137635954.0, + "step": 3606 + }, + { + "epoch": 0.45884747487597, + "ewc_loss": 0.021240234375, + "ewc_loss_parallel": 2.1219253540039062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.446178436279297, + "learning_rate": 1e-06, + "loss": 0.4542, + "mean_token_accuracy": 0.856666088104248, + "num_tokens": 137676137.0, + "step": 3607 + }, + { + "epoch": 0.45897468515456047, + "ewc_loss": 0.021484375, + "ewc_loss_parallel": 2.1457672119140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.35633659362793, + "learning_rate": 1e-06, + "loss": 0.4497, + "mean_token_accuracy": 0.8571180701255798, + "num_tokens": 137716723.0, + "step": 3608 + }, + { + "epoch": 0.459101895433151, + "ewc_loss": 0.0216064453125, + "ewc_loss_parallel": 2.1576881408691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.50098991394043, + "learning_rate": 1e-06, + "loss": 0.5102, + "mean_token_accuracy": 0.8362222909927368, + "num_tokens": 137752435.0, + "step": 3609 + }, + { + "epoch": 0.4592291057117415, + "ewc_loss": 0.021484375, + "ewc_loss_parallel": 2.1457672119140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.356843948364258, + "learning_rate": 1e-06, + "loss": 0.4826, + "mean_token_accuracy": 0.8468695282936096, + "num_tokens": 137786684.0, + "step": 3610 + }, + { + "epoch": 0.459356315990332, + "ewc_loss": 0.0211181640625, + "ewc_loss_parallel": 2.110004425048828e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.348684310913086, + "learning_rate": 1e-06, + "loss": 0.48, + "mean_token_accuracy": 0.846123993396759, + "num_tokens": 137830668.0, + "step": 3611 + }, + { + "epoch": 0.4594835262689225, + "ewc_loss": 0.0216064453125, + "ewc_loss_parallel": 2.1576881408691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.334373474121094, + "learning_rate": 1e-06, + "loss": 0.4287, + "mean_token_accuracy": 0.8607062101364136, + "num_tokens": 137865262.0, + "step": 3612 + }, + { + "epoch": 0.45961073654751305, + "ewc_loss": 0.021240234375, + "ewc_loss_parallel": 2.1219253540039062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.55002212524414, + "learning_rate": 1e-06, + "loss": 0.4384, + "mean_token_accuracy": 0.8610038757324219, + "num_tokens": 137898867.0, + "step": 3613 + }, + { + "epoch": 0.4597379468261035, + "ewc_loss": 0.0213623046875, + "ewc_loss_parallel": 2.1338462829589844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.202619552612305, + "learning_rate": 1e-06, + "loss": 0.4595, + "mean_token_accuracy": 0.8553337454795837, + "num_tokens": 137935108.0, + "step": 3614 + }, + { + "epoch": 0.45986515710469406, + "ewc_loss": 0.021240234375, + "ewc_loss_parallel": 2.1219253540039062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.57640838623047, + "learning_rate": 1e-06, + "loss": 0.4076, + "mean_token_accuracy": 0.8659500479698181, + "num_tokens": 137977714.0, + "step": 3615 + }, + { + "epoch": 0.4599923673832846, + "ewc_loss": 0.021240234375, + "ewc_loss_parallel": 2.1219253540039062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.314109802246094, + "learning_rate": 1e-06, + "loss": 0.4053, + "mean_token_accuracy": 0.8674459457397461, + "num_tokens": 138006345.0, + "step": 3616 + }, + { + "epoch": 0.46011957766187506, + "ewc_loss": 0.021240234375, + "ewc_loss_parallel": 2.1219253540039062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.2894287109375, + "learning_rate": 1e-06, + "loss": 0.4591, + "mean_token_accuracy": 0.8549761772155762, + "num_tokens": 138045106.0, + "step": 3617 + }, + { + "epoch": 0.4602467879404656, + "ewc_loss": 0.021484375, + "ewc_loss_parallel": 2.1457672119140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.477609634399414, + "learning_rate": 1e-06, + "loss": 0.4736, + "mean_token_accuracy": 0.849271833896637, + "num_tokens": 138079993.0, + "step": 3618 + }, + { + "epoch": 0.4603739982190561, + "ewc_loss": 0.021484375, + "ewc_loss_parallel": 2.1457672119140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.31618881225586, + "learning_rate": 1e-06, + "loss": 0.4446, + "mean_token_accuracy": 0.8585561513900757, + "num_tokens": 138118555.0, + "step": 3619 + }, + { + "epoch": 0.4605012084976466, + "ewc_loss": 0.021484375, + "ewc_loss_parallel": 2.1457672119140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.404094696044922, + "learning_rate": 1e-06, + "loss": 0.4474, + "mean_token_accuracy": 0.8631491661071777, + "num_tokens": 138155096.0, + "step": 3620 + }, + { + "epoch": 0.4606284187762371, + "ewc_loss": 0.021484375, + "ewc_loss_parallel": 2.1457672119140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.345346450805664, + "learning_rate": 1e-06, + "loss": 0.4454, + "mean_token_accuracy": 0.8564468622207642, + "num_tokens": 138194529.0, + "step": 3621 + }, + { + "epoch": 0.46075562905482764, + "ewc_loss": 0.021484375, + "ewc_loss_parallel": 2.1457672119140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.368677139282227, + "learning_rate": 1e-06, + "loss": 0.5305, + "mean_token_accuracy": 0.8295590281486511, + "num_tokens": 138235520.0, + "step": 3622 + }, + { + "epoch": 0.4608828393334181, + "ewc_loss": 0.0213623046875, + "ewc_loss_parallel": 2.1338462829589844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.704347610473633, + "learning_rate": 1e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.864012598991394, + "num_tokens": 138271391.0, + "step": 3623 + }, + { + "epoch": 0.46101004961200864, + "ewc_loss": 0.021484375, + "ewc_loss_parallel": 2.1457672119140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.177845001220703, + "learning_rate": 1e-06, + "loss": 0.4751, + "mean_token_accuracy": 0.849563717842102, + "num_tokens": 138313923.0, + "step": 3624 + }, + { + "epoch": 0.46113725989059917, + "ewc_loss": 0.0213623046875, + "ewc_loss_parallel": 2.1338462829589844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.523048400878906, + "learning_rate": 1e-06, + "loss": 0.5023, + "mean_token_accuracy": 0.8469812870025635, + "num_tokens": 138355992.0, + "step": 3625 + }, + { + "epoch": 0.46126447016918964, + "ewc_loss": 0.021484375, + "ewc_loss_parallel": 2.1457672119140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.282255172729492, + "learning_rate": 1e-06, + "loss": 0.3886, + "mean_token_accuracy": 0.8740570545196533, + "num_tokens": 138393074.0, + "step": 3626 + }, + { + "epoch": 0.4613916804477802, + "ewc_loss": 0.0213623046875, + "ewc_loss_parallel": 2.1338462829589844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.61857795715332, + "learning_rate": 1e-06, + "loss": 0.4363, + "mean_token_accuracy": 0.8616155385971069, + "num_tokens": 138427854.0, + "step": 3627 + }, + { + "epoch": 0.4615188907263707, + "ewc_loss": 0.021484375, + "ewc_loss_parallel": 2.1457672119140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.421466827392578, + "learning_rate": 1e-06, + "loss": 0.4317, + "mean_token_accuracy": 0.8642162680625916, + "num_tokens": 138467359.0, + "step": 3628 + }, + { + "epoch": 0.4616461010049612, + "ewc_loss": 0.0213623046875, + "ewc_loss_parallel": 2.1338462829589844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.350704193115234, + "learning_rate": 1e-06, + "loss": 0.4936, + "mean_token_accuracy": 0.8433185815811157, + "num_tokens": 138505355.0, + "step": 3629 + }, + { + "epoch": 0.4617733112835517, + "ewc_loss": 0.0213623046875, + "ewc_loss_parallel": 2.1338462829589844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.3670654296875, + "learning_rate": 1e-06, + "loss": 0.4404, + "mean_token_accuracy": 0.8594066500663757, + "num_tokens": 138543703.0, + "step": 3630 + }, + { + "epoch": 0.46190052156214223, + "ewc_loss": 0.021240234375, + "ewc_loss_parallel": 2.1219253540039062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.252073287963867, + "learning_rate": 1e-06, + "loss": 0.467, + "mean_token_accuracy": 0.8475729823112488, + "num_tokens": 138579860.0, + "step": 3631 + }, + { + "epoch": 0.46202773184073276, + "ewc_loss": 0.021484375, + "ewc_loss_parallel": 2.1457672119140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.448135375976562, + "learning_rate": 1e-06, + "loss": 0.4208, + "mean_token_accuracy": 0.863835334777832, + "num_tokens": 138615755.0, + "step": 3632 + }, + { + "epoch": 0.46215494211932323, + "ewc_loss": 0.021484375, + "ewc_loss_parallel": 2.1457672119140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.395885467529297, + "learning_rate": 1e-06, + "loss": 0.4959, + "mean_token_accuracy": 0.849892258644104, + "num_tokens": 138647607.0, + "step": 3633 + }, + { + "epoch": 0.46228215239791376, + "ewc_loss": 0.021240234375, + "ewc_loss_parallel": 2.1219253540039062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.315576553344727, + "learning_rate": 1e-06, + "loss": 0.4461, + "mean_token_accuracy": 0.8604838848114014, + "num_tokens": 138679284.0, + "step": 3634 + }, + { + "epoch": 0.4624093626765043, + "ewc_loss": 0.021240234375, + "ewc_loss_parallel": 2.1219253540039062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.244115829467773, + "learning_rate": 1e-06, + "loss": 0.4131, + "mean_token_accuracy": 0.8646342158317566, + "num_tokens": 138719690.0, + "step": 3635 + }, + { + "epoch": 0.46253657295509476, + "ewc_loss": 0.0213623046875, + "ewc_loss_parallel": 2.1338462829589844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.409793853759766, + "learning_rate": 1e-06, + "loss": 0.4966, + "mean_token_accuracy": 0.8428613543510437, + "num_tokens": 138758031.0, + "step": 3636 + }, + { + "epoch": 0.4626637832336853, + "ewc_loss": 0.0213623046875, + "ewc_loss_parallel": 2.1338462829589844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.311161041259766, + "learning_rate": 1e-06, + "loss": 0.4548, + "mean_token_accuracy": 0.8546639680862427, + "num_tokens": 138797283.0, + "step": 3637 + }, + { + "epoch": 0.4627909935122758, + "ewc_loss": 0.0213623046875, + "ewc_loss_parallel": 2.1338462829589844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.494842529296875, + "learning_rate": 1e-06, + "loss": 0.4697, + "mean_token_accuracy": 0.8504830598831177, + "num_tokens": 138834672.0, + "step": 3638 + }, + { + "epoch": 0.4629182037908663, + "ewc_loss": 0.0216064453125, + "ewc_loss_parallel": 2.1576881408691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.309980392456055, + "learning_rate": 1e-06, + "loss": 0.4206, + "mean_token_accuracy": 0.8654968738555908, + "num_tokens": 138871105.0, + "step": 3639 + }, + { + "epoch": 0.4630454140694568, + "ewc_loss": 0.021484375, + "ewc_loss_parallel": 2.1457672119140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.594974517822266, + "learning_rate": 1e-06, + "loss": 0.4592, + "mean_token_accuracy": 0.855983316898346, + "num_tokens": 138903516.0, + "step": 3640 + }, + { + "epoch": 0.46317262434804735, + "ewc_loss": 0.021484375, + "ewc_loss_parallel": 2.1457672119140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.38102149963379, + "learning_rate": 1e-06, + "loss": 0.4624, + "mean_token_accuracy": 0.8527833223342896, + "num_tokens": 138942021.0, + "step": 3641 + }, + { + "epoch": 0.4632998346266378, + "ewc_loss": 0.021240234375, + "ewc_loss_parallel": 2.1219253540039062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.276485443115234, + "learning_rate": 1e-06, + "loss": 0.4203, + "mean_token_accuracy": 0.8636996746063232, + "num_tokens": 138978210.0, + "step": 3642 + }, + { + "epoch": 0.46342704490522835, + "ewc_loss": 0.0213623046875, + "ewc_loss_parallel": 2.1338462829589844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.375385284423828, + "learning_rate": 1e-06, + "loss": 0.4518, + "mean_token_accuracy": 0.8579456806182861, + "num_tokens": 139019122.0, + "step": 3643 + }, + { + "epoch": 0.4635542551838189, + "ewc_loss": 0.021484375, + "ewc_loss_parallel": 2.1457672119140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.4996280670166, + "learning_rate": 1e-06, + "loss": 0.4136, + "mean_token_accuracy": 0.8655484914779663, + "num_tokens": 139062187.0, + "step": 3644 + }, + { + "epoch": 0.46368146546240935, + "ewc_loss": 0.0216064453125, + "ewc_loss_parallel": 2.1576881408691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.538959503173828, + "learning_rate": 1e-06, + "loss": 0.3942, + "mean_token_accuracy": 0.8738807439804077, + "num_tokens": 139102960.0, + "step": 3645 + }, + { + "epoch": 0.4638086757409999, + "ewc_loss": 0.021484375, + "ewc_loss_parallel": 2.1457672119140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.305622100830078, + "learning_rate": 1e-06, + "loss": 0.4434, + "mean_token_accuracy": 0.8555057048797607, + "num_tokens": 139137302.0, + "step": 3646 + }, + { + "epoch": 0.4639358860195904, + "ewc_loss": 0.021240234375, + "ewc_loss_parallel": 2.1219253540039062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.27534294128418, + "learning_rate": 1e-06, + "loss": 0.5216, + "mean_token_accuracy": 0.8363305330276489, + "num_tokens": 139177195.0, + "step": 3647 + }, + { + "epoch": 0.4640630962981809, + "ewc_loss": 0.021728515625, + "ewc_loss_parallel": 2.1696090698242188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.255863189697266, + "learning_rate": 1e-06, + "loss": 0.4663, + "mean_token_accuracy": 0.8515622019767761, + "num_tokens": 139214722.0, + "step": 3648 + }, + { + "epoch": 0.4641903065767714, + "ewc_loss": 0.021484375, + "ewc_loss_parallel": 2.1457672119140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.48303985595703, + "learning_rate": 1e-06, + "loss": 0.464, + "mean_token_accuracy": 0.8546721935272217, + "num_tokens": 139248569.0, + "step": 3649 + }, + { + "epoch": 0.46431751685536193, + "ewc_loss": 0.021728515625, + "ewc_loss_parallel": 2.1696090698242188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.238414764404297, + "learning_rate": 1e-06, + "loss": 0.4322, + "mean_token_accuracy": 0.8610859513282776, + "num_tokens": 139289141.0, + "step": 3650 + }, + { + "epoch": 0.4644447271339524, + "ewc_loss": 0.0216064453125, + "ewc_loss_parallel": 2.1576881408691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.355104446411133, + "learning_rate": 1e-06, + "loss": 0.4029, + "mean_token_accuracy": 0.8714627027511597, + "num_tokens": 139330791.0, + "step": 3651 + }, + { + "epoch": 0.46457193741254293, + "ewc_loss": 0.0218505859375, + "ewc_loss_parallel": 2.181529998779297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.39872169494629, + "learning_rate": 1e-06, + "loss": 0.5537, + "mean_token_accuracy": 0.8227300047874451, + "num_tokens": 139373009.0, + "step": 3652 + }, + { + "epoch": 0.46469914769113346, + "ewc_loss": 0.0213623046875, + "ewc_loss_parallel": 2.1338462829589844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.53412437438965, + "learning_rate": 1e-06, + "loss": 0.4607, + "mean_token_accuracy": 0.853508472442627, + "num_tokens": 139408894.0, + "step": 3653 + }, + { + "epoch": 0.46482635796972394, + "ewc_loss": 0.021728515625, + "ewc_loss_parallel": 2.1696090698242188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.30264663696289, + "learning_rate": 1e-06, + "loss": 0.4448, + "mean_token_accuracy": 0.8530269861221313, + "num_tokens": 139444040.0, + "step": 3654 + }, + { + "epoch": 0.46495356824831446, + "ewc_loss": 0.021484375, + "ewc_loss_parallel": 2.1457672119140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.47785758972168, + "learning_rate": 1e-06, + "loss": 0.4277, + "mean_token_accuracy": 0.8614932298660278, + "num_tokens": 139479675.0, + "step": 3655 + }, + { + "epoch": 0.465080778526905, + "ewc_loss": 0.021484375, + "ewc_loss_parallel": 2.1457672119140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.296092987060547, + "learning_rate": 1e-06, + "loss": 0.4245, + "mean_token_accuracy": 0.8626227378845215, + "num_tokens": 139521792.0, + "step": 3656 + }, + { + "epoch": 0.46520798880549546, + "ewc_loss": 0.021484375, + "ewc_loss_parallel": 2.1457672119140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.618839263916016, + "learning_rate": 1e-06, + "loss": 0.4351, + "mean_token_accuracy": 0.8622854351997375, + "num_tokens": 139563471.0, + "step": 3657 + }, + { + "epoch": 0.465335199084086, + "ewc_loss": 0.0216064453125, + "ewc_loss_parallel": 2.1576881408691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.471134185791016, + "learning_rate": 1e-06, + "loss": 0.4471, + "mean_token_accuracy": 0.8550595045089722, + "num_tokens": 139604702.0, + "step": 3658 + }, + { + "epoch": 0.4654624093626765, + "ewc_loss": 0.021728515625, + "ewc_loss_parallel": 2.1696090698242188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.1673641204834, + "learning_rate": 1e-06, + "loss": 0.4591, + "mean_token_accuracy": 0.853718638420105, + "num_tokens": 139644573.0, + "step": 3659 + }, + { + "epoch": 0.465589619641267, + "ewc_loss": 0.021484375, + "ewc_loss_parallel": 2.1457672119140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.58167266845703, + "learning_rate": 1e-06, + "loss": 0.4988, + "mean_token_accuracy": 0.8407553434371948, + "num_tokens": 139680924.0, + "step": 3660 + }, + { + "epoch": 0.4657168299198575, + "ewc_loss": 0.021240234375, + "ewc_loss_parallel": 2.1219253540039062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.437007904052734, + "learning_rate": 1e-06, + "loss": 0.4765, + "mean_token_accuracy": 0.8482714891433716, + "num_tokens": 139724369.0, + "step": 3661 + }, + { + "epoch": 0.46584404019844805, + "ewc_loss": 0.021484375, + "ewc_loss_parallel": 2.1457672119140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.89122200012207, + "learning_rate": 1e-06, + "loss": 0.4518, + "mean_token_accuracy": 0.855860710144043, + "num_tokens": 139769412.0, + "step": 3662 + }, + { + "epoch": 0.4659712504770385, + "ewc_loss": 0.0213623046875, + "ewc_loss_parallel": 2.1338462829589844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.4807071685791, + "learning_rate": 1e-06, + "loss": 0.527, + "mean_token_accuracy": 0.831038773059845, + "num_tokens": 139808066.0, + "step": 3663 + }, + { + "epoch": 0.46609846075562905, + "ewc_loss": 0.021240234375, + "ewc_loss_parallel": 2.1219253540039062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.42622947692871, + "learning_rate": 1e-06, + "loss": 0.4387, + "mean_token_accuracy": 0.8599228858947754, + "num_tokens": 139851169.0, + "step": 3664 + }, + { + "epoch": 0.4662256710342196, + "ewc_loss": 0.0213623046875, + "ewc_loss_parallel": 2.1338462829589844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.93950080871582, + "learning_rate": 1e-06, + "loss": 0.421, + "mean_token_accuracy": 0.8654782176017761, + "num_tokens": 139890550.0, + "step": 3665 + }, + { + "epoch": 0.46635288131281005, + "ewc_loss": 0.021484375, + "ewc_loss_parallel": 2.1457672119140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.416074752807617, + "learning_rate": 1e-06, + "loss": 0.536, + "mean_token_accuracy": 0.8288339376449585, + "num_tokens": 139931496.0, + "step": 3666 + }, + { + "epoch": 0.4664800915914006, + "ewc_loss": 0.0211181640625, + "ewc_loss_parallel": 2.110004425048828e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.65167999267578, + "learning_rate": 1e-06, + "loss": 0.496, + "mean_token_accuracy": 0.845483660697937, + "num_tokens": 139965872.0, + "step": 3667 + }, + { + "epoch": 0.4666073018699911, + "ewc_loss": 0.0213623046875, + "ewc_loss_parallel": 2.1338462829589844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.31951904296875, + "learning_rate": 1e-06, + "loss": 0.5262, + "mean_token_accuracy": 0.8395088911056519, + "num_tokens": 140000394.0, + "step": 3668 + }, + { + "epoch": 0.4667345121485816, + "ewc_loss": 0.021484375, + "ewc_loss_parallel": 2.1457672119140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.767440795898438, + "learning_rate": 1e-06, + "loss": 0.506, + "mean_token_accuracy": 0.8383593559265137, + "num_tokens": 140039967.0, + "step": 3669 + }, + { + "epoch": 0.4668617224271721, + "ewc_loss": 0.021484375, + "ewc_loss_parallel": 2.1457672119140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.485828399658203, + "learning_rate": 1e-06, + "loss": 0.4602, + "mean_token_accuracy": 0.8558667302131653, + "num_tokens": 140075739.0, + "step": 3670 + }, + { + "epoch": 0.46698893270576264, + "ewc_loss": 0.021484375, + "ewc_loss_parallel": 2.1457672119140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.518762588500977, + "learning_rate": 1e-06, + "loss": 0.4537, + "mean_token_accuracy": 0.859312891960144, + "num_tokens": 140110016.0, + "step": 3671 + }, + { + "epoch": 0.4671161429843531, + "ewc_loss": 0.021484375, + "ewc_loss_parallel": 2.1457672119140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.692794799804688, + "learning_rate": 1e-06, + "loss": 0.4726, + "mean_token_accuracy": 0.8445479273796082, + "num_tokens": 140147905.0, + "step": 3672 + }, + { + "epoch": 0.46724335326294364, + "ewc_loss": 0.021240234375, + "ewc_loss_parallel": 2.1219253540039062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.391624450683594, + "learning_rate": 1e-06, + "loss": 0.4702, + "mean_token_accuracy": 0.8487072587013245, + "num_tokens": 140182284.0, + "step": 3673 + }, + { + "epoch": 0.46737056354153417, + "ewc_loss": 0.021240234375, + "ewc_loss_parallel": 2.1219253540039062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.638683319091797, + "learning_rate": 1e-06, + "loss": 0.4759, + "mean_token_accuracy": 0.8476710915565491, + "num_tokens": 140220669.0, + "step": 3674 + }, + { + "epoch": 0.46749777382012464, + "ewc_loss": 0.021240234375, + "ewc_loss_parallel": 2.1219253540039062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.35551643371582, + "learning_rate": 1e-06, + "loss": 0.4892, + "mean_token_accuracy": 0.8405361771583557, + "num_tokens": 140258399.0, + "step": 3675 + }, + { + "epoch": 0.46762498409871517, + "ewc_loss": 0.0213623046875, + "ewc_loss_parallel": 2.1338462829589844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.718175888061523, + "learning_rate": 1e-06, + "loss": 0.3969, + "mean_token_accuracy": 0.8685528635978699, + "num_tokens": 140296015.0, + "step": 3676 + }, + { + "epoch": 0.4677521943773057, + "ewc_loss": 0.021484375, + "ewc_loss_parallel": 2.1457672119140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.477352142333984, + "learning_rate": 1e-06, + "loss": 0.4919, + "mean_token_accuracy": 0.8446126580238342, + "num_tokens": 140338618.0, + "step": 3677 + }, + { + "epoch": 0.46787940465589617, + "ewc_loss": 0.0213623046875, + "ewc_loss_parallel": 2.1338462829589844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.354772567749023, + "learning_rate": 1e-06, + "loss": 0.4472, + "mean_token_accuracy": 0.8588576316833496, + "num_tokens": 140379151.0, + "step": 3678 + }, + { + "epoch": 0.4680066149344867, + "ewc_loss": 0.0213623046875, + "ewc_loss_parallel": 2.1338462829589844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.68291664123535, + "learning_rate": 1e-06, + "loss": 0.4984, + "mean_token_accuracy": 0.8408218622207642, + "num_tokens": 140417740.0, + "step": 3679 + }, + { + "epoch": 0.4681338252130772, + "ewc_loss": 0.021484375, + "ewc_loss_parallel": 2.1457672119140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.514171600341797, + "learning_rate": 1e-06, + "loss": 0.4557, + "mean_token_accuracy": 0.8515855073928833, + "num_tokens": 140452454.0, + "step": 3680 + }, + { + "epoch": 0.46826103549166775, + "ewc_loss": 0.021240234375, + "ewc_loss_parallel": 2.1219253540039062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.454631805419922, + "learning_rate": 1e-06, + "loss": 0.5364, + "mean_token_accuracy": 0.8310054540634155, + "num_tokens": 140492981.0, + "step": 3681 + }, + { + "epoch": 0.4683882457702582, + "ewc_loss": 0.0213623046875, + "ewc_loss_parallel": 2.1338462829589844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.342342376708984, + "learning_rate": 1e-06, + "loss": 0.4879, + "mean_token_accuracy": 0.8439823985099792, + "num_tokens": 140534131.0, + "step": 3682 + }, + { + "epoch": 0.46851545604884876, + "ewc_loss": 0.021484375, + "ewc_loss_parallel": 2.1457672119140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.337167739868164, + "learning_rate": 1e-06, + "loss": 0.4866, + "mean_token_accuracy": 0.8469812273979187, + "num_tokens": 140576761.0, + "step": 3683 + }, + { + "epoch": 0.4686426663274393, + "ewc_loss": 0.021484375, + "ewc_loss_parallel": 2.1457672119140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.596019744873047, + "learning_rate": 1e-06, + "loss": 0.5438, + "mean_token_accuracy": 0.8347063660621643, + "num_tokens": 140609538.0, + "step": 3684 + }, + { + "epoch": 0.46876987660602976, + "ewc_loss": 0.021484375, + "ewc_loss_parallel": 2.1457672119140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.334218978881836, + "learning_rate": 1e-06, + "loss": 0.4557, + "mean_token_accuracy": 0.8501676917076111, + "num_tokens": 140641802.0, + "step": 3685 + }, + { + "epoch": 0.4688970868846203, + "ewc_loss": 0.021240234375, + "ewc_loss_parallel": 2.1219253540039062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.48018455505371, + "learning_rate": 1e-06, + "loss": 0.5004, + "mean_token_accuracy": 0.8422446250915527, + "num_tokens": 140685832.0, + "step": 3686 + }, + { + "epoch": 0.4690242971632108, + "ewc_loss": 0.021484375, + "ewc_loss_parallel": 2.1457672119140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.32632064819336, + "learning_rate": 1e-06, + "loss": 0.4924, + "mean_token_accuracy": 0.8415567874908447, + "num_tokens": 140725333.0, + "step": 3687 + }, + { + "epoch": 0.4691515074418013, + "ewc_loss": 0.021484375, + "ewc_loss_parallel": 2.1457672119140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.29460906982422, + "learning_rate": 1e-06, + "loss": 0.4558, + "mean_token_accuracy": 0.8524258136749268, + "num_tokens": 140760838.0, + "step": 3688 + }, + { + "epoch": 0.4692787177203918, + "ewc_loss": 0.0213623046875, + "ewc_loss_parallel": 2.1338462829589844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.514978408813477, + "learning_rate": 1e-06, + "loss": 0.4821, + "mean_token_accuracy": 0.8433806300163269, + "num_tokens": 140797367.0, + "step": 3689 + }, + { + "epoch": 0.46940592799898234, + "ewc_loss": 0.021484375, + "ewc_loss_parallel": 2.1457672119140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.384716033935547, + "learning_rate": 1e-06, + "loss": 0.4709, + "mean_token_accuracy": 0.8523361086845398, + "num_tokens": 140839778.0, + "step": 3690 + }, + { + "epoch": 0.4695331382775728, + "ewc_loss": 0.021484375, + "ewc_loss_parallel": 2.1457672119140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.30714225769043, + "learning_rate": 1e-06, + "loss": 0.479, + "mean_token_accuracy": 0.8487851619720459, + "num_tokens": 140880340.0, + "step": 3691 + }, + { + "epoch": 0.46966034855616334, + "ewc_loss": 0.021728515625, + "ewc_loss_parallel": 2.1696090698242188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.504732131958008, + "learning_rate": 1e-06, + "loss": 0.4912, + "mean_token_accuracy": 0.8472926616668701, + "num_tokens": 140915451.0, + "step": 3692 + }, + { + "epoch": 0.46978755883475387, + "ewc_loss": 0.0216064453125, + "ewc_loss_parallel": 2.1576881408691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.413291931152344, + "learning_rate": 1e-06, + "loss": 0.47, + "mean_token_accuracy": 0.8513643145561218, + "num_tokens": 140956967.0, + "step": 3693 + }, + { + "epoch": 0.46991476911334434, + "ewc_loss": 0.021484375, + "ewc_loss_parallel": 2.1457672119140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.469770431518555, + "learning_rate": 1e-06, + "loss": 0.4118, + "mean_token_accuracy": 0.8684829473495483, + "num_tokens": 140996640.0, + "step": 3694 + }, + { + "epoch": 0.47004197939193487, + "ewc_loss": 0.0216064453125, + "ewc_loss_parallel": 2.1576881408691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.39784049987793, + "learning_rate": 1e-06, + "loss": 0.4663, + "mean_token_accuracy": 0.8524599671363831, + "num_tokens": 141036094.0, + "step": 3695 + }, + { + "epoch": 0.4701691896705254, + "ewc_loss": 0.0216064453125, + "ewc_loss_parallel": 2.1576881408691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.465957641601562, + "learning_rate": 1e-06, + "loss": 0.4909, + "mean_token_accuracy": 0.8428257703781128, + "num_tokens": 141069033.0, + "step": 3696 + }, + { + "epoch": 0.4702963999491159, + "ewc_loss": 0.021484375, + "ewc_loss_parallel": 2.1457672119140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.502052307128906, + "learning_rate": 1e-06, + "loss": 0.4026, + "mean_token_accuracy": 0.8706808090209961, + "num_tokens": 141111075.0, + "step": 3697 + }, + { + "epoch": 0.4704236102277064, + "ewc_loss": 0.021484375, + "ewc_loss_parallel": 2.1457672119140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.309297561645508, + "learning_rate": 1e-06, + "loss": 0.4062, + "mean_token_accuracy": 0.8680109977722168, + "num_tokens": 141145270.0, + "step": 3698 + }, + { + "epoch": 0.47055082050629693, + "ewc_loss": 0.0213623046875, + "ewc_loss_parallel": 2.1338462829589844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.41585922241211, + "learning_rate": 1e-06, + "loss": 0.4217, + "mean_token_accuracy": 0.8677413463592529, + "num_tokens": 141182132.0, + "step": 3699 + }, + { + "epoch": 0.4706780307848874, + "ewc_loss": 0.021484375, + "ewc_loss_parallel": 2.1457672119140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.46302604675293, + "learning_rate": 1e-06, + "loss": 0.4548, + "mean_token_accuracy": 0.8538509607315063, + "num_tokens": 141217455.0, + "step": 3700 + }, + { + "epoch": 0.47080524106347793, + "ewc_loss": 0.021484375, + "ewc_loss_parallel": 2.1457672119140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.487001419067383, + "learning_rate": 1e-06, + "loss": 0.4555, + "mean_token_accuracy": 0.8521084785461426, + "num_tokens": 141249680.0, + "step": 3701 + }, + { + "epoch": 0.47093245134206846, + "ewc_loss": 0.021484375, + "ewc_loss_parallel": 2.1457672119140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.46311378479004, + "learning_rate": 1e-06, + "loss": 0.4613, + "mean_token_accuracy": 0.8540545105934143, + "num_tokens": 141292085.0, + "step": 3702 + }, + { + "epoch": 0.47105966162065893, + "ewc_loss": 0.0213623046875, + "ewc_loss_parallel": 2.1338462829589844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.22612762451172, + "learning_rate": 1e-06, + "loss": 0.4321, + "mean_token_accuracy": 0.8621177673339844, + "num_tokens": 141331455.0, + "step": 3703 + }, + { + "epoch": 0.47118687189924946, + "ewc_loss": 0.021484375, + "ewc_loss_parallel": 2.1457672119140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.385112762451172, + "learning_rate": 1e-06, + "loss": 0.4511, + "mean_token_accuracy": 0.8546526432037354, + "num_tokens": 141365623.0, + "step": 3704 + }, + { + "epoch": 0.47131408217784, + "ewc_loss": 0.0216064453125, + "ewc_loss_parallel": 2.1576881408691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.43514633178711, + "learning_rate": 1e-06, + "loss": 0.4833, + "mean_token_accuracy": 0.8439902067184448, + "num_tokens": 141399033.0, + "step": 3705 + }, + { + "epoch": 0.47144129245643046, + "ewc_loss": 0.021484375, + "ewc_loss_parallel": 2.1457672119140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.40349578857422, + "learning_rate": 1e-06, + "loss": 0.4359, + "mean_token_accuracy": 0.8600848913192749, + "num_tokens": 141430436.0, + "step": 3706 + }, + { + "epoch": 0.471568502735021, + "ewc_loss": 0.021484375, + "ewc_loss_parallel": 2.1457672119140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.573348999023438, + "learning_rate": 1e-06, + "loss": 0.3972, + "mean_token_accuracy": 0.8706650733947754, + "num_tokens": 141463169.0, + "step": 3707 + }, + { + "epoch": 0.4716957130136115, + "ewc_loss": 0.021484375, + "ewc_loss_parallel": 2.1457672119140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.292081832885742, + "learning_rate": 1e-06, + "loss": 0.4005, + "mean_token_accuracy": 0.8716801404953003, + "num_tokens": 141495524.0, + "step": 3708 + }, + { + "epoch": 0.471822923292202, + "ewc_loss": 0.021484375, + "ewc_loss_parallel": 2.1457672119140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.415019989013672, + "learning_rate": 1e-06, + "loss": 0.429, + "mean_token_accuracy": 0.8652383089065552, + "num_tokens": 141533099.0, + "step": 3709 + }, + { + "epoch": 0.4719501335707925, + "ewc_loss": 0.0216064453125, + "ewc_loss_parallel": 2.1576881408691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.437158584594727, + "learning_rate": 1e-06, + "loss": 0.4388, + "mean_token_accuracy": 0.8591679334640503, + "num_tokens": 141576389.0, + "step": 3710 + }, + { + "epoch": 0.47207734384938305, + "ewc_loss": 0.0216064453125, + "ewc_loss_parallel": 2.1576881408691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.70675277709961, + "learning_rate": 1e-06, + "loss": 0.4377, + "mean_token_accuracy": 0.8560888767242432, + "num_tokens": 141611870.0, + "step": 3711 + }, + { + "epoch": 0.4722045541279735, + "ewc_loss": 0.021728515625, + "ewc_loss_parallel": 2.1696090698242188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.762916564941406, + "learning_rate": 1e-06, + "loss": 0.4249, + "mean_token_accuracy": 0.8656235933303833, + "num_tokens": 141648539.0, + "step": 3712 + }, + { + "epoch": 0.47233176440656405, + "ewc_loss": 0.021484375, + "ewc_loss_parallel": 2.1457672119140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.518722534179688, + "learning_rate": 1e-06, + "loss": 0.4494, + "mean_token_accuracy": 0.8583885431289673, + "num_tokens": 141688549.0, + "step": 3713 + }, + { + "epoch": 0.4724589746851546, + "ewc_loss": 0.0216064453125, + "ewc_loss_parallel": 2.1576881408691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.84644889831543, + "learning_rate": 1e-06, + "loss": 0.4262, + "mean_token_accuracy": 0.8637816905975342, + "num_tokens": 141725294.0, + "step": 3714 + }, + { + "epoch": 0.47258618496374505, + "ewc_loss": 0.021484375, + "ewc_loss_parallel": 2.1457672119140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.45863914489746, + "learning_rate": 1e-06, + "loss": 0.5018, + "mean_token_accuracy": 0.8387780785560608, + "num_tokens": 141756761.0, + "step": 3715 + }, + { + "epoch": 0.4727133952423356, + "ewc_loss": 0.0216064453125, + "ewc_loss_parallel": 2.1576881408691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.568500518798828, + "learning_rate": 1e-06, + "loss": 0.5083, + "mean_token_accuracy": 0.8381104469299316, + "num_tokens": 141790417.0, + "step": 3716 + }, + { + "epoch": 0.4728406055209261, + "ewc_loss": 0.0216064453125, + "ewc_loss_parallel": 2.1576881408691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.562198638916016, + "learning_rate": 1e-06, + "loss": 0.4242, + "mean_token_accuracy": 0.8659003376960754, + "num_tokens": 141830749.0, + "step": 3717 + }, + { + "epoch": 0.4729678157995166, + "ewc_loss": 0.021484375, + "ewc_loss_parallel": 2.1457672119140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.329954147338867, + "learning_rate": 1e-06, + "loss": 0.4476, + "mean_token_accuracy": 0.8576589822769165, + "num_tokens": 141868433.0, + "step": 3718 + }, + { + "epoch": 0.4730950260781071, + "ewc_loss": 0.0216064453125, + "ewc_loss_parallel": 2.1576881408691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.64348602294922, + "learning_rate": 1e-06, + "loss": 0.4141, + "mean_token_accuracy": 0.8725227117538452, + "num_tokens": 141907140.0, + "step": 3719 + }, + { + "epoch": 0.47322223635669763, + "ewc_loss": 0.0216064453125, + "ewc_loss_parallel": 2.1576881408691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.088809967041016, + "learning_rate": 1e-06, + "loss": 0.4084, + "mean_token_accuracy": 0.8683611750602722, + "num_tokens": 141947436.0, + "step": 3720 + }, + { + "epoch": 0.4733494466352881, + "ewc_loss": 0.021728515625, + "ewc_loss_parallel": 2.1696090698242188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.523883819580078, + "learning_rate": 1e-06, + "loss": 0.4748, + "mean_token_accuracy": 0.8486431837081909, + "num_tokens": 141986876.0, + "step": 3721 + }, + { + "epoch": 0.47347665691387864, + "ewc_loss": 0.021728515625, + "ewc_loss_parallel": 2.1696090698242188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.64885139465332, + "learning_rate": 1e-06, + "loss": 0.4464, + "mean_token_accuracy": 0.8597989082336426, + "num_tokens": 142025806.0, + "step": 3722 + }, + { + "epoch": 0.47360386719246916, + "ewc_loss": 0.0216064453125, + "ewc_loss_parallel": 2.1576881408691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.290802001953125, + "learning_rate": 1e-06, + "loss": 0.4564, + "mean_token_accuracy": 0.8537602424621582, + "num_tokens": 142057932.0, + "step": 3723 + }, + { + "epoch": 0.47373107747105964, + "ewc_loss": 0.0216064453125, + "ewc_loss_parallel": 2.1576881408691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.561687469482422, + "learning_rate": 1e-06, + "loss": 0.4358, + "mean_token_accuracy": 0.8583896160125732, + "num_tokens": 142088880.0, + "step": 3724 + }, + { + "epoch": 0.47385828774965016, + "ewc_loss": 0.0218505859375, + "ewc_loss_parallel": 2.181529998779297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.49445152282715, + "learning_rate": 1e-06, + "loss": 0.4135, + "mean_token_accuracy": 0.8683962821960449, + "num_tokens": 142124959.0, + "step": 3725 + }, + { + "epoch": 0.4739854980282407, + "ewc_loss": 0.0216064453125, + "ewc_loss_parallel": 2.1576881408691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.410329818725586, + "learning_rate": 1e-06, + "loss": 0.5199, + "mean_token_accuracy": 0.8410536050796509, + "num_tokens": 142160667.0, + "step": 3726 + }, + { + "epoch": 0.47411270830683117, + "ewc_loss": 0.0218505859375, + "ewc_loss_parallel": 2.181529998779297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.295759201049805, + "learning_rate": 1e-06, + "loss": 0.4583, + "mean_token_accuracy": 0.8544327020645142, + "num_tokens": 142205017.0, + "step": 3727 + }, + { + "epoch": 0.4742399185854217, + "ewc_loss": 0.02197265625, + "ewc_loss_parallel": 2.193450927734375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.470809936523438, + "learning_rate": 1e-06, + "loss": 0.4293, + "mean_token_accuracy": 0.8619561791419983, + "num_tokens": 142241479.0, + "step": 3728 + }, + { + "epoch": 0.4743671288640122, + "ewc_loss": 0.0218505859375, + "ewc_loss_parallel": 2.181529998779297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.505107879638672, + "learning_rate": 1e-06, + "loss": 0.4474, + "mean_token_accuracy": 0.8570393323898315, + "num_tokens": 142280881.0, + "step": 3729 + }, + { + "epoch": 0.4744943391426027, + "ewc_loss": 0.02197265625, + "ewc_loss_parallel": 2.193450927734375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.446088790893555, + "learning_rate": 1e-06, + "loss": 0.4291, + "mean_token_accuracy": 0.8627824783325195, + "num_tokens": 142328248.0, + "step": 3730 + }, + { + "epoch": 0.4746215494211932, + "ewc_loss": 0.0220947265625, + "ewc_loss_parallel": 2.205371856689453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.5501766204834, + "learning_rate": 1e-06, + "loss": 0.5212, + "mean_token_accuracy": 0.8392757177352905, + "num_tokens": 142364872.0, + "step": 3731 + }, + { + "epoch": 0.47474875969978375, + "ewc_loss": 0.021728515625, + "ewc_loss_parallel": 2.1696090698242188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.43788719177246, + "learning_rate": 1e-06, + "loss": 0.5113, + "mean_token_accuracy": 0.8376507759094238, + "num_tokens": 142400494.0, + "step": 3732 + }, + { + "epoch": 0.4748759699783743, + "ewc_loss": 0.021728515625, + "ewc_loss_parallel": 2.1696090698242188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.571767807006836, + "learning_rate": 1e-06, + "loss": 0.4558, + "mean_token_accuracy": 0.8514928221702576, + "num_tokens": 142433739.0, + "step": 3733 + }, + { + "epoch": 0.47500318025696475, + "ewc_loss": 0.02197265625, + "ewc_loss_parallel": 2.193450927734375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.7551212310791, + "learning_rate": 1e-06, + "loss": 0.4256, + "mean_token_accuracy": 0.8639253377914429, + "num_tokens": 142469777.0, + "step": 3734 + }, + { + "epoch": 0.4751303905355553, + "ewc_loss": 0.021484375, + "ewc_loss_parallel": 2.1457672119140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.53758430480957, + "learning_rate": 1e-06, + "loss": 0.4893, + "mean_token_accuracy": 0.8458393812179565, + "num_tokens": 142510052.0, + "step": 3735 + }, + { + "epoch": 0.4752576008141458, + "ewc_loss": 0.0213623046875, + "ewc_loss_parallel": 2.1338462829589844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.384754180908203, + "learning_rate": 1e-06, + "loss": 0.4984, + "mean_token_accuracy": 0.8433501124382019, + "num_tokens": 142545689.0, + "step": 3736 + }, + { + "epoch": 0.4753848110927363, + "ewc_loss": 0.021728515625, + "ewc_loss_parallel": 2.1696090698242188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.699094772338867, + "learning_rate": 1e-06, + "loss": 0.4471, + "mean_token_accuracy": 0.8554363250732422, + "num_tokens": 142585916.0, + "step": 3737 + }, + { + "epoch": 0.4755120213713268, + "ewc_loss": 0.0216064453125, + "ewc_loss_parallel": 2.1576881408691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.340185165405273, + "learning_rate": 1e-06, + "loss": 0.4268, + "mean_token_accuracy": 0.8621669411659241, + "num_tokens": 142623019.0, + "step": 3738 + }, + { + "epoch": 0.47563923164991734, + "ewc_loss": 0.021484375, + "ewc_loss_parallel": 2.1457672119140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.456735610961914, + "learning_rate": 1e-06, + "loss": 0.4559, + "mean_token_accuracy": 0.8518297672271729, + "num_tokens": 142660986.0, + "step": 3739 + }, + { + "epoch": 0.4757664419285078, + "ewc_loss": 0.0218505859375, + "ewc_loss_parallel": 2.181529998779297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.616268157958984, + "learning_rate": 1e-06, + "loss": 0.448, + "mean_token_accuracy": 0.8582711219787598, + "num_tokens": 142702876.0, + "step": 3740 + }, + { + "epoch": 0.47589365220709834, + "ewc_loss": 0.021484375, + "ewc_loss_parallel": 2.1457672119140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.397192001342773, + "learning_rate": 1e-06, + "loss": 0.4976, + "mean_token_accuracy": 0.844494640827179, + "num_tokens": 142744082.0, + "step": 3741 + }, + { + "epoch": 0.47602086248568887, + "ewc_loss": 0.021484375, + "ewc_loss_parallel": 2.1457672119140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.459314346313477, + "learning_rate": 1e-06, + "loss": 0.5075, + "mean_token_accuracy": 0.8357951641082764, + "num_tokens": 142777511.0, + "step": 3742 + }, + { + "epoch": 0.47614807276427934, + "ewc_loss": 0.021728515625, + "ewc_loss_parallel": 2.1696090698242188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.48257064819336, + "learning_rate": 1e-06, + "loss": 0.4793, + "mean_token_accuracy": 0.8510082960128784, + "num_tokens": 142818882.0, + "step": 3743 + }, + { + "epoch": 0.47627528304286987, + "ewc_loss": 0.021728515625, + "ewc_loss_parallel": 2.1696090698242188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.383848190307617, + "learning_rate": 1e-06, + "loss": 0.4423, + "mean_token_accuracy": 0.8584319353103638, + "num_tokens": 142852771.0, + "step": 3744 + }, + { + "epoch": 0.4764024933214604, + "ewc_loss": 0.021728515625, + "ewc_loss_parallel": 2.1696090698242188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.31858253479004, + "learning_rate": 1e-06, + "loss": 0.4529, + "mean_token_accuracy": 0.8561521768569946, + "num_tokens": 142888398.0, + "step": 3745 + }, + { + "epoch": 0.47652970360005087, + "ewc_loss": 0.0216064453125, + "ewc_loss_parallel": 2.1576881408691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.290586471557617, + "learning_rate": 1e-06, + "loss": 0.407, + "mean_token_accuracy": 0.8709046840667725, + "num_tokens": 142924789.0, + "step": 3746 + }, + { + "epoch": 0.4766569138786414, + "ewc_loss": 0.0220947265625, + "ewc_loss_parallel": 2.205371856689453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.45391082763672, + "learning_rate": 1e-06, + "loss": 0.4258, + "mean_token_accuracy": 0.8625349998474121, + "num_tokens": 142958843.0, + "step": 3747 + }, + { + "epoch": 0.4767841241572319, + "ewc_loss": 0.0216064453125, + "ewc_loss_parallel": 2.1576881408691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.363143920898438, + "learning_rate": 1e-06, + "loss": 0.4153, + "mean_token_accuracy": 0.8661540746688843, + "num_tokens": 142996408.0, + "step": 3748 + }, + { + "epoch": 0.4769113344358224, + "ewc_loss": 0.0218505859375, + "ewc_loss_parallel": 2.181529998779297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.48565673828125, + "learning_rate": 1e-06, + "loss": 0.4794, + "mean_token_accuracy": 0.84815514087677, + "num_tokens": 143040031.0, + "step": 3749 + }, + { + "epoch": 0.4770385447144129, + "ewc_loss": 0.0216064453125, + "ewc_loss_parallel": 2.1576881408691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.452863693237305, + "learning_rate": 1e-06, + "loss": 0.5223, + "mean_token_accuracy": 0.8336314558982849, + "num_tokens": 143081648.0, + "step": 3750 + }, + { + "epoch": 0.47716575499300345, + "ewc_loss": 0.021728515625, + "ewc_loss_parallel": 2.1696090698242188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.438825607299805, + "learning_rate": 1e-06, + "loss": 0.5168, + "mean_token_accuracy": 0.8345085978507996, + "num_tokens": 143118913.0, + "step": 3751 + }, + { + "epoch": 0.4772929652715939, + "ewc_loss": 0.021728515625, + "ewc_loss_parallel": 2.1696090698242188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.359275817871094, + "learning_rate": 1e-06, + "loss": 0.4318, + "mean_token_accuracy": 0.8639897704124451, + "num_tokens": 143153858.0, + "step": 3752 + }, + { + "epoch": 0.47742017555018446, + "ewc_loss": 0.021728515625, + "ewc_loss_parallel": 2.1696090698242188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.57571029663086, + "learning_rate": 1e-06, + "loss": 0.4081, + "mean_token_accuracy": 0.8658175468444824, + "num_tokens": 143190977.0, + "step": 3753 + }, + { + "epoch": 0.477547385828775, + "ewc_loss": 0.021728515625, + "ewc_loss_parallel": 2.1696090698242188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.48794174194336, + "learning_rate": 1e-06, + "loss": 0.4789, + "mean_token_accuracy": 0.8474367260932922, + "num_tokens": 143228009.0, + "step": 3754 + }, + { + "epoch": 0.47767459610736546, + "ewc_loss": 0.0216064453125, + "ewc_loss_parallel": 2.1576881408691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.445890426635742, + "learning_rate": 1e-06, + "loss": 0.4833, + "mean_token_accuracy": 0.8443036079406738, + "num_tokens": 143269820.0, + "step": 3755 + }, + { + "epoch": 0.477801806385956, + "ewc_loss": 0.0218505859375, + "ewc_loss_parallel": 2.181529998779297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.354888916015625, + "learning_rate": 1e-06, + "loss": 0.4438, + "mean_token_accuracy": 0.8595455288887024, + "num_tokens": 143303441.0, + "step": 3756 + }, + { + "epoch": 0.4779290166645465, + "ewc_loss": 0.0216064453125, + "ewc_loss_parallel": 2.1576881408691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.189844131469727, + "learning_rate": 1e-06, + "loss": 0.48, + "mean_token_accuracy": 0.8465537428855896, + "num_tokens": 143338748.0, + "step": 3757 + }, + { + "epoch": 0.478056226943137, + "ewc_loss": 0.0218505859375, + "ewc_loss_parallel": 2.181529998779297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.5264892578125, + "learning_rate": 1e-06, + "loss": 0.4643, + "mean_token_accuracy": 0.8556724786758423, + "num_tokens": 143373900.0, + "step": 3758 + }, + { + "epoch": 0.4781834372217275, + "ewc_loss": 0.02197265625, + "ewc_loss_parallel": 2.193450927734375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.444730758666992, + "learning_rate": 1e-06, + "loss": 0.4738, + "mean_token_accuracy": 0.8499802350997925, + "num_tokens": 143411190.0, + "step": 3759 + }, + { + "epoch": 0.47831064750031804, + "ewc_loss": 0.021728515625, + "ewc_loss_parallel": 2.1696090698242188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.33742332458496, + "learning_rate": 1e-06, + "loss": 0.4609, + "mean_token_accuracy": 0.851635754108429, + "num_tokens": 143449470.0, + "step": 3760 + }, + { + "epoch": 0.4784378577789085, + "ewc_loss": 0.021728515625, + "ewc_loss_parallel": 2.1696090698242188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.452478408813477, + "learning_rate": 1e-06, + "loss": 0.4461, + "mean_token_accuracy": 0.8612035512924194, + "num_tokens": 143485868.0, + "step": 3761 + }, + { + "epoch": 0.47856506805749904, + "ewc_loss": 0.02197265625, + "ewc_loss_parallel": 2.193450927734375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.371950149536133, + "learning_rate": 1e-06, + "loss": 0.4065, + "mean_token_accuracy": 0.8642823100090027, + "num_tokens": 143520334.0, + "step": 3762 + }, + { + "epoch": 0.47869227833608957, + "ewc_loss": 0.02197265625, + "ewc_loss_parallel": 2.193450927734375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.34642219543457, + "learning_rate": 1e-06, + "loss": 0.4785, + "mean_token_accuracy": 0.8478089570999146, + "num_tokens": 143557752.0, + "step": 3763 + }, + { + "epoch": 0.47881948861468004, + "ewc_loss": 0.022216796875, + "ewc_loss_parallel": 2.2172927856445312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.425317764282227, + "learning_rate": 1e-06, + "loss": 0.4664, + "mean_token_accuracy": 0.8547945022583008, + "num_tokens": 143593385.0, + "step": 3764 + }, + { + "epoch": 0.4789466988932706, + "ewc_loss": 0.0216064453125, + "ewc_loss_parallel": 2.1576881408691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.35683822631836, + "learning_rate": 1e-06, + "loss": 0.5219, + "mean_token_accuracy": 0.8346637487411499, + "num_tokens": 143631975.0, + "step": 3765 + }, + { + "epoch": 0.4790739091718611, + "ewc_loss": 0.0218505859375, + "ewc_loss_parallel": 2.181529998779297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.375804901123047, + "learning_rate": 1e-06, + "loss": 0.5071, + "mean_token_accuracy": 0.8376442193984985, + "num_tokens": 143668382.0, + "step": 3766 + }, + { + "epoch": 0.4792011194504516, + "ewc_loss": 0.0218505859375, + "ewc_loss_parallel": 2.181529998779297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.36688804626465, + "learning_rate": 1e-06, + "loss": 0.4993, + "mean_token_accuracy": 0.8402446508407593, + "num_tokens": 143706623.0, + "step": 3767 + }, + { + "epoch": 0.4793283297290421, + "ewc_loss": 0.02197265625, + "ewc_loss_parallel": 2.193450927734375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.51511573791504, + "learning_rate": 1e-06, + "loss": 0.4428, + "mean_token_accuracy": 0.8575412631034851, + "num_tokens": 143745664.0, + "step": 3768 + }, + { + "epoch": 0.47945554000763263, + "ewc_loss": 0.02197265625, + "ewc_loss_parallel": 2.193450927734375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.426612854003906, + "learning_rate": 1e-06, + "loss": 0.4767, + "mean_token_accuracy": 0.8457323908805847, + "num_tokens": 143780761.0, + "step": 3769 + }, + { + "epoch": 0.4795827502862231, + "ewc_loss": 0.02197265625, + "ewc_loss_parallel": 2.193450927734375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.411121368408203, + "learning_rate": 1e-06, + "loss": 0.4367, + "mean_token_accuracy": 0.8592367768287659, + "num_tokens": 143825298.0, + "step": 3770 + }, + { + "epoch": 0.47970996056481363, + "ewc_loss": 0.0218505859375, + "ewc_loss_parallel": 2.181529998779297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.422494888305664, + "learning_rate": 1e-06, + "loss": 0.3949, + "mean_token_accuracy": 0.8743045926094055, + "num_tokens": 143859855.0, + "step": 3771 + }, + { + "epoch": 0.47983717084340416, + "ewc_loss": 0.0218505859375, + "ewc_loss_parallel": 2.181529998779297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.4554386138916, + "learning_rate": 1e-06, + "loss": 0.4068, + "mean_token_accuracy": 0.8684006929397583, + "num_tokens": 143898553.0, + "step": 3772 + }, + { + "epoch": 0.47996438112199463, + "ewc_loss": 0.02197265625, + "ewc_loss_parallel": 2.193450927734375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.41462516784668, + "learning_rate": 1e-06, + "loss": 0.4669, + "mean_token_accuracy": 0.8520252108573914, + "num_tokens": 143935362.0, + "step": 3773 + }, + { + "epoch": 0.48009159140058516, + "ewc_loss": 0.0220947265625, + "ewc_loss_parallel": 2.205371856689453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.71510124206543, + "learning_rate": 1e-06, + "loss": 0.4498, + "mean_token_accuracy": 0.855720043182373, + "num_tokens": 143978692.0, + "step": 3774 + }, + { + "epoch": 0.4802188016791757, + "ewc_loss": 0.0218505859375, + "ewc_loss_parallel": 2.181529998779297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.55641746520996, + "learning_rate": 1e-06, + "loss": 0.4397, + "mean_token_accuracy": 0.8631781339645386, + "num_tokens": 144020951.0, + "step": 3775 + }, + { + "epoch": 0.48034601195776616, + "ewc_loss": 0.021728515625, + "ewc_loss_parallel": 2.1696090698242188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.52734375, + "learning_rate": 1e-06, + "loss": 0.4344, + "mean_token_accuracy": 0.8638656139373779, + "num_tokens": 144058341.0, + "step": 3776 + }, + { + "epoch": 0.4804732222363567, + "ewc_loss": 0.02197265625, + "ewc_loss_parallel": 2.193450927734375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.691242218017578, + "learning_rate": 1e-06, + "loss": 0.4754, + "mean_token_accuracy": 0.847907543182373, + "num_tokens": 144098232.0, + "step": 3777 + }, + { + "epoch": 0.4806004325149472, + "ewc_loss": 0.0218505859375, + "ewc_loss_parallel": 2.181529998779297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.250102996826172, + "learning_rate": 1e-06, + "loss": 0.4733, + "mean_token_accuracy": 0.8501179814338684, + "num_tokens": 144137537.0, + "step": 3778 + }, + { + "epoch": 0.4807276427935377, + "ewc_loss": 0.021484375, + "ewc_loss_parallel": 2.1457672119140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.880992889404297, + "learning_rate": 1e-06, + "loss": 0.4345, + "mean_token_accuracy": 0.8597549796104431, + "num_tokens": 144178459.0, + "step": 3779 + }, + { + "epoch": 0.4808548530721282, + "ewc_loss": 0.0218505859375, + "ewc_loss_parallel": 2.181529998779297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.287731170654297, + "learning_rate": 1e-06, + "loss": 0.4781, + "mean_token_accuracy": 0.8494590520858765, + "num_tokens": 144219257.0, + "step": 3780 + }, + { + "epoch": 0.48098206335071875, + "ewc_loss": 0.021728515625, + "ewc_loss_parallel": 2.1696090698242188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.821435928344727, + "learning_rate": 1e-06, + "loss": 0.4321, + "mean_token_accuracy": 0.8602787256240845, + "num_tokens": 144262232.0, + "step": 3781 + }, + { + "epoch": 0.4811092736293093, + "ewc_loss": 0.02197265625, + "ewc_loss_parallel": 2.193450927734375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.4659481048584, + "learning_rate": 1e-06, + "loss": 0.516, + "mean_token_accuracy": 0.8328800797462463, + "num_tokens": 144297144.0, + "step": 3782 + }, + { + "epoch": 0.48123648390789975, + "ewc_loss": 0.021728515625, + "ewc_loss_parallel": 2.1696090698242188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.332691192626953, + "learning_rate": 1e-06, + "loss": 0.4621, + "mean_token_accuracy": 0.8549872040748596, + "num_tokens": 144336078.0, + "step": 3783 + }, + { + "epoch": 0.4813636941864903, + "ewc_loss": 0.0218505859375, + "ewc_loss_parallel": 2.181529998779297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.560531616210938, + "learning_rate": 1e-06, + "loss": 0.485, + "mean_token_accuracy": 0.8477784395217896, + "num_tokens": 144380162.0, + "step": 3784 + }, + { + "epoch": 0.4814909044650808, + "ewc_loss": 0.0216064453125, + "ewc_loss_parallel": 2.1576881408691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.39871597290039, + "learning_rate": 1e-06, + "loss": 0.4992, + "mean_token_accuracy": 0.8399089574813843, + "num_tokens": 144416735.0, + "step": 3785 + }, + { + "epoch": 0.4816181147436713, + "ewc_loss": 0.0218505859375, + "ewc_loss_parallel": 2.181529998779297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.42261505126953, + "learning_rate": 1e-06, + "loss": 0.4751, + "mean_token_accuracy": 0.8430145978927612, + "num_tokens": 144452780.0, + "step": 3786 + }, + { + "epoch": 0.4817453250222618, + "ewc_loss": 0.0218505859375, + "ewc_loss_parallel": 2.181529998779297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.731395721435547, + "learning_rate": 1e-06, + "loss": 0.4328, + "mean_token_accuracy": 0.8605907559394836, + "num_tokens": 144494155.0, + "step": 3787 + }, + { + "epoch": 0.48187253530085233, + "ewc_loss": 0.0216064453125, + "ewc_loss_parallel": 2.1576881408691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.362119674682617, + "learning_rate": 1e-06, + "loss": 0.4226, + "mean_token_accuracy": 0.8641393780708313, + "num_tokens": 144533989.0, + "step": 3788 + }, + { + "epoch": 0.4819997455794428, + "ewc_loss": 0.02197265625, + "ewc_loss_parallel": 2.193450927734375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.42619514465332, + "learning_rate": 1e-06, + "loss": 0.461, + "mean_token_accuracy": 0.8510019779205322, + "num_tokens": 144577409.0, + "step": 3789 + }, + { + "epoch": 0.48212695585803333, + "ewc_loss": 0.0218505859375, + "ewc_loss_parallel": 2.181529998779297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.43623161315918, + "learning_rate": 1e-06, + "loss": 0.4691, + "mean_token_accuracy": 0.8469794988632202, + "num_tokens": 144618501.0, + "step": 3790 + }, + { + "epoch": 0.48225416613662386, + "ewc_loss": 0.02197265625, + "ewc_loss_parallel": 2.193450927734375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.573238372802734, + "learning_rate": 1e-06, + "loss": 0.436, + "mean_token_accuracy": 0.8622788786888123, + "num_tokens": 144659454.0, + "step": 3791 + }, + { + "epoch": 0.48238137641521434, + "ewc_loss": 0.02197265625, + "ewc_loss_parallel": 2.193450927734375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.50578498840332, + "learning_rate": 1e-06, + "loss": 0.4351, + "mean_token_accuracy": 0.859733521938324, + "num_tokens": 144695580.0, + "step": 3792 + }, + { + "epoch": 0.48250858669380486, + "ewc_loss": 0.0218505859375, + "ewc_loss_parallel": 2.181529998779297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.43831443786621, + "learning_rate": 1e-06, + "loss": 0.5133, + "mean_token_accuracy": 0.8382100462913513, + "num_tokens": 144735586.0, + "step": 3793 + }, + { + "epoch": 0.4826357969723954, + "ewc_loss": 0.021484375, + "ewc_loss_parallel": 2.1457672119140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.363630294799805, + "learning_rate": 1e-06, + "loss": 0.4467, + "mean_token_accuracy": 0.8552823662757874, + "num_tokens": 144776371.0, + "step": 3794 + }, + { + "epoch": 0.48276300725098586, + "ewc_loss": 0.0220947265625, + "ewc_loss_parallel": 2.205371856689453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.39527702331543, + "learning_rate": 1e-06, + "loss": 0.4845, + "mean_token_accuracy": 0.8490489721298218, + "num_tokens": 144816724.0, + "step": 3795 + }, + { + "epoch": 0.4828902175295764, + "ewc_loss": 0.0218505859375, + "ewc_loss_parallel": 2.181529998779297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.652393341064453, + "learning_rate": 1e-06, + "loss": 0.4434, + "mean_token_accuracy": 0.8613848090171814, + "num_tokens": 144854591.0, + "step": 3796 + }, + { + "epoch": 0.4830174278081669, + "ewc_loss": 0.02197265625, + "ewc_loss_parallel": 2.193450927734375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.359472274780273, + "learning_rate": 1e-06, + "loss": 0.4422, + "mean_token_accuracy": 0.8564972877502441, + "num_tokens": 144893694.0, + "step": 3797 + }, + { + "epoch": 0.4831446380867574, + "ewc_loss": 0.02197265625, + "ewc_loss_parallel": 2.193450927734375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.410860061645508, + "learning_rate": 1e-06, + "loss": 0.4657, + "mean_token_accuracy": 0.8515003323554993, + "num_tokens": 144932985.0, + "step": 3798 + }, + { + "epoch": 0.4832718483653479, + "ewc_loss": 0.0218505859375, + "ewc_loss_parallel": 2.181529998779297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.563491821289062, + "learning_rate": 1e-06, + "loss": 0.4603, + "mean_token_accuracy": 0.8540581464767456, + "num_tokens": 144967823.0, + "step": 3799 + }, + { + "epoch": 0.48339905864393845, + "ewc_loss": 0.02197265625, + "ewc_loss_parallel": 2.193450927734375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.418323516845703, + "learning_rate": 1e-06, + "loss": 0.3991, + "mean_token_accuracy": 0.8745023608207703, + "num_tokens": 145007750.0, + "step": 3800 + }, + { + "epoch": 0.4835262689225289, + "ewc_loss": 0.0216064453125, + "ewc_loss_parallel": 2.1576881408691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.45868492126465, + "learning_rate": 1e-06, + "loss": 0.4674, + "mean_token_accuracy": 0.8502047061920166, + "num_tokens": 145048596.0, + "step": 3801 + }, + { + "epoch": 0.48365347920111945, + "ewc_loss": 0.0220947265625, + "ewc_loss_parallel": 2.205371856689453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.550132751464844, + "learning_rate": 1e-06, + "loss": 0.4832, + "mean_token_accuracy": 0.8444157838821411, + "num_tokens": 145080868.0, + "step": 3802 + }, + { + "epoch": 0.48378068947971, + "ewc_loss": 0.0218505859375, + "ewc_loss_parallel": 2.181529998779297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.33970069885254, + "learning_rate": 1e-06, + "loss": 0.4243, + "mean_token_accuracy": 0.8644185066223145, + "num_tokens": 145115865.0, + "step": 3803 + }, + { + "epoch": 0.48390789975830045, + "ewc_loss": 0.02197265625, + "ewc_loss_parallel": 2.193450927734375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.544031143188477, + "learning_rate": 1e-06, + "loss": 0.4093, + "mean_token_accuracy": 0.8681001663208008, + "num_tokens": 145151748.0, + "step": 3804 + }, + { + "epoch": 0.484035110036891, + "ewc_loss": 0.0218505859375, + "ewc_loss_parallel": 2.181529998779297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.263046264648438, + "learning_rate": 1e-06, + "loss": 0.4581, + "mean_token_accuracy": 0.8561809062957764, + "num_tokens": 145187074.0, + "step": 3805 + }, + { + "epoch": 0.4841623203154815, + "ewc_loss": 0.0218505859375, + "ewc_loss_parallel": 2.181529998779297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.63372802734375, + "learning_rate": 1e-06, + "loss": 0.393, + "mean_token_accuracy": 0.8722050189971924, + "num_tokens": 145224855.0, + "step": 3806 + }, + { + "epoch": 0.484289530594072, + "ewc_loss": 0.02197265625, + "ewc_loss_parallel": 2.193450927734375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.466222763061523, + "learning_rate": 1e-06, + "loss": 0.4202, + "mean_token_accuracy": 0.8641870021820068, + "num_tokens": 145266538.0, + "step": 3807 + }, + { + "epoch": 0.4844167408726625, + "ewc_loss": 0.02197265625, + "ewc_loss_parallel": 2.193450927734375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.457931518554688, + "learning_rate": 1e-06, + "loss": 0.4299, + "mean_token_accuracy": 0.8601822853088379, + "num_tokens": 145303560.0, + "step": 3808 + }, + { + "epoch": 0.48454395115125304, + "ewc_loss": 0.0220947265625, + "ewc_loss_parallel": 2.205371856689453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.427047729492188, + "learning_rate": 1e-06, + "loss": 0.4491, + "mean_token_accuracy": 0.8553746342658997, + "num_tokens": 145345985.0, + "step": 3809 + }, + { + "epoch": 0.4846711614298435, + "ewc_loss": 0.02197265625, + "ewc_loss_parallel": 2.193450927734375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.35649871826172, + "learning_rate": 1e-06, + "loss": 0.4936, + "mean_token_accuracy": 0.8451594710350037, + "num_tokens": 145382132.0, + "step": 3810 + }, + { + "epoch": 0.48479837170843404, + "ewc_loss": 0.022216796875, + "ewc_loss_parallel": 2.2172927856445312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.6015625, + "learning_rate": 1e-06, + "loss": 0.4875, + "mean_token_accuracy": 0.8458104133605957, + "num_tokens": 145418992.0, + "step": 3811 + }, + { + "epoch": 0.48492558198702457, + "ewc_loss": 0.02197265625, + "ewc_loss_parallel": 2.193450927734375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.42499542236328, + "learning_rate": 1e-06, + "loss": 0.4628, + "mean_token_accuracy": 0.8540955781936646, + "num_tokens": 145455892.0, + "step": 3812 + }, + { + "epoch": 0.48505279226561504, + "ewc_loss": 0.02197265625, + "ewc_loss_parallel": 2.193450927734375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.517305374145508, + "learning_rate": 1e-06, + "loss": 0.4643, + "mean_token_accuracy": 0.8526715040206909, + "num_tokens": 145497217.0, + "step": 3813 + }, + { + "epoch": 0.48518000254420557, + "ewc_loss": 0.0223388671875, + "ewc_loss_parallel": 2.2292137145996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.352554321289062, + "learning_rate": 1e-06, + "loss": 0.4382, + "mean_token_accuracy": 0.862149715423584, + "num_tokens": 145533822.0, + "step": 3814 + }, + { + "epoch": 0.4853072128227961, + "ewc_loss": 0.02197265625, + "ewc_loss_parallel": 2.193450927734375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.418344497680664, + "learning_rate": 1e-06, + "loss": 0.39, + "mean_token_accuracy": 0.8746916055679321, + "num_tokens": 145572967.0, + "step": 3815 + }, + { + "epoch": 0.48543442310138657, + "ewc_loss": 0.02197265625, + "ewc_loss_parallel": 2.193450927734375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.348093032836914, + "learning_rate": 1e-06, + "loss": 0.4117, + "mean_token_accuracy": 0.8683837056159973, + "num_tokens": 145609436.0, + "step": 3816 + }, + { + "epoch": 0.4855616333799771, + "ewc_loss": 0.0220947265625, + "ewc_loss_parallel": 2.205371856689453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.628990173339844, + "learning_rate": 1e-06, + "loss": 0.4126, + "mean_token_accuracy": 0.8666961789131165, + "num_tokens": 145645083.0, + "step": 3817 + }, + { + "epoch": 0.4856888436585676, + "ewc_loss": 0.02197265625, + "ewc_loss_parallel": 2.193450927734375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.486373901367188, + "learning_rate": 1e-06, + "loss": 0.5347, + "mean_token_accuracy": 0.8368019461631775, + "num_tokens": 145680249.0, + "step": 3818 + }, + { + "epoch": 0.4858160539371581, + "ewc_loss": 0.02197265625, + "ewc_loss_parallel": 2.193450927734375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.4053897857666, + "learning_rate": 1e-06, + "loss": 0.4514, + "mean_token_accuracy": 0.8606963753700256, + "num_tokens": 145718569.0, + "step": 3819 + }, + { + "epoch": 0.4859432642157486, + "ewc_loss": 0.0218505859375, + "ewc_loss_parallel": 2.181529998779297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.509248733520508, + "learning_rate": 1e-06, + "loss": 0.4963, + "mean_token_accuracy": 0.8389052748680115, + "num_tokens": 145751868.0, + "step": 3820 + }, + { + "epoch": 0.48607047449433916, + "ewc_loss": 0.022216796875, + "ewc_loss_parallel": 2.2172927856445312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.45081901550293, + "learning_rate": 1e-06, + "loss": 0.4355, + "mean_token_accuracy": 0.8597015142440796, + "num_tokens": 145787881.0, + "step": 3821 + }, + { + "epoch": 0.48619768477292963, + "ewc_loss": 0.022216796875, + "ewc_loss_parallel": 2.2172927856445312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.575054168701172, + "learning_rate": 1e-06, + "loss": 0.4339, + "mean_token_accuracy": 0.862757682800293, + "num_tokens": 145824010.0, + "step": 3822 + }, + { + "epoch": 0.48632489505152016, + "ewc_loss": 0.0220947265625, + "ewc_loss_parallel": 2.205371856689453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.677637100219727, + "learning_rate": 1e-06, + "loss": 0.4311, + "mean_token_accuracy": 0.8624039888381958, + "num_tokens": 145860981.0, + "step": 3823 + }, + { + "epoch": 0.4864521053301107, + "ewc_loss": 0.0218505859375, + "ewc_loss_parallel": 2.181529998779297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.336957931518555, + "learning_rate": 1e-06, + "loss": 0.4405, + "mean_token_accuracy": 0.859805166721344, + "num_tokens": 145900687.0, + "step": 3824 + }, + { + "epoch": 0.48657931560870116, + "ewc_loss": 0.02197265625, + "ewc_loss_parallel": 2.193450927734375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.589887619018555, + "learning_rate": 1e-06, + "loss": 0.4981, + "mean_token_accuracy": 0.8475907444953918, + "num_tokens": 145938888.0, + "step": 3825 + }, + { + "epoch": 0.4867065258872917, + "ewc_loss": 0.022216796875, + "ewc_loss_parallel": 2.2172927856445312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.442378997802734, + "learning_rate": 1e-06, + "loss": 0.463, + "mean_token_accuracy": 0.8509622812271118, + "num_tokens": 145987060.0, + "step": 3826 + }, + { + "epoch": 0.4868337361658822, + "ewc_loss": 0.021728515625, + "ewc_loss_parallel": 2.1696090698242188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.5310115814209, + "learning_rate": 1e-06, + "loss": 0.4735, + "mean_token_accuracy": 0.8511056303977966, + "num_tokens": 146026447.0, + "step": 3827 + }, + { + "epoch": 0.4869609464444727, + "ewc_loss": 0.02197265625, + "ewc_loss_parallel": 2.193450927734375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.601024627685547, + "learning_rate": 1e-06, + "loss": 0.4618, + "mean_token_accuracy": 0.857792854309082, + "num_tokens": 146067099.0, + "step": 3828 + }, + { + "epoch": 0.4870881567230632, + "ewc_loss": 0.0220947265625, + "ewc_loss_parallel": 2.205371856689453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.32784080505371, + "learning_rate": 1e-06, + "loss": 0.4922, + "mean_token_accuracy": 0.842606246471405, + "num_tokens": 146110767.0, + "step": 3829 + }, + { + "epoch": 0.48721536700165374, + "ewc_loss": 0.0220947265625, + "ewc_loss_parallel": 2.205371856689453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.57486915588379, + "learning_rate": 1e-06, + "loss": 0.4712, + "mean_token_accuracy": 0.8479692935943604, + "num_tokens": 146147603.0, + "step": 3830 + }, + { + "epoch": 0.48734257728024427, + "ewc_loss": 0.022216796875, + "ewc_loss_parallel": 2.2172927856445312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.352832794189453, + "learning_rate": 1e-06, + "loss": 0.4267, + "mean_token_accuracy": 0.8653949499130249, + "num_tokens": 146188821.0, + "step": 3831 + }, + { + "epoch": 0.48746978755883474, + "ewc_loss": 0.0223388671875, + "ewc_loss_parallel": 2.2292137145996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.676265716552734, + "learning_rate": 1e-06, + "loss": 0.4644, + "mean_token_accuracy": 0.8531167507171631, + "num_tokens": 146231886.0, + "step": 3832 + }, + { + "epoch": 0.48759699783742527, + "ewc_loss": 0.022216796875, + "ewc_loss_parallel": 2.2172927856445312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.55567741394043, + "learning_rate": 1e-06, + "loss": 0.5183, + "mean_token_accuracy": 0.833519697189331, + "num_tokens": 146271965.0, + "step": 3833 + }, + { + "epoch": 0.4877242081160158, + "ewc_loss": 0.0220947265625, + "ewc_loss_parallel": 2.205371856689453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.437097549438477, + "learning_rate": 1e-06, + "loss": 0.5015, + "mean_token_accuracy": 0.8426486253738403, + "num_tokens": 146316686.0, + "step": 3834 + }, + { + "epoch": 0.4878514183946063, + "ewc_loss": 0.02197265625, + "ewc_loss_parallel": 2.193450927734375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.38852310180664, + "learning_rate": 1e-06, + "loss": 0.4845, + "mean_token_accuracy": 0.8462069630622864, + "num_tokens": 146355098.0, + "step": 3835 + }, + { + "epoch": 0.4879786286731968, + "ewc_loss": 0.0220947265625, + "ewc_loss_parallel": 2.205371856689453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.478612899780273, + "learning_rate": 1e-06, + "loss": 0.4266, + "mean_token_accuracy": 0.8610904216766357, + "num_tokens": 146390666.0, + "step": 3836 + }, + { + "epoch": 0.48810583895178733, + "ewc_loss": 0.0220947265625, + "ewc_loss_parallel": 2.205371856689453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.556137084960938, + "learning_rate": 1e-06, + "loss": 0.4193, + "mean_token_accuracy": 0.8626747131347656, + "num_tokens": 146426740.0, + "step": 3837 + }, + { + "epoch": 0.4882330492303778, + "ewc_loss": 0.022216796875, + "ewc_loss_parallel": 2.2172927856445312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.50059700012207, + "learning_rate": 1e-06, + "loss": 0.4784, + "mean_token_accuracy": 0.8458166122436523, + "num_tokens": 146462668.0, + "step": 3838 + }, + { + "epoch": 0.48836025950896833, + "ewc_loss": 0.0223388671875, + "ewc_loss_parallel": 2.2292137145996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.502168655395508, + "learning_rate": 1e-06, + "loss": 0.4811, + "mean_token_accuracy": 0.8481972813606262, + "num_tokens": 146500160.0, + "step": 3839 + }, + { + "epoch": 0.48848746978755886, + "ewc_loss": 0.022216796875, + "ewc_loss_parallel": 2.2172927856445312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.43277931213379, + "learning_rate": 1e-06, + "loss": 0.4759, + "mean_token_accuracy": 0.850838303565979, + "num_tokens": 146542963.0, + "step": 3840 + }, + { + "epoch": 0.48861468006614933, + "ewc_loss": 0.02197265625, + "ewc_loss_parallel": 2.193450927734375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.423805236816406, + "learning_rate": 1e-06, + "loss": 0.4316, + "mean_token_accuracy": 0.8621301651000977, + "num_tokens": 146580649.0, + "step": 3841 + }, + { + "epoch": 0.48874189034473986, + "ewc_loss": 0.022216796875, + "ewc_loss_parallel": 2.2172927856445312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.52684783935547, + "learning_rate": 1e-06, + "loss": 0.4339, + "mean_token_accuracy": 0.8622480034828186, + "num_tokens": 146620823.0, + "step": 3842 + }, + { + "epoch": 0.4888691006233304, + "ewc_loss": 0.0220947265625, + "ewc_loss_parallel": 2.205371856689453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.46670150756836, + "learning_rate": 1e-06, + "loss": 0.422, + "mean_token_accuracy": 0.8655638694763184, + "num_tokens": 146663700.0, + "step": 3843 + }, + { + "epoch": 0.48899631090192086, + "ewc_loss": 0.022216796875, + "ewc_loss_parallel": 2.2172927856445312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.493738174438477, + "learning_rate": 1e-06, + "loss": 0.4532, + "mean_token_accuracy": 0.850358784198761, + "num_tokens": 146698211.0, + "step": 3844 + }, + { + "epoch": 0.4891235211805114, + "ewc_loss": 0.0220947265625, + "ewc_loss_parallel": 2.205371856689453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.397802352905273, + "learning_rate": 1e-06, + "loss": 0.4494, + "mean_token_accuracy": 0.853005051612854, + "num_tokens": 146738380.0, + "step": 3845 + }, + { + "epoch": 0.4892507314591019, + "ewc_loss": 0.02197265625, + "ewc_loss_parallel": 2.193450927734375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.279705047607422, + "learning_rate": 1e-06, + "loss": 0.4402, + "mean_token_accuracy": 0.8580735921859741, + "num_tokens": 146775413.0, + "step": 3846 + }, + { + "epoch": 0.4893779417376924, + "ewc_loss": 0.0220947265625, + "ewc_loss_parallel": 2.205371856689453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.421602249145508, + "learning_rate": 1e-06, + "loss": 0.4508, + "mean_token_accuracy": 0.8572973608970642, + "num_tokens": 146814774.0, + "step": 3847 + }, + { + "epoch": 0.4895051520162829, + "ewc_loss": 0.0223388671875, + "ewc_loss_parallel": 2.2292137145996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.333566665649414, + "learning_rate": 1e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.8643098473548889, + "num_tokens": 146853957.0, + "step": 3848 + }, + { + "epoch": 0.48963236229487345, + "ewc_loss": 0.0220947265625, + "ewc_loss_parallel": 2.205371856689453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.382099151611328, + "learning_rate": 1e-06, + "loss": 0.4736, + "mean_token_accuracy": 0.8476202487945557, + "num_tokens": 146889060.0, + "step": 3849 + }, + { + "epoch": 0.4897595725734639, + "ewc_loss": 0.022216796875, + "ewc_loss_parallel": 2.2172927856445312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.417978286743164, + "learning_rate": 1e-06, + "loss": 0.4283, + "mean_token_accuracy": 0.8651179075241089, + "num_tokens": 146929849.0, + "step": 3850 + }, + { + "epoch": 0.48988678285205445, + "ewc_loss": 0.022216796875, + "ewc_loss_parallel": 2.2172927856445312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.440933227539062, + "learning_rate": 1e-06, + "loss": 0.4559, + "mean_token_accuracy": 0.8574479222297668, + "num_tokens": 146968376.0, + "step": 3851 + }, + { + "epoch": 0.490013993130645, + "ewc_loss": 0.0223388671875, + "ewc_loss_parallel": 2.2292137145996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.55608558654785, + "learning_rate": 1e-06, + "loss": 0.5122, + "mean_token_accuracy": 0.840420126914978, + "num_tokens": 147002871.0, + "step": 3852 + }, + { + "epoch": 0.49014120340923545, + "ewc_loss": 0.0220947265625, + "ewc_loss_parallel": 2.205371856689453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.351762771606445, + "learning_rate": 1e-06, + "loss": 0.4625, + "mean_token_accuracy": 0.8519102931022644, + "num_tokens": 147039412.0, + "step": 3853 + }, + { + "epoch": 0.490268413687826, + "ewc_loss": 0.0223388671875, + "ewc_loss_parallel": 2.2292137145996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.411508560180664, + "learning_rate": 1e-06, + "loss": 0.4322, + "mean_token_accuracy": 0.8594921827316284, + "num_tokens": 147080307.0, + "step": 3854 + }, + { + "epoch": 0.4903956239664165, + "ewc_loss": 0.022216796875, + "ewc_loss_parallel": 2.2172927856445312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.54472541809082, + "learning_rate": 1e-06, + "loss": 0.4323, + "mean_token_accuracy": 0.8633399605751038, + "num_tokens": 147120131.0, + "step": 3855 + }, + { + "epoch": 0.490522834245007, + "ewc_loss": 0.0223388671875, + "ewc_loss_parallel": 2.2292137145996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.40502166748047, + "learning_rate": 1e-06, + "loss": 0.478, + "mean_token_accuracy": 0.8483923673629761, + "num_tokens": 147155760.0, + "step": 3856 + }, + { + "epoch": 0.4906500445235975, + "ewc_loss": 0.0223388671875, + "ewc_loss_parallel": 2.2292137145996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.62816047668457, + "learning_rate": 1e-06, + "loss": 0.4337, + "mean_token_accuracy": 0.8590282201766968, + "num_tokens": 147195406.0, + "step": 3857 + }, + { + "epoch": 0.49077725480218803, + "ewc_loss": 0.0223388671875, + "ewc_loss_parallel": 2.2292137145996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.417686462402344, + "learning_rate": 1e-06, + "loss": 0.4137, + "mean_token_accuracy": 0.8687944412231445, + "num_tokens": 147235688.0, + "step": 3858 + }, + { + "epoch": 0.4909044650807785, + "ewc_loss": 0.022216796875, + "ewc_loss_parallel": 2.2172927856445312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.653980255126953, + "learning_rate": 1e-06, + "loss": 0.42, + "mean_token_accuracy": 0.863706111907959, + "num_tokens": 147271575.0, + "step": 3859 + }, + { + "epoch": 0.49103167535936904, + "ewc_loss": 0.0223388671875, + "ewc_loss_parallel": 2.2292137145996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.560047149658203, + "learning_rate": 1e-06, + "loss": 0.4621, + "mean_token_accuracy": 0.8524346351623535, + "num_tokens": 147315473.0, + "step": 3860 + }, + { + "epoch": 0.49115888563795956, + "ewc_loss": 0.022216796875, + "ewc_loss_parallel": 2.2172927856445312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.583097457885742, + "learning_rate": 1e-06, + "loss": 0.4415, + "mean_token_accuracy": 0.8598395586013794, + "num_tokens": 147359058.0, + "step": 3861 + }, + { + "epoch": 0.49128609591655004, + "ewc_loss": 0.021728515625, + "ewc_loss_parallel": 2.1696090698242188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.658266067504883, + "learning_rate": 1e-06, + "loss": 0.4419, + "mean_token_accuracy": 0.8603547811508179, + "num_tokens": 147394443.0, + "step": 3862 + }, + { + "epoch": 0.49141330619514056, + "ewc_loss": 0.022216796875, + "ewc_loss_parallel": 2.2172927856445312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.44430160522461, + "learning_rate": 1e-06, + "loss": 0.4628, + "mean_token_accuracy": 0.8506925106048584, + "num_tokens": 147436225.0, + "step": 3863 + }, + { + "epoch": 0.4915405164737311, + "ewc_loss": 0.022216796875, + "ewc_loss_parallel": 2.2172927856445312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.496973037719727, + "learning_rate": 1e-06, + "loss": 0.4209, + "mean_token_accuracy": 0.8656413555145264, + "num_tokens": 147473449.0, + "step": 3864 + }, + { + "epoch": 0.49166772675232157, + "ewc_loss": 0.022216796875, + "ewc_loss_parallel": 2.2172927856445312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.412574768066406, + "learning_rate": 1e-06, + "loss": 0.4548, + "mean_token_accuracy": 0.8537706732749939, + "num_tokens": 147510330.0, + "step": 3865 + }, + { + "epoch": 0.4917949370309121, + "ewc_loss": 0.0220947265625, + "ewc_loss_parallel": 2.205371856689453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.377967834472656, + "learning_rate": 1e-06, + "loss": 0.4088, + "mean_token_accuracy": 0.867432713508606, + "num_tokens": 147544683.0, + "step": 3866 + }, + { + "epoch": 0.4919221473095026, + "ewc_loss": 0.0220947265625, + "ewc_loss_parallel": 2.205371856689453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.610538482666016, + "learning_rate": 1e-06, + "loss": 0.3965, + "mean_token_accuracy": 0.8739748001098633, + "num_tokens": 147589330.0, + "step": 3867 + }, + { + "epoch": 0.4920493575880931, + "ewc_loss": 0.0220947265625, + "ewc_loss_parallel": 2.205371856689453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.4249267578125, + "learning_rate": 1e-06, + "loss": 0.5586, + "mean_token_accuracy": 0.8257216811180115, + "num_tokens": 147626864.0, + "step": 3868 + }, + { + "epoch": 0.4921765678666836, + "ewc_loss": 0.022216796875, + "ewc_loss_parallel": 2.2172927856445312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.436716079711914, + "learning_rate": 1e-06, + "loss": 0.5055, + "mean_token_accuracy": 0.8383758664131165, + "num_tokens": 147672006.0, + "step": 3869 + }, + { + "epoch": 0.49230377814527415, + "ewc_loss": 0.02197265625, + "ewc_loss_parallel": 2.193450927734375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.526309967041016, + "learning_rate": 1e-06, + "loss": 0.4845, + "mean_token_accuracy": 0.8469768762588501, + "num_tokens": 147717388.0, + "step": 3870 + }, + { + "epoch": 0.4924309884238646, + "ewc_loss": 0.022216796875, + "ewc_loss_parallel": 2.2172927856445312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.596586227416992, + "learning_rate": 1e-06, + "loss": 0.4881, + "mean_token_accuracy": 0.8435176014900208, + "num_tokens": 147759827.0, + "step": 3871 + }, + { + "epoch": 0.49255819870245515, + "ewc_loss": 0.0220947265625, + "ewc_loss_parallel": 2.205371856689453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.5819091796875, + "learning_rate": 1e-06, + "loss": 0.5037, + "mean_token_accuracy": 0.8367010951042175, + "num_tokens": 147793742.0, + "step": 3872 + }, + { + "epoch": 0.4926854089810457, + "ewc_loss": 0.022216796875, + "ewc_loss_parallel": 2.2172927856445312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.32790184020996, + "learning_rate": 1e-06, + "loss": 0.4129, + "mean_token_accuracy": 0.8658308386802673, + "num_tokens": 147829881.0, + "step": 3873 + }, + { + "epoch": 0.49281261925963615, + "ewc_loss": 0.02197265625, + "ewc_loss_parallel": 2.193450927734375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.65793228149414, + "learning_rate": 1e-06, + "loss": 0.4997, + "mean_token_accuracy": 0.840753972530365, + "num_tokens": 147875051.0, + "step": 3874 + }, + { + "epoch": 0.4929398295382267, + "ewc_loss": 0.022216796875, + "ewc_loss_parallel": 2.2172927856445312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.507305145263672, + "learning_rate": 1e-06, + "loss": 0.488, + "mean_token_accuracy": 0.8483699560165405, + "num_tokens": 147912786.0, + "step": 3875 + }, + { + "epoch": 0.4930670398168172, + "ewc_loss": 0.02197265625, + "ewc_loss_parallel": 2.193450927734375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.502588272094727, + "learning_rate": 1e-06, + "loss": 0.518, + "mean_token_accuracy": 0.8349718451499939, + "num_tokens": 147954981.0, + "step": 3876 + }, + { + "epoch": 0.4931942500954077, + "ewc_loss": 0.0220947265625, + "ewc_loss_parallel": 2.205371856689453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.59758186340332, + "learning_rate": 1e-06, + "loss": 0.4778, + "mean_token_accuracy": 0.8487712740898132, + "num_tokens": 147992179.0, + "step": 3877 + }, + { + "epoch": 0.4933214603739982, + "ewc_loss": 0.02197265625, + "ewc_loss_parallel": 2.193450927734375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.384618759155273, + "learning_rate": 1e-06, + "loss": 0.4471, + "mean_token_accuracy": 0.8609305620193481, + "num_tokens": 148029866.0, + "step": 3878 + }, + { + "epoch": 0.49344867065258874, + "ewc_loss": 0.0220947265625, + "ewc_loss_parallel": 2.205371856689453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.727497100830078, + "learning_rate": 1e-06, + "loss": 0.4547, + "mean_token_accuracy": 0.8558922410011292, + "num_tokens": 148070057.0, + "step": 3879 + }, + { + "epoch": 0.4935758809311792, + "ewc_loss": 0.02197265625, + "ewc_loss_parallel": 2.193450927734375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.420907974243164, + "learning_rate": 1e-06, + "loss": 0.4561, + "mean_token_accuracy": 0.8589743375778198, + "num_tokens": 148108461.0, + "step": 3880 + }, + { + "epoch": 0.49370309120976974, + "ewc_loss": 0.022216796875, + "ewc_loss_parallel": 2.2172927856445312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.670103073120117, + "learning_rate": 1e-06, + "loss": 0.4783, + "mean_token_accuracy": 0.8484100699424744, + "num_tokens": 148147609.0, + "step": 3881 + }, + { + "epoch": 0.49383030148836027, + "ewc_loss": 0.0218505859375, + "ewc_loss_parallel": 2.181529998779297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.453460693359375, + "learning_rate": 1e-06, + "loss": 0.4662, + "mean_token_accuracy": 0.8505525588989258, + "num_tokens": 148190475.0, + "step": 3882 + }, + { + "epoch": 0.4939575117669508, + "ewc_loss": 0.02197265625, + "ewc_loss_parallel": 2.193450927734375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.47629737854004, + "learning_rate": 1e-06, + "loss": 0.4168, + "mean_token_accuracy": 0.8664213418960571, + "num_tokens": 148221187.0, + "step": 3883 + }, + { + "epoch": 0.49408472204554127, + "ewc_loss": 0.02197265625, + "ewc_loss_parallel": 2.193450927734375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.54450035095215, + "learning_rate": 1e-06, + "loss": 0.4352, + "mean_token_accuracy": 0.863391637802124, + "num_tokens": 148260916.0, + "step": 3884 + }, + { + "epoch": 0.4942119323241318, + "ewc_loss": 0.022216796875, + "ewc_loss_parallel": 2.2172927856445312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.47716522216797, + "learning_rate": 1e-06, + "loss": 0.4251, + "mean_token_accuracy": 0.8604180812835693, + "num_tokens": 148295591.0, + "step": 3885 + }, + { + "epoch": 0.4943391426027223, + "ewc_loss": 0.0220947265625, + "ewc_loss_parallel": 2.205371856689453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.409488677978516, + "learning_rate": 1e-06, + "loss": 0.485, + "mean_token_accuracy": 0.849397599697113, + "num_tokens": 148333445.0, + "step": 3886 + }, + { + "epoch": 0.4944663528813128, + "ewc_loss": 0.0220947265625, + "ewc_loss_parallel": 2.205371856689453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.539478302001953, + "learning_rate": 1e-06, + "loss": 0.4301, + "mean_token_accuracy": 0.8647037148475647, + "num_tokens": 148367951.0, + "step": 3887 + }, + { + "epoch": 0.4945935631599033, + "ewc_loss": 0.0220947265625, + "ewc_loss_parallel": 2.205371856689453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.621620178222656, + "learning_rate": 1e-06, + "loss": 0.4877, + "mean_token_accuracy": 0.8477786779403687, + "num_tokens": 148408491.0, + "step": 3888 + }, + { + "epoch": 0.49472077343849385, + "ewc_loss": 0.022216796875, + "ewc_loss_parallel": 2.2172927856445312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.396379470825195, + "learning_rate": 1e-06, + "loss": 0.453, + "mean_token_accuracy": 0.8548436760902405, + "num_tokens": 148443883.0, + "step": 3889 + }, + { + "epoch": 0.4948479837170843, + "ewc_loss": 0.02197265625, + "ewc_loss_parallel": 2.193450927734375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.653898239135742, + "learning_rate": 1e-06, + "loss": 0.4473, + "mean_token_accuracy": 0.8570929169654846, + "num_tokens": 148480511.0, + "step": 3890 + }, + { + "epoch": 0.49497519399567486, + "ewc_loss": 0.0220947265625, + "ewc_loss_parallel": 2.205371856689453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.2823429107666, + "learning_rate": 1e-06, + "loss": 0.4684, + "mean_token_accuracy": 0.8498060703277588, + "num_tokens": 148514759.0, + "step": 3891 + }, + { + "epoch": 0.4951024042742654, + "ewc_loss": 0.0218505859375, + "ewc_loss_parallel": 2.181529998779297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.48680877685547, + "learning_rate": 1e-06, + "loss": 0.4413, + "mean_token_accuracy": 0.8568406701087952, + "num_tokens": 148553209.0, + "step": 3892 + }, + { + "epoch": 0.49522961455285586, + "ewc_loss": 0.022216796875, + "ewc_loss_parallel": 2.2172927856445312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.406522750854492, + "learning_rate": 1e-06, + "loss": 0.4794, + "mean_token_accuracy": 0.8462468981742859, + "num_tokens": 148590387.0, + "step": 3893 + }, + { + "epoch": 0.4953568248314464, + "ewc_loss": 0.0220947265625, + "ewc_loss_parallel": 2.205371856689453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.47057342529297, + "learning_rate": 1e-06, + "loss": 0.4455, + "mean_token_accuracy": 0.8529554605484009, + "num_tokens": 148625201.0, + "step": 3894 + }, + { + "epoch": 0.4954840351100369, + "ewc_loss": 0.0220947265625, + "ewc_loss_parallel": 2.205371856689453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.506277084350586, + "learning_rate": 1e-06, + "loss": 0.4358, + "mean_token_accuracy": 0.8590389490127563, + "num_tokens": 148665989.0, + "step": 3895 + }, + { + "epoch": 0.4956112453886274, + "ewc_loss": 0.022216796875, + "ewc_loss_parallel": 2.2172927856445312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.611391067504883, + "learning_rate": 1e-06, + "loss": 0.4447, + "mean_token_accuracy": 0.8579942584037781, + "num_tokens": 148704839.0, + "step": 3896 + }, + { + "epoch": 0.4957384556672179, + "ewc_loss": 0.0220947265625, + "ewc_loss_parallel": 2.205371856689453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.552629470825195, + "learning_rate": 1e-06, + "loss": 0.4162, + "mean_token_accuracy": 0.8647603988647461, + "num_tokens": 148740523.0, + "step": 3897 + }, + { + "epoch": 0.49586566594580844, + "ewc_loss": 0.022216796875, + "ewc_loss_parallel": 2.2172927856445312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.466968536376953, + "learning_rate": 1e-06, + "loss": 0.3993, + "mean_token_accuracy": 0.8706799745559692, + "num_tokens": 148779103.0, + "step": 3898 + }, + { + "epoch": 0.4959928762243989, + "ewc_loss": 0.02197265625, + "ewc_loss_parallel": 2.193450927734375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.396936416625977, + "learning_rate": 1e-06, + "loss": 0.415, + "mean_token_accuracy": 0.8663303256034851, + "num_tokens": 148814659.0, + "step": 3899 + }, + { + "epoch": 0.49612008650298944, + "ewc_loss": 0.02197265625, + "ewc_loss_parallel": 2.193450927734375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.652172088623047, + "learning_rate": 1e-06, + "loss": 0.3766, + "mean_token_accuracy": 0.8756898641586304, + "num_tokens": 148856864.0, + "step": 3900 + }, + { + "epoch": 0.49624729678157997, + "ewc_loss": 0.02197265625, + "ewc_loss_parallel": 2.193450927734375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.53012466430664, + "learning_rate": 1e-06, + "loss": 0.4348, + "mean_token_accuracy": 0.8618385195732117, + "num_tokens": 148890309.0, + "step": 3901 + }, + { + "epoch": 0.49637450706017044, + "ewc_loss": 0.0220947265625, + "ewc_loss_parallel": 2.205371856689453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.659269332885742, + "learning_rate": 1e-06, + "loss": 0.5065, + "mean_token_accuracy": 0.839917778968811, + "num_tokens": 148926662.0, + "step": 3902 + }, + { + "epoch": 0.496501717338761, + "ewc_loss": 0.022216796875, + "ewc_loss_parallel": 2.2172927856445312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.518146514892578, + "learning_rate": 1e-06, + "loss": 0.4797, + "mean_token_accuracy": 0.8432445526123047, + "num_tokens": 148963032.0, + "step": 3903 + }, + { + "epoch": 0.4966289276173515, + "ewc_loss": 0.0220947265625, + "ewc_loss_parallel": 2.205371856689453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.43656349182129, + "learning_rate": 1e-06, + "loss": 0.4316, + "mean_token_accuracy": 0.8619574308395386, + "num_tokens": 148996684.0, + "step": 3904 + }, + { + "epoch": 0.496756137895942, + "ewc_loss": 0.022216796875, + "ewc_loss_parallel": 2.2172927856445312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.669286727905273, + "learning_rate": 1e-06, + "loss": 0.4578, + "mean_token_accuracy": 0.8556920886039734, + "num_tokens": 149032735.0, + "step": 3905 + }, + { + "epoch": 0.4968833481745325, + "ewc_loss": 0.022216796875, + "ewc_loss_parallel": 2.2172927856445312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.65496826171875, + "learning_rate": 1e-06, + "loss": 0.4527, + "mean_token_accuracy": 0.8573839664459229, + "num_tokens": 149069415.0, + "step": 3906 + }, + { + "epoch": 0.49701055845312303, + "ewc_loss": 0.0223388671875, + "ewc_loss_parallel": 2.2292137145996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.578815460205078, + "learning_rate": 1e-06, + "loss": 0.5045, + "mean_token_accuracy": 0.842001736164093, + "num_tokens": 149115042.0, + "step": 3907 + }, + { + "epoch": 0.4971377687317135, + "ewc_loss": 0.022216796875, + "ewc_loss_parallel": 2.2172927856445312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.341623306274414, + "learning_rate": 1e-06, + "loss": 0.4304, + "mean_token_accuracy": 0.8633390069007874, + "num_tokens": 149149767.0, + "step": 3908 + }, + { + "epoch": 0.49726497901030403, + "ewc_loss": 0.0223388671875, + "ewc_loss_parallel": 2.2292137145996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.761207580566406, + "learning_rate": 1e-06, + "loss": 0.479, + "mean_token_accuracy": 0.8494704961776733, + "num_tokens": 149193000.0, + "step": 3909 + }, + { + "epoch": 0.49739218928889456, + "ewc_loss": 0.0223388671875, + "ewc_loss_parallel": 2.2292137145996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.400880813598633, + "learning_rate": 1e-06, + "loss": 0.4655, + "mean_token_accuracy": 0.8531805276870728, + "num_tokens": 149226963.0, + "step": 3910 + }, + { + "epoch": 0.49751939956748503, + "ewc_loss": 0.0220947265625, + "ewc_loss_parallel": 2.205371856689453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.799976348876953, + "learning_rate": 1e-06, + "loss": 0.4666, + "mean_token_accuracy": 0.8507421612739563, + "num_tokens": 149268139.0, + "step": 3911 + }, + { + "epoch": 0.49764660984607556, + "ewc_loss": 0.0225830078125, + "ewc_loss_parallel": 2.2530555725097656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.644392013549805, + "learning_rate": 1e-06, + "loss": 0.4276, + "mean_token_accuracy": 0.8635837435722351, + "num_tokens": 149305882.0, + "step": 3912 + }, + { + "epoch": 0.4977738201246661, + "ewc_loss": 0.0220947265625, + "ewc_loss_parallel": 2.205371856689453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.61289405822754, + "learning_rate": 1e-06, + "loss": 0.4583, + "mean_token_accuracy": 0.8527873754501343, + "num_tokens": 149344119.0, + "step": 3913 + }, + { + "epoch": 0.49790103040325656, + "ewc_loss": 0.022216796875, + "ewc_loss_parallel": 2.2172927856445312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.52671241760254, + "learning_rate": 1e-06, + "loss": 0.4907, + "mean_token_accuracy": 0.8465267419815063, + "num_tokens": 149384102.0, + "step": 3914 + }, + { + "epoch": 0.4980282406818471, + "ewc_loss": 0.0220947265625, + "ewc_loss_parallel": 2.205371856689453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.506277084350586, + "learning_rate": 1e-06, + "loss": 0.4609, + "mean_token_accuracy": 0.8539791107177734, + "num_tokens": 149425677.0, + "step": 3915 + }, + { + "epoch": 0.4981554509604376, + "ewc_loss": 0.022216796875, + "ewc_loss_parallel": 2.2172927856445312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.630321502685547, + "learning_rate": 1e-06, + "loss": 0.4666, + "mean_token_accuracy": 0.8518509864807129, + "num_tokens": 149463275.0, + "step": 3916 + }, + { + "epoch": 0.4982826612390281, + "ewc_loss": 0.0224609375, + "ewc_loss_parallel": 2.2411346435546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.64945411682129, + "learning_rate": 1e-06, + "loss": 0.4058, + "mean_token_accuracy": 0.8695365786552429, + "num_tokens": 149503950.0, + "step": 3917 + }, + { + "epoch": 0.4984098715176186, + "ewc_loss": 0.0220947265625, + "ewc_loss_parallel": 2.205371856689453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.59259796142578, + "learning_rate": 1e-06, + "loss": 0.4716, + "mean_token_accuracy": 0.8468148112297058, + "num_tokens": 149541192.0, + "step": 3918 + }, + { + "epoch": 0.49853708179620915, + "ewc_loss": 0.022216796875, + "ewc_loss_parallel": 2.2172927856445312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.644596099853516, + "learning_rate": 1e-06, + "loss": 0.4557, + "mean_token_accuracy": 0.8520424365997314, + "num_tokens": 149581784.0, + "step": 3919 + }, + { + "epoch": 0.4986642920747996, + "ewc_loss": 0.0220947265625, + "ewc_loss_parallel": 2.205371856689453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.58382225036621, + "learning_rate": 1e-06, + "loss": 0.437, + "mean_token_accuracy": 0.8597074747085571, + "num_tokens": 149619312.0, + "step": 3920 + }, + { + "epoch": 0.49879150235339015, + "ewc_loss": 0.022216796875, + "ewc_loss_parallel": 2.2172927856445312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.5748233795166, + "learning_rate": 1e-06, + "loss": 0.4624, + "mean_token_accuracy": 0.8535122871398926, + "num_tokens": 149657443.0, + "step": 3921 + }, + { + "epoch": 0.4989187126319807, + "ewc_loss": 0.0220947265625, + "ewc_loss_parallel": 2.205371856689453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.396671295166016, + "learning_rate": 1e-06, + "loss": 0.4198, + "mean_token_accuracy": 0.8667880296707153, + "num_tokens": 149694992.0, + "step": 3922 + }, + { + "epoch": 0.49904592291057115, + "ewc_loss": 0.022216796875, + "ewc_loss_parallel": 2.2172927856445312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.636045455932617, + "learning_rate": 1e-06, + "loss": 0.4659, + "mean_token_accuracy": 0.8518526554107666, + "num_tokens": 149729554.0, + "step": 3923 + }, + { + "epoch": 0.4991731331891617, + "ewc_loss": 0.022216796875, + "ewc_loss_parallel": 2.2172927856445312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.6982421875, + "learning_rate": 1e-06, + "loss": 0.4292, + "mean_token_accuracy": 0.8611500263214111, + "num_tokens": 149770051.0, + "step": 3924 + }, + { + "epoch": 0.4993003434677522, + "ewc_loss": 0.022216796875, + "ewc_loss_parallel": 2.2172927856445312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.4422607421875, + "learning_rate": 1e-06, + "loss": 0.5542, + "mean_token_accuracy": 0.8241167068481445, + "num_tokens": 149806724.0, + "step": 3925 + }, + { + "epoch": 0.4994275537463427, + "ewc_loss": 0.02197265625, + "ewc_loss_parallel": 2.193450927734375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.56949806213379, + "learning_rate": 1e-06, + "loss": 0.4819, + "mean_token_accuracy": 0.8470965623855591, + "num_tokens": 149849874.0, + "step": 3926 + }, + { + "epoch": 0.4995547640249332, + "ewc_loss": 0.0223388671875, + "ewc_loss_parallel": 2.2292137145996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.589279174804688, + "learning_rate": 1e-06, + "loss": 0.4621, + "mean_token_accuracy": 0.8544349670410156, + "num_tokens": 149888076.0, + "step": 3927 + }, + { + "epoch": 0.49968197430352373, + "ewc_loss": 0.0220947265625, + "ewc_loss_parallel": 2.205371856689453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.710935592651367, + "learning_rate": 1e-06, + "loss": 0.4504, + "mean_token_accuracy": 0.8561913967132568, + "num_tokens": 149923750.0, + "step": 3928 + }, + { + "epoch": 0.4998091845821142, + "ewc_loss": 0.02197265625, + "ewc_loss_parallel": 2.193450927734375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.4188289642334, + "learning_rate": 1e-06, + "loss": 0.4838, + "mean_token_accuracy": 0.8435512185096741, + "num_tokens": 149960657.0, + "step": 3929 + }, + { + "epoch": 0.49993639486070474, + "ewc_loss": 0.02197265625, + "ewc_loss_parallel": 2.193450927734375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.670583724975586, + "learning_rate": 1e-06, + "loss": 0.4019, + "mean_token_accuracy": 0.8714863657951355, + "num_tokens": 149995983.0, + "step": 3930 + }, + { + "epoch": 0.5000636051392953, + "ewc_loss": 0.0220947265625, + "ewc_loss_parallel": 2.205371856689453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.481765747070312, + "learning_rate": 1e-06, + "loss": 0.5192, + "mean_token_accuracy": 0.833301305770874, + "num_tokens": 150037508.0, + "step": 3931 + }, + { + "epoch": 0.5001908154178858, + "ewc_loss": 0.022216796875, + "ewc_loss_parallel": 2.2172927856445312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.51462173461914, + "learning_rate": 1e-06, + "loss": 0.4646, + "mean_token_accuracy": 0.8506656885147095, + "num_tokens": 150077806.0, + "step": 3932 + }, + { + "epoch": 0.5003180256964763, + "ewc_loss": 0.022216796875, + "ewc_loss_parallel": 2.2172927856445312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.41668128967285, + "learning_rate": 1e-06, + "loss": 0.4557, + "mean_token_accuracy": 0.8539401292800903, + "num_tokens": 150117589.0, + "step": 3933 + }, + { + "epoch": 0.5004452359750667, + "ewc_loss": 0.022216796875, + "ewc_loss_parallel": 2.2172927856445312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.463966369628906, + "learning_rate": 1e-06, + "loss": 0.4779, + "mean_token_accuracy": 0.8490805625915527, + "num_tokens": 150161579.0, + "step": 3934 + }, + { + "epoch": 0.5005724462536573, + "ewc_loss": 0.02197265625, + "ewc_loss_parallel": 2.193450927734375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.686275482177734, + "learning_rate": 1e-06, + "loss": 0.5031, + "mean_token_accuracy": 0.8426780700683594, + "num_tokens": 150208018.0, + "step": 3935 + }, + { + "epoch": 0.5006996565322478, + "ewc_loss": 0.0223388671875, + "ewc_loss_parallel": 2.2292137145996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.601709365844727, + "learning_rate": 1e-06, + "loss": 0.4454, + "mean_token_accuracy": 0.8568617701530457, + "num_tokens": 150250164.0, + "step": 3936 + }, + { + "epoch": 0.5008268668108383, + "ewc_loss": 0.0220947265625, + "ewc_loss_parallel": 2.205371856689453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.534536361694336, + "learning_rate": 1e-06, + "loss": 0.4006, + "mean_token_accuracy": 0.8739917278289795, + "num_tokens": 150289972.0, + "step": 3937 + }, + { + "epoch": 0.5009540770894289, + "ewc_loss": 0.0220947265625, + "ewc_loss_parallel": 2.205371856689453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.288833618164062, + "learning_rate": 1e-06, + "loss": 0.4521, + "mean_token_accuracy": 0.8539696931838989, + "num_tokens": 150333656.0, + "step": 3938 + }, + { + "epoch": 0.5010812873680194, + "ewc_loss": 0.022216796875, + "ewc_loss_parallel": 2.2172927856445312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.418779373168945, + "learning_rate": 1e-06, + "loss": 0.4076, + "mean_token_accuracy": 0.8682301044464111, + "num_tokens": 150374086.0, + "step": 3939 + }, + { + "epoch": 0.5012084976466098, + "ewc_loss": 0.0223388671875, + "ewc_loss_parallel": 2.2292137145996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.485517501831055, + "learning_rate": 1e-06, + "loss": 0.4346, + "mean_token_accuracy": 0.8586382269859314, + "num_tokens": 150407472.0, + "step": 3940 + }, + { + "epoch": 0.5013357079252003, + "ewc_loss": 0.0220947265625, + "ewc_loss_parallel": 2.205371856689453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.61307716369629, + "learning_rate": 1e-06, + "loss": 0.4344, + "mean_token_accuracy": 0.8607791066169739, + "num_tokens": 150444560.0, + "step": 3941 + }, + { + "epoch": 0.5014629182037909, + "ewc_loss": 0.022216796875, + "ewc_loss_parallel": 2.2172927856445312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.472137451171875, + "learning_rate": 1e-06, + "loss": 0.4395, + "mean_token_accuracy": 0.859722375869751, + "num_tokens": 150483621.0, + "step": 3942 + }, + { + "epoch": 0.5015901284823814, + "ewc_loss": 0.0223388671875, + "ewc_loss_parallel": 2.2292137145996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.589828491210938, + "learning_rate": 1e-06, + "loss": 0.4473, + "mean_token_accuracy": 0.8556849360466003, + "num_tokens": 150520414.0, + "step": 3943 + }, + { + "epoch": 0.5017173387609719, + "ewc_loss": 0.022216796875, + "ewc_loss_parallel": 2.2172927856445312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.535888671875, + "learning_rate": 1e-06, + "loss": 0.4789, + "mean_token_accuracy": 0.8471856117248535, + "num_tokens": 150556589.0, + "step": 3944 + }, + { + "epoch": 0.5018445490395624, + "ewc_loss": 0.0223388671875, + "ewc_loss_parallel": 2.2292137145996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.69209098815918, + "learning_rate": 1e-06, + "loss": 0.4576, + "mean_token_accuracy": 0.8536850214004517, + "num_tokens": 150592393.0, + "step": 3945 + }, + { + "epoch": 0.5019717593181529, + "ewc_loss": 0.02197265625, + "ewc_loss_parallel": 2.193450927734375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.573392868041992, + "learning_rate": 1e-06, + "loss": 0.4888, + "mean_token_accuracy": 0.84181809425354, + "num_tokens": 150630457.0, + "step": 3946 + }, + { + "epoch": 0.5020989695967434, + "ewc_loss": 0.022216796875, + "ewc_loss_parallel": 2.2172927856445312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.574010848999023, + "learning_rate": 1e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.8631705641746521, + "num_tokens": 150659482.0, + "step": 3947 + }, + { + "epoch": 0.5022261798753339, + "ewc_loss": 0.0220947265625, + "ewc_loss_parallel": 2.205371856689453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.398529052734375, + "learning_rate": 1e-06, + "loss": 0.4556, + "mean_token_accuracy": 0.8552564382553101, + "num_tokens": 150700920.0, + "step": 3948 + }, + { + "epoch": 0.5023533901539244, + "ewc_loss": 0.022216796875, + "ewc_loss_parallel": 2.2172927856445312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.663850784301758, + "learning_rate": 1e-06, + "loss": 0.4237, + "mean_token_accuracy": 0.8661309480667114, + "num_tokens": 150736393.0, + "step": 3949 + }, + { + "epoch": 0.502480600432515, + "ewc_loss": 0.022216796875, + "ewc_loss_parallel": 2.2172927856445312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.560422897338867, + "learning_rate": 1e-06, + "loss": 0.4656, + "mean_token_accuracy": 0.8522212505340576, + "num_tokens": 150783848.0, + "step": 3950 + }, + { + "epoch": 0.5026078107111055, + "ewc_loss": 0.02197265625, + "ewc_loss_parallel": 2.193450927734375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.621875762939453, + "learning_rate": 1e-06, + "loss": 0.4564, + "mean_token_accuracy": 0.851924479007721, + "num_tokens": 150824179.0, + "step": 3951 + }, + { + "epoch": 0.5027350209896959, + "ewc_loss": 0.0223388671875, + "ewc_loss_parallel": 2.2292137145996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.57623291015625, + "learning_rate": 1e-06, + "loss": 0.449, + "mean_token_accuracy": 0.8556785583496094, + "num_tokens": 150862775.0, + "step": 3952 + }, + { + "epoch": 0.5028622312682864, + "ewc_loss": 0.0220947265625, + "ewc_loss_parallel": 2.205371856689453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.590675354003906, + "learning_rate": 1e-06, + "loss": 0.4538, + "mean_token_accuracy": 0.8547836542129517, + "num_tokens": 150900005.0, + "step": 3953 + }, + { + "epoch": 0.502989441546877, + "ewc_loss": 0.0223388671875, + "ewc_loss_parallel": 2.2292137145996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.588035583496094, + "learning_rate": 1e-06, + "loss": 0.4626, + "mean_token_accuracy": 0.8517217636108398, + "num_tokens": 150936148.0, + "step": 3954 + }, + { + "epoch": 0.5031166518254675, + "ewc_loss": 0.02197265625, + "ewc_loss_parallel": 2.193450927734375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.60409927368164, + "learning_rate": 1e-06, + "loss": 0.4573, + "mean_token_accuracy": 0.8519130349159241, + "num_tokens": 150971661.0, + "step": 3955 + }, + { + "epoch": 0.503243862104058, + "ewc_loss": 0.0220947265625, + "ewc_loss_parallel": 2.205371856689453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.607078552246094, + "learning_rate": 1e-06, + "loss": 0.4219, + "mean_token_accuracy": 0.866305947303772, + "num_tokens": 151009521.0, + "step": 3956 + }, + { + "epoch": 0.5033710723826486, + "ewc_loss": 0.0223388671875, + "ewc_loss_parallel": 2.2292137145996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.470170974731445, + "learning_rate": 1e-06, + "loss": 0.464, + "mean_token_accuracy": 0.8513376712799072, + "num_tokens": 151053641.0, + "step": 3957 + }, + { + "epoch": 0.5034982826612391, + "ewc_loss": 0.022216796875, + "ewc_loss_parallel": 2.2172927856445312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.656177520751953, + "learning_rate": 1e-06, + "loss": 0.4442, + "mean_token_accuracy": 0.8588501214981079, + "num_tokens": 151094476.0, + "step": 3958 + }, + { + "epoch": 0.5036254929398295, + "ewc_loss": 0.0223388671875, + "ewc_loss_parallel": 2.2292137145996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.472043991088867, + "learning_rate": 1e-06, + "loss": 0.4683, + "mean_token_accuracy": 0.8488476276397705, + "num_tokens": 151126381.0, + "step": 3959 + }, + { + "epoch": 0.50375270321842, + "ewc_loss": 0.022216796875, + "ewc_loss_parallel": 2.2172927856445312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.596298217773438, + "learning_rate": 1e-06, + "loss": 0.4411, + "mean_token_accuracy": 0.8602234125137329, + "num_tokens": 151156875.0, + "step": 3960 + }, + { + "epoch": 0.5038799134970106, + "ewc_loss": 0.0223388671875, + "ewc_loss_parallel": 2.2292137145996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.625940322875977, + "learning_rate": 1e-06, + "loss": 0.4769, + "mean_token_accuracy": 0.8510938882827759, + "num_tokens": 151194394.0, + "step": 3961 + }, + { + "epoch": 0.5040071237756011, + "ewc_loss": 0.022216796875, + "ewc_loss_parallel": 2.2172927856445312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.660076141357422, + "learning_rate": 1e-06, + "loss": 0.4929, + "mean_token_accuracy": 0.8413481712341309, + "num_tokens": 151234472.0, + "step": 3962 + }, + { + "epoch": 0.5041343340541916, + "ewc_loss": 0.022216796875, + "ewc_loss_parallel": 2.2172927856445312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.482641220092773, + "learning_rate": 1e-06, + "loss": 0.4597, + "mean_token_accuracy": 0.8513864278793335, + "num_tokens": 151270426.0, + "step": 3963 + }, + { + "epoch": 0.5042615443327821, + "ewc_loss": 0.0223388671875, + "ewc_loss_parallel": 2.2292137145996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.54935073852539, + "learning_rate": 1e-06, + "loss": 0.4612, + "mean_token_accuracy": 0.8534340858459473, + "num_tokens": 151305793.0, + "step": 3964 + }, + { + "epoch": 0.5043887546113726, + "ewc_loss": 0.022216796875, + "ewc_loss_parallel": 2.2172927856445312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.422561645507812, + "learning_rate": 1e-06, + "loss": 0.4941, + "mean_token_accuracy": 0.8422542810440063, + "num_tokens": 151345116.0, + "step": 3965 + }, + { + "epoch": 0.5045159648899631, + "ewc_loss": 0.0223388671875, + "ewc_loss_parallel": 2.2292137145996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.498716354370117, + "learning_rate": 1e-06, + "loss": 0.436, + "mean_token_accuracy": 0.8598237037658691, + "num_tokens": 151381901.0, + "step": 3966 + }, + { + "epoch": 0.5046431751685536, + "ewc_loss": 0.0224609375, + "ewc_loss_parallel": 2.2411346435546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.60890769958496, + "learning_rate": 1e-06, + "loss": 0.4416, + "mean_token_accuracy": 0.8575947880744934, + "num_tokens": 151426884.0, + "step": 3967 + }, + { + "epoch": 0.5047703854471441, + "ewc_loss": 0.0223388671875, + "ewc_loss_parallel": 2.2292137145996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.571483612060547, + "learning_rate": 1e-06, + "loss": 0.4686, + "mean_token_accuracy": 0.8482678532600403, + "num_tokens": 151467596.0, + "step": 3968 + }, + { + "epoch": 0.5048975957257347, + "ewc_loss": 0.0223388671875, + "ewc_loss_parallel": 2.2292137145996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.571144104003906, + "learning_rate": 1e-06, + "loss": 0.3931, + "mean_token_accuracy": 0.8744003176689148, + "num_tokens": 151502131.0, + "step": 3969 + }, + { + "epoch": 0.5050248060043252, + "ewc_loss": 0.022216796875, + "ewc_loss_parallel": 2.2172927856445312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.48938751220703, + "learning_rate": 1e-06, + "loss": 0.4774, + "mean_token_accuracy": 0.8524218201637268, + "num_tokens": 151538932.0, + "step": 3970 + }, + { + "epoch": 0.5051520162829156, + "ewc_loss": 0.022705078125, + "ewc_loss_parallel": 2.2649765014648438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.665830612182617, + "learning_rate": 1e-06, + "loss": 0.4483, + "mean_token_accuracy": 0.8543053865432739, + "num_tokens": 151577839.0, + "step": 3971 + }, + { + "epoch": 0.5052792265615061, + "ewc_loss": 0.0223388671875, + "ewc_loss_parallel": 2.2292137145996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.65631675720215, + "learning_rate": 1e-06, + "loss": 0.4498, + "mean_token_accuracy": 0.8547267913818359, + "num_tokens": 151618984.0, + "step": 3972 + }, + { + "epoch": 0.5054064368400967, + "ewc_loss": 0.022216796875, + "ewc_loss_parallel": 2.2172927856445312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.682357788085938, + "learning_rate": 1e-06, + "loss": 0.4556, + "mean_token_accuracy": 0.8545963764190674, + "num_tokens": 151658392.0, + "step": 3973 + }, + { + "epoch": 0.5055336471186872, + "ewc_loss": 0.022216796875, + "ewc_loss_parallel": 2.2172927856445312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.679719924926758, + "learning_rate": 1e-06, + "loss": 0.423, + "mean_token_accuracy": 0.8656731247901917, + "num_tokens": 151689781.0, + "step": 3974 + }, + { + "epoch": 0.5056608573972777, + "ewc_loss": 0.022216796875, + "ewc_loss_parallel": 2.2172927856445312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.563339233398438, + "learning_rate": 1e-06, + "loss": 0.5141, + "mean_token_accuracy": 0.8401699662208557, + "num_tokens": 151729612.0, + "step": 3975 + }, + { + "epoch": 0.5057880676758683, + "ewc_loss": 0.022216796875, + "ewc_loss_parallel": 2.2172927856445312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.587007522583008, + "learning_rate": 1e-06, + "loss": 0.4914, + "mean_token_accuracy": 0.8427999019622803, + "num_tokens": 151769270.0, + "step": 3976 + }, + { + "epoch": 0.5059152779544587, + "ewc_loss": 0.0223388671875, + "ewc_loss_parallel": 2.2292137145996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.77180290222168, + "learning_rate": 1e-06, + "loss": 0.5069, + "mean_token_accuracy": 0.8362267017364502, + "num_tokens": 151803936.0, + "step": 3977 + }, + { + "epoch": 0.5060424882330492, + "ewc_loss": 0.0223388671875, + "ewc_loss_parallel": 2.2292137145996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.76565933227539, + "learning_rate": 1e-06, + "loss": 0.4968, + "mean_token_accuracy": 0.8430314660072327, + "num_tokens": 151840429.0, + "step": 3978 + }, + { + "epoch": 0.5061696985116397, + "ewc_loss": 0.0223388671875, + "ewc_loss_parallel": 2.2292137145996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.616981506347656, + "learning_rate": 1e-06, + "loss": 0.4486, + "mean_token_accuracy": 0.8569923639297485, + "num_tokens": 151883296.0, + "step": 3979 + }, + { + "epoch": 0.5062969087902303, + "ewc_loss": 0.022216796875, + "ewc_loss_parallel": 2.2172927856445312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.607282638549805, + "learning_rate": 1e-06, + "loss": 0.4097, + "mean_token_accuracy": 0.8689195513725281, + "num_tokens": 151917658.0, + "step": 3980 + }, + { + "epoch": 0.5064241190688208, + "ewc_loss": 0.022216796875, + "ewc_loss_parallel": 2.2172927856445312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.60150146484375, + "learning_rate": 1e-06, + "loss": 0.4539, + "mean_token_accuracy": 0.854251503944397, + "num_tokens": 151958822.0, + "step": 3981 + }, + { + "epoch": 0.5065513293474113, + "ewc_loss": 0.022216796875, + "ewc_loss_parallel": 2.2172927856445312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.57807159423828, + "learning_rate": 1e-06, + "loss": 0.4643, + "mean_token_accuracy": 0.8497430086135864, + "num_tokens": 151994964.0, + "step": 3982 + }, + { + "epoch": 0.5066785396260017, + "ewc_loss": 0.022216796875, + "ewc_loss_parallel": 2.2172927856445312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.480274200439453, + "learning_rate": 1e-06, + "loss": 0.4666, + "mean_token_accuracy": 0.8497361540794373, + "num_tokens": 152037013.0, + "step": 3983 + }, + { + "epoch": 0.5068057499045923, + "ewc_loss": 0.0223388671875, + "ewc_loss_parallel": 2.2292137145996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.59128189086914, + "learning_rate": 1e-06, + "loss": 0.4552, + "mean_token_accuracy": 0.8507975339889526, + "num_tokens": 152074821.0, + "step": 3984 + }, + { + "epoch": 0.5069329601831828, + "ewc_loss": 0.0225830078125, + "ewc_loss_parallel": 2.2530555725097656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.589523315429688, + "learning_rate": 1e-06, + "loss": 0.478, + "mean_token_accuracy": 0.8478950262069702, + "num_tokens": 152116667.0, + "step": 3985 + }, + { + "epoch": 0.5070601704617733, + "ewc_loss": 0.0223388671875, + "ewc_loss_parallel": 2.2292137145996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.725900650024414, + "learning_rate": 1e-06, + "loss": 0.4472, + "mean_token_accuracy": 0.8589709997177124, + "num_tokens": 152152714.0, + "step": 3986 + }, + { + "epoch": 0.5071873807403638, + "ewc_loss": 0.0224609375, + "ewc_loss_parallel": 2.2411346435546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.657278060913086, + "learning_rate": 1e-06, + "loss": 0.5043, + "mean_token_accuracy": 0.8403944373130798, + "num_tokens": 152190230.0, + "step": 3987 + }, + { + "epoch": 0.5073145910189544, + "ewc_loss": 0.022216796875, + "ewc_loss_parallel": 2.2172927856445312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.662368774414062, + "learning_rate": 1e-06, + "loss": 0.4633, + "mean_token_accuracy": 0.852921724319458, + "num_tokens": 152228887.0, + "step": 3988 + }, + { + "epoch": 0.5074418012975448, + "ewc_loss": 0.022216796875, + "ewc_loss_parallel": 2.2172927856445312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.489599227905273, + "learning_rate": 1e-06, + "loss": 0.4185, + "mean_token_accuracy": 0.8638123273849487, + "num_tokens": 152271863.0, + "step": 3989 + }, + { + "epoch": 0.5075690115761353, + "ewc_loss": 0.0220947265625, + "ewc_loss_parallel": 2.205371856689453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.680110931396484, + "learning_rate": 1e-06, + "loss": 0.4815, + "mean_token_accuracy": 0.8491332530975342, + "num_tokens": 152310765.0, + "step": 3990 + }, + { + "epoch": 0.5076962218547258, + "ewc_loss": 0.0224609375, + "ewc_loss_parallel": 2.2411346435546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.62615394592285, + "learning_rate": 1e-06, + "loss": 0.3805, + "mean_token_accuracy": 0.8763260841369629, + "num_tokens": 152342545.0, + "step": 3991 + }, + { + "epoch": 0.5078234321333164, + "ewc_loss": 0.022216796875, + "ewc_loss_parallel": 2.2172927856445312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.598424911499023, + "learning_rate": 1e-06, + "loss": 0.5012, + "mean_token_accuracy": 0.8385201692581177, + "num_tokens": 152386223.0, + "step": 3992 + }, + { + "epoch": 0.5079506424119069, + "ewc_loss": 0.022216796875, + "ewc_loss_parallel": 2.2172927856445312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.612375259399414, + "learning_rate": 1e-06, + "loss": 0.4348, + "mean_token_accuracy": 0.8616423010826111, + "num_tokens": 152421811.0, + "step": 3993 + }, + { + "epoch": 0.5080778526904974, + "ewc_loss": 0.0220947265625, + "ewc_loss_parallel": 2.205371856689453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.566797256469727, + "learning_rate": 1e-06, + "loss": 0.4844, + "mean_token_accuracy": 0.842402458190918, + "num_tokens": 152454581.0, + "step": 3994 + }, + { + "epoch": 0.5082050629690879, + "ewc_loss": 0.022216796875, + "ewc_loss_parallel": 2.2172927856445312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.614429473876953, + "learning_rate": 1e-06, + "loss": 0.4853, + "mean_token_accuracy": 0.8441120386123657, + "num_tokens": 152490591.0, + "step": 3995 + }, + { + "epoch": 0.5083322732476784, + "ewc_loss": 0.0223388671875, + "ewc_loss_parallel": 2.2292137145996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.67889976501465, + "learning_rate": 1e-06, + "loss": 0.3899, + "mean_token_accuracy": 0.8772753477096558, + "num_tokens": 152532290.0, + "step": 3996 + }, + { + "epoch": 0.5084594835262689, + "ewc_loss": 0.022216796875, + "ewc_loss_parallel": 2.2172927856445312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.65802574157715, + "learning_rate": 1e-06, + "loss": 0.4421, + "mean_token_accuracy": 0.8605785369873047, + "num_tokens": 152571984.0, + "step": 3997 + }, + { + "epoch": 0.5085866938048594, + "ewc_loss": 0.022216796875, + "ewc_loss_parallel": 2.2172927856445312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.610105514526367, + "learning_rate": 1e-06, + "loss": 0.4547, + "mean_token_accuracy": 0.8553082942962646, + "num_tokens": 152611082.0, + "step": 3998 + }, + { + "epoch": 0.50871390408345, + "ewc_loss": 0.0220947265625, + "ewc_loss_parallel": 2.205371856689453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.657487869262695, + "learning_rate": 1e-06, + "loss": 0.4188, + "mean_token_accuracy": 0.8627731204032898, + "num_tokens": 152655303.0, + "step": 3999 + }, + { + "epoch": 0.5088411143620405, + "ewc_loss": 0.022216796875, + "ewc_loss_parallel": 2.2172927856445312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.539459228515625, + "learning_rate": 1e-06, + "loss": 0.4598, + "mean_token_accuracy": 0.853573739528656, + "num_tokens": 152688432.0, + "step": 4000 + }, + { + "epoch": 0.5089683246406309, + "ewc_loss": 0.0223388671875, + "ewc_loss_parallel": 2.2292137145996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.614187240600586, + "learning_rate": 1e-06, + "loss": 0.4687, + "mean_token_accuracy": 0.849614679813385, + "num_tokens": 152725637.0, + "step": 4001 + }, + { + "epoch": 0.5090955349192214, + "ewc_loss": 0.0223388671875, + "ewc_loss_parallel": 2.2292137145996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.610414505004883, + "learning_rate": 1e-06, + "loss": 0.4636, + "mean_token_accuracy": 0.8531126976013184, + "num_tokens": 152763108.0, + "step": 4002 + }, + { + "epoch": 0.509222745197812, + "ewc_loss": 0.0224609375, + "ewc_loss_parallel": 2.2411346435546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.653078079223633, + "learning_rate": 1e-06, + "loss": 0.4355, + "mean_token_accuracy": 0.8610011339187622, + "num_tokens": 152800659.0, + "step": 4003 + }, + { + "epoch": 0.5093499554764025, + "ewc_loss": 0.02197265625, + "ewc_loss_parallel": 2.193450927734375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.525793075561523, + "learning_rate": 1e-06, + "loss": 0.48, + "mean_token_accuracy": 0.8457390069961548, + "num_tokens": 152842797.0, + "step": 4004 + }, + { + "epoch": 0.509477165754993, + "ewc_loss": 0.022216796875, + "ewc_loss_parallel": 2.2172927856445312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.629344940185547, + "learning_rate": 1e-06, + "loss": 0.4888, + "mean_token_accuracy": 0.8457438945770264, + "num_tokens": 152888257.0, + "step": 4005 + }, + { + "epoch": 0.5096043760335836, + "ewc_loss": 0.0220947265625, + "ewc_loss_parallel": 2.205371856689453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.504762649536133, + "learning_rate": 1e-06, + "loss": 0.4549, + "mean_token_accuracy": 0.8537499904632568, + "num_tokens": 152929089.0, + "step": 4006 + }, + { + "epoch": 0.5097315863121741, + "ewc_loss": 0.0224609375, + "ewc_loss_parallel": 2.2411346435546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.55101203918457, + "learning_rate": 1e-06, + "loss": 0.4667, + "mean_token_accuracy": 0.8485795855522156, + "num_tokens": 152972710.0, + "step": 4007 + }, + { + "epoch": 0.5098587965907645, + "ewc_loss": 0.022216796875, + "ewc_loss_parallel": 2.2172927856445312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.411531448364258, + "learning_rate": 1e-06, + "loss": 0.4456, + "mean_token_accuracy": 0.8550790548324585, + "num_tokens": 153008867.0, + "step": 4008 + }, + { + "epoch": 0.509986006869355, + "ewc_loss": 0.0224609375, + "ewc_loss_parallel": 2.2411346435546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.56437110900879, + "learning_rate": 1e-06, + "loss": 0.471, + "mean_token_accuracy": 0.8491793870925903, + "num_tokens": 153051497.0, + "step": 4009 + }, + { + "epoch": 0.5101132171479456, + "ewc_loss": 0.0220947265625, + "ewc_loss_parallel": 2.205371856689453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.591102600097656, + "learning_rate": 1e-06, + "loss": 0.4108, + "mean_token_accuracy": 0.8705793023109436, + "num_tokens": 153089295.0, + "step": 4010 + }, + { + "epoch": 0.5102404274265361, + "ewc_loss": 0.0225830078125, + "ewc_loss_parallel": 2.2530555725097656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.62773323059082, + "learning_rate": 1e-06, + "loss": 0.4414, + "mean_token_accuracy": 0.858803927898407, + "num_tokens": 153123957.0, + "step": 4011 + }, + { + "epoch": 0.5103676377051266, + "ewc_loss": 0.0223388671875, + "ewc_loss_parallel": 2.2292137145996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.515661239624023, + "learning_rate": 1e-06, + "loss": 0.4702, + "mean_token_accuracy": 0.8506189584732056, + "num_tokens": 153165248.0, + "step": 4012 + }, + { + "epoch": 0.5104948479837171, + "ewc_loss": 0.0225830078125, + "ewc_loss_parallel": 2.2530555725097656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.687314987182617, + "learning_rate": 1e-06, + "loss": 0.4062, + "mean_token_accuracy": 0.8688727617263794, + "num_tokens": 153206163.0, + "step": 4013 + }, + { + "epoch": 0.5106220582623076, + "ewc_loss": 0.0223388671875, + "ewc_loss_parallel": 2.2292137145996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.77018165588379, + "learning_rate": 1e-06, + "loss": 0.4873, + "mean_token_accuracy": 0.8459452390670776, + "num_tokens": 153248666.0, + "step": 4014 + }, + { + "epoch": 0.5107492685408981, + "ewc_loss": 0.0223388671875, + "ewc_loss_parallel": 2.2292137145996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.59383773803711, + "learning_rate": 1e-06, + "loss": 0.3917, + "mean_token_accuracy": 0.877368688583374, + "num_tokens": 153289009.0, + "step": 4015 + }, + { + "epoch": 0.5108764788194886, + "ewc_loss": 0.0224609375, + "ewc_loss_parallel": 2.2411346435546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.775821685791016, + "learning_rate": 1e-06, + "loss": 0.4309, + "mean_token_accuracy": 0.8595899939537048, + "num_tokens": 153321354.0, + "step": 4016 + }, + { + "epoch": 0.5110036890980791, + "ewc_loss": 0.0220947265625, + "ewc_loss_parallel": 2.205371856689453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.56033706665039, + "learning_rate": 1e-06, + "loss": 0.4548, + "mean_token_accuracy": 0.853618323802948, + "num_tokens": 153361061.0, + "step": 4017 + }, + { + "epoch": 0.5111308993766697, + "ewc_loss": 0.0224609375, + "ewc_loss_parallel": 2.2411346435546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.466995239257812, + "learning_rate": 1e-06, + "loss": 0.4832, + "mean_token_accuracy": 0.8491802215576172, + "num_tokens": 153397002.0, + "step": 4018 + }, + { + "epoch": 0.5112581096552602, + "ewc_loss": 0.0224609375, + "ewc_loss_parallel": 2.2411346435546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.747817993164062, + "learning_rate": 1e-06, + "loss": 0.4984, + "mean_token_accuracy": 0.8408776521682739, + "num_tokens": 153435264.0, + "step": 4019 + }, + { + "epoch": 0.5113853199338506, + "ewc_loss": 0.0224609375, + "ewc_loss_parallel": 2.2411346435546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.566261291503906, + "learning_rate": 1e-06, + "loss": 0.4545, + "mean_token_accuracy": 0.8532659411430359, + "num_tokens": 153474751.0, + "step": 4020 + }, + { + "epoch": 0.5115125302124411, + "ewc_loss": 0.0223388671875, + "ewc_loss_parallel": 2.2292137145996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.726186752319336, + "learning_rate": 1e-06, + "loss": 0.4169, + "mean_token_accuracy": 0.8626286387443542, + "num_tokens": 153510084.0, + "step": 4021 + }, + { + "epoch": 0.5116397404910317, + "ewc_loss": 0.0224609375, + "ewc_loss_parallel": 2.2411346435546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.63825225830078, + "learning_rate": 1e-06, + "loss": 0.5056, + "mean_token_accuracy": 0.8410422801971436, + "num_tokens": 153550387.0, + "step": 4022 + }, + { + "epoch": 0.5117669507696222, + "ewc_loss": 0.0223388671875, + "ewc_loss_parallel": 2.2292137145996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.613536834716797, + "learning_rate": 1e-06, + "loss": 0.3871, + "mean_token_accuracy": 0.8762538433074951, + "num_tokens": 153592441.0, + "step": 4023 + }, + { + "epoch": 0.5118941610482127, + "ewc_loss": 0.0224609375, + "ewc_loss_parallel": 2.2411346435546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.59320068359375, + "learning_rate": 1e-06, + "loss": 0.4368, + "mean_token_accuracy": 0.8613752126693726, + "num_tokens": 153629202.0, + "step": 4024 + }, + { + "epoch": 0.5120213713268033, + "ewc_loss": 0.0225830078125, + "ewc_loss_parallel": 2.2530555725097656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.69366455078125, + "learning_rate": 1e-06, + "loss": 0.4757, + "mean_token_accuracy": 0.8506392240524292, + "num_tokens": 153669243.0, + "step": 4025 + }, + { + "epoch": 0.5121485816053937, + "ewc_loss": 0.0225830078125, + "ewc_loss_parallel": 2.2530555725097656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.62708282470703, + "learning_rate": 1e-06, + "loss": 0.4774, + "mean_token_accuracy": 0.8500256538391113, + "num_tokens": 153709206.0, + "step": 4026 + }, + { + "epoch": 0.5122757918839842, + "ewc_loss": 0.022216796875, + "ewc_loss_parallel": 2.2172927856445312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.592514038085938, + "learning_rate": 1e-06, + "loss": 0.4405, + "mean_token_accuracy": 0.861153244972229, + "num_tokens": 153741251.0, + "step": 4027 + }, + { + "epoch": 0.5124030021625747, + "ewc_loss": 0.0223388671875, + "ewc_loss_parallel": 2.2292137145996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.556425094604492, + "learning_rate": 1e-06, + "loss": 0.5059, + "mean_token_accuracy": 0.8408643007278442, + "num_tokens": 153775559.0, + "step": 4028 + }, + { + "epoch": 0.5125302124411653, + "ewc_loss": 0.0223388671875, + "ewc_loss_parallel": 2.2292137145996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.5982608795166, + "learning_rate": 1e-06, + "loss": 0.4376, + "mean_token_accuracy": 0.8592188358306885, + "num_tokens": 153819158.0, + "step": 4029 + }, + { + "epoch": 0.5126574227197558, + "ewc_loss": 0.0224609375, + "ewc_loss_parallel": 2.2411346435546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.607088088989258, + "learning_rate": 1e-06, + "loss": 0.483, + "mean_token_accuracy": 0.848153829574585, + "num_tokens": 153856231.0, + "step": 4030 + }, + { + "epoch": 0.5127846329983463, + "ewc_loss": 0.0223388671875, + "ewc_loss_parallel": 2.2292137145996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.677379608154297, + "learning_rate": 1e-06, + "loss": 0.4262, + "mean_token_accuracy": 0.8657357692718506, + "num_tokens": 153897125.0, + "step": 4031 + }, + { + "epoch": 0.5129118432769367, + "ewc_loss": 0.0224609375, + "ewc_loss_parallel": 2.2411346435546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.553943634033203, + "learning_rate": 1e-06, + "loss": 0.4288, + "mean_token_accuracy": 0.8614749312400818, + "num_tokens": 153933552.0, + "step": 4032 + }, + { + "epoch": 0.5130390535555273, + "ewc_loss": 0.022705078125, + "ewc_loss_parallel": 2.2649765014648438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.70452117919922, + "learning_rate": 1e-06, + "loss": 0.5115, + "mean_token_accuracy": 0.8389453291893005, + "num_tokens": 153974337.0, + "step": 4033 + }, + { + "epoch": 0.5131662638341178, + "ewc_loss": 0.0224609375, + "ewc_loss_parallel": 2.2411346435546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.631580352783203, + "learning_rate": 1e-06, + "loss": 0.4714, + "mean_token_accuracy": 0.8497493267059326, + "num_tokens": 154018911.0, + "step": 4034 + }, + { + "epoch": 0.5132934741127083, + "ewc_loss": 0.0225830078125, + "ewc_loss_parallel": 2.2530555725097656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.686988830566406, + "learning_rate": 1e-06, + "loss": 0.4175, + "mean_token_accuracy": 0.8654036521911621, + "num_tokens": 154056441.0, + "step": 4035 + }, + { + "epoch": 0.5134206843912988, + "ewc_loss": 0.022216796875, + "ewc_loss_parallel": 2.2172927856445312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.62193489074707, + "learning_rate": 1e-06, + "loss": 0.485, + "mean_token_accuracy": 0.8447589874267578, + "num_tokens": 154096326.0, + "step": 4036 + }, + { + "epoch": 0.5135478946698894, + "ewc_loss": 0.0223388671875, + "ewc_loss_parallel": 2.2292137145996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.56354522705078, + "learning_rate": 1e-06, + "loss": 0.4433, + "mean_token_accuracy": 0.8588524460792542, + "num_tokens": 154137223.0, + "step": 4037 + }, + { + "epoch": 0.5136751049484798, + "ewc_loss": 0.022705078125, + "ewc_loss_parallel": 2.2649765014648438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.714887619018555, + "learning_rate": 1e-06, + "loss": 0.4673, + "mean_token_accuracy": 0.8527834415435791, + "num_tokens": 154168312.0, + "step": 4038 + }, + { + "epoch": 0.5138023152270703, + "ewc_loss": 0.0224609375, + "ewc_loss_parallel": 2.2411346435546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.704713821411133, + "learning_rate": 1e-06, + "loss": 0.4177, + "mean_token_accuracy": 0.8693042993545532, + "num_tokens": 154203125.0, + "step": 4039 + }, + { + "epoch": 0.5139295255056608, + "ewc_loss": 0.022705078125, + "ewc_loss_parallel": 2.2649765014648438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.614145278930664, + "learning_rate": 1e-06, + "loss": 0.4743, + "mean_token_accuracy": 0.8496960997581482, + "num_tokens": 154248437.0, + "step": 4040 + }, + { + "epoch": 0.5140567357842514, + "ewc_loss": 0.0223388671875, + "ewc_loss_parallel": 2.2292137145996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.818349838256836, + "learning_rate": 1e-06, + "loss": 0.4411, + "mean_token_accuracy": 0.8585838079452515, + "num_tokens": 154286303.0, + "step": 4041 + }, + { + "epoch": 0.5141839460628419, + "ewc_loss": 0.022705078125, + "ewc_loss_parallel": 2.2649765014648438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.438472747802734, + "learning_rate": 1e-06, + "loss": 0.4903, + "mean_token_accuracy": 0.8463459014892578, + "num_tokens": 154328638.0, + "step": 4042 + }, + { + "epoch": 0.5143111563414324, + "ewc_loss": 0.0223388671875, + "ewc_loss_parallel": 2.2292137145996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.885133743286133, + "learning_rate": 1e-06, + "loss": 0.4597, + "mean_token_accuracy": 0.8532177209854126, + "num_tokens": 154362953.0, + "step": 4043 + }, + { + "epoch": 0.5144383666200228, + "ewc_loss": 0.0224609375, + "ewc_loss_parallel": 2.2411346435546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.69044303894043, + "learning_rate": 1e-06, + "loss": 0.5042, + "mean_token_accuracy": 0.8422529697418213, + "num_tokens": 154401289.0, + "step": 4044 + }, + { + "epoch": 0.5145655768986134, + "ewc_loss": 0.0223388671875, + "ewc_loss_parallel": 2.2292137145996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.619077682495117, + "learning_rate": 1e-06, + "loss": 0.4375, + "mean_token_accuracy": 0.8590977191925049, + "num_tokens": 154438779.0, + "step": 4045 + }, + { + "epoch": 0.5146927871772039, + "ewc_loss": 0.0223388671875, + "ewc_loss_parallel": 2.2292137145996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.73002815246582, + "learning_rate": 1e-06, + "loss": 0.47, + "mean_token_accuracy": 0.848784327507019, + "num_tokens": 154467032.0, + "step": 4046 + }, + { + "epoch": 0.5148199974557944, + "ewc_loss": 0.0224609375, + "ewc_loss_parallel": 2.2411346435546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.587833404541016, + "learning_rate": 1e-06, + "loss": 0.4322, + "mean_token_accuracy": 0.865339994430542, + "num_tokens": 154505222.0, + "step": 4047 + }, + { + "epoch": 0.514947207734385, + "ewc_loss": 0.0223388671875, + "ewc_loss_parallel": 2.2292137145996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.804609298706055, + "learning_rate": 1e-06, + "loss": 0.5143, + "mean_token_accuracy": 0.8419699668884277, + "num_tokens": 154544953.0, + "step": 4048 + }, + { + "epoch": 0.5150744180129755, + "ewc_loss": 0.0225830078125, + "ewc_loss_parallel": 2.2530555725097656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.54961585998535, + "learning_rate": 1e-06, + "loss": 0.4093, + "mean_token_accuracy": 0.8678926229476929, + "num_tokens": 154586193.0, + "step": 4049 + }, + { + "epoch": 0.5152016282915659, + "ewc_loss": 0.0224609375, + "ewc_loss_parallel": 2.2411346435546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.589326858520508, + "learning_rate": 1e-06, + "loss": 0.4431, + "mean_token_accuracy": 0.8592074513435364, + "num_tokens": 154629531.0, + "step": 4050 + }, + { + "epoch": 0.5153288385701564, + "ewc_loss": 0.0225830078125, + "ewc_loss_parallel": 2.2530555725097656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.655044555664062, + "learning_rate": 1e-06, + "loss": 0.3959, + "mean_token_accuracy": 0.8700182437896729, + "num_tokens": 154659482.0, + "step": 4051 + }, + { + "epoch": 0.515456048848747, + "ewc_loss": 0.0225830078125, + "ewc_loss_parallel": 2.2530555725097656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.72701072692871, + "learning_rate": 1e-06, + "loss": 0.5522, + "mean_token_accuracy": 0.8219814300537109, + "num_tokens": 154702645.0, + "step": 4052 + }, + { + "epoch": 0.5155832591273375, + "ewc_loss": 0.0224609375, + "ewc_loss_parallel": 2.2411346435546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.56032371520996, + "learning_rate": 1e-06, + "loss": 0.4138, + "mean_token_accuracy": 0.863592267036438, + "num_tokens": 154743329.0, + "step": 4053 + }, + { + "epoch": 0.515710469405928, + "ewc_loss": 0.022705078125, + "ewc_loss_parallel": 2.2649765014648438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.694353103637695, + "learning_rate": 1e-06, + "loss": 0.458, + "mean_token_accuracy": 0.8518174886703491, + "num_tokens": 154783460.0, + "step": 4054 + }, + { + "epoch": 0.5158376796845185, + "ewc_loss": 0.022705078125, + "ewc_loss_parallel": 2.2649765014648438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.776702880859375, + "learning_rate": 1e-06, + "loss": 0.4736, + "mean_token_accuracy": 0.8496576547622681, + "num_tokens": 154827209.0, + "step": 4055 + }, + { + "epoch": 0.5159648899631091, + "ewc_loss": 0.0223388671875, + "ewc_loss_parallel": 2.2292137145996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.819732666015625, + "learning_rate": 1e-06, + "loss": 0.5338, + "mean_token_accuracy": 0.830484926700592, + "num_tokens": 154866577.0, + "step": 4056 + }, + { + "epoch": 0.5160921002416995, + "ewc_loss": 0.0223388671875, + "ewc_loss_parallel": 2.2292137145996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.74352264404297, + "learning_rate": 1e-06, + "loss": 0.4459, + "mean_token_accuracy": 0.8568648099899292, + "num_tokens": 154912191.0, + "step": 4057 + }, + { + "epoch": 0.51621931052029, + "ewc_loss": 0.0223388671875, + "ewc_loss_parallel": 2.2292137145996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.782794952392578, + "learning_rate": 1e-06, + "loss": 0.5011, + "mean_token_accuracy": 0.8413485288619995, + "num_tokens": 154953412.0, + "step": 4058 + }, + { + "epoch": 0.5163465207988805, + "ewc_loss": 0.0223388671875, + "ewc_loss_parallel": 2.2292137145996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.891183853149414, + "learning_rate": 1e-06, + "loss": 0.4131, + "mean_token_accuracy": 0.8666435480117798, + "num_tokens": 154989913.0, + "step": 4059 + }, + { + "epoch": 0.5164737310774711, + "ewc_loss": 0.0223388671875, + "ewc_loss_parallel": 2.2292137145996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.623455047607422, + "learning_rate": 1e-06, + "loss": 0.4945, + "mean_token_accuracy": 0.8422238826751709, + "num_tokens": 155038094.0, + "step": 4060 + }, + { + "epoch": 0.5166009413560616, + "ewc_loss": 0.022216796875, + "ewc_loss_parallel": 2.2172927856445312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.574552536010742, + "learning_rate": 1e-06, + "loss": 0.4832, + "mean_token_accuracy": 0.8444729447364807, + "num_tokens": 155075900.0, + "step": 4061 + }, + { + "epoch": 0.5167281516346521, + "ewc_loss": 0.022216796875, + "ewc_loss_parallel": 2.2172927856445312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.811622619628906, + "learning_rate": 1e-06, + "loss": 0.461, + "mean_token_accuracy": 0.8535841107368469, + "num_tokens": 155117038.0, + "step": 4062 + }, + { + "epoch": 0.5168553619132426, + "ewc_loss": 0.0224609375, + "ewc_loss_parallel": 2.2411346435546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.916593551635742, + "learning_rate": 1e-06, + "loss": 0.4082, + "mean_token_accuracy": 0.8683690428733826, + "num_tokens": 155151442.0, + "step": 4063 + }, + { + "epoch": 0.5169825721918331, + "ewc_loss": 0.0223388671875, + "ewc_loss_parallel": 2.2292137145996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.969257354736328, + "learning_rate": 1e-06, + "loss": 0.4247, + "mean_token_accuracy": 0.862912118434906, + "num_tokens": 155192288.0, + "step": 4064 + }, + { + "epoch": 0.5171097824704236, + "ewc_loss": 0.0223388671875, + "ewc_loss_parallel": 2.2292137145996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.951278686523438, + "learning_rate": 1e-06, + "loss": 0.4403, + "mean_token_accuracy": 0.8576712012290955, + "num_tokens": 155238931.0, + "step": 4065 + }, + { + "epoch": 0.5172369927490141, + "ewc_loss": 0.0223388671875, + "ewc_loss_parallel": 2.2292137145996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.686120986938477, + "learning_rate": 1e-06, + "loss": 0.4063, + "mean_token_accuracy": 0.8678135871887207, + "num_tokens": 155276644.0, + "step": 4066 + }, + { + "epoch": 0.5173642030276047, + "ewc_loss": 0.0220947265625, + "ewc_loss_parallel": 2.205371856689453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.07216453552246, + "learning_rate": 1e-06, + "loss": 0.4188, + "mean_token_accuracy": 0.8668929934501648, + "num_tokens": 155307638.0, + "step": 4067 + }, + { + "epoch": 0.5174914133061952, + "ewc_loss": 0.022216796875, + "ewc_loss_parallel": 2.2172927856445312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.738319396972656, + "learning_rate": 1e-06, + "loss": 0.4783, + "mean_token_accuracy": 0.8478174209594727, + "num_tokens": 155343255.0, + "step": 4068 + }, + { + "epoch": 0.5176186235847856, + "ewc_loss": 0.0223388671875, + "ewc_loss_parallel": 2.2292137145996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.566362380981445, + "learning_rate": 1e-06, + "loss": 0.4536, + "mean_token_accuracy": 0.8572708368301392, + "num_tokens": 155382283.0, + "step": 4069 + }, + { + "epoch": 0.5177458338633761, + "ewc_loss": 0.0225830078125, + "ewc_loss_parallel": 2.2530555725097656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.977066040039062, + "learning_rate": 1e-06, + "loss": 0.5258, + "mean_token_accuracy": 0.8325670957565308, + "num_tokens": 155427096.0, + "step": 4070 + }, + { + "epoch": 0.5178730441419667, + "ewc_loss": 0.0223388671875, + "ewc_loss_parallel": 2.2292137145996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.66280746459961, + "learning_rate": 1e-06, + "loss": 0.4854, + "mean_token_accuracy": 0.8431845903396606, + "num_tokens": 155467472.0, + "step": 4071 + }, + { + "epoch": 0.5180002544205572, + "ewc_loss": 0.022216796875, + "ewc_loss_parallel": 2.2172927856445312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.67915916442871, + "learning_rate": 1e-06, + "loss": 0.4662, + "mean_token_accuracy": 0.8499271273612976, + "num_tokens": 155508350.0, + "step": 4072 + }, + { + "epoch": 0.5181274646991477, + "ewc_loss": 0.0225830078125, + "ewc_loss_parallel": 2.2530555725097656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.740129470825195, + "learning_rate": 1e-06, + "loss": 0.3825, + "mean_token_accuracy": 0.8776629567146301, + "num_tokens": 155547215.0, + "step": 4073 + }, + { + "epoch": 0.5182546749777382, + "ewc_loss": 0.0220947265625, + "ewc_loss_parallel": 2.205371856689453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.907041549682617, + "learning_rate": 1e-06, + "loss": 0.4344, + "mean_token_accuracy": 0.8597376346588135, + "num_tokens": 155586658.0, + "step": 4074 + }, + { + "epoch": 0.5183818852563287, + "ewc_loss": 0.0224609375, + "ewc_loss_parallel": 2.2411346435546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.83303451538086, + "learning_rate": 1e-06, + "loss": 0.4491, + "mean_token_accuracy": 0.8571636080741882, + "num_tokens": 155620944.0, + "step": 4075 + }, + { + "epoch": 0.5185090955349192, + "ewc_loss": 0.0224609375, + "ewc_loss_parallel": 2.2411346435546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.83144187927246, + "learning_rate": 1e-06, + "loss": 0.4642, + "mean_token_accuracy": 0.8524214029312134, + "num_tokens": 155662699.0, + "step": 4076 + }, + { + "epoch": 0.5186363058135097, + "ewc_loss": 0.022216796875, + "ewc_loss_parallel": 2.2172927856445312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.680944442749023, + "learning_rate": 1e-06, + "loss": 0.4964, + "mean_token_accuracy": 0.8432635068893433, + "num_tokens": 155707783.0, + "step": 4077 + }, + { + "epoch": 0.5187635160921003, + "ewc_loss": 0.022216796875, + "ewc_loss_parallel": 2.2172927856445312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.815420150756836, + "learning_rate": 1e-06, + "loss": 0.4513, + "mean_token_accuracy": 0.8596550226211548, + "num_tokens": 155745227.0, + "step": 4078 + }, + { + "epoch": 0.5188907263706908, + "ewc_loss": 0.0224609375, + "ewc_loss_parallel": 2.2411346435546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.69417381286621, + "learning_rate": 1e-06, + "loss": 0.4456, + "mean_token_accuracy": 0.8556944131851196, + "num_tokens": 155786466.0, + "step": 4079 + }, + { + "epoch": 0.5190179366492813, + "ewc_loss": 0.022216796875, + "ewc_loss_parallel": 2.2172927856445312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.649131774902344, + "learning_rate": 1e-06, + "loss": 0.4489, + "mean_token_accuracy": 0.8560584187507629, + "num_tokens": 155833145.0, + "step": 4080 + }, + { + "epoch": 0.5191451469278717, + "ewc_loss": 0.0224609375, + "ewc_loss_parallel": 2.2411346435546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.955724716186523, + "learning_rate": 1e-06, + "loss": 0.4792, + "mean_token_accuracy": 0.8449525833129883, + "num_tokens": 155869533.0, + "step": 4081 + }, + { + "epoch": 0.5192723572064623, + "ewc_loss": 0.022216796875, + "ewc_loss_parallel": 2.2172927856445312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.75087547302246, + "learning_rate": 1e-06, + "loss": 0.4526, + "mean_token_accuracy": 0.854038417339325, + "num_tokens": 155909329.0, + "step": 4082 + }, + { + "epoch": 0.5193995674850528, + "ewc_loss": 0.022216796875, + "ewc_loss_parallel": 2.2172927856445312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.484661102294922, + "learning_rate": 1e-06, + "loss": 0.4311, + "mean_token_accuracy": 0.864314615726471, + "num_tokens": 155947848.0, + "step": 4083 + }, + { + "epoch": 0.5195267777636433, + "ewc_loss": 0.0225830078125, + "ewc_loss_parallel": 2.2530555725097656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.76622200012207, + "learning_rate": 1e-06, + "loss": 0.4392, + "mean_token_accuracy": 0.8593990802764893, + "num_tokens": 155982693.0, + "step": 4084 + }, + { + "epoch": 0.5196539880422338, + "ewc_loss": 0.0224609375, + "ewc_loss_parallel": 2.2411346435546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.63824462890625, + "learning_rate": 1e-06, + "loss": 0.4604, + "mean_token_accuracy": 0.8542628288269043, + "num_tokens": 156023414.0, + "step": 4085 + }, + { + "epoch": 0.5197811983208244, + "ewc_loss": 0.0224609375, + "ewc_loss_parallel": 2.2411346435546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.705472946166992, + "learning_rate": 1e-06, + "loss": 0.4583, + "mean_token_accuracy": 0.8547906875610352, + "num_tokens": 156063488.0, + "step": 4086 + }, + { + "epoch": 0.5199084085994148, + "ewc_loss": 0.0224609375, + "ewc_loss_parallel": 2.2411346435546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.702987670898438, + "learning_rate": 1e-06, + "loss": 0.4518, + "mean_token_accuracy": 0.8550205826759338, + "num_tokens": 156103081.0, + "step": 4087 + }, + { + "epoch": 0.5200356188780053, + "ewc_loss": 0.0223388671875, + "ewc_loss_parallel": 2.2292137145996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.851686477661133, + "learning_rate": 1e-06, + "loss": 0.4815, + "mean_token_accuracy": 0.8466569185256958, + "num_tokens": 156139453.0, + "step": 4088 + }, + { + "epoch": 0.5201628291565958, + "ewc_loss": 0.0224609375, + "ewc_loss_parallel": 2.2411346435546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.749929428100586, + "learning_rate": 1e-06, + "loss": 0.4678, + "mean_token_accuracy": 0.8494188785552979, + "num_tokens": 156176564.0, + "step": 4089 + }, + { + "epoch": 0.5202900394351864, + "ewc_loss": 0.0225830078125, + "ewc_loss_parallel": 2.2530555725097656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.67494010925293, + "learning_rate": 1e-06, + "loss": 0.4426, + "mean_token_accuracy": 0.8578245043754578, + "num_tokens": 156221278.0, + "step": 4090 + }, + { + "epoch": 0.5204172497137769, + "ewc_loss": 0.0223388671875, + "ewc_loss_parallel": 2.2292137145996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.61360740661621, + "learning_rate": 1e-06, + "loss": 0.4394, + "mean_token_accuracy": 0.8603099584579468, + "num_tokens": 156259124.0, + "step": 4091 + }, + { + "epoch": 0.5205444599923674, + "ewc_loss": 0.0225830078125, + "ewc_loss_parallel": 2.2530555725097656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.727481842041016, + "learning_rate": 1e-06, + "loss": 0.4979, + "mean_token_accuracy": 0.8450671434402466, + "num_tokens": 156297832.0, + "step": 4092 + }, + { + "epoch": 0.5206716702709578, + "ewc_loss": 0.022216796875, + "ewc_loss_parallel": 2.2172927856445312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.775129318237305, + "learning_rate": 1e-06, + "loss": 0.4288, + "mean_token_accuracy": 0.862113356590271, + "num_tokens": 156329745.0, + "step": 4093 + }, + { + "epoch": 0.5207988805495484, + "ewc_loss": 0.0224609375, + "ewc_loss_parallel": 2.2411346435546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.71854019165039, + "learning_rate": 1e-06, + "loss": 0.4466, + "mean_token_accuracy": 0.8547581434249878, + "num_tokens": 156360154.0, + "step": 4094 + }, + { + "epoch": 0.5209260908281389, + "ewc_loss": 0.0224609375, + "ewc_loss_parallel": 2.2411346435546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.6838321685791, + "learning_rate": 1e-06, + "loss": 0.4441, + "mean_token_accuracy": 0.8576290607452393, + "num_tokens": 156399360.0, + "step": 4095 + }, + { + "epoch": 0.5210533011067294, + "ewc_loss": 0.0224609375, + "ewc_loss_parallel": 2.2411346435546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.576135635375977, + "learning_rate": 1e-06, + "loss": 0.4566, + "mean_token_accuracy": 0.8544436693191528, + "num_tokens": 156438782.0, + "step": 4096 + }, + { + "epoch": 0.52118051138532, + "ewc_loss": 0.0223388671875, + "ewc_loss_parallel": 2.2292137145996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.753488540649414, + "learning_rate": 1e-06, + "loss": 0.4255, + "mean_token_accuracy": 0.8611655235290527, + "num_tokens": 156470261.0, + "step": 4097 + }, + { + "epoch": 0.5213077216639105, + "ewc_loss": 0.0225830078125, + "ewc_loss_parallel": 2.2530555725097656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.573137283325195, + "learning_rate": 1e-06, + "loss": 0.5129, + "mean_token_accuracy": 0.8377892971038818, + "num_tokens": 156510493.0, + "step": 4098 + }, + { + "epoch": 0.5214349319425009, + "ewc_loss": 0.0225830078125, + "ewc_loss_parallel": 2.2530555725097656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.823461532592773, + "learning_rate": 1e-06, + "loss": 0.4539, + "mean_token_accuracy": 0.8571096658706665, + "num_tokens": 156545040.0, + "step": 4099 + }, + { + "epoch": 0.5215621422210914, + "ewc_loss": 0.0224609375, + "ewc_loss_parallel": 2.2411346435546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.464946746826172, + "learning_rate": 1e-06, + "loss": 0.4516, + "mean_token_accuracy": 0.8531885147094727, + "num_tokens": 156583474.0, + "step": 4100 + }, + { + "epoch": 0.521689352499682, + "ewc_loss": 0.0225830078125, + "ewc_loss_parallel": 2.2530555725097656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.88232421875, + "learning_rate": 1e-06, + "loss": 0.4561, + "mean_token_accuracy": 0.8548123836517334, + "num_tokens": 156627137.0, + "step": 4101 + }, + { + "epoch": 0.5218165627782725, + "ewc_loss": 0.0228271484375, + "ewc_loss_parallel": 2.276897430419922e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.746353149414062, + "learning_rate": 1e-06, + "loss": 0.494, + "mean_token_accuracy": 0.843975305557251, + "num_tokens": 156664726.0, + "step": 4102 + }, + { + "epoch": 0.521943773056863, + "ewc_loss": 0.0224609375, + "ewc_loss_parallel": 2.2411346435546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.689311981201172, + "learning_rate": 1e-06, + "loss": 0.3954, + "mean_token_accuracy": 0.8718804121017456, + "num_tokens": 156707014.0, + "step": 4103 + }, + { + "epoch": 0.5220709833354535, + "ewc_loss": 0.0224609375, + "ewc_loss_parallel": 2.2411346435546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.828067779541016, + "learning_rate": 1e-06, + "loss": 0.4733, + "mean_token_accuracy": 0.8519423604011536, + "num_tokens": 156747784.0, + "step": 4104 + }, + { + "epoch": 0.522198193614044, + "ewc_loss": 0.022705078125, + "ewc_loss_parallel": 2.2649765014648438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.801828384399414, + "learning_rate": 1e-06, + "loss": 0.4544, + "mean_token_accuracy": 0.8532510995864868, + "num_tokens": 156784760.0, + "step": 4105 + }, + { + "epoch": 0.5223254038926345, + "ewc_loss": 0.022705078125, + "ewc_loss_parallel": 2.2649765014648438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.574337005615234, + "learning_rate": 1e-06, + "loss": 0.4623, + "mean_token_accuracy": 0.85268235206604, + "num_tokens": 156822899.0, + "step": 4106 + }, + { + "epoch": 0.522452614171225, + "ewc_loss": 0.0225830078125, + "ewc_loss_parallel": 2.2530555725097656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.764625549316406, + "learning_rate": 1e-06, + "loss": 0.4353, + "mean_token_accuracy": 0.8609606027603149, + "num_tokens": 156861308.0, + "step": 4107 + }, + { + "epoch": 0.5225798244498155, + "ewc_loss": 0.0225830078125, + "ewc_loss_parallel": 2.2530555725097656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.595008850097656, + "learning_rate": 1e-06, + "loss": 0.4252, + "mean_token_accuracy": 0.8624250888824463, + "num_tokens": 156901981.0, + "step": 4108 + }, + { + "epoch": 0.5227070347284061, + "ewc_loss": 0.0224609375, + "ewc_loss_parallel": 2.2411346435546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.739238739013672, + "learning_rate": 1e-06, + "loss": 0.4275, + "mean_token_accuracy": 0.8605079054832458, + "num_tokens": 156942215.0, + "step": 4109 + }, + { + "epoch": 0.5228342450069966, + "ewc_loss": 0.0224609375, + "ewc_loss_parallel": 2.2411346435546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.655498504638672, + "learning_rate": 1e-06, + "loss": 0.4316, + "mean_token_accuracy": 0.8610752820968628, + "num_tokens": 156983820.0, + "step": 4110 + }, + { + "epoch": 0.5229614552855871, + "ewc_loss": 0.0224609375, + "ewc_loss_parallel": 2.2411346435546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.621429443359375, + "learning_rate": 1e-06, + "loss": 0.412, + "mean_token_accuracy": 0.8681151866912842, + "num_tokens": 157016215.0, + "step": 4111 + }, + { + "epoch": 0.5230886655641775, + "ewc_loss": 0.0225830078125, + "ewc_loss_parallel": 2.2530555725097656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.61920166015625, + "learning_rate": 1e-06, + "loss": 0.4537, + "mean_token_accuracy": 0.854354977607727, + "num_tokens": 157052071.0, + "step": 4112 + }, + { + "epoch": 0.5232158758427681, + "ewc_loss": 0.0224609375, + "ewc_loss_parallel": 2.2411346435546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.67231559753418, + "learning_rate": 1e-06, + "loss": 0.5289, + "mean_token_accuracy": 0.832818329334259, + "num_tokens": 157097711.0, + "step": 4113 + }, + { + "epoch": 0.5233430861213586, + "ewc_loss": 0.0228271484375, + "ewc_loss_parallel": 2.276897430419922e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.50958824157715, + "learning_rate": 1e-06, + "loss": 0.4595, + "mean_token_accuracy": 0.8534462451934814, + "num_tokens": 157134801.0, + "step": 4114 + }, + { + "epoch": 0.5234702963999491, + "ewc_loss": 0.0225830078125, + "ewc_loss_parallel": 2.2530555725097656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.46779441833496, + "learning_rate": 1e-06, + "loss": 0.4602, + "mean_token_accuracy": 0.8560119867324829, + "num_tokens": 157171885.0, + "step": 4115 + }, + { + "epoch": 0.5235975066785397, + "ewc_loss": 0.022705078125, + "ewc_loss_parallel": 2.2649765014648438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.710208892822266, + "learning_rate": 1e-06, + "loss": 0.4596, + "mean_token_accuracy": 0.8534088134765625, + "num_tokens": 157211915.0, + "step": 4116 + }, + { + "epoch": 0.5237247169571302, + "ewc_loss": 0.02294921875, + "ewc_loss_parallel": 2.288818359375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.574520111083984, + "learning_rate": 1e-06, + "loss": 0.4778, + "mean_token_accuracy": 0.84930419921875, + "num_tokens": 157260564.0, + "step": 4117 + }, + { + "epoch": 0.5238519272357206, + "ewc_loss": 0.0228271484375, + "ewc_loss_parallel": 2.276897430419922e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.73414421081543, + "learning_rate": 1e-06, + "loss": 0.4471, + "mean_token_accuracy": 0.8568835258483887, + "num_tokens": 157294676.0, + "step": 4118 + }, + { + "epoch": 0.5239791375143111, + "ewc_loss": 0.0224609375, + "ewc_loss_parallel": 2.2411346435546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.539119720458984, + "learning_rate": 1e-06, + "loss": 0.4138, + "mean_token_accuracy": 0.8660888671875, + "num_tokens": 157334607.0, + "step": 4119 + }, + { + "epoch": 0.5241063477929017, + "ewc_loss": 0.0228271484375, + "ewc_loss_parallel": 2.276897430419922e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.6722469329834, + "learning_rate": 1e-06, + "loss": 0.4492, + "mean_token_accuracy": 0.8567598462104797, + "num_tokens": 157372021.0, + "step": 4120 + }, + { + "epoch": 0.5242335580714922, + "ewc_loss": 0.0228271484375, + "ewc_loss_parallel": 2.276897430419922e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.67764663696289, + "learning_rate": 1e-06, + "loss": 0.437, + "mean_token_accuracy": 0.8595285415649414, + "num_tokens": 157410092.0, + "step": 4121 + }, + { + "epoch": 0.5243607683500827, + "ewc_loss": 0.0228271484375, + "ewc_loss_parallel": 2.276897430419922e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.88072967529297, + "learning_rate": 1e-06, + "loss": 0.4635, + "mean_token_accuracy": 0.8519230484962463, + "num_tokens": 157447979.0, + "step": 4122 + }, + { + "epoch": 0.5244879786286732, + "ewc_loss": 0.02294921875, + "ewc_loss_parallel": 2.288818359375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.624805450439453, + "learning_rate": 1e-06, + "loss": 0.4645, + "mean_token_accuracy": 0.8505605459213257, + "num_tokens": 157481376.0, + "step": 4123 + }, + { + "epoch": 0.5246151889072637, + "ewc_loss": 0.022705078125, + "ewc_loss_parallel": 2.2649765014648438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.63998031616211, + "learning_rate": 1e-06, + "loss": 0.4352, + "mean_token_accuracy": 0.861067533493042, + "num_tokens": 157519555.0, + "step": 4124 + }, + { + "epoch": 0.5247423991858542, + "ewc_loss": 0.022216796875, + "ewc_loss_parallel": 2.2172927856445312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.866825103759766, + "learning_rate": 1e-06, + "loss": 0.4483, + "mean_token_accuracy": 0.8526058197021484, + "num_tokens": 157551849.0, + "step": 4125 + }, + { + "epoch": 0.5248696094644447, + "ewc_loss": 0.0224609375, + "ewc_loss_parallel": 2.2411346435546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.93777084350586, + "learning_rate": 1e-06, + "loss": 0.4529, + "mean_token_accuracy": 0.8551273345947266, + "num_tokens": 157590823.0, + "step": 4126 + }, + { + "epoch": 0.5249968197430352, + "ewc_loss": 0.0224609375, + "ewc_loss_parallel": 2.2411346435546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.612550735473633, + "learning_rate": 1e-06, + "loss": 0.4174, + "mean_token_accuracy": 0.8665001392364502, + "num_tokens": 157633771.0, + "step": 4127 + }, + { + "epoch": 0.5251240300216258, + "ewc_loss": 0.0224609375, + "ewc_loss_parallel": 2.2411346435546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.84292221069336, + "learning_rate": 1e-06, + "loss": 0.4017, + "mean_token_accuracy": 0.8689859509468079, + "num_tokens": 157667796.0, + "step": 4128 + }, + { + "epoch": 0.5252512403002163, + "ewc_loss": 0.0223388671875, + "ewc_loss_parallel": 2.2292137145996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.676481246948242, + "learning_rate": 1e-06, + "loss": 0.4131, + "mean_token_accuracy": 0.865985631942749, + "num_tokens": 157709452.0, + "step": 4129 + }, + { + "epoch": 0.5253784505788067, + "ewc_loss": 0.0225830078125, + "ewc_loss_parallel": 2.2530555725097656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.644058227539062, + "learning_rate": 1e-06, + "loss": 0.482, + "mean_token_accuracy": 0.8475527763366699, + "num_tokens": 157741407.0, + "step": 4130 + }, + { + "epoch": 0.5255056608573972, + "ewc_loss": 0.0223388671875, + "ewc_loss_parallel": 2.2292137145996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.50899314880371, + "learning_rate": 1e-06, + "loss": 0.4056, + "mean_token_accuracy": 0.8662160634994507, + "num_tokens": 157777693.0, + "step": 4131 + }, + { + "epoch": 0.5256328711359878, + "ewc_loss": 0.022705078125, + "ewc_loss_parallel": 2.2649765014648438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.917390823364258, + "learning_rate": 1e-06, + "loss": 0.4634, + "mean_token_accuracy": 0.8541606068611145, + "num_tokens": 157814729.0, + "step": 4132 + }, + { + "epoch": 0.5257600814145783, + "ewc_loss": 0.0224609375, + "ewc_loss_parallel": 2.2411346435546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.756778717041016, + "learning_rate": 1e-06, + "loss": 0.4221, + "mean_token_accuracy": 0.8649437427520752, + "num_tokens": 157853698.0, + "step": 4133 + }, + { + "epoch": 0.5258872916931688, + "ewc_loss": 0.02294921875, + "ewc_loss_parallel": 2.288818359375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.557275772094727, + "learning_rate": 1e-06, + "loss": 0.4224, + "mean_token_accuracy": 0.8609903454780579, + "num_tokens": 157888580.0, + "step": 4134 + }, + { + "epoch": 0.5260145019717594, + "ewc_loss": 0.02294921875, + "ewc_loss_parallel": 2.300739288330078e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.736825942993164, + "learning_rate": 1e-06, + "loss": 0.454, + "mean_token_accuracy": 0.8537774085998535, + "num_tokens": 157928434.0, + "step": 4135 + }, + { + "epoch": 0.5261417122503498, + "ewc_loss": 0.0224609375, + "ewc_loss_parallel": 2.2411346435546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.987384796142578, + "learning_rate": 1e-06, + "loss": 0.4658, + "mean_token_accuracy": 0.8511233925819397, + "num_tokens": 157968936.0, + "step": 4136 + }, + { + "epoch": 0.5262689225289403, + "ewc_loss": 0.0228271484375, + "ewc_loss_parallel": 2.276897430419922e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.58759307861328, + "learning_rate": 1e-06, + "loss": 0.4881, + "mean_token_accuracy": 0.8442075252532959, + "num_tokens": 158005496.0, + "step": 4137 + }, + { + "epoch": 0.5263961328075308, + "ewc_loss": 0.022216796875, + "ewc_loss_parallel": 2.2172927856445312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.62603187561035, + "learning_rate": 1e-06, + "loss": 0.4271, + "mean_token_accuracy": 0.8634815216064453, + "num_tokens": 158047569.0, + "step": 4138 + }, + { + "epoch": 0.5265233430861214, + "ewc_loss": 0.0225830078125, + "ewc_loss_parallel": 2.2530555725097656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.763896942138672, + "learning_rate": 1e-06, + "loss": 0.4525, + "mean_token_accuracy": 0.854979395866394, + "num_tokens": 158085733.0, + "step": 4139 + }, + { + "epoch": 0.5266505533647119, + "ewc_loss": 0.02294921875, + "ewc_loss_parallel": 2.300739288330078e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.86738395690918, + "learning_rate": 1e-06, + "loss": 0.4821, + "mean_token_accuracy": 0.84599769115448, + "num_tokens": 158123906.0, + "step": 4140 + }, + { + "epoch": 0.5267777636433024, + "ewc_loss": 0.0223388671875, + "ewc_loss_parallel": 2.2292137145996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.668224334716797, + "learning_rate": 1e-06, + "loss": 0.4268, + "mean_token_accuracy": 0.8619007468223572, + "num_tokens": 158163663.0, + "step": 4141 + }, + { + "epoch": 0.5269049739218928, + "ewc_loss": 0.022705078125, + "ewc_loss_parallel": 2.2649765014648438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.782176971435547, + "learning_rate": 1e-06, + "loss": 0.4944, + "mean_token_accuracy": 0.8440996408462524, + "num_tokens": 158205878.0, + "step": 4142 + }, + { + "epoch": 0.5270321842004834, + "ewc_loss": 0.0224609375, + "ewc_loss_parallel": 2.2411346435546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.663776397705078, + "learning_rate": 1e-06, + "loss": 0.4286, + "mean_token_accuracy": 0.8587894439697266, + "num_tokens": 158241608.0, + "step": 4143 + }, + { + "epoch": 0.5271593944790739, + "ewc_loss": 0.022705078125, + "ewc_loss_parallel": 2.2649765014648438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.726762771606445, + "learning_rate": 1e-06, + "loss": 0.4082, + "mean_token_accuracy": 0.8678131699562073, + "num_tokens": 158274259.0, + "step": 4144 + }, + { + "epoch": 0.5272866047576644, + "ewc_loss": 0.0224609375, + "ewc_loss_parallel": 2.2411346435546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.753225326538086, + "learning_rate": 1e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.8627574443817139, + "num_tokens": 158304793.0, + "step": 4145 + }, + { + "epoch": 0.527413815036255, + "ewc_loss": 0.02294921875, + "ewc_loss_parallel": 2.288818359375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.68347930908203, + "learning_rate": 1e-06, + "loss": 0.4106, + "mean_token_accuracy": 0.8694228529930115, + "num_tokens": 158349420.0, + "step": 4146 + }, + { + "epoch": 0.5275410253148455, + "ewc_loss": 0.0228271484375, + "ewc_loss_parallel": 2.276897430419922e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.717565536499023, + "learning_rate": 1e-06, + "loss": 0.4568, + "mean_token_accuracy": 0.8540955781936646, + "num_tokens": 158395834.0, + "step": 4147 + }, + { + "epoch": 0.5276682355934359, + "ewc_loss": 0.0228271484375, + "ewc_loss_parallel": 2.276897430419922e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.645885467529297, + "learning_rate": 1e-06, + "loss": 0.4533, + "mean_token_accuracy": 0.8572929501533508, + "num_tokens": 158435568.0, + "step": 4148 + }, + { + "epoch": 0.5277954458720264, + "ewc_loss": 0.022705078125, + "ewc_loss_parallel": 2.2649765014648438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.65871810913086, + "learning_rate": 1e-06, + "loss": 0.4129, + "mean_token_accuracy": 0.8655940294265747, + "num_tokens": 158473584.0, + "step": 4149 + }, + { + "epoch": 0.527922656150617, + "ewc_loss": 0.022705078125, + "ewc_loss_parallel": 2.2649765014648438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.843238830566406, + "learning_rate": 1e-06, + "loss": 0.4527, + "mean_token_accuracy": 0.8566514849662781, + "num_tokens": 158513435.0, + "step": 4150 + }, + { + "epoch": 0.5280498664292075, + "ewc_loss": 0.022705078125, + "ewc_loss_parallel": 2.2649765014648438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.779680252075195, + "learning_rate": 1e-06, + "loss": 0.4429, + "mean_token_accuracy": 0.8566656112670898, + "num_tokens": 158549478.0, + "step": 4151 + }, + { + "epoch": 0.528177076707798, + "ewc_loss": 0.0224609375, + "ewc_loss_parallel": 2.2411346435546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.730072021484375, + "learning_rate": 1e-06, + "loss": 0.4396, + "mean_token_accuracy": 0.860579252243042, + "num_tokens": 158598742.0, + "step": 4152 + }, + { + "epoch": 0.5283042869863885, + "ewc_loss": 0.022705078125, + "ewc_loss_parallel": 2.2649765014648438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.814931869506836, + "learning_rate": 1e-06, + "loss": 0.4894, + "mean_token_accuracy": 0.8444852232933044, + "num_tokens": 158636806.0, + "step": 4153 + }, + { + "epoch": 0.528431497264979, + "ewc_loss": 0.0225830078125, + "ewc_loss_parallel": 2.2530555725097656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.603652954101562, + "learning_rate": 1e-06, + "loss": 0.4634, + "mean_token_accuracy": 0.8498654961585999, + "num_tokens": 158676043.0, + "step": 4154 + }, + { + "epoch": 0.5285587075435695, + "ewc_loss": 0.0224609375, + "ewc_loss_parallel": 2.2411346435546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.860370635986328, + "learning_rate": 1e-06, + "loss": 0.4748, + "mean_token_accuracy": 0.840961217880249, + "num_tokens": 158711772.0, + "step": 4155 + }, + { + "epoch": 0.52868591782216, + "ewc_loss": 0.0223388671875, + "ewc_loss_parallel": 2.2292137145996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.771873474121094, + "learning_rate": 1e-06, + "loss": 0.448, + "mean_token_accuracy": 0.8557428121566772, + "num_tokens": 158749608.0, + "step": 4156 + }, + { + "epoch": 0.5288131281007505, + "ewc_loss": 0.022705078125, + "ewc_loss_parallel": 2.2649765014648438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.65256690979004, + "learning_rate": 1e-06, + "loss": 0.4472, + "mean_token_accuracy": 0.8562299013137817, + "num_tokens": 158790502.0, + "step": 4157 + }, + { + "epoch": 0.5289403383793411, + "ewc_loss": 0.0228271484375, + "ewc_loss_parallel": 2.276897430419922e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.785308837890625, + "learning_rate": 1e-06, + "loss": 0.5294, + "mean_token_accuracy": 0.8315918445587158, + "num_tokens": 158828415.0, + "step": 4158 + }, + { + "epoch": 0.5290675486579316, + "ewc_loss": 0.022705078125, + "ewc_loss_parallel": 2.2649765014648438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.780363082885742, + "learning_rate": 1e-06, + "loss": 0.4823, + "mean_token_accuracy": 0.8468673229217529, + "num_tokens": 158866972.0, + "step": 4159 + }, + { + "epoch": 0.5291947589365221, + "ewc_loss": 0.0225830078125, + "ewc_loss_parallel": 2.2530555725097656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.704269409179688, + "learning_rate": 1e-06, + "loss": 0.4666, + "mean_token_accuracy": 0.853471040725708, + "num_tokens": 158902991.0, + "step": 4160 + }, + { + "epoch": 0.5293219692151125, + "ewc_loss": 0.0225830078125, + "ewc_loss_parallel": 2.2530555725097656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.814640045166016, + "learning_rate": 1e-06, + "loss": 0.4256, + "mean_token_accuracy": 0.8635821342468262, + "num_tokens": 158945963.0, + "step": 4161 + }, + { + "epoch": 0.5294491794937031, + "ewc_loss": 0.0228271484375, + "ewc_loss_parallel": 2.276897430419922e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.846155166625977, + "learning_rate": 1e-06, + "loss": 0.4478, + "mean_token_accuracy": 0.8543707728385925, + "num_tokens": 158985407.0, + "step": 4162 + }, + { + "epoch": 0.5295763897722936, + "ewc_loss": 0.0224609375, + "ewc_loss_parallel": 2.2411346435546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.777822494506836, + "learning_rate": 1e-06, + "loss": 0.468, + "mean_token_accuracy": 0.8483287692070007, + "num_tokens": 159024387.0, + "step": 4163 + }, + { + "epoch": 0.5297036000508841, + "ewc_loss": 0.022705078125, + "ewc_loss_parallel": 2.2649765014648438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.793621063232422, + "learning_rate": 1e-06, + "loss": 0.4652, + "mean_token_accuracy": 0.8473341464996338, + "num_tokens": 159061430.0, + "step": 4164 + }, + { + "epoch": 0.5298308103294747, + "ewc_loss": 0.0228271484375, + "ewc_loss_parallel": 2.276897430419922e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.65607261657715, + "learning_rate": 1e-06, + "loss": 0.4454, + "mean_token_accuracy": 0.8572118282318115, + "num_tokens": 159099350.0, + "step": 4165 + }, + { + "epoch": 0.5299580206080652, + "ewc_loss": 0.022705078125, + "ewc_loss_parallel": 2.2649765014648438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.3878116607666, + "learning_rate": 1e-06, + "loss": 0.5453, + "mean_token_accuracy": 0.8303167223930359, + "num_tokens": 159136276.0, + "step": 4166 + }, + { + "epoch": 0.5300852308866556, + "ewc_loss": 0.022705078125, + "ewc_loss_parallel": 2.2649765014648438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.87557029724121, + "learning_rate": 1e-06, + "loss": 0.3848, + "mean_token_accuracy": 0.8772765398025513, + "num_tokens": 159177102.0, + "step": 4167 + }, + { + "epoch": 0.5302124411652461, + "ewc_loss": 0.0224609375, + "ewc_loss_parallel": 2.2411346435546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.0817928314209, + "learning_rate": 1e-06, + "loss": 0.4116, + "mean_token_accuracy": 0.8671032190322876, + "num_tokens": 159213582.0, + "step": 4168 + }, + { + "epoch": 0.5303396514438367, + "ewc_loss": 0.0225830078125, + "ewc_loss_parallel": 2.2530555725097656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.688583374023438, + "learning_rate": 1e-06, + "loss": 0.4575, + "mean_token_accuracy": 0.8549119234085083, + "num_tokens": 159254283.0, + "step": 4169 + }, + { + "epoch": 0.5304668617224272, + "ewc_loss": 0.0224609375, + "ewc_loss_parallel": 2.2411346435546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.198272705078125, + "learning_rate": 1e-06, + "loss": 0.4388, + "mean_token_accuracy": 0.8583916425704956, + "num_tokens": 159288557.0, + "step": 4170 + }, + { + "epoch": 0.5305940720010177, + "ewc_loss": 0.022705078125, + "ewc_loss_parallel": 2.2649765014648438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.925689697265625, + "learning_rate": 1e-06, + "loss": 0.4212, + "mean_token_accuracy": 0.865163266658783, + "num_tokens": 159325047.0, + "step": 4171 + }, + { + "epoch": 0.5307212822796082, + "ewc_loss": 0.022216796875, + "ewc_loss_parallel": 2.2172927856445312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.57097053527832, + "learning_rate": 1e-06, + "loss": 0.4437, + "mean_token_accuracy": 0.8600692749023438, + "num_tokens": 159366291.0, + "step": 4172 + }, + { + "epoch": 0.5308484925581987, + "ewc_loss": 0.0223388671875, + "ewc_loss_parallel": 2.2292137145996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.319576263427734, + "learning_rate": 1e-06, + "loss": 0.4778, + "mean_token_accuracy": 0.850205659866333, + "num_tokens": 159401634.0, + "step": 4173 + }, + { + "epoch": 0.5309757028367892, + "ewc_loss": 0.0224609375, + "ewc_loss_parallel": 2.2411346435546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.6317081451416, + "learning_rate": 1e-06, + "loss": 0.4784, + "mean_token_accuracy": 0.8483338356018066, + "num_tokens": 159435316.0, + "step": 4174 + }, + { + "epoch": 0.5311029131153797, + "ewc_loss": 0.022216796875, + "ewc_loss_parallel": 2.2172927856445312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.61932373046875, + "learning_rate": 1e-06, + "loss": 0.4717, + "mean_token_accuracy": 0.8482436537742615, + "num_tokens": 159472192.0, + "step": 4175 + }, + { + "epoch": 0.5312301233939702, + "ewc_loss": 0.02294921875, + "ewc_loss_parallel": 2.288818359375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.885683059692383, + "learning_rate": 1e-06, + "loss": 0.4221, + "mean_token_accuracy": 0.8632282018661499, + "num_tokens": 159515342.0, + "step": 4176 + }, + { + "epoch": 0.5313573336725608, + "ewc_loss": 0.0223388671875, + "ewc_loss_parallel": 2.2292137145996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.931949615478516, + "learning_rate": 1e-06, + "loss": 0.4331, + "mean_token_accuracy": 0.859947681427002, + "num_tokens": 159553932.0, + "step": 4177 + }, + { + "epoch": 0.5314845439511513, + "ewc_loss": 0.0225830078125, + "ewc_loss_parallel": 2.2530555725097656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.5723876953125, + "learning_rate": 1e-06, + "loss": 0.4497, + "mean_token_accuracy": 0.8572829961776733, + "num_tokens": 159584711.0, + "step": 4178 + }, + { + "epoch": 0.5316117542297417, + "ewc_loss": 0.0225830078125, + "ewc_loss_parallel": 2.2530555725097656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.70999526977539, + "learning_rate": 1e-06, + "loss": 0.4835, + "mean_token_accuracy": 0.8426912426948547, + "num_tokens": 159618784.0, + "step": 4179 + }, + { + "epoch": 0.5317389645083322, + "ewc_loss": 0.0228271484375, + "ewc_loss_parallel": 2.276897430419922e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.776531219482422, + "learning_rate": 1e-06, + "loss": 0.4572, + "mean_token_accuracy": 0.8541041612625122, + "num_tokens": 159661288.0, + "step": 4180 + }, + { + "epoch": 0.5318661747869228, + "ewc_loss": 0.0225830078125, + "ewc_loss_parallel": 2.2530555725097656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.687095642089844, + "learning_rate": 1e-06, + "loss": 0.4192, + "mean_token_accuracy": 0.8665515184402466, + "num_tokens": 159701286.0, + "step": 4181 + }, + { + "epoch": 0.5319933850655133, + "ewc_loss": 0.02294921875, + "ewc_loss_parallel": 2.288818359375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.701887130737305, + "learning_rate": 1e-06, + "loss": 0.4899, + "mean_token_accuracy": 0.8428418636322021, + "num_tokens": 159746631.0, + "step": 4182 + }, + { + "epoch": 0.5321205953441038, + "ewc_loss": 0.022705078125, + "ewc_loss_parallel": 2.2649765014648438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.93098258972168, + "learning_rate": 1e-06, + "loss": 0.4509, + "mean_token_accuracy": 0.8590673208236694, + "num_tokens": 159787753.0, + "step": 4183 + }, + { + "epoch": 0.5322478056226944, + "ewc_loss": 0.02294921875, + "ewc_loss_parallel": 2.288818359375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.766782760620117, + "learning_rate": 1e-06, + "loss": 0.4834, + "mean_token_accuracy": 0.8483921885490417, + "num_tokens": 159823092.0, + "step": 4184 + }, + { + "epoch": 0.5323750159012848, + "ewc_loss": 0.0225830078125, + "ewc_loss_parallel": 2.2530555725097656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.670137405395508, + "learning_rate": 1e-06, + "loss": 0.4828, + "mean_token_accuracy": 0.8476468324661255, + "num_tokens": 159858480.0, + "step": 4185 + }, + { + "epoch": 0.5325022261798753, + "ewc_loss": 0.0228271484375, + "ewc_loss_parallel": 2.276897430419922e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.951059341430664, + "learning_rate": 1e-06, + "loss": 0.5032, + "mean_token_accuracy": 0.847029447555542, + "num_tokens": 159896220.0, + "step": 4186 + }, + { + "epoch": 0.5326294364584658, + "ewc_loss": 0.022705078125, + "ewc_loss_parallel": 2.2649765014648438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.623414993286133, + "learning_rate": 1e-06, + "loss": 0.4486, + "mean_token_accuracy": 0.8530961871147156, + "num_tokens": 159930995.0, + "step": 4187 + }, + { + "epoch": 0.5327566467370564, + "ewc_loss": 0.02294921875, + "ewc_loss_parallel": 2.288818359375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.78436279296875, + "learning_rate": 1e-06, + "loss": 0.4638, + "mean_token_accuracy": 0.8512188196182251, + "num_tokens": 159968013.0, + "step": 4188 + }, + { + "epoch": 0.5328838570156469, + "ewc_loss": 0.02294921875, + "ewc_loss_parallel": 2.288818359375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.743423461914062, + "learning_rate": 1e-06, + "loss": 0.4776, + "mean_token_accuracy": 0.8488321304321289, + "num_tokens": 160004085.0, + "step": 4189 + }, + { + "epoch": 0.5330110672942374, + "ewc_loss": 0.0228271484375, + "ewc_loss_parallel": 2.276897430419922e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.942224502563477, + "learning_rate": 1e-06, + "loss": 0.4226, + "mean_token_accuracy": 0.866190493106842, + "num_tokens": 160040606.0, + "step": 4190 + }, + { + "epoch": 0.5331382775728278, + "ewc_loss": 0.0228271484375, + "ewc_loss_parallel": 2.276897430419922e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.83010482788086, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.8668960332870483, + "num_tokens": 160080792.0, + "step": 4191 + }, + { + "epoch": 0.5332654878514184, + "ewc_loss": 0.0228271484375, + "ewc_loss_parallel": 2.276897430419922e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.778921127319336, + "learning_rate": 1e-06, + "loss": 0.4787, + "mean_token_accuracy": 0.8444938659667969, + "num_tokens": 160117055.0, + "step": 4192 + }, + { + "epoch": 0.5333926981300089, + "ewc_loss": 0.022705078125, + "ewc_loss_parallel": 2.2649765014648438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.92568588256836, + "learning_rate": 1e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.8608368635177612, + "num_tokens": 160153223.0, + "step": 4193 + }, + { + "epoch": 0.5335199084085994, + "ewc_loss": 0.0228271484375, + "ewc_loss_parallel": 2.276897430419922e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.866615295410156, + "learning_rate": 1e-06, + "loss": 0.5105, + "mean_token_accuracy": 0.838823676109314, + "num_tokens": 160190920.0, + "step": 4194 + }, + { + "epoch": 0.53364711868719, + "ewc_loss": 0.0225830078125, + "ewc_loss_parallel": 2.2530555725097656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.74410057067871, + "learning_rate": 1e-06, + "loss": 0.4488, + "mean_token_accuracy": 0.8531455993652344, + "num_tokens": 160221898.0, + "step": 4195 + }, + { + "epoch": 0.5337743289657805, + "ewc_loss": 0.022705078125, + "ewc_loss_parallel": 2.2649765014648438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.998735427856445, + "learning_rate": 1e-06, + "loss": 0.4538, + "mean_token_accuracy": 0.8587603569030762, + "num_tokens": 160267102.0, + "step": 4196 + }, + { + "epoch": 0.5339015392443709, + "ewc_loss": 0.0225830078125, + "ewc_loss_parallel": 2.2530555725097656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.774017333984375, + "learning_rate": 1e-06, + "loss": 0.4272, + "mean_token_accuracy": 0.8607324957847595, + "num_tokens": 160304422.0, + "step": 4197 + }, + { + "epoch": 0.5340287495229614, + "ewc_loss": 0.0223388671875, + "ewc_loss_parallel": 2.2292137145996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.645587921142578, + "learning_rate": 1e-06, + "loss": 0.4586, + "mean_token_accuracy": 0.8533270955085754, + "num_tokens": 160339006.0, + "step": 4198 + }, + { + "epoch": 0.534155959801552, + "ewc_loss": 0.022705078125, + "ewc_loss_parallel": 2.2649765014648438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.781667709350586, + "learning_rate": 1e-06, + "loss": 0.3864, + "mean_token_accuracy": 0.8757284879684448, + "num_tokens": 160377504.0, + "step": 4199 + }, + { + "epoch": 0.5342831700801425, + "ewc_loss": 0.0225830078125, + "ewc_loss_parallel": 2.2530555725097656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.689167022705078, + "learning_rate": 1e-06, + "loss": 0.5119, + "mean_token_accuracy": 0.8353813886642456, + "num_tokens": 160413315.0, + "step": 4200 + }, + { + "epoch": 0.534410380358733, + "ewc_loss": 0.0225830078125, + "ewc_loss_parallel": 2.2530555725097656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.92321014404297, + "learning_rate": 1e-06, + "loss": 0.4074, + "mean_token_accuracy": 0.8697770833969116, + "num_tokens": 160455034.0, + "step": 4201 + }, + { + "epoch": 0.5345375906373235, + "ewc_loss": 0.02294921875, + "ewc_loss_parallel": 2.288818359375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.708751678466797, + "learning_rate": 1e-06, + "loss": 0.4712, + "mean_token_accuracy": 0.849937915802002, + "num_tokens": 160490505.0, + "step": 4202 + }, + { + "epoch": 0.534664800915914, + "ewc_loss": 0.022705078125, + "ewc_loss_parallel": 2.2649765014648438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.949975967407227, + "learning_rate": 1e-06, + "loss": 0.4248, + "mean_token_accuracy": 0.8657877445220947, + "num_tokens": 160528712.0, + "step": 4203 + }, + { + "epoch": 0.5347920111945045, + "ewc_loss": 0.022705078125, + "ewc_loss_parallel": 2.2649765014648438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.59048080444336, + "learning_rate": 1e-06, + "loss": 0.479, + "mean_token_accuracy": 0.8466829061508179, + "num_tokens": 160567945.0, + "step": 4204 + }, + { + "epoch": 0.534919221473095, + "ewc_loss": 0.0228271484375, + "ewc_loss_parallel": 2.276897430419922e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.874372482299805, + "learning_rate": 1e-06, + "loss": 0.4206, + "mean_token_accuracy": 0.8633149266242981, + "num_tokens": 160596767.0, + "step": 4205 + }, + { + "epoch": 0.5350464317516855, + "ewc_loss": 0.0225830078125, + "ewc_loss_parallel": 2.2530555725097656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.721275329589844, + "learning_rate": 1e-06, + "loss": 0.4727, + "mean_token_accuracy": 0.8522729873657227, + "num_tokens": 160636912.0, + "step": 4206 + }, + { + "epoch": 0.5351736420302761, + "ewc_loss": 0.0228271484375, + "ewc_loss_parallel": 2.276897430419922e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.969825744628906, + "learning_rate": 1e-06, + "loss": 0.4703, + "mean_token_accuracy": 0.8489099144935608, + "num_tokens": 160674588.0, + "step": 4207 + }, + { + "epoch": 0.5353008523088666, + "ewc_loss": 0.02294921875, + "ewc_loss_parallel": 2.288818359375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.72264289855957, + "learning_rate": 1e-06, + "loss": 0.4874, + "mean_token_accuracy": 0.8440508246421814, + "num_tokens": 160713441.0, + "step": 4208 + }, + { + "epoch": 0.5354280625874571, + "ewc_loss": 0.022705078125, + "ewc_loss_parallel": 2.2649765014648438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.796239852905273, + "learning_rate": 1e-06, + "loss": 0.4238, + "mean_token_accuracy": 0.8658351898193359, + "num_tokens": 160752716.0, + "step": 4209 + }, + { + "epoch": 0.5355552728660475, + "ewc_loss": 0.02294921875, + "ewc_loss_parallel": 2.288818359375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.819658279418945, + "learning_rate": 1e-06, + "loss": 0.4141, + "mean_token_accuracy": 0.8659083843231201, + "num_tokens": 160798771.0, + "step": 4210 + }, + { + "epoch": 0.5356824831446381, + "ewc_loss": 0.022705078125, + "ewc_loss_parallel": 2.2649765014648438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.751325607299805, + "learning_rate": 1e-06, + "loss": 0.4029, + "mean_token_accuracy": 0.8724066615104675, + "num_tokens": 160843085.0, + "step": 4211 + }, + { + "epoch": 0.5358096934232286, + "ewc_loss": 0.022705078125, + "ewc_loss_parallel": 2.2649765014648438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.826078414916992, + "learning_rate": 1e-06, + "loss": 0.4726, + "mean_token_accuracy": 0.8479008078575134, + "num_tokens": 160883073.0, + "step": 4212 + }, + { + "epoch": 0.5359369037018191, + "ewc_loss": 0.022705078125, + "ewc_loss_parallel": 2.2649765014648438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.69866943359375, + "learning_rate": 1e-06, + "loss": 0.4297, + "mean_token_accuracy": 0.8623355031013489, + "num_tokens": 160923734.0, + "step": 4213 + }, + { + "epoch": 0.5360641139804097, + "ewc_loss": 0.022705078125, + "ewc_loss_parallel": 2.2649765014648438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.80265235900879, + "learning_rate": 1e-06, + "loss": 0.466, + "mean_token_accuracy": 0.8524658679962158, + "num_tokens": 160963460.0, + "step": 4214 + }, + { + "epoch": 0.5361913242590002, + "ewc_loss": 0.0228271484375, + "ewc_loss_parallel": 2.276897430419922e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.73448944091797, + "learning_rate": 1e-06, + "loss": 0.4613, + "mean_token_accuracy": 0.8532435297966003, + "num_tokens": 161003691.0, + "step": 4215 + }, + { + "epoch": 0.5363185345375906, + "ewc_loss": 0.02294921875, + "ewc_loss_parallel": 2.288818359375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.697237014770508, + "learning_rate": 1e-06, + "loss": 0.4468, + "mean_token_accuracy": 0.8565083742141724, + "num_tokens": 161042056.0, + "step": 4216 + }, + { + "epoch": 0.5364457448161811, + "ewc_loss": 0.02294921875, + "ewc_loss_parallel": 2.300739288330078e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.776792526245117, + "learning_rate": 1e-06, + "loss": 0.3852, + "mean_token_accuracy": 0.879114031791687, + "num_tokens": 161078670.0, + "step": 4217 + }, + { + "epoch": 0.5365729550947717, + "ewc_loss": 0.02294921875, + "ewc_loss_parallel": 2.288818359375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.90683937072754, + "learning_rate": 1e-06, + "loss": 0.4087, + "mean_token_accuracy": 0.8691936135292053, + "num_tokens": 161113992.0, + "step": 4218 + }, + { + "epoch": 0.5367001653733622, + "ewc_loss": 0.0228271484375, + "ewc_loss_parallel": 2.276897430419922e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.817264556884766, + "learning_rate": 1e-06, + "loss": 0.4387, + "mean_token_accuracy": 0.8616057634353638, + "num_tokens": 161153763.0, + "step": 4219 + }, + { + "epoch": 0.5368273756519527, + "ewc_loss": 0.022705078125, + "ewc_loss_parallel": 2.2649765014648438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.708831787109375, + "learning_rate": 1e-06, + "loss": 0.4965, + "mean_token_accuracy": 0.8400552272796631, + "num_tokens": 161184186.0, + "step": 4220 + }, + { + "epoch": 0.5369545859305432, + "ewc_loss": 0.02294921875, + "ewc_loss_parallel": 2.288818359375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.74695587158203, + "learning_rate": 1e-06, + "loss": 0.4498, + "mean_token_accuracy": 0.8536097407341003, + "num_tokens": 161219298.0, + "step": 4221 + }, + { + "epoch": 0.5370817962091337, + "ewc_loss": 0.0228271484375, + "ewc_loss_parallel": 2.276897430419922e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.679351806640625, + "learning_rate": 1e-06, + "loss": 0.3998, + "mean_token_accuracy": 0.8712244033813477, + "num_tokens": 161255252.0, + "step": 4222 + }, + { + "epoch": 0.5372090064877242, + "ewc_loss": 0.022705078125, + "ewc_loss_parallel": 2.2649765014648438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.016990661621094, + "learning_rate": 1e-06, + "loss": 0.445, + "mean_token_accuracy": 0.8611305952072144, + "num_tokens": 161295890.0, + "step": 4223 + }, + { + "epoch": 0.5373362167663147, + "ewc_loss": 0.0224609375, + "ewc_loss_parallel": 2.2411346435546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.83656120300293, + "learning_rate": 1e-06, + "loss": 0.4612, + "mean_token_accuracy": 0.8538975715637207, + "num_tokens": 161330808.0, + "step": 4224 + }, + { + "epoch": 0.5374634270449052, + "ewc_loss": 0.0225830078125, + "ewc_loss_parallel": 2.2530555725097656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.77511215209961, + "learning_rate": 1e-06, + "loss": 0.4217, + "mean_token_accuracy": 0.8645004034042358, + "num_tokens": 161370515.0, + "step": 4225 + }, + { + "epoch": 0.5375906373234958, + "ewc_loss": 0.0225830078125, + "ewc_loss_parallel": 2.2530555725097656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.027950286865234, + "learning_rate": 1e-06, + "loss": 0.4123, + "mean_token_accuracy": 0.8690338134765625, + "num_tokens": 161405257.0, + "step": 4226 + }, + { + "epoch": 0.5377178476020863, + "ewc_loss": 0.0228271484375, + "ewc_loss_parallel": 2.276897430419922e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.036046981811523, + "learning_rate": 1e-06, + "loss": 0.4541, + "mean_token_accuracy": 0.8578046560287476, + "num_tokens": 161443526.0, + "step": 4227 + }, + { + "epoch": 0.5378450578806767, + "ewc_loss": 0.0225830078125, + "ewc_loss_parallel": 2.2530555725097656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.814048767089844, + "learning_rate": 1e-06, + "loss": 0.4711, + "mean_token_accuracy": 0.8501206636428833, + "num_tokens": 161481262.0, + "step": 4228 + }, + { + "epoch": 0.5379722681592672, + "ewc_loss": 0.0225830078125, + "ewc_loss_parallel": 2.2530555725097656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.622844696044922, + "learning_rate": 1e-06, + "loss": 0.4965, + "mean_token_accuracy": 0.8412642478942871, + "num_tokens": 161517756.0, + "step": 4229 + }, + { + "epoch": 0.5380994784378578, + "ewc_loss": 0.022705078125, + "ewc_loss_parallel": 2.2649765014648438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.973011016845703, + "learning_rate": 1e-06, + "loss": 0.4117, + "mean_token_accuracy": 0.8652425408363342, + "num_tokens": 161552680.0, + "step": 4230 + }, + { + "epoch": 0.5382266887164483, + "ewc_loss": 0.02294921875, + "ewc_loss_parallel": 2.288818359375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.808452606201172, + "learning_rate": 1e-06, + "loss": 0.4606, + "mean_token_accuracy": 0.8532134294509888, + "num_tokens": 161588607.0, + "step": 4231 + }, + { + "epoch": 0.5383538989950388, + "ewc_loss": 0.022705078125, + "ewc_loss_parallel": 2.2649765014648438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.735641479492188, + "learning_rate": 1e-06, + "loss": 0.5073, + "mean_token_accuracy": 0.8407586812973022, + "num_tokens": 161626427.0, + "step": 4232 + }, + { + "epoch": 0.5384811092736294, + "ewc_loss": 0.0228271484375, + "ewc_loss_parallel": 2.276897430419922e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.860980987548828, + "learning_rate": 1e-06, + "loss": 0.4398, + "mean_token_accuracy": 0.8618760108947754, + "num_tokens": 161665182.0, + "step": 4233 + }, + { + "epoch": 0.5386083195522198, + "ewc_loss": 0.0225830078125, + "ewc_loss_parallel": 2.2530555725097656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.742494583129883, + "learning_rate": 1e-06, + "loss": 0.4464, + "mean_token_accuracy": 0.8570704460144043, + "num_tokens": 161704685.0, + "step": 4234 + }, + { + "epoch": 0.5387355298308103, + "ewc_loss": 0.02294921875, + "ewc_loss_parallel": 2.300739288330078e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.818265914916992, + "learning_rate": 1e-06, + "loss": 0.4375, + "mean_token_accuracy": 0.8619422912597656, + "num_tokens": 161742960.0, + "step": 4235 + }, + { + "epoch": 0.5388627401094008, + "ewc_loss": 0.02294921875, + "ewc_loss_parallel": 2.288818359375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.792993545532227, + "learning_rate": 1e-06, + "loss": 0.4587, + "mean_token_accuracy": 0.8554815053939819, + "num_tokens": 161778199.0, + "step": 4236 + }, + { + "epoch": 0.5389899503879914, + "ewc_loss": 0.0228271484375, + "ewc_loss_parallel": 2.276897430419922e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.803367614746094, + "learning_rate": 1e-06, + "loss": 0.4608, + "mean_token_accuracy": 0.8526251316070557, + "num_tokens": 161811235.0, + "step": 4237 + }, + { + "epoch": 0.5391171606665819, + "ewc_loss": 0.0233154296875, + "ewc_loss_parallel": 2.3365020751953125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.82489776611328, + "learning_rate": 1e-06, + "loss": 0.4405, + "mean_token_accuracy": 0.8562867641448975, + "num_tokens": 161847669.0, + "step": 4238 + }, + { + "epoch": 0.5392443709451724, + "ewc_loss": 0.0225830078125, + "ewc_loss_parallel": 2.2530555725097656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.845197677612305, + "learning_rate": 1e-06, + "loss": 0.4913, + "mean_token_accuracy": 0.8430067896842957, + "num_tokens": 161888470.0, + "step": 4239 + }, + { + "epoch": 0.5393715812237628, + "ewc_loss": 0.023193359375, + "ewc_loss_parallel": 2.3245811462402344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.7528018951416, + "learning_rate": 1e-06, + "loss": 0.5124, + "mean_token_accuracy": 0.837507963180542, + "num_tokens": 161924563.0, + "step": 4240 + }, + { + "epoch": 0.5394987915023534, + "ewc_loss": 0.02294921875, + "ewc_loss_parallel": 2.300739288330078e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.91520881652832, + "learning_rate": 1e-06, + "loss": 0.4976, + "mean_token_accuracy": 0.842124879360199, + "num_tokens": 161959062.0, + "step": 4241 + }, + { + "epoch": 0.5396260017809439, + "ewc_loss": 0.0230712890625, + "ewc_loss_parallel": 2.3126602172851562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.809593200683594, + "learning_rate": 1e-06, + "loss": 0.4413, + "mean_token_accuracy": 0.8598248958587646, + "num_tokens": 161994755.0, + "step": 4242 + }, + { + "epoch": 0.5397532120595344, + "ewc_loss": 0.02294921875, + "ewc_loss_parallel": 2.288818359375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.804121017456055, + "learning_rate": 1e-06, + "loss": 0.4125, + "mean_token_accuracy": 0.8654444217681885, + "num_tokens": 162032812.0, + "step": 4243 + }, + { + "epoch": 0.5398804223381249, + "ewc_loss": 0.023193359375, + "ewc_loss_parallel": 2.3245811462402344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.705427169799805, + "learning_rate": 1e-06, + "loss": 0.4365, + "mean_token_accuracy": 0.8598654866218567, + "num_tokens": 162067130.0, + "step": 4244 + }, + { + "epoch": 0.5400076326167155, + "ewc_loss": 0.0230712890625, + "ewc_loss_parallel": 2.3126602172851562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.749895095825195, + "learning_rate": 1e-06, + "loss": 0.4515, + "mean_token_accuracy": 0.8582823276519775, + "num_tokens": 162107452.0, + "step": 4245 + }, + { + "epoch": 0.5401348428953059, + "ewc_loss": 0.02294921875, + "ewc_loss_parallel": 2.300739288330078e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.669384002685547, + "learning_rate": 1e-06, + "loss": 0.4468, + "mean_token_accuracy": 0.857025682926178, + "num_tokens": 162144220.0, + "step": 4246 + }, + { + "epoch": 0.5402620531738964, + "ewc_loss": 0.0230712890625, + "ewc_loss_parallel": 2.3126602172851562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.89497184753418, + "learning_rate": 1e-06, + "loss": 0.4248, + "mean_token_accuracy": 0.865703821182251, + "num_tokens": 162181798.0, + "step": 4247 + }, + { + "epoch": 0.540389263452487, + "ewc_loss": 0.023193359375, + "ewc_loss_parallel": 2.3245811462402344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.668357849121094, + "learning_rate": 1e-06, + "loss": 0.4693, + "mean_token_accuracy": 0.8487321138381958, + "num_tokens": 162222501.0, + "step": 4248 + }, + { + "epoch": 0.5405164737310775, + "ewc_loss": 0.02294921875, + "ewc_loss_parallel": 2.300739288330078e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.933317184448242, + "learning_rate": 1e-06, + "loss": 0.451, + "mean_token_accuracy": 0.8533965349197388, + "num_tokens": 162265533.0, + "step": 4249 + }, + { + "epoch": 0.540643684009668, + "ewc_loss": 0.0234375, + "ewc_loss_parallel": 2.3484230041503906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.779605865478516, + "learning_rate": 1e-06, + "loss": 0.3962, + "mean_token_accuracy": 0.8714622855186462, + "num_tokens": 162304984.0, + "step": 4250 + }, + { + "epoch": 0.5407708942882585, + "ewc_loss": 0.023193359375, + "ewc_loss_parallel": 2.3245811462402344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.653345108032227, + "learning_rate": 1e-06, + "loss": 0.4788, + "mean_token_accuracy": 0.8472933769226074, + "num_tokens": 162342358.0, + "step": 4251 + }, + { + "epoch": 0.540898104566849, + "ewc_loss": 0.0230712890625, + "ewc_loss_parallel": 2.3126602172851562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.81100845336914, + "learning_rate": 1e-06, + "loss": 0.4431, + "mean_token_accuracy": 0.8582080602645874, + "num_tokens": 162377343.0, + "step": 4252 + }, + { + "epoch": 0.5410253148454395, + "ewc_loss": 0.022705078125, + "ewc_loss_parallel": 2.2649765014648438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.74073600769043, + "learning_rate": 1e-06, + "loss": 0.4477, + "mean_token_accuracy": 0.8566162586212158, + "num_tokens": 162416416.0, + "step": 4253 + }, + { + "epoch": 0.54115252512403, + "ewc_loss": 0.0233154296875, + "ewc_loss_parallel": 2.3365020751953125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.754404067993164, + "learning_rate": 1e-06, + "loss": 0.3667, + "mean_token_accuracy": 0.883421778678894, + "num_tokens": 162447108.0, + "step": 4254 + }, + { + "epoch": 0.5412797354026205, + "ewc_loss": 0.0230712890625, + "ewc_loss_parallel": 2.3126602172851562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.811861038208008, + "learning_rate": 1e-06, + "loss": 0.4565, + "mean_token_accuracy": 0.8588610291481018, + "num_tokens": 162489659.0, + "step": 4255 + }, + { + "epoch": 0.5414069456812111, + "ewc_loss": 0.0228271484375, + "ewc_loss_parallel": 2.276897430419922e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.700864791870117, + "learning_rate": 1e-06, + "loss": 0.4848, + "mean_token_accuracy": 0.8464280366897583, + "num_tokens": 162525816.0, + "step": 4256 + }, + { + "epoch": 0.5415341559598016, + "ewc_loss": 0.02294921875, + "ewc_loss_parallel": 2.300739288330078e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.858457565307617, + "learning_rate": 1e-06, + "loss": 0.4885, + "mean_token_accuracy": 0.8525897264480591, + "num_tokens": 162564154.0, + "step": 4257 + }, + { + "epoch": 0.5416613662383921, + "ewc_loss": 0.0230712890625, + "ewc_loss_parallel": 2.3126602172851562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.80887222290039, + "learning_rate": 1e-06, + "loss": 0.4726, + "mean_token_accuracy": 0.8503944873809814, + "num_tokens": 162602728.0, + "step": 4258 + }, + { + "epoch": 0.5417885765169825, + "ewc_loss": 0.0233154296875, + "ewc_loss_parallel": 2.3365020751953125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.683263778686523, + "learning_rate": 1e-06, + "loss": 0.4348, + "mean_token_accuracy": 0.8615446090698242, + "num_tokens": 162636659.0, + "step": 4259 + }, + { + "epoch": 0.5419157867955731, + "ewc_loss": 0.02294921875, + "ewc_loss_parallel": 2.300739288330078e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.753582000732422, + "learning_rate": 1e-06, + "loss": 0.3919, + "mean_token_accuracy": 0.8729012608528137, + "num_tokens": 162676639.0, + "step": 4260 + }, + { + "epoch": 0.5420429970741636, + "ewc_loss": 0.02294921875, + "ewc_loss_parallel": 2.300739288330078e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.671449661254883, + "learning_rate": 1e-06, + "loss": 0.5094, + "mean_token_accuracy": 0.8383939266204834, + "num_tokens": 162716763.0, + "step": 4261 + }, + { + "epoch": 0.5421702073527541, + "ewc_loss": 0.0230712890625, + "ewc_loss_parallel": 2.3126602172851562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.78352165222168, + "learning_rate": 1e-06, + "loss": 0.4831, + "mean_token_accuracy": 0.8459537625312805, + "num_tokens": 162753943.0, + "step": 4262 + }, + { + "epoch": 0.5422974176313446, + "ewc_loss": 0.02294921875, + "ewc_loss_parallel": 2.300739288330078e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.694990158081055, + "learning_rate": 1e-06, + "loss": 0.4534, + "mean_token_accuracy": 0.8521032333374023, + "num_tokens": 162792938.0, + "step": 4263 + }, + { + "epoch": 0.5424246279099352, + "ewc_loss": 0.0233154296875, + "ewc_loss_parallel": 2.3365020751953125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.71829605102539, + "learning_rate": 1e-06, + "loss": 0.4685, + "mean_token_accuracy": 0.8549633622169495, + "num_tokens": 162838938.0, + "step": 4264 + }, + { + "epoch": 0.5425518381885256, + "ewc_loss": 0.02294921875, + "ewc_loss_parallel": 2.300739288330078e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.73809814453125, + "learning_rate": 1e-06, + "loss": 0.4196, + "mean_token_accuracy": 0.8628156781196594, + "num_tokens": 162875026.0, + "step": 4265 + }, + { + "epoch": 0.5426790484671161, + "ewc_loss": 0.0233154296875, + "ewc_loss_parallel": 2.3365020751953125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.734947204589844, + "learning_rate": 1e-06, + "loss": 0.4344, + "mean_token_accuracy": 0.8611742854118347, + "num_tokens": 162910724.0, + "step": 4266 + }, + { + "epoch": 0.5428062587457066, + "ewc_loss": 0.0230712890625, + "ewc_loss_parallel": 2.3126602172851562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.81451988220215, + "learning_rate": 1e-06, + "loss": 0.44, + "mean_token_accuracy": 0.8603640794754028, + "num_tokens": 162941602.0, + "step": 4267 + }, + { + "epoch": 0.5429334690242972, + "ewc_loss": 0.023193359375, + "ewc_loss_parallel": 2.3245811462402344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.846214294433594, + "learning_rate": 1e-06, + "loss": 0.5009, + "mean_token_accuracy": 0.8432291746139526, + "num_tokens": 162980760.0, + "step": 4268 + }, + { + "epoch": 0.5430606793028877, + "ewc_loss": 0.02294921875, + "ewc_loss_parallel": 2.300739288330078e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.73824691772461, + "learning_rate": 1e-06, + "loss": 0.4209, + "mean_token_accuracy": 0.867027997970581, + "num_tokens": 163016155.0, + "step": 4269 + }, + { + "epoch": 0.5431878895814782, + "ewc_loss": 0.023193359375, + "ewc_loss_parallel": 2.3245811462402344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.937694549560547, + "learning_rate": 1e-06, + "loss": 0.4758, + "mean_token_accuracy": 0.8480581045150757, + "num_tokens": 163052519.0, + "step": 4270 + }, + { + "epoch": 0.5433150998600687, + "ewc_loss": 0.0233154296875, + "ewc_loss_parallel": 2.3365020751953125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.795787811279297, + "learning_rate": 1e-06, + "loss": 0.4184, + "mean_token_accuracy": 0.8651942610740662, + "num_tokens": 163091806.0, + "step": 4271 + }, + { + "epoch": 0.5434423101386592, + "ewc_loss": 0.023193359375, + "ewc_loss_parallel": 2.3245811462402344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.769399642944336, + "learning_rate": 1e-06, + "loss": 0.4, + "mean_token_accuracy": 0.8726032972335815, + "num_tokens": 163128830.0, + "step": 4272 + }, + { + "epoch": 0.5435695204172497, + "ewc_loss": 0.0230712890625, + "ewc_loss_parallel": 2.3126602172851562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.872983932495117, + "learning_rate": 1e-06, + "loss": 0.4653, + "mean_token_accuracy": 0.8527977466583252, + "num_tokens": 163169567.0, + "step": 4273 + }, + { + "epoch": 0.5436967306958402, + "ewc_loss": 0.0230712890625, + "ewc_loss_parallel": 2.3126602172851562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.727901458740234, + "learning_rate": 1e-06, + "loss": 0.4717, + "mean_token_accuracy": 0.849571168422699, + "num_tokens": 163202913.0, + "step": 4274 + }, + { + "epoch": 0.5438239409744308, + "ewc_loss": 0.023193359375, + "ewc_loss_parallel": 2.3245811462402344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.747459411621094, + "learning_rate": 1e-06, + "loss": 0.421, + "mean_token_accuracy": 0.8646501898765564, + "num_tokens": 163236741.0, + "step": 4275 + }, + { + "epoch": 0.5439511512530213, + "ewc_loss": 0.0230712890625, + "ewc_loss_parallel": 2.3126602172851562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.9141902923584, + "learning_rate": 1e-06, + "loss": 0.4682, + "mean_token_accuracy": 0.8511911630630493, + "num_tokens": 163281402.0, + "step": 4276 + }, + { + "epoch": 0.5440783615316117, + "ewc_loss": 0.0230712890625, + "ewc_loss_parallel": 2.3126602172851562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.66147804260254, + "learning_rate": 1e-06, + "loss": 0.4218, + "mean_token_accuracy": 0.8655990362167358, + "num_tokens": 163325288.0, + "step": 4277 + }, + { + "epoch": 0.5442055718102022, + "ewc_loss": 0.0230712890625, + "ewc_loss_parallel": 2.3126602172851562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.929767608642578, + "learning_rate": 1e-06, + "loss": 0.4872, + "mean_token_accuracy": 0.8463577032089233, + "num_tokens": 163367471.0, + "step": 4278 + }, + { + "epoch": 0.5443327820887928, + "ewc_loss": 0.0233154296875, + "ewc_loss_parallel": 2.3365020751953125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.83685302734375, + "learning_rate": 1e-06, + "loss": 0.4407, + "mean_token_accuracy": 0.8574776649475098, + "num_tokens": 163400077.0, + "step": 4279 + }, + { + "epoch": 0.5444599923673833, + "ewc_loss": 0.0230712890625, + "ewc_loss_parallel": 2.3126602172851562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.698408126831055, + "learning_rate": 1e-06, + "loss": 0.434, + "mean_token_accuracy": 0.860854983329773, + "num_tokens": 163433739.0, + "step": 4280 + }, + { + "epoch": 0.5445872026459738, + "ewc_loss": 0.0233154296875, + "ewc_loss_parallel": 2.3365020751953125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.84026527404785, + "learning_rate": 1e-06, + "loss": 0.4464, + "mean_token_accuracy": 0.8572970628738403, + "num_tokens": 163467447.0, + "step": 4281 + }, + { + "epoch": 0.5447144129245644, + "ewc_loss": 0.0233154296875, + "ewc_loss_parallel": 2.3365020751953125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.77448081970215, + "learning_rate": 1e-06, + "loss": 0.4588, + "mean_token_accuracy": 0.8536894917488098, + "num_tokens": 163506633.0, + "step": 4282 + }, + { + "epoch": 0.5448416232031548, + "ewc_loss": 0.023193359375, + "ewc_loss_parallel": 2.3245811462402344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.87043571472168, + "learning_rate": 1e-06, + "loss": 0.4984, + "mean_token_accuracy": 0.8404809236526489, + "num_tokens": 163542773.0, + "step": 4283 + }, + { + "epoch": 0.5449688334817453, + "ewc_loss": 0.023193359375, + "ewc_loss_parallel": 2.3245811462402344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.84564781188965, + "learning_rate": 1e-06, + "loss": 0.5562, + "mean_token_accuracy": 0.8233751058578491, + "num_tokens": 163585247.0, + "step": 4284 + }, + { + "epoch": 0.5450960437603358, + "ewc_loss": 0.02294921875, + "ewc_loss_parallel": 2.300739288330078e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.669776916503906, + "learning_rate": 1e-06, + "loss": 0.4853, + "mean_token_accuracy": 0.8448849320411682, + "num_tokens": 163622802.0, + "step": 4285 + }, + { + "epoch": 0.5452232540389264, + "ewc_loss": 0.0228271484375, + "ewc_loss_parallel": 2.276897430419922e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.83707618713379, + "learning_rate": 1e-06, + "loss": 0.3672, + "mean_token_accuracy": 0.8809036016464233, + "num_tokens": 163655980.0, + "step": 4286 + }, + { + "epoch": 0.5453504643175169, + "ewc_loss": 0.0228271484375, + "ewc_loss_parallel": 2.276897430419922e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.518306732177734, + "learning_rate": 1e-06, + "loss": 0.4284, + "mean_token_accuracy": 0.8621513843536377, + "num_tokens": 163691494.0, + "step": 4287 + }, + { + "epoch": 0.5454776745961074, + "ewc_loss": 0.02294921875, + "ewc_loss_parallel": 2.288818359375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.225759506225586, + "learning_rate": 1e-06, + "loss": 0.4178, + "mean_token_accuracy": 0.8676164746284485, + "num_tokens": 163725169.0, + "step": 4288 + }, + { + "epoch": 0.5456048848746978, + "ewc_loss": 0.0224609375, + "ewc_loss_parallel": 2.2411346435546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.560760498046875, + "learning_rate": 1e-06, + "loss": 0.442, + "mean_token_accuracy": 0.8609546422958374, + "num_tokens": 163766050.0, + "step": 4289 + }, + { + "epoch": 0.5457320951532884, + "ewc_loss": 0.02294921875, + "ewc_loss_parallel": 2.288818359375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.532175064086914, + "learning_rate": 1e-06, + "loss": 0.4777, + "mean_token_accuracy": 0.8513023853302002, + "num_tokens": 163799559.0, + "step": 4290 + }, + { + "epoch": 0.5458593054318789, + "ewc_loss": 0.0230712890625, + "ewc_loss_parallel": 2.3126602172851562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.08847427368164, + "learning_rate": 1e-06, + "loss": 0.4638, + "mean_token_accuracy": 0.851771891117096, + "num_tokens": 163837052.0, + "step": 4291 + }, + { + "epoch": 0.5459865157104694, + "ewc_loss": 0.0228271484375, + "ewc_loss_parallel": 2.276897430419922e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.56327247619629, + "learning_rate": 1e-06, + "loss": 0.46, + "mean_token_accuracy": 0.8534644842147827, + "num_tokens": 163872494.0, + "step": 4292 + }, + { + "epoch": 0.5461137259890599, + "ewc_loss": 0.0228271484375, + "ewc_loss_parallel": 2.276897430419922e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.233562469482422, + "learning_rate": 1e-06, + "loss": 0.4632, + "mean_token_accuracy": 0.848893404006958, + "num_tokens": 163913499.0, + "step": 4293 + }, + { + "epoch": 0.5462409362676505, + "ewc_loss": 0.02294921875, + "ewc_loss_parallel": 2.288818359375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.01235008239746, + "learning_rate": 1e-06, + "loss": 0.5108, + "mean_token_accuracy": 0.8370460271835327, + "num_tokens": 163957965.0, + "step": 4294 + }, + { + "epoch": 0.5463681465462409, + "ewc_loss": 0.0228271484375, + "ewc_loss_parallel": 2.276897430419922e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.738527297973633, + "learning_rate": 1e-06, + "loss": 0.5829, + "mean_token_accuracy": 0.8156384825706482, + "num_tokens": 164005661.0, + "step": 4295 + }, + { + "epoch": 0.5464953568248314, + "ewc_loss": 0.02294921875, + "ewc_loss_parallel": 2.288818359375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.860708236694336, + "learning_rate": 1e-06, + "loss": 0.4007, + "mean_token_accuracy": 0.8727521896362305, + "num_tokens": 164044717.0, + "step": 4296 + }, + { + "epoch": 0.5466225671034219, + "ewc_loss": 0.0228271484375, + "ewc_loss_parallel": 2.276897430419922e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.828580856323242, + "learning_rate": 1e-06, + "loss": 0.4543, + "mean_token_accuracy": 0.8561800718307495, + "num_tokens": 164080022.0, + "step": 4297 + }, + { + "epoch": 0.5467497773820125, + "ewc_loss": 0.0228271484375, + "ewc_loss_parallel": 2.276897430419922e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.0474853515625, + "learning_rate": 1e-06, + "loss": 0.4287, + "mean_token_accuracy": 0.8639202117919922, + "num_tokens": 164112749.0, + "step": 4298 + }, + { + "epoch": 0.546876987660603, + "ewc_loss": 0.0230712890625, + "ewc_loss_parallel": 2.3126602172851562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.86884117126465, + "learning_rate": 1e-06, + "loss": 0.4684, + "mean_token_accuracy": 0.8515004515647888, + "num_tokens": 164152517.0, + "step": 4299 + }, + { + "epoch": 0.5470041979391935, + "ewc_loss": 0.02294921875, + "ewc_loss_parallel": 2.300739288330078e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.7609806060791, + "learning_rate": 1e-06, + "loss": 0.4277, + "mean_token_accuracy": 0.8626725077629089, + "num_tokens": 164195245.0, + "step": 4300 + }, + { + "epoch": 0.5471314082177839, + "ewc_loss": 0.02294921875, + "ewc_loss_parallel": 2.300739288330078e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.77073860168457, + "learning_rate": 1e-06, + "loss": 0.4659, + "mean_token_accuracy": 0.8519307374954224, + "num_tokens": 164230643.0, + "step": 4301 + }, + { + "epoch": 0.5472586184963745, + "ewc_loss": 0.02294921875, + "ewc_loss_parallel": 2.300739288330078e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.900920867919922, + "learning_rate": 1e-06, + "loss": 0.4539, + "mean_token_accuracy": 0.853134036064148, + "num_tokens": 164271089.0, + "step": 4302 + }, + { + "epoch": 0.547385828774965, + "ewc_loss": 0.0233154296875, + "ewc_loss_parallel": 2.3365020751953125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.9050350189209, + "learning_rate": 1e-06, + "loss": 0.4505, + "mean_token_accuracy": 0.8545612096786499, + "num_tokens": 164310618.0, + "step": 4303 + }, + { + "epoch": 0.5475130390535555, + "ewc_loss": 0.02294921875, + "ewc_loss_parallel": 2.288818359375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.790576934814453, + "learning_rate": 1e-06, + "loss": 0.4841, + "mean_token_accuracy": 0.8431854248046875, + "num_tokens": 164348630.0, + "step": 4304 + }, + { + "epoch": 0.5476402493321461, + "ewc_loss": 0.023193359375, + "ewc_loss_parallel": 2.3245811462402344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.662353515625, + "learning_rate": 1e-06, + "loss": 0.4335, + "mean_token_accuracy": 0.8630681037902832, + "num_tokens": 164383735.0, + "step": 4305 + }, + { + "epoch": 0.5477674596107366, + "ewc_loss": 0.02294921875, + "ewc_loss_parallel": 2.300739288330078e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.730560302734375, + "learning_rate": 1e-06, + "loss": 0.4352, + "mean_token_accuracy": 0.8575747013092041, + "num_tokens": 164426115.0, + "step": 4306 + }, + { + "epoch": 0.5478946698893271, + "ewc_loss": 0.0234375, + "ewc_loss_parallel": 2.3484230041503906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.838909149169922, + "learning_rate": 1e-06, + "loss": 0.4711, + "mean_token_accuracy": 0.8487012386322021, + "num_tokens": 164462119.0, + "step": 4307 + }, + { + "epoch": 0.5480218801679175, + "ewc_loss": 0.0230712890625, + "ewc_loss_parallel": 2.3126602172851562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.803770065307617, + "learning_rate": 1e-06, + "loss": 0.4402, + "mean_token_accuracy": 0.8541488647460938, + "num_tokens": 164503030.0, + "step": 4308 + }, + { + "epoch": 0.5481490904465081, + "ewc_loss": 0.023193359375, + "ewc_loss_parallel": 2.3245811462402344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.74803924560547, + "learning_rate": 1e-06, + "loss": 0.434, + "mean_token_accuracy": 0.859656572341919, + "num_tokens": 164541091.0, + "step": 4309 + }, + { + "epoch": 0.5482763007250986, + "ewc_loss": 0.0233154296875, + "ewc_loss_parallel": 2.3365020751953125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.840045928955078, + "learning_rate": 1e-06, + "loss": 0.4647, + "mean_token_accuracy": 0.8512518405914307, + "num_tokens": 164577678.0, + "step": 4310 + }, + { + "epoch": 0.5484035110036891, + "ewc_loss": 0.023193359375, + "ewc_loss_parallel": 2.3245811462402344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.796192169189453, + "learning_rate": 1e-06, + "loss": 0.4426, + "mean_token_accuracy": 0.8537654280662537, + "num_tokens": 164612779.0, + "step": 4311 + }, + { + "epoch": 0.5485307212822796, + "ewc_loss": 0.023193359375, + "ewc_loss_parallel": 2.3245811462402344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.695545196533203, + "learning_rate": 1e-06, + "loss": 0.4352, + "mean_token_accuracy": 0.8606806397438049, + "num_tokens": 164649849.0, + "step": 4312 + }, + { + "epoch": 0.5486579315608702, + "ewc_loss": 0.0233154296875, + "ewc_loss_parallel": 2.3365020751953125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.87087059020996, + "learning_rate": 1e-06, + "loss": 0.4617, + "mean_token_accuracy": 0.853975772857666, + "num_tokens": 164690538.0, + "step": 4313 + }, + { + "epoch": 0.5487851418394606, + "ewc_loss": 0.0233154296875, + "ewc_loss_parallel": 2.3365020751953125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.97188949584961, + "learning_rate": 1e-06, + "loss": 0.4523, + "mean_token_accuracy": 0.8570674657821655, + "num_tokens": 164729338.0, + "step": 4314 + }, + { + "epoch": 0.5489123521180511, + "ewc_loss": 0.0233154296875, + "ewc_loss_parallel": 2.3365020751953125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.66063690185547, + "learning_rate": 1e-06, + "loss": 0.4176, + "mean_token_accuracy": 0.868123471736908, + "num_tokens": 164766648.0, + "step": 4315 + }, + { + "epoch": 0.5490395623966416, + "ewc_loss": 0.0234375, + "ewc_loss_parallel": 2.3484230041503906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.037805557250977, + "learning_rate": 1e-06, + "loss": 0.4607, + "mean_token_accuracy": 0.855483889579773, + "num_tokens": 164804254.0, + "step": 4316 + }, + { + "epoch": 0.5491667726752322, + "ewc_loss": 0.02294921875, + "ewc_loss_parallel": 2.300739288330078e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.781780242919922, + "learning_rate": 1e-06, + "loss": 0.4054, + "mean_token_accuracy": 0.869531512260437, + "num_tokens": 164839943.0, + "step": 4317 + }, + { + "epoch": 0.5492939829538227, + "ewc_loss": 0.0234375, + "ewc_loss_parallel": 2.3484230041503906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.74536895751953, + "learning_rate": 1e-06, + "loss": 0.476, + "mean_token_accuracy": 0.8480960130691528, + "num_tokens": 164887057.0, + "step": 4318 + }, + { + "epoch": 0.5494211932324132, + "ewc_loss": 0.0234375, + "ewc_loss_parallel": 2.3484230041503906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.899524688720703, + "learning_rate": 1e-06, + "loss": 0.4706, + "mean_token_accuracy": 0.8562930226325989, + "num_tokens": 164928515.0, + "step": 4319 + }, + { + "epoch": 0.5495484035110036, + "ewc_loss": 0.0234375, + "ewc_loss_parallel": 2.3484230041503906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.714582443237305, + "learning_rate": 1e-06, + "loss": 0.4513, + "mean_token_accuracy": 0.8562567234039307, + "num_tokens": 164963818.0, + "step": 4320 + }, + { + "epoch": 0.5496756137895942, + "ewc_loss": 0.0233154296875, + "ewc_loss_parallel": 2.3365020751953125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.7921085357666, + "learning_rate": 1e-06, + "loss": 0.424, + "mean_token_accuracy": 0.870686411857605, + "num_tokens": 165001091.0, + "step": 4321 + }, + { + "epoch": 0.5498028240681847, + "ewc_loss": 0.0235595703125, + "ewc_loss_parallel": 2.3603439331054688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.752717971801758, + "learning_rate": 1e-06, + "loss": 0.4999, + "mean_token_accuracy": 0.8389755487442017, + "num_tokens": 165046009.0, + "step": 4322 + }, + { + "epoch": 0.5499300343467752, + "ewc_loss": 0.0233154296875, + "ewc_loss_parallel": 2.3365020751953125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.780122756958008, + "learning_rate": 1e-06, + "loss": 0.4567, + "mean_token_accuracy": 0.8526131510734558, + "num_tokens": 165090729.0, + "step": 4323 + }, + { + "epoch": 0.5500572446253658, + "ewc_loss": 0.0233154296875, + "ewc_loss_parallel": 2.3365020751953125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.771215438842773, + "learning_rate": 1e-06, + "loss": 0.3957, + "mean_token_accuracy": 0.8730521202087402, + "num_tokens": 165129771.0, + "step": 4324 + }, + { + "epoch": 0.5501844549039563, + "ewc_loss": 0.0234375, + "ewc_loss_parallel": 2.3484230041503906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.851613998413086, + "learning_rate": 1e-06, + "loss": 0.3928, + "mean_token_accuracy": 0.872289776802063, + "num_tokens": 165164310.0, + "step": 4325 + }, + { + "epoch": 0.5503116651825467, + "ewc_loss": 0.0230712890625, + "ewc_loss_parallel": 2.3126602172851562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.740753173828125, + "learning_rate": 1e-06, + "loss": 0.4577, + "mean_token_accuracy": 0.8538538217544556, + "num_tokens": 165202842.0, + "step": 4326 + }, + { + "epoch": 0.5504388754611372, + "ewc_loss": 0.0233154296875, + "ewc_loss_parallel": 2.3365020751953125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.77704620361328, + "learning_rate": 1e-06, + "loss": 0.4359, + "mean_token_accuracy": 0.8599528074264526, + "num_tokens": 165235998.0, + "step": 4327 + }, + { + "epoch": 0.5505660857397278, + "ewc_loss": 0.02294921875, + "ewc_loss_parallel": 2.300739288330078e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.90976333618164, + "learning_rate": 1e-06, + "loss": 0.436, + "mean_token_accuracy": 0.8583817481994629, + "num_tokens": 165271779.0, + "step": 4328 + }, + { + "epoch": 0.5506932960183183, + "ewc_loss": 0.0234375, + "ewc_loss_parallel": 2.3484230041503906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.97846031188965, + "learning_rate": 1e-06, + "loss": 0.4596, + "mean_token_accuracy": 0.8514435291290283, + "num_tokens": 165306136.0, + "step": 4329 + }, + { + "epoch": 0.5508205062969088, + "ewc_loss": 0.0233154296875, + "ewc_loss_parallel": 2.3365020751953125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.700265884399414, + "learning_rate": 1e-06, + "loss": 0.3947, + "mean_token_accuracy": 0.8703380823135376, + "num_tokens": 165342401.0, + "step": 4330 + }, + { + "epoch": 0.5509477165754993, + "ewc_loss": 0.023193359375, + "ewc_loss_parallel": 2.3245811462402344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.894428253173828, + "learning_rate": 1e-06, + "loss": 0.4453, + "mean_token_accuracy": 0.8605062365531921, + "num_tokens": 165380120.0, + "step": 4331 + }, + { + "epoch": 0.5510749268540898, + "ewc_loss": 0.023193359375, + "ewc_loss_parallel": 2.3245811462402344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.837453842163086, + "learning_rate": 1e-06, + "loss": 0.4914, + "mean_token_accuracy": 0.8443273901939392, + "num_tokens": 165418210.0, + "step": 4332 + }, + { + "epoch": 0.5512021371326803, + "ewc_loss": 0.0234375, + "ewc_loss_parallel": 2.3484230041503906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.837064743041992, + "learning_rate": 1e-06, + "loss": 0.5235, + "mean_token_accuracy": 0.8366414904594421, + "num_tokens": 165460952.0, + "step": 4333 + }, + { + "epoch": 0.5513293474112708, + "ewc_loss": 0.0233154296875, + "ewc_loss_parallel": 2.3365020751953125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.861141204833984, + "learning_rate": 1e-06, + "loss": 0.4037, + "mean_token_accuracy": 0.8695226907730103, + "num_tokens": 165500370.0, + "step": 4334 + }, + { + "epoch": 0.5514565576898613, + "ewc_loss": 0.023193359375, + "ewc_loss_parallel": 2.3245811462402344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.780881881713867, + "learning_rate": 1e-06, + "loss": 0.4511, + "mean_token_accuracy": 0.8570733070373535, + "num_tokens": 165534089.0, + "step": 4335 + }, + { + "epoch": 0.5515837679684519, + "ewc_loss": 0.023193359375, + "ewc_loss_parallel": 2.3245811462402344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.619680404663086, + "learning_rate": 1e-06, + "loss": 0.4676, + "mean_token_accuracy": 0.8496445417404175, + "num_tokens": 165573111.0, + "step": 4336 + }, + { + "epoch": 0.5517109782470424, + "ewc_loss": 0.023193359375, + "ewc_loss_parallel": 2.3245811462402344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.838613510131836, + "learning_rate": 1e-06, + "loss": 0.4741, + "mean_token_accuracy": 0.8469060063362122, + "num_tokens": 165611942.0, + "step": 4337 + }, + { + "epoch": 0.5518381885256328, + "ewc_loss": 0.0230712890625, + "ewc_loss_parallel": 2.3126602172851562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.83291244506836, + "learning_rate": 1e-06, + "loss": 0.4054, + "mean_token_accuracy": 0.864139974117279, + "num_tokens": 165651703.0, + "step": 4338 + }, + { + "epoch": 0.5519653988042234, + "ewc_loss": 0.0235595703125, + "ewc_loss_parallel": 2.3603439331054688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.10370635986328, + "learning_rate": 1e-06, + "loss": 0.4574, + "mean_token_accuracy": 0.8540968894958496, + "num_tokens": 165691257.0, + "step": 4339 + }, + { + "epoch": 0.5520926090828139, + "ewc_loss": 0.0233154296875, + "ewc_loss_parallel": 2.3365020751953125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.70252227783203, + "learning_rate": 1e-06, + "loss": 0.3999, + "mean_token_accuracy": 0.8727218508720398, + "num_tokens": 165727947.0, + "step": 4340 + }, + { + "epoch": 0.5522198193614044, + "ewc_loss": 0.0230712890625, + "ewc_loss_parallel": 2.3126602172851562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.304595947265625, + "learning_rate": 1e-06, + "loss": 0.4295, + "mean_token_accuracy": 0.8656703233718872, + "num_tokens": 165763400.0, + "step": 4341 + }, + { + "epoch": 0.5523470296399949, + "ewc_loss": 0.0234375, + "ewc_loss_parallel": 2.3484230041503906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.677536010742188, + "learning_rate": 1e-06, + "loss": 0.4439, + "mean_token_accuracy": 0.856315016746521, + "num_tokens": 165809586.0, + "step": 4342 + }, + { + "epoch": 0.5524742399185855, + "ewc_loss": 0.02294921875, + "ewc_loss_parallel": 2.288818359375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.10587501525879, + "learning_rate": 1e-06, + "loss": 0.4355, + "mean_token_accuracy": 0.8582279682159424, + "num_tokens": 165839565.0, + "step": 4343 + }, + { + "epoch": 0.5526014501971759, + "ewc_loss": 0.0233154296875, + "ewc_loss_parallel": 2.3365020751953125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.088031768798828, + "learning_rate": 1e-06, + "loss": 0.443, + "mean_token_accuracy": 0.8599772453308105, + "num_tokens": 165873539.0, + "step": 4344 + }, + { + "epoch": 0.5527286604757664, + "ewc_loss": 0.0230712890625, + "ewc_loss_parallel": 2.3126602172851562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.481531143188477, + "learning_rate": 1e-06, + "loss": 0.4195, + "mean_token_accuracy": 0.8653544783592224, + "num_tokens": 165920987.0, + "step": 4345 + }, + { + "epoch": 0.5528558707543569, + "ewc_loss": 0.0230712890625, + "ewc_loss_parallel": 2.3126602172851562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.205108642578125, + "learning_rate": 1e-06, + "loss": 0.4892, + "mean_token_accuracy": 0.846997082233429, + "num_tokens": 165965661.0, + "step": 4346 + }, + { + "epoch": 0.5529830810329475, + "ewc_loss": 0.0233154296875, + "ewc_loss_parallel": 2.3365020751953125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.853818893432617, + "learning_rate": 1e-06, + "loss": 0.4624, + "mean_token_accuracy": 0.8503379821777344, + "num_tokens": 165998239.0, + "step": 4347 + }, + { + "epoch": 0.553110291311538, + "ewc_loss": 0.0233154296875, + "ewc_loss_parallel": 2.3365020751953125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.756834030151367, + "learning_rate": 1e-06, + "loss": 0.451, + "mean_token_accuracy": 0.854373574256897, + "num_tokens": 166036489.0, + "step": 4348 + }, + { + "epoch": 0.5532375015901285, + "ewc_loss": 0.0234375, + "ewc_loss_parallel": 2.3484230041503906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.060407638549805, + "learning_rate": 1e-06, + "loss": 0.4458, + "mean_token_accuracy": 0.8530046939849854, + "num_tokens": 166078774.0, + "step": 4349 + }, + { + "epoch": 0.5533647118687189, + "ewc_loss": 0.0234375, + "ewc_loss_parallel": 2.3484230041503906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.692014694213867, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.8644763231277466, + "num_tokens": 166119555.0, + "step": 4350 + }, + { + "epoch": 0.5534919221473095, + "ewc_loss": 0.0230712890625, + "ewc_loss_parallel": 2.3126602172851562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.78719139099121, + "learning_rate": 1e-06, + "loss": 0.4494, + "mean_token_accuracy": 0.8556708097457886, + "num_tokens": 166159788.0, + "step": 4351 + }, + { + "epoch": 0.5536191324259, + "ewc_loss": 0.0230712890625, + "ewc_loss_parallel": 2.3126602172851562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.949739456176758, + "learning_rate": 1e-06, + "loss": 0.4621, + "mean_token_accuracy": 0.8515930771827698, + "num_tokens": 166200230.0, + "step": 4352 + }, + { + "epoch": 0.5537463427044905, + "ewc_loss": 0.0230712890625, + "ewc_loss_parallel": 2.3126602172851562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.876142501831055, + "learning_rate": 1e-06, + "loss": 0.4331, + "mean_token_accuracy": 0.8596310019493103, + "num_tokens": 166237864.0, + "step": 4353 + }, + { + "epoch": 0.553873552983081, + "ewc_loss": 0.023193359375, + "ewc_loss_parallel": 2.3245811462402344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.637062072753906, + "learning_rate": 1e-06, + "loss": 0.4326, + "mean_token_accuracy": 0.8593482971191406, + "num_tokens": 166278347.0, + "step": 4354 + }, + { + "epoch": 0.5540007632616716, + "ewc_loss": 0.023193359375, + "ewc_loss_parallel": 2.3245811462402344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.828794479370117, + "learning_rate": 1e-06, + "loss": 0.4795, + "mean_token_accuracy": 0.8486723899841309, + "num_tokens": 166315744.0, + "step": 4355 + }, + { + "epoch": 0.554127973540262, + "ewc_loss": 0.0233154296875, + "ewc_loss_parallel": 2.3365020751953125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.768856048583984, + "learning_rate": 1e-06, + "loss": 0.4643, + "mean_token_accuracy": 0.857166588306427, + "num_tokens": 166352705.0, + "step": 4356 + }, + { + "epoch": 0.5542551838188525, + "ewc_loss": 0.023193359375, + "ewc_loss_parallel": 2.3245811462402344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.74515151977539, + "learning_rate": 1e-06, + "loss": 0.4439, + "mean_token_accuracy": 0.8612560033798218, + "num_tokens": 166393956.0, + "step": 4357 + }, + { + "epoch": 0.554382394097443, + "ewc_loss": 0.0233154296875, + "ewc_loss_parallel": 2.3365020751953125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.73528289794922, + "learning_rate": 1e-06, + "loss": 0.5334, + "mean_token_accuracy": 0.8348188400268555, + "num_tokens": 166430172.0, + "step": 4358 + }, + { + "epoch": 0.5545096043760336, + "ewc_loss": 0.023193359375, + "ewc_loss_parallel": 2.3245811462402344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.045326232910156, + "learning_rate": 1e-06, + "loss": 0.4431, + "mean_token_accuracy": 0.8564035892486572, + "num_tokens": 166468732.0, + "step": 4359 + }, + { + "epoch": 0.5546368146546241, + "ewc_loss": 0.0233154296875, + "ewc_loss_parallel": 2.3365020751953125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.679931640625, + "learning_rate": 1e-06, + "loss": 0.469, + "mean_token_accuracy": 0.852476954460144, + "num_tokens": 166511383.0, + "step": 4360 + }, + { + "epoch": 0.5547640249332146, + "ewc_loss": 0.0233154296875, + "ewc_loss_parallel": 2.3365020751953125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.914382934570312, + "learning_rate": 1e-06, + "loss": 0.4395, + "mean_token_accuracy": 0.8605146408081055, + "num_tokens": 166547674.0, + "step": 4361 + }, + { + "epoch": 0.5548912352118052, + "ewc_loss": 0.0234375, + "ewc_loss_parallel": 2.3484230041503906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.759830474853516, + "learning_rate": 1e-06, + "loss": 0.4567, + "mean_token_accuracy": 0.8493669629096985, + "num_tokens": 166581763.0, + "step": 4362 + }, + { + "epoch": 0.5550184454903956, + "ewc_loss": 0.0235595703125, + "ewc_loss_parallel": 2.3603439331054688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.726112365722656, + "learning_rate": 1e-06, + "loss": 0.4709, + "mean_token_accuracy": 0.8541386127471924, + "num_tokens": 166618604.0, + "step": 4363 + }, + { + "epoch": 0.5551456557689861, + "ewc_loss": 0.023193359375, + "ewc_loss_parallel": 2.3245811462402344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.72091293334961, + "learning_rate": 1e-06, + "loss": 0.4252, + "mean_token_accuracy": 0.8657622337341309, + "num_tokens": 166654812.0, + "step": 4364 + }, + { + "epoch": 0.5552728660475766, + "ewc_loss": 0.0235595703125, + "ewc_loss_parallel": 2.3603439331054688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.780746459960938, + "learning_rate": 1e-06, + "loss": 0.4952, + "mean_token_accuracy": 0.8426847457885742, + "num_tokens": 166693206.0, + "step": 4365 + }, + { + "epoch": 0.5554000763261672, + "ewc_loss": 0.0234375, + "ewc_loss_parallel": 2.3484230041503906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.787134170532227, + "learning_rate": 1e-06, + "loss": 0.4631, + "mean_token_accuracy": 0.8554329872131348, + "num_tokens": 166734531.0, + "step": 4366 + }, + { + "epoch": 0.5555272866047577, + "ewc_loss": 0.0234375, + "ewc_loss_parallel": 2.3484230041503906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.76762580871582, + "learning_rate": 1e-06, + "loss": 0.4546, + "mean_token_accuracy": 0.8555001616477966, + "num_tokens": 166771362.0, + "step": 4367 + }, + { + "epoch": 0.5556544968833482, + "ewc_loss": 0.0235595703125, + "ewc_loss_parallel": 2.3603439331054688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.800262451171875, + "learning_rate": 1e-06, + "loss": 0.4836, + "mean_token_accuracy": 0.8478635549545288, + "num_tokens": 166802784.0, + "step": 4368 + }, + { + "epoch": 0.5557817071619386, + "ewc_loss": 0.0234375, + "ewc_loss_parallel": 2.3484230041503906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.731958389282227, + "learning_rate": 1e-06, + "loss": 0.4737, + "mean_token_accuracy": 0.8525004386901855, + "num_tokens": 166835590.0, + "step": 4369 + }, + { + "epoch": 0.5559089174405292, + "ewc_loss": 0.0238037109375, + "ewc_loss_parallel": 2.384185791015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.800594329833984, + "learning_rate": 1e-06, + "loss": 0.4612, + "mean_token_accuracy": 0.8511483669281006, + "num_tokens": 166872145.0, + "step": 4370 + }, + { + "epoch": 0.5560361277191197, + "ewc_loss": 0.0235595703125, + "ewc_loss_parallel": 2.3603439331054688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.89864158630371, + "learning_rate": 1e-06, + "loss": 0.456, + "mean_token_accuracy": 0.85340416431427, + "num_tokens": 166912855.0, + "step": 4371 + }, + { + "epoch": 0.5561633379977102, + "ewc_loss": 0.0235595703125, + "ewc_loss_parallel": 2.3603439331054688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.64937400817871, + "learning_rate": 1e-06, + "loss": 0.4842, + "mean_token_accuracy": 0.8416463136672974, + "num_tokens": 166949482.0, + "step": 4372 + }, + { + "epoch": 0.5562905482763008, + "ewc_loss": 0.023681640625, + "ewc_loss_parallel": 2.372264862060547e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.814619064331055, + "learning_rate": 1e-06, + "loss": 0.4489, + "mean_token_accuracy": 0.856044590473175, + "num_tokens": 166986727.0, + "step": 4373 + }, + { + "epoch": 0.5564177585548913, + "ewc_loss": 0.0240478515625, + "ewc_loss_parallel": 2.4080276489257812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.776004791259766, + "learning_rate": 1e-06, + "loss": 0.4609, + "mean_token_accuracy": 0.8486424684524536, + "num_tokens": 167025869.0, + "step": 4374 + }, + { + "epoch": 0.5565449688334817, + "ewc_loss": 0.023681640625, + "ewc_loss_parallel": 2.372264862060547e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.780397415161133, + "learning_rate": 1e-06, + "loss": 0.4209, + "mean_token_accuracy": 0.8670796155929565, + "num_tokens": 167059825.0, + "step": 4375 + }, + { + "epoch": 0.5566721791120722, + "ewc_loss": 0.0238037109375, + "ewc_loss_parallel": 2.384185791015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.851333618164062, + "learning_rate": 1e-06, + "loss": 0.4337, + "mean_token_accuracy": 0.8611084222793579, + "num_tokens": 167097375.0, + "step": 4376 + }, + { + "epoch": 0.5567993893906628, + "ewc_loss": 0.0235595703125, + "ewc_loss_parallel": 2.3603439331054688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.806615829467773, + "learning_rate": 1e-06, + "loss": 0.4767, + "mean_token_accuracy": 0.8478526473045349, + "num_tokens": 167136555.0, + "step": 4377 + }, + { + "epoch": 0.5569265996692533, + "ewc_loss": 0.0240478515625, + "ewc_loss_parallel": 2.4080276489257812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.792036056518555, + "learning_rate": 1e-06, + "loss": 0.4515, + "mean_token_accuracy": 0.8541548848152161, + "num_tokens": 167179870.0, + "step": 4378 + }, + { + "epoch": 0.5570538099478438, + "ewc_loss": 0.0235595703125, + "ewc_loss_parallel": 2.3603439331054688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.746057510375977, + "learning_rate": 1e-06, + "loss": 0.4438, + "mean_token_accuracy": 0.8548831939697266, + "num_tokens": 167215242.0, + "step": 4379 + }, + { + "epoch": 0.5571810202264343, + "ewc_loss": 0.02392578125, + "ewc_loss_parallel": 2.396106719970703e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.913820266723633, + "learning_rate": 1e-06, + "loss": 0.4245, + "mean_token_accuracy": 0.8644258379936218, + "num_tokens": 167250690.0, + "step": 4380 + }, + { + "epoch": 0.5573082305050248, + "ewc_loss": 0.0238037109375, + "ewc_loss_parallel": 2.384185791015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.869319915771484, + "learning_rate": 1e-06, + "loss": 0.4937, + "mean_token_accuracy": 0.8433070778846741, + "num_tokens": 167283677.0, + "step": 4381 + }, + { + "epoch": 0.5574354407836153, + "ewc_loss": 0.023681640625, + "ewc_loss_parallel": 2.372264862060547e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.791248321533203, + "learning_rate": 1e-06, + "loss": 0.4479, + "mean_token_accuracy": 0.8571281433105469, + "num_tokens": 167321355.0, + "step": 4382 + }, + { + "epoch": 0.5575626510622058, + "ewc_loss": 0.023681640625, + "ewc_loss_parallel": 2.372264862060547e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.845548629760742, + "learning_rate": 1e-06, + "loss": 0.4669, + "mean_token_accuracy": 0.8485949039459229, + "num_tokens": 167363103.0, + "step": 4383 + }, + { + "epoch": 0.5576898613407963, + "ewc_loss": 0.023681640625, + "ewc_loss_parallel": 2.372264862060547e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.84295654296875, + "learning_rate": 1e-06, + "loss": 0.3947, + "mean_token_accuracy": 0.8701450824737549, + "num_tokens": 167399732.0, + "step": 4384 + }, + { + "epoch": 0.5578170716193869, + "ewc_loss": 0.0235595703125, + "ewc_loss_parallel": 2.3603439331054688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.829952239990234, + "learning_rate": 1e-06, + "loss": 0.4551, + "mean_token_accuracy": 0.8556127548217773, + "num_tokens": 167434938.0, + "step": 4385 + }, + { + "epoch": 0.5579442818979774, + "ewc_loss": 0.023681640625, + "ewc_loss_parallel": 2.372264862060547e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.982444763183594, + "learning_rate": 1e-06, + "loss": 0.4541, + "mean_token_accuracy": 0.8590108156204224, + "num_tokens": 167472293.0, + "step": 4386 + }, + { + "epoch": 0.5580714921765678, + "ewc_loss": 0.0235595703125, + "ewc_loss_parallel": 2.3603439331054688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.740291595458984, + "learning_rate": 1e-06, + "loss": 0.4327, + "mean_token_accuracy": 0.8610777854919434, + "num_tokens": 167511015.0, + "step": 4387 + }, + { + "epoch": 0.5581987024551583, + "ewc_loss": 0.0233154296875, + "ewc_loss_parallel": 2.3365020751953125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.005373001098633, + "learning_rate": 1e-06, + "loss": 0.451, + "mean_token_accuracy": 0.8563426733016968, + "num_tokens": 167549777.0, + "step": 4388 + }, + { + "epoch": 0.5583259127337489, + "ewc_loss": 0.023681640625, + "ewc_loss_parallel": 2.372264862060547e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.784517288208008, + "learning_rate": 1e-06, + "loss": 0.4369, + "mean_token_accuracy": 0.8628886938095093, + "num_tokens": 167592342.0, + "step": 4389 + }, + { + "epoch": 0.5584531230123394, + "ewc_loss": 0.0234375, + "ewc_loss_parallel": 2.3484230041503906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.754365921020508, + "learning_rate": 1e-06, + "loss": 0.4483, + "mean_token_accuracy": 0.8604456186294556, + "num_tokens": 167630829.0, + "step": 4390 + }, + { + "epoch": 0.5585803332909299, + "ewc_loss": 0.0234375, + "ewc_loss_parallel": 2.3484230041503906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.804729461669922, + "learning_rate": 1e-06, + "loss": 0.4833, + "mean_token_accuracy": 0.844477653503418, + "num_tokens": 167675938.0, + "step": 4391 + }, + { + "epoch": 0.5587075435695205, + "ewc_loss": 0.023681640625, + "ewc_loss_parallel": 2.372264862060547e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.89925193786621, + "learning_rate": 1e-06, + "loss": 0.4384, + "mean_token_accuracy": 0.8598813414573669, + "num_tokens": 167716550.0, + "step": 4392 + }, + { + "epoch": 0.5588347538481109, + "ewc_loss": 0.023681640625, + "ewc_loss_parallel": 2.372264862060547e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.735994338989258, + "learning_rate": 1e-06, + "loss": 0.4988, + "mean_token_accuracy": 0.8434033393859863, + "num_tokens": 167754401.0, + "step": 4393 + }, + { + "epoch": 0.5589619641267014, + "ewc_loss": 0.0235595703125, + "ewc_loss_parallel": 2.3603439331054688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.83063316345215, + "learning_rate": 1e-06, + "loss": 0.4331, + "mean_token_accuracy": 0.8598551750183105, + "num_tokens": 167792124.0, + "step": 4394 + }, + { + "epoch": 0.5590891744052919, + "ewc_loss": 0.0233154296875, + "ewc_loss_parallel": 2.3365020751953125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.867477416992188, + "learning_rate": 1e-06, + "loss": 0.3955, + "mean_token_accuracy": 0.8749182820320129, + "num_tokens": 167829226.0, + "step": 4395 + }, + { + "epoch": 0.5592163846838825, + "ewc_loss": 0.023193359375, + "ewc_loss_parallel": 2.3245811462402344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.89111328125, + "learning_rate": 1e-06, + "loss": 0.4088, + "mean_token_accuracy": 0.8665235042572021, + "num_tokens": 167863439.0, + "step": 4396 + }, + { + "epoch": 0.559343594962473, + "ewc_loss": 0.0233154296875, + "ewc_loss_parallel": 2.3365020751953125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.80809211730957, + "learning_rate": 1e-06, + "loss": 0.4477, + "mean_token_accuracy": 0.8571532964706421, + "num_tokens": 167907019.0, + "step": 4397 + }, + { + "epoch": 0.5594708052410635, + "ewc_loss": 0.023193359375, + "ewc_loss_parallel": 2.3245811462402344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.868162155151367, + "learning_rate": 1e-06, + "loss": 0.4669, + "mean_token_accuracy": 0.8483930826187134, + "num_tokens": 167942539.0, + "step": 4398 + }, + { + "epoch": 0.5595980155196539, + "ewc_loss": 0.0235595703125, + "ewc_loss_parallel": 2.3603439331054688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.88774871826172, + "learning_rate": 1e-06, + "loss": 0.4525, + "mean_token_accuracy": 0.8559756278991699, + "num_tokens": 167982848.0, + "step": 4399 + }, + { + "epoch": 0.5597252257982445, + "ewc_loss": 0.0234375, + "ewc_loss_parallel": 2.3484230041503906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.900737762451172, + "learning_rate": 1e-06, + "loss": 0.417, + "mean_token_accuracy": 0.8647701144218445, + "num_tokens": 168017500.0, + "step": 4400 + }, + { + "epoch": 0.559852436076835, + "ewc_loss": 0.0235595703125, + "ewc_loss_parallel": 2.3603439331054688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.9108829498291, + "learning_rate": 1e-06, + "loss": 0.4906, + "mean_token_accuracy": 0.8409940004348755, + "num_tokens": 168052790.0, + "step": 4401 + }, + { + "epoch": 0.5599796463554255, + "ewc_loss": 0.0234375, + "ewc_loss_parallel": 2.3484230041503906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.827484130859375, + "learning_rate": 1e-06, + "loss": 0.4297, + "mean_token_accuracy": 0.8642910718917847, + "num_tokens": 168082541.0, + "step": 4402 + }, + { + "epoch": 0.560106856634016, + "ewc_loss": 0.023681640625, + "ewc_loss_parallel": 2.372264862060547e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.887033462524414, + "learning_rate": 1e-06, + "loss": 0.4383, + "mean_token_accuracy": 0.863107442855835, + "num_tokens": 168128218.0, + "step": 4403 + }, + { + "epoch": 0.5602340669126066, + "ewc_loss": 0.0234375, + "ewc_loss_parallel": 2.3484230041503906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.83601188659668, + "learning_rate": 1e-06, + "loss": 0.439, + "mean_token_accuracy": 0.8584264516830444, + "num_tokens": 168165496.0, + "step": 4404 + }, + { + "epoch": 0.560361277191197, + "ewc_loss": 0.0235595703125, + "ewc_loss_parallel": 2.3603439331054688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.77995491027832, + "learning_rate": 1e-06, + "loss": 0.5019, + "mean_token_accuracy": 0.844355583190918, + "num_tokens": 168204675.0, + "step": 4405 + }, + { + "epoch": 0.5604884874697875, + "ewc_loss": 0.0233154296875, + "ewc_loss_parallel": 2.3365020751953125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.865379333496094, + "learning_rate": 1e-06, + "loss": 0.464, + "mean_token_accuracy": 0.853027880191803, + "num_tokens": 168244832.0, + "step": 4406 + }, + { + "epoch": 0.560615697748378, + "ewc_loss": 0.023681640625, + "ewc_loss_parallel": 2.372264862060547e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.878826141357422, + "learning_rate": 1e-06, + "loss": 0.4671, + "mean_token_accuracy": 0.8518063426017761, + "num_tokens": 168282440.0, + "step": 4407 + }, + { + "epoch": 0.5607429080269686, + "ewc_loss": 0.023193359375, + "ewc_loss_parallel": 2.3245811462402344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.854116439819336, + "learning_rate": 1e-06, + "loss": 0.4344, + "mean_token_accuracy": 0.8618655204772949, + "num_tokens": 168325303.0, + "step": 4408 + }, + { + "epoch": 0.5608701183055591, + "ewc_loss": 0.0235595703125, + "ewc_loss_parallel": 2.3603439331054688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.936439514160156, + "learning_rate": 1e-06, + "loss": 0.4362, + "mean_token_accuracy": 0.8619115352630615, + "num_tokens": 168360628.0, + "step": 4409 + }, + { + "epoch": 0.5609973285841496, + "ewc_loss": 0.0234375, + "ewc_loss_parallel": 2.3484230041503906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.885425567626953, + "learning_rate": 1e-06, + "loss": 0.4924, + "mean_token_accuracy": 0.8436987400054932, + "num_tokens": 168398601.0, + "step": 4410 + }, + { + "epoch": 0.5611245388627402, + "ewc_loss": 0.0233154296875, + "ewc_loss_parallel": 2.3365020751953125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.115394592285156, + "learning_rate": 1e-06, + "loss": 0.4988, + "mean_token_accuracy": 0.843296229839325, + "num_tokens": 168436893.0, + "step": 4411 + }, + { + "epoch": 0.5612517491413306, + "ewc_loss": 0.0235595703125, + "ewc_loss_parallel": 2.3603439331054688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.873262405395508, + "learning_rate": 1e-06, + "loss": 0.4605, + "mean_token_accuracy": 0.8530068397521973, + "num_tokens": 168478794.0, + "step": 4412 + }, + { + "epoch": 0.5613789594199211, + "ewc_loss": 0.0233154296875, + "ewc_loss_parallel": 2.3365020751953125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.88846206665039, + "learning_rate": 1e-06, + "loss": 0.4509, + "mean_token_accuracy": 0.8547945618629456, + "num_tokens": 168520184.0, + "step": 4413 + }, + { + "epoch": 0.5615061696985116, + "ewc_loss": 0.023193359375, + "ewc_loss_parallel": 2.3245811462402344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.932645797729492, + "learning_rate": 1e-06, + "loss": 0.4527, + "mean_token_accuracy": 0.8529341220855713, + "num_tokens": 168554305.0, + "step": 4414 + }, + { + "epoch": 0.5616333799771022, + "ewc_loss": 0.0233154296875, + "ewc_loss_parallel": 2.3365020751953125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.77488899230957, + "learning_rate": 1e-06, + "loss": 0.5069, + "mean_token_accuracy": 0.8404117226600647, + "num_tokens": 168597752.0, + "step": 4415 + }, + { + "epoch": 0.5617605902556927, + "ewc_loss": 0.023193359375, + "ewc_loss_parallel": 2.3245811462402344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.92643165588379, + "learning_rate": 1e-06, + "loss": 0.4286, + "mean_token_accuracy": 0.8625698089599609, + "num_tokens": 168637764.0, + "step": 4416 + }, + { + "epoch": 0.5618878005342832, + "ewc_loss": 0.0233154296875, + "ewc_loss_parallel": 2.3365020751953125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.970735549926758, + "learning_rate": 1e-06, + "loss": 0.5027, + "mean_token_accuracy": 0.8369466066360474, + "num_tokens": 168672260.0, + "step": 4417 + }, + { + "epoch": 0.5620150108128736, + "ewc_loss": 0.0233154296875, + "ewc_loss_parallel": 2.3365020751953125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.794147491455078, + "learning_rate": 1e-06, + "loss": 0.4193, + "mean_token_accuracy": 0.8650935292243958, + "num_tokens": 168711478.0, + "step": 4418 + }, + { + "epoch": 0.5621422210914642, + "ewc_loss": 0.0233154296875, + "ewc_loss_parallel": 2.3365020751953125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.984956741333008, + "learning_rate": 1e-06, + "loss": 0.4363, + "mean_token_accuracy": 0.8592716455459595, + "num_tokens": 168751773.0, + "step": 4419 + }, + { + "epoch": 0.5622694313700547, + "ewc_loss": 0.023193359375, + "ewc_loss_parallel": 2.3245811462402344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.807472229003906, + "learning_rate": 1e-06, + "loss": 0.4191, + "mean_token_accuracy": 0.8680012822151184, + "num_tokens": 168786364.0, + "step": 4420 + }, + { + "epoch": 0.5623966416486452, + "ewc_loss": 0.0235595703125, + "ewc_loss_parallel": 2.3603439331054688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.927223205566406, + "learning_rate": 1e-06, + "loss": 0.4665, + "mean_token_accuracy": 0.8543734550476074, + "num_tokens": 168826819.0, + "step": 4421 + }, + { + "epoch": 0.5625238519272358, + "ewc_loss": 0.0233154296875, + "ewc_loss_parallel": 2.3365020751953125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.90744972229004, + "learning_rate": 1e-06, + "loss": 0.4451, + "mean_token_accuracy": 0.8544047474861145, + "num_tokens": 168869405.0, + "step": 4422 + }, + { + "epoch": 0.5626510622058263, + "ewc_loss": 0.0234375, + "ewc_loss_parallel": 2.3484230041503906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.992544174194336, + "learning_rate": 1e-06, + "loss": 0.4421, + "mean_token_accuracy": 0.8626887202262878, + "num_tokens": 168904323.0, + "step": 4423 + }, + { + "epoch": 0.5627782724844167, + "ewc_loss": 0.0233154296875, + "ewc_loss_parallel": 2.3365020751953125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.750696182250977, + "learning_rate": 1e-06, + "loss": 0.4787, + "mean_token_accuracy": 0.8472192287445068, + "num_tokens": 168948654.0, + "step": 4424 + }, + { + "epoch": 0.5629054827630072, + "ewc_loss": 0.023681640625, + "ewc_loss_parallel": 2.372264862060547e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.89156150817871, + "learning_rate": 1e-06, + "loss": 0.4324, + "mean_token_accuracy": 0.8636171817779541, + "num_tokens": 168984333.0, + "step": 4425 + }, + { + "epoch": 0.5630326930415978, + "ewc_loss": 0.023681640625, + "ewc_loss_parallel": 2.372264862060547e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.898723602294922, + "learning_rate": 1e-06, + "loss": 0.4474, + "mean_token_accuracy": 0.8562098741531372, + "num_tokens": 169019843.0, + "step": 4426 + }, + { + "epoch": 0.5631599033201883, + "ewc_loss": 0.0234375, + "ewc_loss_parallel": 2.3484230041503906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.036237716674805, + "learning_rate": 1e-06, + "loss": 0.4085, + "mean_token_accuracy": 0.8670449256896973, + "num_tokens": 169058177.0, + "step": 4427 + }, + { + "epoch": 0.5632871135987788, + "ewc_loss": 0.023681640625, + "ewc_loss_parallel": 2.372264862060547e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.011442184448242, + "learning_rate": 1e-06, + "loss": 0.4422, + "mean_token_accuracy": 0.8621902465820312, + "num_tokens": 169102268.0, + "step": 4428 + }, + { + "epoch": 0.5634143238773693, + "ewc_loss": 0.0230712890625, + "ewc_loss_parallel": 2.3126602172851562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.90156364440918, + "learning_rate": 1e-06, + "loss": 0.4588, + "mean_token_accuracy": 0.8516480922698975, + "num_tokens": 169141709.0, + "step": 4429 + }, + { + "epoch": 0.5635415341559598, + "ewc_loss": 0.0233154296875, + "ewc_loss_parallel": 2.3365020751953125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.79084587097168, + "learning_rate": 1e-06, + "loss": 0.4664, + "mean_token_accuracy": 0.8530523777008057, + "num_tokens": 169177107.0, + "step": 4430 + }, + { + "epoch": 0.5636687444345503, + "ewc_loss": 0.0234375, + "ewc_loss_parallel": 2.3484230041503906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.912490844726562, + "learning_rate": 1e-06, + "loss": 0.453, + "mean_token_accuracy": 0.8523678183555603, + "num_tokens": 169211531.0, + "step": 4431 + }, + { + "epoch": 0.5637959547131408, + "ewc_loss": 0.0235595703125, + "ewc_loss_parallel": 2.3603439331054688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.8574275970459, + "learning_rate": 1e-06, + "loss": 0.468, + "mean_token_accuracy": 0.849390983581543, + "num_tokens": 169247507.0, + "step": 4432 + }, + { + "epoch": 0.5639231649917313, + "ewc_loss": 0.0235595703125, + "ewc_loss_parallel": 2.3603439331054688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.84838104248047, + "learning_rate": 1e-06, + "loss": 0.4073, + "mean_token_accuracy": 0.8703662753105164, + "num_tokens": 169281007.0, + "step": 4433 + }, + { + "epoch": 0.5640503752703219, + "ewc_loss": 0.0234375, + "ewc_loss_parallel": 2.3484230041503906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.152393341064453, + "learning_rate": 1e-06, + "loss": 0.4595, + "mean_token_accuracy": 0.8582456111907959, + "num_tokens": 169322295.0, + "step": 4434 + }, + { + "epoch": 0.5641775855489124, + "ewc_loss": 0.0234375, + "ewc_loss_parallel": 2.3484230041503906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.869171142578125, + "learning_rate": 1e-06, + "loss": 0.4575, + "mean_token_accuracy": 0.8529486656188965, + "num_tokens": 169358184.0, + "step": 4435 + }, + { + "epoch": 0.5643047958275028, + "ewc_loss": 0.0233154296875, + "ewc_loss_parallel": 2.3365020751953125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.198057174682617, + "learning_rate": 1e-06, + "loss": 0.4435, + "mean_token_accuracy": 0.8616551756858826, + "num_tokens": 169391518.0, + "step": 4436 + }, + { + "epoch": 0.5644320061060933, + "ewc_loss": 0.0235595703125, + "ewc_loss_parallel": 2.3603439331054688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.246240615844727, + "learning_rate": 1e-06, + "loss": 0.4492, + "mean_token_accuracy": 0.8562197089195251, + "num_tokens": 169426743.0, + "step": 4437 + }, + { + "epoch": 0.5645592163846839, + "ewc_loss": 0.02294921875, + "ewc_loss_parallel": 2.300739288330078e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.80061912536621, + "learning_rate": 1e-06, + "loss": 0.3851, + "mean_token_accuracy": 0.8759797811508179, + "num_tokens": 169466452.0, + "step": 4438 + }, + { + "epoch": 0.5646864266632744, + "ewc_loss": 0.02294921875, + "ewc_loss_parallel": 2.300739288330078e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.188125610351562, + "learning_rate": 1e-06, + "loss": 0.4277, + "mean_token_accuracy": 0.8622570037841797, + "num_tokens": 169500295.0, + "step": 4439 + }, + { + "epoch": 0.5648136369418649, + "ewc_loss": 0.0234375, + "ewc_loss_parallel": 2.3484230041503906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.810714721679688, + "learning_rate": 1e-06, + "loss": 0.4492, + "mean_token_accuracy": 0.8563708066940308, + "num_tokens": 169541829.0, + "step": 4440 + }, + { + "epoch": 0.5649408472204555, + "ewc_loss": 0.02294921875, + "ewc_loss_parallel": 2.300739288330078e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.96254539489746, + "learning_rate": 1e-06, + "loss": 0.5015, + "mean_token_accuracy": 0.8412982225418091, + "num_tokens": 169581547.0, + "step": 4441 + }, + { + "epoch": 0.5650680574990459, + "ewc_loss": 0.023681640625, + "ewc_loss_parallel": 2.372264862060547e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.774934768676758, + "learning_rate": 1e-06, + "loss": 0.4323, + "mean_token_accuracy": 0.8616530895233154, + "num_tokens": 169622107.0, + "step": 4442 + }, + { + "epoch": 0.5651952677776364, + "ewc_loss": 0.0234375, + "ewc_loss_parallel": 2.3484230041503906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.88471221923828, + "learning_rate": 1e-06, + "loss": 0.456, + "mean_token_accuracy": 0.853754460811615, + "num_tokens": 169657258.0, + "step": 4443 + }, + { + "epoch": 0.5653224780562269, + "ewc_loss": 0.0234375, + "ewc_loss_parallel": 2.3484230041503906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.896635055541992, + "learning_rate": 1e-06, + "loss": 0.4929, + "mean_token_accuracy": 0.8457351326942444, + "num_tokens": 169691568.0, + "step": 4444 + }, + { + "epoch": 0.5654496883348175, + "ewc_loss": 0.0235595703125, + "ewc_loss_parallel": 2.3603439331054688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.921720504760742, + "learning_rate": 1e-06, + "loss": 0.4256, + "mean_token_accuracy": 0.8631495237350464, + "num_tokens": 169732429.0, + "step": 4445 + }, + { + "epoch": 0.565576898613408, + "ewc_loss": 0.0235595703125, + "ewc_loss_parallel": 2.3603439331054688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.93609619140625, + "learning_rate": 1e-06, + "loss": 0.4778, + "mean_token_accuracy": 0.8496575951576233, + "num_tokens": 169769224.0, + "step": 4446 + }, + { + "epoch": 0.5657041088919985, + "ewc_loss": 0.023193359375, + "ewc_loss_parallel": 2.3245811462402344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.877466201782227, + "learning_rate": 1e-06, + "loss": 0.4645, + "mean_token_accuracy": 0.8569401502609253, + "num_tokens": 169812830.0, + "step": 4447 + }, + { + "epoch": 0.5658313191705889, + "ewc_loss": 0.0233154296875, + "ewc_loss_parallel": 2.3365020751953125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.842906951904297, + "learning_rate": 1e-06, + "loss": 0.4568, + "mean_token_accuracy": 0.8541110157966614, + "num_tokens": 169843823.0, + "step": 4448 + }, + { + "epoch": 0.5659585294491795, + "ewc_loss": 0.0235595703125, + "ewc_loss_parallel": 2.3603439331054688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.935705184936523, + "learning_rate": 1e-06, + "loss": 0.4446, + "mean_token_accuracy": 0.8586515188217163, + "num_tokens": 169882847.0, + "step": 4449 + }, + { + "epoch": 0.56608573972777, + "ewc_loss": 0.023681640625, + "ewc_loss_parallel": 2.372264862060547e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.933826446533203, + "learning_rate": 1e-06, + "loss": 0.5063, + "mean_token_accuracy": 0.8379141688346863, + "num_tokens": 169921264.0, + "step": 4450 + }, + { + "epoch": 0.5662129500063605, + "ewc_loss": 0.0235595703125, + "ewc_loss_parallel": 2.3603439331054688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.938459396362305, + "learning_rate": 1e-06, + "loss": 0.4258, + "mean_token_accuracy": 0.862374484539032, + "num_tokens": 169949681.0, + "step": 4451 + }, + { + "epoch": 0.566340160284951, + "ewc_loss": 0.023681640625, + "ewc_loss_parallel": 2.372264862060547e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.9262752532959, + "learning_rate": 1e-06, + "loss": 0.4128, + "mean_token_accuracy": 0.8708956837654114, + "num_tokens": 169986254.0, + "step": 4452 + }, + { + "epoch": 0.5664673705635416, + "ewc_loss": 0.0235595703125, + "ewc_loss_parallel": 2.3603439331054688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.87005615234375, + "learning_rate": 1e-06, + "loss": 0.4594, + "mean_token_accuracy": 0.8529736995697021, + "num_tokens": 170022745.0, + "step": 4453 + }, + { + "epoch": 0.566594580842132, + "ewc_loss": 0.023681640625, + "ewc_loss_parallel": 2.372264862060547e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.878448486328125, + "learning_rate": 1e-06, + "loss": 0.4173, + "mean_token_accuracy": 0.8667725324630737, + "num_tokens": 170057038.0, + "step": 4454 + }, + { + "epoch": 0.5667217911207225, + "ewc_loss": 0.0235595703125, + "ewc_loss_parallel": 2.3603439331054688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.85365867614746, + "learning_rate": 1e-06, + "loss": 0.3897, + "mean_token_accuracy": 0.8751416802406311, + "num_tokens": 170098460.0, + "step": 4455 + }, + { + "epoch": 0.566849001399313, + "ewc_loss": 0.0234375, + "ewc_loss_parallel": 2.3484230041503906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.77937126159668, + "learning_rate": 1e-06, + "loss": 0.4811, + "mean_token_accuracy": 0.8445731997489929, + "num_tokens": 170137689.0, + "step": 4456 + }, + { + "epoch": 0.5669762116779036, + "ewc_loss": 0.0238037109375, + "ewc_loss_parallel": 2.384185791015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.046384811401367, + "learning_rate": 1e-06, + "loss": 0.4438, + "mean_token_accuracy": 0.8583287000656128, + "num_tokens": 170176716.0, + "step": 4457 + }, + { + "epoch": 0.5671034219564941, + "ewc_loss": 0.023681640625, + "ewc_loss_parallel": 2.372264862060547e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.877222061157227, + "learning_rate": 1e-06, + "loss": 0.4513, + "mean_token_accuracy": 0.8524255752563477, + "num_tokens": 170212621.0, + "step": 4458 + }, + { + "epoch": 0.5672306322350846, + "ewc_loss": 0.0238037109375, + "ewc_loss_parallel": 2.384185791015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.85321044921875, + "learning_rate": 1e-06, + "loss": 0.4731, + "mean_token_accuracy": 0.8473811745643616, + "num_tokens": 170254493.0, + "step": 4459 + }, + { + "epoch": 0.5673578425136752, + "ewc_loss": 0.0238037109375, + "ewc_loss_parallel": 2.384185791015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.15559959411621, + "learning_rate": 1e-06, + "loss": 0.3718, + "mean_token_accuracy": 0.8771806955337524, + "num_tokens": 170290559.0, + "step": 4460 + }, + { + "epoch": 0.5674850527922656, + "ewc_loss": 0.0234375, + "ewc_loss_parallel": 2.3484230041503906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.910417556762695, + "learning_rate": 1e-06, + "loss": 0.4603, + "mean_token_accuracy": 0.8504911661148071, + "num_tokens": 170335079.0, + "step": 4461 + }, + { + "epoch": 0.5676122630708561, + "ewc_loss": 0.0235595703125, + "ewc_loss_parallel": 2.3603439331054688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.925992965698242, + "learning_rate": 1e-06, + "loss": 0.4316, + "mean_token_accuracy": 0.859790563583374, + "num_tokens": 170370665.0, + "step": 4462 + }, + { + "epoch": 0.5677394733494466, + "ewc_loss": 0.023681640625, + "ewc_loss_parallel": 2.372264862060547e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.115449905395508, + "learning_rate": 1e-06, + "loss": 0.4641, + "mean_token_accuracy": 0.8560411930084229, + "num_tokens": 170412585.0, + "step": 4463 + }, + { + "epoch": 0.5678666836280372, + "ewc_loss": 0.0235595703125, + "ewc_loss_parallel": 2.3603439331054688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.911762237548828, + "learning_rate": 1e-06, + "loss": 0.4273, + "mean_token_accuracy": 0.8625420331954956, + "num_tokens": 170450806.0, + "step": 4464 + }, + { + "epoch": 0.5679938939066277, + "ewc_loss": 0.0234375, + "ewc_loss_parallel": 2.3484230041503906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.982450485229492, + "learning_rate": 1e-06, + "loss": 0.4477, + "mean_token_accuracy": 0.8568875789642334, + "num_tokens": 170483265.0, + "step": 4465 + }, + { + "epoch": 0.5681211041852182, + "ewc_loss": 0.0235595703125, + "ewc_loss_parallel": 2.3603439331054688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.896087646484375, + "learning_rate": 1e-06, + "loss": 0.4039, + "mean_token_accuracy": 0.8731886148452759, + "num_tokens": 170520910.0, + "step": 4466 + }, + { + "epoch": 0.5682483144638086, + "ewc_loss": 0.0235595703125, + "ewc_loss_parallel": 2.3603439331054688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.930639266967773, + "learning_rate": 1e-06, + "loss": 0.5046, + "mean_token_accuracy": 0.8391116857528687, + "num_tokens": 170556516.0, + "step": 4467 + }, + { + "epoch": 0.5683755247423992, + "ewc_loss": 0.0238037109375, + "ewc_loss_parallel": 2.384185791015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.993877410888672, + "learning_rate": 1e-06, + "loss": 0.4617, + "mean_token_accuracy": 0.8555965423583984, + "num_tokens": 170595571.0, + "step": 4468 + }, + { + "epoch": 0.5685027350209897, + "ewc_loss": 0.0233154296875, + "ewc_loss_parallel": 2.3365020751953125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.889751434326172, + "learning_rate": 1e-06, + "loss": 0.3919, + "mean_token_accuracy": 0.8750566244125366, + "num_tokens": 170634117.0, + "step": 4469 + }, + { + "epoch": 0.5686299452995802, + "ewc_loss": 0.0235595703125, + "ewc_loss_parallel": 2.3603439331054688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.92290496826172, + "learning_rate": 1e-06, + "loss": 0.4623, + "mean_token_accuracy": 0.8527839183807373, + "num_tokens": 170677624.0, + "step": 4470 + }, + { + "epoch": 0.5687571555781707, + "ewc_loss": 0.0238037109375, + "ewc_loss_parallel": 2.384185791015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.900402069091797, + "learning_rate": 1e-06, + "loss": 0.4797, + "mean_token_accuracy": 0.8430638909339905, + "num_tokens": 170716732.0, + "step": 4471 + }, + { + "epoch": 0.5688843658567613, + "ewc_loss": 0.0233154296875, + "ewc_loss_parallel": 2.3365020751953125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.876646041870117, + "learning_rate": 1e-06, + "loss": 0.4717, + "mean_token_accuracy": 0.8504456877708435, + "num_tokens": 170758521.0, + "step": 4472 + }, + { + "epoch": 0.5690115761353517, + "ewc_loss": 0.0238037109375, + "ewc_loss_parallel": 2.384185791015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.081798553466797, + "learning_rate": 1e-06, + "loss": 0.4435, + "mean_token_accuracy": 0.8560271263122559, + "num_tokens": 170796799.0, + "step": 4473 + }, + { + "epoch": 0.5691387864139422, + "ewc_loss": 0.0238037109375, + "ewc_loss_parallel": 2.384185791015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.83180046081543, + "learning_rate": 1e-06, + "loss": 0.4904, + "mean_token_accuracy": 0.8429088592529297, + "num_tokens": 170838569.0, + "step": 4474 + }, + { + "epoch": 0.5692659966925327, + "ewc_loss": 0.0234375, + "ewc_loss_parallel": 2.3484230041503906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.811203002929688, + "learning_rate": 1e-06, + "loss": 0.4358, + "mean_token_accuracy": 0.8589073419570923, + "num_tokens": 170878206.0, + "step": 4475 + }, + { + "epoch": 0.5693932069711233, + "ewc_loss": 0.023681640625, + "ewc_loss_parallel": 2.372264862060547e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.06755828857422, + "learning_rate": 1e-06, + "loss": 0.4983, + "mean_token_accuracy": 0.8436809778213501, + "num_tokens": 170915506.0, + "step": 4476 + }, + { + "epoch": 0.5695204172497138, + "ewc_loss": 0.0235595703125, + "ewc_loss_parallel": 2.3603439331054688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.772409439086914, + "learning_rate": 1e-06, + "loss": 0.4692, + "mean_token_accuracy": 0.8526953458786011, + "num_tokens": 170952303.0, + "step": 4477 + }, + { + "epoch": 0.5696476275283043, + "ewc_loss": 0.0235595703125, + "ewc_loss_parallel": 2.3603439331054688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.938278198242188, + "learning_rate": 1e-06, + "loss": 0.4704, + "mean_token_accuracy": 0.8507048487663269, + "num_tokens": 170993757.0, + "step": 4478 + }, + { + "epoch": 0.5697748378068948, + "ewc_loss": 0.0235595703125, + "ewc_loss_parallel": 2.3603439331054688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.892683029174805, + "learning_rate": 1e-06, + "loss": 0.4584, + "mean_token_accuracy": 0.8545279502868652, + "num_tokens": 171030240.0, + "step": 4479 + }, + { + "epoch": 0.5699020480854853, + "ewc_loss": 0.0235595703125, + "ewc_loss_parallel": 2.3603439331054688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.991043090820312, + "learning_rate": 1e-06, + "loss": 0.5173, + "mean_token_accuracy": 0.8330320715904236, + "num_tokens": 171069654.0, + "step": 4480 + }, + { + "epoch": 0.5700292583640758, + "ewc_loss": 0.0235595703125, + "ewc_loss_parallel": 2.3603439331054688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.973289489746094, + "learning_rate": 1e-06, + "loss": 0.5042, + "mean_token_accuracy": 0.8404210805892944, + "num_tokens": 171107341.0, + "step": 4481 + }, + { + "epoch": 0.5701564686426663, + "ewc_loss": 0.023681640625, + "ewc_loss_parallel": 2.372264862060547e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.92862319946289, + "learning_rate": 1e-06, + "loss": 0.4824, + "mean_token_accuracy": 0.8473556637763977, + "num_tokens": 171144063.0, + "step": 4482 + }, + { + "epoch": 0.5702836789212569, + "ewc_loss": 0.0234375, + "ewc_loss_parallel": 2.3484230041503906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.98777198791504, + "learning_rate": 1e-06, + "loss": 0.4628, + "mean_token_accuracy": 0.8497348427772522, + "num_tokens": 171180448.0, + "step": 4483 + }, + { + "epoch": 0.5704108891998474, + "ewc_loss": 0.0238037109375, + "ewc_loss_parallel": 2.384185791015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.002395629882812, + "learning_rate": 1e-06, + "loss": 0.4227, + "mean_token_accuracy": 0.8614082336425781, + "num_tokens": 171214220.0, + "step": 4484 + }, + { + "epoch": 0.5705380994784378, + "ewc_loss": 0.02392578125, + "ewc_loss_parallel": 2.396106719970703e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.19379997253418, + "learning_rate": 1e-06, + "loss": 0.4488, + "mean_token_accuracy": 0.8625900745391846, + "num_tokens": 171252798.0, + "step": 4485 + }, + { + "epoch": 0.5706653097570283, + "ewc_loss": 0.023681640625, + "ewc_loss_parallel": 2.372264862060547e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.99635124206543, + "learning_rate": 1e-06, + "loss": 0.4265, + "mean_token_accuracy": 0.8623273372650146, + "num_tokens": 171295434.0, + "step": 4486 + }, + { + "epoch": 0.5707925200356189, + "ewc_loss": 0.0235595703125, + "ewc_loss_parallel": 2.3603439331054688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.901155471801758, + "learning_rate": 1e-06, + "loss": 0.4472, + "mean_token_accuracy": 0.8586933612823486, + "num_tokens": 171334129.0, + "step": 4487 + }, + { + "epoch": 0.5709197303142094, + "ewc_loss": 0.023681640625, + "ewc_loss_parallel": 2.372264862060547e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.97627830505371, + "learning_rate": 1e-06, + "loss": 0.4716, + "mean_token_accuracy": 0.8528575897216797, + "num_tokens": 171370239.0, + "step": 4488 + }, + { + "epoch": 0.5710469405927999, + "ewc_loss": 0.023681640625, + "ewc_loss_parallel": 2.372264862060547e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.99781608581543, + "learning_rate": 1e-06, + "loss": 0.3948, + "mean_token_accuracy": 0.8706592321395874, + "num_tokens": 171402411.0, + "step": 4489 + }, + { + "epoch": 0.5711741508713905, + "ewc_loss": 0.0235595703125, + "ewc_loss_parallel": 2.3603439331054688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.91954231262207, + "learning_rate": 1e-06, + "loss": 0.4118, + "mean_token_accuracy": 0.8664652705192566, + "num_tokens": 171438736.0, + "step": 4490 + }, + { + "epoch": 0.5713013611499809, + "ewc_loss": 0.02392578125, + "ewc_loss_parallel": 2.396106719970703e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.025238037109375, + "learning_rate": 1e-06, + "loss": 0.4311, + "mean_token_accuracy": 0.8633806109428406, + "num_tokens": 171477949.0, + "step": 4491 + }, + { + "epoch": 0.5714285714285714, + "ewc_loss": 0.023681640625, + "ewc_loss_parallel": 2.372264862060547e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.838790893554688, + "learning_rate": 1e-06, + "loss": 0.4556, + "mean_token_accuracy": 0.8516252636909485, + "num_tokens": 171520953.0, + "step": 4492 + }, + { + "epoch": 0.5715557817071619, + "ewc_loss": 0.0234375, + "ewc_loss_parallel": 2.3484230041503906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.03483009338379, + "learning_rate": 1e-06, + "loss": 0.4254, + "mean_token_accuracy": 0.8624100089073181, + "num_tokens": 171559415.0, + "step": 4493 + }, + { + "epoch": 0.5716829919857525, + "ewc_loss": 0.0233154296875, + "ewc_loss_parallel": 2.3365020751953125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.910341262817383, + "learning_rate": 1e-06, + "loss": 0.421, + "mean_token_accuracy": 0.864905595779419, + "num_tokens": 171593118.0, + "step": 4494 + }, + { + "epoch": 0.571810202264343, + "ewc_loss": 0.023681640625, + "ewc_loss_parallel": 2.372264862060547e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.11374855041504, + "learning_rate": 1e-06, + "loss": 0.4994, + "mean_token_accuracy": 0.8429024815559387, + "num_tokens": 171634514.0, + "step": 4495 + }, + { + "epoch": 0.5719374125429335, + "ewc_loss": 0.0238037109375, + "ewc_loss_parallel": 2.384185791015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.983957290649414, + "learning_rate": 1e-06, + "loss": 0.4372, + "mean_token_accuracy": 0.8603993654251099, + "num_tokens": 171675239.0, + "step": 4496 + }, + { + "epoch": 0.5720646228215239, + "ewc_loss": 0.0235595703125, + "ewc_loss_parallel": 2.3603439331054688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.847936630249023, + "learning_rate": 1e-06, + "loss": 0.4141, + "mean_token_accuracy": 0.8654711842536926, + "num_tokens": 171713154.0, + "step": 4497 + }, + { + "epoch": 0.5721918331001145, + "ewc_loss": 0.023681640625, + "ewc_loss_parallel": 2.372264862060547e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.081344604492188, + "learning_rate": 1e-06, + "loss": 0.4807, + "mean_token_accuracy": 0.8498785495758057, + "num_tokens": 171750046.0, + "step": 4498 + }, + { + "epoch": 0.572319043378705, + "ewc_loss": 0.0234375, + "ewc_loss_parallel": 2.3484230041503906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.136314392089844, + "learning_rate": 1e-06, + "loss": 0.4769, + "mean_token_accuracy": 0.8468090891838074, + "num_tokens": 171782708.0, + "step": 4499 + }, + { + "epoch": 0.5724462536572955, + "ewc_loss": 0.0235595703125, + "ewc_loss_parallel": 2.3603439331054688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.92205047607422, + "learning_rate": 1e-06, + "loss": 0.4672, + "mean_token_accuracy": 0.852067232131958, + "num_tokens": 171819421.0, + "step": 4500 + }, + { + "epoch": 0.572573463935886, + "ewc_loss": 0.023681640625, + "ewc_loss_parallel": 2.372264862060547e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.23578643798828, + "learning_rate": 1e-06, + "loss": 0.4476, + "mean_token_accuracy": 0.8544028997421265, + "num_tokens": 171860993.0, + "step": 4501 + }, + { + "epoch": 0.5727006742144766, + "ewc_loss": 0.023681640625, + "ewc_loss_parallel": 2.372264862060547e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.02955436706543, + "learning_rate": 1e-06, + "loss": 0.4494, + "mean_token_accuracy": 0.8549659252166748, + "num_tokens": 171896894.0, + "step": 4502 + }, + { + "epoch": 0.572827884493067, + "ewc_loss": 0.0234375, + "ewc_loss_parallel": 2.3484230041503906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.009700775146484, + "learning_rate": 1e-06, + "loss": 0.4794, + "mean_token_accuracy": 0.8537058234214783, + "num_tokens": 171938694.0, + "step": 4503 + }, + { + "epoch": 0.5729550947716575, + "ewc_loss": 0.0235595703125, + "ewc_loss_parallel": 2.3603439331054688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.104419708251953, + "learning_rate": 1e-06, + "loss": 0.3698, + "mean_token_accuracy": 0.8807042837142944, + "num_tokens": 171972606.0, + "step": 4504 + }, + { + "epoch": 0.573082305050248, + "ewc_loss": 0.0233154296875, + "ewc_loss_parallel": 2.3365020751953125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.853918075561523, + "learning_rate": 1e-06, + "loss": 0.4489, + "mean_token_accuracy": 0.8594768047332764, + "num_tokens": 172010861.0, + "step": 4505 + }, + { + "epoch": 0.5732095153288386, + "ewc_loss": 0.0235595703125, + "ewc_loss_parallel": 2.3603439331054688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.925296783447266, + "learning_rate": 1e-06, + "loss": 0.5083, + "mean_token_accuracy": 0.8412157297134399, + "num_tokens": 172049867.0, + "step": 4506 + }, + { + "epoch": 0.5733367256074291, + "ewc_loss": 0.0238037109375, + "ewc_loss_parallel": 2.384185791015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.045969009399414, + "learning_rate": 1e-06, + "loss": 0.4636, + "mean_token_accuracy": 0.8570714592933655, + "num_tokens": 172092176.0, + "step": 4507 + }, + { + "epoch": 0.5734639358860196, + "ewc_loss": 0.0233154296875, + "ewc_loss_parallel": 2.3365020751953125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.060976028442383, + "learning_rate": 1e-06, + "loss": 0.4471, + "mean_token_accuracy": 0.8570472002029419, + "num_tokens": 172132789.0, + "step": 4508 + }, + { + "epoch": 0.5735911461646102, + "ewc_loss": 0.02392578125, + "ewc_loss_parallel": 2.396106719970703e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.20384407043457, + "learning_rate": 1e-06, + "loss": 0.4072, + "mean_token_accuracy": 0.8709260821342468, + "num_tokens": 172170536.0, + "step": 4509 + }, + { + "epoch": 0.5737183564432006, + "ewc_loss": 0.0233154296875, + "ewc_loss_parallel": 2.3365020751953125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.947118759155273, + "learning_rate": 1e-06, + "loss": 0.4645, + "mean_token_accuracy": 0.8557368516921997, + "num_tokens": 172206828.0, + "step": 4510 + }, + { + "epoch": 0.5738455667217911, + "ewc_loss": 0.0235595703125, + "ewc_loss_parallel": 2.3603439331054688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.10942840576172, + "learning_rate": 1e-06, + "loss": 0.4732, + "mean_token_accuracy": 0.8479440212249756, + "num_tokens": 172242311.0, + "step": 4511 + }, + { + "epoch": 0.5739727770003816, + "ewc_loss": 0.0235595703125, + "ewc_loss_parallel": 2.3603439331054688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.985990524291992, + "learning_rate": 1e-06, + "loss": 0.4622, + "mean_token_accuracy": 0.8507648706436157, + "num_tokens": 172275135.0, + "step": 4512 + }, + { + "epoch": 0.5740999872789722, + "ewc_loss": 0.023681640625, + "ewc_loss_parallel": 2.372264862060547e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.994335174560547, + "learning_rate": 1e-06, + "loss": 0.4286, + "mean_token_accuracy": 0.8648480176925659, + "num_tokens": 172308682.0, + "step": 4513 + }, + { + "epoch": 0.5742271975575627, + "ewc_loss": 0.0233154296875, + "ewc_loss_parallel": 2.3365020751953125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.097631454467773, + "learning_rate": 1e-06, + "loss": 0.4621, + "mean_token_accuracy": 0.8515253663063049, + "num_tokens": 172345834.0, + "step": 4514 + }, + { + "epoch": 0.5743544078361532, + "ewc_loss": 0.023681640625, + "ewc_loss_parallel": 2.372264862060547e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.813201904296875, + "learning_rate": 1e-06, + "loss": 0.5335, + "mean_token_accuracy": 0.8346560597419739, + "num_tokens": 172387976.0, + "step": 4515 + }, + { + "epoch": 0.5744816181147436, + "ewc_loss": 0.023681640625, + "ewc_loss_parallel": 2.372264862060547e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.179134368896484, + "learning_rate": 1e-06, + "loss": 0.4576, + "mean_token_accuracy": 0.8549898862838745, + "num_tokens": 172427447.0, + "step": 4516 + }, + { + "epoch": 0.5746088283933342, + "ewc_loss": 0.0238037109375, + "ewc_loss_parallel": 2.384185791015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.906789779663086, + "learning_rate": 1e-06, + "loss": 0.4802, + "mean_token_accuracy": 0.8470149040222168, + "num_tokens": 172468686.0, + "step": 4517 + }, + { + "epoch": 0.5747360386719247, + "ewc_loss": 0.0235595703125, + "ewc_loss_parallel": 2.3603439331054688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.014509201049805, + "learning_rate": 1e-06, + "loss": 0.4729, + "mean_token_accuracy": 0.8530412912368774, + "num_tokens": 172508190.0, + "step": 4518 + }, + { + "epoch": 0.5748632489505152, + "ewc_loss": 0.0238037109375, + "ewc_loss_parallel": 2.384185791015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.902881622314453, + "learning_rate": 1e-06, + "loss": 0.4107, + "mean_token_accuracy": 0.8717944622039795, + "num_tokens": 172552295.0, + "step": 4519 + }, + { + "epoch": 0.5749904592291057, + "ewc_loss": 0.02392578125, + "ewc_loss_parallel": 2.396106719970703e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.01828384399414, + "learning_rate": 1e-06, + "loss": 0.4389, + "mean_token_accuracy": 0.8611769676208496, + "num_tokens": 172591065.0, + "step": 4520 + }, + { + "epoch": 0.5751176695076963, + "ewc_loss": 0.0238037109375, + "ewc_loss_parallel": 2.384185791015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.031291961669922, + "learning_rate": 1e-06, + "loss": 0.4481, + "mean_token_accuracy": 0.8585969805717468, + "num_tokens": 172633741.0, + "step": 4521 + }, + { + "epoch": 0.5752448797862867, + "ewc_loss": 0.023681640625, + "ewc_loss_parallel": 2.372264862060547e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.857807159423828, + "learning_rate": 1e-06, + "loss": 0.4166, + "mean_token_accuracy": 0.866701066493988, + "num_tokens": 172669081.0, + "step": 4522 + }, + { + "epoch": 0.5753720900648772, + "ewc_loss": 0.0238037109375, + "ewc_loss_parallel": 2.384185791015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.0250186920166, + "learning_rate": 1e-06, + "loss": 0.5296, + "mean_token_accuracy": 0.8346021175384521, + "num_tokens": 172704956.0, + "step": 4523 + }, + { + "epoch": 0.5754993003434677, + "ewc_loss": 0.0238037109375, + "ewc_loss_parallel": 2.384185791015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.973154067993164, + "learning_rate": 1e-06, + "loss": 0.4386, + "mean_token_accuracy": 0.8602688312530518, + "num_tokens": 172742270.0, + "step": 4524 + }, + { + "epoch": 0.5756265106220583, + "ewc_loss": 0.0238037109375, + "ewc_loss_parallel": 2.384185791015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.13932991027832, + "learning_rate": 1e-06, + "loss": 0.4843, + "mean_token_accuracy": 0.8447489738464355, + "num_tokens": 172778065.0, + "step": 4525 + }, + { + "epoch": 0.5757537209006488, + "ewc_loss": 0.0235595703125, + "ewc_loss_parallel": 2.3603439331054688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.992984771728516, + "learning_rate": 1e-06, + "loss": 0.4328, + "mean_token_accuracy": 0.865129828453064, + "num_tokens": 172815825.0, + "step": 4526 + }, + { + "epoch": 0.5758809311792393, + "ewc_loss": 0.02392578125, + "ewc_loss_parallel": 2.396106719970703e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.98359489440918, + "learning_rate": 1e-06, + "loss": 0.4591, + "mean_token_accuracy": 0.8555124998092651, + "num_tokens": 172849560.0, + "step": 4527 + }, + { + "epoch": 0.5760081414578297, + "ewc_loss": 0.0238037109375, + "ewc_loss_parallel": 2.384185791015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.038846969604492, + "learning_rate": 1e-06, + "loss": 0.4391, + "mean_token_accuracy": 0.8576782941818237, + "num_tokens": 172885203.0, + "step": 4528 + }, + { + "epoch": 0.5761353517364203, + "ewc_loss": 0.02392578125, + "ewc_loss_parallel": 2.396106719970703e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.025476455688477, + "learning_rate": 1e-06, + "loss": 0.46, + "mean_token_accuracy": 0.8553926348686218, + "num_tokens": 172922341.0, + "step": 4529 + }, + { + "epoch": 0.5762625620150108, + "ewc_loss": 0.023681640625, + "ewc_loss_parallel": 2.372264862060547e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.965606689453125, + "learning_rate": 1e-06, + "loss": 0.4256, + "mean_token_accuracy": 0.8617938756942749, + "num_tokens": 172953363.0, + "step": 4530 + }, + { + "epoch": 0.5763897722936013, + "ewc_loss": 0.0238037109375, + "ewc_loss_parallel": 2.384185791015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.036502838134766, + "learning_rate": 1e-06, + "loss": 0.4322, + "mean_token_accuracy": 0.8620741963386536, + "num_tokens": 172986253.0, + "step": 4531 + }, + { + "epoch": 0.5765169825721919, + "ewc_loss": 0.02392578125, + "ewc_loss_parallel": 2.396106719970703e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.039371490478516, + "learning_rate": 1e-06, + "loss": 0.4073, + "mean_token_accuracy": 0.8666678667068481, + "num_tokens": 173023466.0, + "step": 4532 + }, + { + "epoch": 0.5766441928507824, + "ewc_loss": 0.023681640625, + "ewc_loss_parallel": 2.372264862060547e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.9622802734375, + "learning_rate": 1e-06, + "loss": 0.4614, + "mean_token_accuracy": 0.8515117168426514, + "num_tokens": 173063329.0, + "step": 4533 + }, + { + "epoch": 0.5767714031293728, + "ewc_loss": 0.023681640625, + "ewc_loss_parallel": 2.372264862060547e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.11011505126953, + "learning_rate": 1e-06, + "loss": 0.4363, + "mean_token_accuracy": 0.8619653582572937, + "num_tokens": 173100676.0, + "step": 4534 + }, + { + "epoch": 0.5768986134079633, + "ewc_loss": 0.0238037109375, + "ewc_loss_parallel": 2.384185791015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.032896041870117, + "learning_rate": 1e-06, + "loss": 0.4157, + "mean_token_accuracy": 0.8666281700134277, + "num_tokens": 173135088.0, + "step": 4535 + }, + { + "epoch": 0.5770258236865539, + "ewc_loss": 0.0238037109375, + "ewc_loss_parallel": 2.384185791015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.02149200439453, + "learning_rate": 1e-06, + "loss": 0.4758, + "mean_token_accuracy": 0.8479093313217163, + "num_tokens": 173175810.0, + "step": 4536 + }, + { + "epoch": 0.5771530339651444, + "ewc_loss": 0.023681640625, + "ewc_loss_parallel": 2.372264862060547e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.09107780456543, + "learning_rate": 1e-06, + "loss": 0.4513, + "mean_token_accuracy": 0.857700526714325, + "num_tokens": 173220826.0, + "step": 4537 + }, + { + "epoch": 0.5772802442437349, + "ewc_loss": 0.0240478515625, + "ewc_loss_parallel": 2.4080276489257812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.98265838623047, + "learning_rate": 1e-06, + "loss": 0.4238, + "mean_token_accuracy": 0.8675168752670288, + "num_tokens": 173258411.0, + "step": 4538 + }, + { + "epoch": 0.5774074545223254, + "ewc_loss": 0.02392578125, + "ewc_loss_parallel": 2.396106719970703e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.06082534790039, + "learning_rate": 1e-06, + "loss": 0.4929, + "mean_token_accuracy": 0.8397350311279297, + "num_tokens": 173297298.0, + "step": 4539 + }, + { + "epoch": 0.5775346648009159, + "ewc_loss": 0.02392578125, + "ewc_loss_parallel": 2.396106719970703e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.06388282775879, + "learning_rate": 1e-06, + "loss": 0.4915, + "mean_token_accuracy": 0.8393073678016663, + "num_tokens": 173330839.0, + "step": 4540 + }, + { + "epoch": 0.5776618750795064, + "ewc_loss": 0.023681640625, + "ewc_loss_parallel": 2.372264862060547e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.86293601989746, + "learning_rate": 1e-06, + "loss": 0.4778, + "mean_token_accuracy": 0.8492327332496643, + "num_tokens": 173370219.0, + "step": 4541 + }, + { + "epoch": 0.5777890853580969, + "ewc_loss": 0.02392578125, + "ewc_loss_parallel": 2.396106719970703e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.134838104248047, + "learning_rate": 1e-06, + "loss": 0.4381, + "mean_token_accuracy": 0.8569939136505127, + "num_tokens": 173410425.0, + "step": 4542 + }, + { + "epoch": 0.5779162956366874, + "ewc_loss": 0.023681640625, + "ewc_loss_parallel": 2.372264862060547e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.980579376220703, + "learning_rate": 1e-06, + "loss": 0.4486, + "mean_token_accuracy": 0.8561421632766724, + "num_tokens": 173447109.0, + "step": 4543 + }, + { + "epoch": 0.578043505915278, + "ewc_loss": 0.0238037109375, + "ewc_loss_parallel": 2.384185791015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.978811264038086, + "learning_rate": 1e-06, + "loss": 0.4358, + "mean_token_accuracy": 0.8614848256111145, + "num_tokens": 173490064.0, + "step": 4544 + }, + { + "epoch": 0.5781707161938685, + "ewc_loss": 0.02392578125, + "ewc_loss_parallel": 2.396106719970703e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.841421127319336, + "learning_rate": 1e-06, + "loss": 0.4461, + "mean_token_accuracy": 0.8591375946998596, + "num_tokens": 173523791.0, + "step": 4545 + }, + { + "epoch": 0.5782979264724589, + "ewc_loss": 0.0238037109375, + "ewc_loss_parallel": 2.384185791015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.012590408325195, + "learning_rate": 1e-06, + "loss": 0.4137, + "mean_token_accuracy": 0.8648175597190857, + "num_tokens": 173564069.0, + "step": 4546 + }, + { + "epoch": 0.5784251367510495, + "ewc_loss": 0.0240478515625, + "ewc_loss_parallel": 2.4080276489257812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.945152282714844, + "learning_rate": 1e-06, + "loss": 0.4506, + "mean_token_accuracy": 0.8558639883995056, + "num_tokens": 173601893.0, + "step": 4547 + }, + { + "epoch": 0.57855234702964, + "ewc_loss": 0.0235595703125, + "ewc_loss_parallel": 2.3603439331054688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.95376205444336, + "learning_rate": 1e-06, + "loss": 0.4379, + "mean_token_accuracy": 0.8663741946220398, + "num_tokens": 173642299.0, + "step": 4548 + }, + { + "epoch": 0.5786795573082305, + "ewc_loss": 0.0240478515625, + "ewc_loss_parallel": 2.4080276489257812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.979450225830078, + "learning_rate": 1e-06, + "loss": 0.4339, + "mean_token_accuracy": 0.8584955930709839, + "num_tokens": 173675824.0, + "step": 4549 + }, + { + "epoch": 0.578806767586821, + "ewc_loss": 0.02392578125, + "ewc_loss_parallel": 2.396106719970703e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.109100341796875, + "learning_rate": 1e-06, + "loss": 0.449, + "mean_token_accuracy": 0.8586231470108032, + "num_tokens": 173712192.0, + "step": 4550 + }, + { + "epoch": 0.5789339778654116, + "ewc_loss": 0.0240478515625, + "ewc_loss_parallel": 2.4080276489257812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.001598358154297, + "learning_rate": 1e-06, + "loss": 0.4272, + "mean_token_accuracy": 0.8636646866798401, + "num_tokens": 173746949.0, + "step": 4551 + }, + { + "epoch": 0.579061188144002, + "ewc_loss": 0.02392578125, + "ewc_loss_parallel": 2.396106719970703e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.112926483154297, + "learning_rate": 1e-06, + "loss": 0.441, + "mean_token_accuracy": 0.8595399856567383, + "num_tokens": 173784527.0, + "step": 4552 + }, + { + "epoch": 0.5791883984225925, + "ewc_loss": 0.02392578125, + "ewc_loss_parallel": 2.396106719970703e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.94987678527832, + "learning_rate": 1e-06, + "loss": 0.4197, + "mean_token_accuracy": 0.8625693321228027, + "num_tokens": 173820648.0, + "step": 4553 + }, + { + "epoch": 0.579315608701183, + "ewc_loss": 0.023681640625, + "ewc_loss_parallel": 2.372264862060547e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.0661563873291, + "learning_rate": 1e-06, + "loss": 0.4669, + "mean_token_accuracy": 0.8500576615333557, + "num_tokens": 173860427.0, + "step": 4554 + }, + { + "epoch": 0.5794428189797736, + "ewc_loss": 0.02392578125, + "ewc_loss_parallel": 2.396106719970703e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.997509002685547, + "learning_rate": 1e-06, + "loss": 0.431, + "mean_token_accuracy": 0.8593355417251587, + "num_tokens": 173899876.0, + "step": 4555 + }, + { + "epoch": 0.5795700292583641, + "ewc_loss": 0.0238037109375, + "ewc_loss_parallel": 2.384185791015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.046157836914062, + "learning_rate": 1e-06, + "loss": 0.4231, + "mean_token_accuracy": 0.8652224540710449, + "num_tokens": 173937354.0, + "step": 4556 + }, + { + "epoch": 0.5796972395369546, + "ewc_loss": 0.023681640625, + "ewc_loss_parallel": 2.372264862060547e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.07114028930664, + "learning_rate": 1e-06, + "loss": 0.4788, + "mean_token_accuracy": 0.8457045555114746, + "num_tokens": 173976039.0, + "step": 4557 + }, + { + "epoch": 0.5798244498155452, + "ewc_loss": 0.0240478515625, + "ewc_loss_parallel": 2.4080276489257812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.96324348449707, + "learning_rate": 1e-06, + "loss": 0.5219, + "mean_token_accuracy": 0.8371617197990417, + "num_tokens": 174006359.0, + "step": 4558 + }, + { + "epoch": 0.5799516600941356, + "ewc_loss": 0.023681640625, + "ewc_loss_parallel": 2.372264862060547e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.13119888305664, + "learning_rate": 1e-06, + "loss": 0.475, + "mean_token_accuracy": 0.8484501242637634, + "num_tokens": 174044662.0, + "step": 4559 + }, + { + "epoch": 0.5800788703727261, + "ewc_loss": 0.02392578125, + "ewc_loss_parallel": 2.396106719970703e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.05192756652832, + "learning_rate": 1e-06, + "loss": 0.4561, + "mean_token_accuracy": 0.8518840670585632, + "num_tokens": 174077133.0, + "step": 4560 + }, + { + "epoch": 0.5802060806513166, + "ewc_loss": 0.0238037109375, + "ewc_loss_parallel": 2.384185791015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.994699478149414, + "learning_rate": 1e-06, + "loss": 0.4617, + "mean_token_accuracy": 0.8524658679962158, + "num_tokens": 174121873.0, + "step": 4561 + }, + { + "epoch": 0.5803332909299072, + "ewc_loss": 0.02392578125, + "ewc_loss_parallel": 2.396106719970703e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.095712661743164, + "learning_rate": 1e-06, + "loss": 0.4342, + "mean_token_accuracy": 0.8584684133529663, + "num_tokens": 174158873.0, + "step": 4562 + }, + { + "epoch": 0.5804605012084977, + "ewc_loss": 0.023681640625, + "ewc_loss_parallel": 2.372264862060547e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.031063079833984, + "learning_rate": 1e-06, + "loss": 0.4274, + "mean_token_accuracy": 0.8625308871269226, + "num_tokens": 174189222.0, + "step": 4563 + }, + { + "epoch": 0.5805877114870882, + "ewc_loss": 0.02392578125, + "ewc_loss_parallel": 2.396106719970703e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.03742218017578, + "learning_rate": 1e-06, + "loss": 0.4752, + "mean_token_accuracy": 0.8484671711921692, + "num_tokens": 174229018.0, + "step": 4564 + }, + { + "epoch": 0.5807149217656786, + "ewc_loss": 0.02392578125, + "ewc_loss_parallel": 2.396106719970703e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.98301124572754, + "learning_rate": 1e-06, + "loss": 0.4476, + "mean_token_accuracy": 0.8575071096420288, + "num_tokens": 174269215.0, + "step": 4565 + }, + { + "epoch": 0.5808421320442692, + "ewc_loss": 0.0240478515625, + "ewc_loss_parallel": 2.4080276489257812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.135934829711914, + "learning_rate": 1e-06, + "loss": 0.427, + "mean_token_accuracy": 0.8647623062133789, + "num_tokens": 174303926.0, + "step": 4566 + }, + { + "epoch": 0.5809693423228597, + "ewc_loss": 0.0238037109375, + "ewc_loss_parallel": 2.384185791015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.005592346191406, + "learning_rate": 1e-06, + "loss": 0.4549, + "mean_token_accuracy": 0.8591576814651489, + "num_tokens": 174347491.0, + "step": 4567 + }, + { + "epoch": 0.5810965526014502, + "ewc_loss": 0.02392578125, + "ewc_loss_parallel": 2.396106719970703e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.960710525512695, + "learning_rate": 1e-06, + "loss": 0.4535, + "mean_token_accuracy": 0.8584325313568115, + "num_tokens": 174384246.0, + "step": 4568 + }, + { + "epoch": 0.5812237628800407, + "ewc_loss": 0.023681640625, + "ewc_loss_parallel": 2.372264862060547e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.00001335144043, + "learning_rate": 1e-06, + "loss": 0.4406, + "mean_token_accuracy": 0.8563356399536133, + "num_tokens": 174417081.0, + "step": 4569 + }, + { + "epoch": 0.5813509731586313, + "ewc_loss": 0.02392578125, + "ewc_loss_parallel": 2.396106719970703e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.797330856323242, + "learning_rate": 1e-06, + "loss": 0.4901, + "mean_token_accuracy": 0.8467857837677002, + "num_tokens": 174457364.0, + "step": 4570 + }, + { + "epoch": 0.5814781834372217, + "ewc_loss": 0.024169921875, + "ewc_loss_parallel": 2.4199485778808594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.164594650268555, + "learning_rate": 1e-06, + "loss": 0.4448, + "mean_token_accuracy": 0.8577573299407959, + "num_tokens": 174494304.0, + "step": 4571 + }, + { + "epoch": 0.5816053937158122, + "ewc_loss": 0.0240478515625, + "ewc_loss_parallel": 2.4080276489257812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.02906608581543, + "learning_rate": 1e-06, + "loss": 0.4416, + "mean_token_accuracy": 0.8584834337234497, + "num_tokens": 174528047.0, + "step": 4572 + }, + { + "epoch": 0.5817326039944027, + "ewc_loss": 0.0238037109375, + "ewc_loss_parallel": 2.384185791015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.918601989746094, + "learning_rate": 1e-06, + "loss": 0.4281, + "mean_token_accuracy": 0.8610832095146179, + "num_tokens": 174565990.0, + "step": 4573 + }, + { + "epoch": 0.5818598142729933, + "ewc_loss": 0.024169921875, + "ewc_loss_parallel": 2.4199485778808594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.10867691040039, + "learning_rate": 1e-06, + "loss": 0.491, + "mean_token_accuracy": 0.8421310186386108, + "num_tokens": 174599039.0, + "step": 4574 + }, + { + "epoch": 0.5819870245515838, + "ewc_loss": 0.0240478515625, + "ewc_loss_parallel": 2.4080276489257812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.132333755493164, + "learning_rate": 1e-06, + "loss": 0.4589, + "mean_token_accuracy": 0.8501691818237305, + "num_tokens": 174634786.0, + "step": 4575 + }, + { + "epoch": 0.5821142348301743, + "ewc_loss": 0.0242919921875, + "ewc_loss_parallel": 2.4318695068359375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.038822174072266, + "learning_rate": 1e-06, + "loss": 0.4889, + "mean_token_accuracy": 0.8461573719978333, + "num_tokens": 174677875.0, + "step": 4576 + }, + { + "epoch": 0.5822414451087647, + "ewc_loss": 0.02392578125, + "ewc_loss_parallel": 2.396106719970703e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.89997100830078, + "learning_rate": 1e-06, + "loss": 0.4599, + "mean_token_accuracy": 0.8505952954292297, + "num_tokens": 174715421.0, + "step": 4577 + }, + { + "epoch": 0.5823686553873553, + "ewc_loss": 0.0240478515625, + "ewc_loss_parallel": 2.4080276489257812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.990650177001953, + "learning_rate": 1e-06, + "loss": 0.4617, + "mean_token_accuracy": 0.8529090285301208, + "num_tokens": 174746555.0, + "step": 4578 + }, + { + "epoch": 0.5824958656659458, + "ewc_loss": 0.0238037109375, + "ewc_loss_parallel": 2.384185791015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.92029571533203, + "learning_rate": 1e-06, + "loss": 0.486, + "mean_token_accuracy": 0.8448055982589722, + "num_tokens": 174786470.0, + "step": 4579 + }, + { + "epoch": 0.5826230759445363, + "ewc_loss": 0.024169921875, + "ewc_loss_parallel": 2.4199485778808594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.000808715820312, + "learning_rate": 1e-06, + "loss": 0.4595, + "mean_token_accuracy": 0.8536348938941956, + "num_tokens": 174827131.0, + "step": 4580 + }, + { + "epoch": 0.5827502862231269, + "ewc_loss": 0.0240478515625, + "ewc_loss_parallel": 2.4080276489257812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.0231876373291, + "learning_rate": 1e-06, + "loss": 0.4553, + "mean_token_accuracy": 0.8502830266952515, + "num_tokens": 174861391.0, + "step": 4581 + }, + { + "epoch": 0.5828774965017174, + "ewc_loss": 0.02392578125, + "ewc_loss_parallel": 2.396106719970703e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.21790885925293, + "learning_rate": 1e-06, + "loss": 0.4291, + "mean_token_accuracy": 0.8607431650161743, + "num_tokens": 174897437.0, + "step": 4582 + }, + { + "epoch": 0.5830047067803078, + "ewc_loss": 0.024169921875, + "ewc_loss_parallel": 2.4199485778808594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.82644271850586, + "learning_rate": 1e-06, + "loss": 0.4839, + "mean_token_accuracy": 0.8469873070716858, + "num_tokens": 174932504.0, + "step": 4583 + }, + { + "epoch": 0.5831319170588983, + "ewc_loss": 0.02392578125, + "ewc_loss_parallel": 2.396106719970703e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.24920082092285, + "learning_rate": 1e-06, + "loss": 0.4704, + "mean_token_accuracy": 0.8513773679733276, + "num_tokens": 174974746.0, + "step": 4584 + }, + { + "epoch": 0.5832591273374889, + "ewc_loss": 0.0240478515625, + "ewc_loss_parallel": 2.4080276489257812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.871997833251953, + "learning_rate": 1e-06, + "loss": 0.4669, + "mean_token_accuracy": 0.8510830402374268, + "num_tokens": 175016155.0, + "step": 4585 + }, + { + "epoch": 0.5833863376160794, + "ewc_loss": 0.0240478515625, + "ewc_loss_parallel": 2.4080276489257812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.028823852539062, + "learning_rate": 1e-06, + "loss": 0.4901, + "mean_token_accuracy": 0.840882420539856, + "num_tokens": 175060595.0, + "step": 4586 + }, + { + "epoch": 0.5835135478946699, + "ewc_loss": 0.0240478515625, + "ewc_loss_parallel": 2.4080276489257812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.03756332397461, + "learning_rate": 1e-06, + "loss": 0.459, + "mean_token_accuracy": 0.8561526536941528, + "num_tokens": 175101278.0, + "step": 4587 + }, + { + "epoch": 0.5836407581732604, + "ewc_loss": 0.0240478515625, + "ewc_loss_parallel": 2.4080276489257812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.057836532592773, + "learning_rate": 1e-06, + "loss": 0.4779, + "mean_token_accuracy": 0.8492155075073242, + "num_tokens": 175139479.0, + "step": 4588 + }, + { + "epoch": 0.5837679684518509, + "ewc_loss": 0.024169921875, + "ewc_loss_parallel": 2.4199485778808594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.99217414855957, + "learning_rate": 1e-06, + "loss": 0.5049, + "mean_token_accuracy": 0.8391543626785278, + "num_tokens": 175182376.0, + "step": 4589 + }, + { + "epoch": 0.5838951787304414, + "ewc_loss": 0.0240478515625, + "ewc_loss_parallel": 2.4080276489257812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.98681640625, + "learning_rate": 1e-06, + "loss": 0.4841, + "mean_token_accuracy": 0.8486181497573853, + "num_tokens": 175225449.0, + "step": 4590 + }, + { + "epoch": 0.5840223890090319, + "ewc_loss": 0.0240478515625, + "ewc_loss_parallel": 2.4080276489257812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.054630279541016, + "learning_rate": 1e-06, + "loss": 0.461, + "mean_token_accuracy": 0.8534722328186035, + "num_tokens": 175255959.0, + "step": 4591 + }, + { + "epoch": 0.5841495992876224, + "ewc_loss": 0.02392578125, + "ewc_loss_parallel": 2.396106719970703e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.978485107421875, + "learning_rate": 1e-06, + "loss": 0.4733, + "mean_token_accuracy": 0.8440679311752319, + "num_tokens": 175292142.0, + "step": 4592 + }, + { + "epoch": 0.584276809566213, + "ewc_loss": 0.024169921875, + "ewc_loss_parallel": 2.4199485778808594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.041027069091797, + "learning_rate": 1e-06, + "loss": 0.4677, + "mean_token_accuracy": 0.8497459292411804, + "num_tokens": 175336630.0, + "step": 4593 + }, + { + "epoch": 0.5844040198448035, + "ewc_loss": 0.024169921875, + "ewc_loss_parallel": 2.4199485778808594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.015689849853516, + "learning_rate": 1e-06, + "loss": 0.4214, + "mean_token_accuracy": 0.8668388724327087, + "num_tokens": 175372697.0, + "step": 4594 + }, + { + "epoch": 0.5845312301233939, + "ewc_loss": 0.02392578125, + "ewc_loss_parallel": 2.396106719970703e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.13806915283203, + "learning_rate": 1e-06, + "loss": 0.5033, + "mean_token_accuracy": 0.8433562517166138, + "num_tokens": 175412395.0, + "step": 4595 + }, + { + "epoch": 0.5846584404019844, + "ewc_loss": 0.024169921875, + "ewc_loss_parallel": 2.4199485778808594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.131736755371094, + "learning_rate": 1e-06, + "loss": 0.4543, + "mean_token_accuracy": 0.8549715280532837, + "num_tokens": 175447295.0, + "step": 4596 + }, + { + "epoch": 0.584785650680575, + "ewc_loss": 0.0240478515625, + "ewc_loss_parallel": 2.4080276489257812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.055410385131836, + "learning_rate": 1e-06, + "loss": 0.4199, + "mean_token_accuracy": 0.8654784560203552, + "num_tokens": 175480229.0, + "step": 4597 + }, + { + "epoch": 0.5849128609591655, + "ewc_loss": 0.0240478515625, + "ewc_loss_parallel": 2.4080276489257812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.004667282104492, + "learning_rate": 1e-06, + "loss": 0.4482, + "mean_token_accuracy": 0.8551111817359924, + "num_tokens": 175523081.0, + "step": 4598 + }, + { + "epoch": 0.585040071237756, + "ewc_loss": 0.0240478515625, + "ewc_loss_parallel": 2.4080276489257812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.01540756225586, + "learning_rate": 1e-06, + "loss": 0.4354, + "mean_token_accuracy": 0.8591462969779968, + "num_tokens": 175565036.0, + "step": 4599 + }, + { + "epoch": 0.5851672815163466, + "ewc_loss": 0.0240478515625, + "ewc_loss_parallel": 2.4080276489257812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.948453903198242, + "learning_rate": 1e-06, + "loss": 0.4474, + "mean_token_accuracy": 0.8557642698287964, + "num_tokens": 175606733.0, + "step": 4600 + }, + { + "epoch": 0.585294491794937, + "ewc_loss": 0.0238037109375, + "ewc_loss_parallel": 2.384185791015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.054828643798828, + "learning_rate": 1e-06, + "loss": 0.407, + "mean_token_accuracy": 0.8715194463729858, + "num_tokens": 175637663.0, + "step": 4601 + }, + { + "epoch": 0.5854217020735275, + "ewc_loss": 0.0242919921875, + "ewc_loss_parallel": 2.4318695068359375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.980791091918945, + "learning_rate": 1e-06, + "loss": 0.4517, + "mean_token_accuracy": 0.8580933809280396, + "num_tokens": 175677381.0, + "step": 4602 + }, + { + "epoch": 0.585548912352118, + "ewc_loss": 0.0240478515625, + "ewc_loss_parallel": 2.4080276489257812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.054540634155273, + "learning_rate": 1e-06, + "loss": 0.4107, + "mean_token_accuracy": 0.8666984438896179, + "num_tokens": 175713049.0, + "step": 4603 + }, + { + "epoch": 0.5856761226307086, + "ewc_loss": 0.024169921875, + "ewc_loss_parallel": 2.4199485778808594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.13770866394043, + "learning_rate": 1e-06, + "loss": 0.4759, + "mean_token_accuracy": 0.8465118408203125, + "num_tokens": 175753681.0, + "step": 4604 + }, + { + "epoch": 0.5858033329092991, + "ewc_loss": 0.024169921875, + "ewc_loss_parallel": 2.4199485778808594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.924863815307617, + "learning_rate": 1e-06, + "loss": 0.489, + "mean_token_accuracy": 0.8432600498199463, + "num_tokens": 175789725.0, + "step": 4605 + }, + { + "epoch": 0.5859305431878896, + "ewc_loss": 0.024169921875, + "ewc_loss_parallel": 2.4199485778808594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.167421340942383, + "learning_rate": 1e-06, + "loss": 0.4516, + "mean_token_accuracy": 0.8548230528831482, + "num_tokens": 175823203.0, + "step": 4606 + }, + { + "epoch": 0.5860577534664801, + "ewc_loss": 0.024169921875, + "ewc_loss_parallel": 2.4199485778808594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.091135025024414, + "learning_rate": 1e-06, + "loss": 0.465, + "mean_token_accuracy": 0.8497206568717957, + "num_tokens": 175860506.0, + "step": 4607 + }, + { + "epoch": 0.5861849637450706, + "ewc_loss": 0.024169921875, + "ewc_loss_parallel": 2.4199485778808594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.821151733398438, + "learning_rate": 1e-06, + "loss": 0.3982, + "mean_token_accuracy": 0.8725576400756836, + "num_tokens": 175895080.0, + "step": 4608 + }, + { + "epoch": 0.5863121740236611, + "ewc_loss": 0.0240478515625, + "ewc_loss_parallel": 2.4080276489257812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.172269821166992, + "learning_rate": 1e-06, + "loss": 0.4751, + "mean_token_accuracy": 0.8502541780471802, + "num_tokens": 175937113.0, + "step": 4609 + }, + { + "epoch": 0.5864393843022516, + "ewc_loss": 0.024169921875, + "ewc_loss_parallel": 2.4199485778808594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.987159729003906, + "learning_rate": 1e-06, + "loss": 0.3991, + "mean_token_accuracy": 0.868588387966156, + "num_tokens": 175966193.0, + "step": 4610 + }, + { + "epoch": 0.5865665945808421, + "ewc_loss": 0.0240478515625, + "ewc_loss_parallel": 2.4080276489257812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.07921028137207, + "learning_rate": 1e-06, + "loss": 0.4632, + "mean_token_accuracy": 0.8489055633544922, + "num_tokens": 176003771.0, + "step": 4611 + }, + { + "epoch": 0.5866938048594327, + "ewc_loss": 0.024169921875, + "ewc_loss_parallel": 2.4199485778808594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.96770477294922, + "learning_rate": 1e-06, + "loss": 0.425, + "mean_token_accuracy": 0.8604736328125, + "num_tokens": 176050069.0, + "step": 4612 + }, + { + "epoch": 0.5868210151380232, + "ewc_loss": 0.02392578125, + "ewc_loss_parallel": 2.396106719970703e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.147558212280273, + "learning_rate": 1e-06, + "loss": 0.4051, + "mean_token_accuracy": 0.8703542947769165, + "num_tokens": 176088763.0, + "step": 4613 + }, + { + "epoch": 0.5869482254166136, + "ewc_loss": 0.024169921875, + "ewc_loss_parallel": 2.4199485778808594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.057802200317383, + "learning_rate": 1e-06, + "loss": 0.4533, + "mean_token_accuracy": 0.8575388193130493, + "num_tokens": 176127853.0, + "step": 4614 + }, + { + "epoch": 0.5870754356952042, + "ewc_loss": 0.0240478515625, + "ewc_loss_parallel": 2.4080276489257812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.97413444519043, + "learning_rate": 1e-06, + "loss": 0.5015, + "mean_token_accuracy": 0.8420512080192566, + "num_tokens": 176173775.0, + "step": 4615 + }, + { + "epoch": 0.5872026459737947, + "ewc_loss": 0.0240478515625, + "ewc_loss_parallel": 2.4080276489257812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.11145782470703, + "learning_rate": 1e-06, + "loss": 0.4406, + "mean_token_accuracy": 0.8585525751113892, + "num_tokens": 176210878.0, + "step": 4616 + }, + { + "epoch": 0.5873298562523852, + "ewc_loss": 0.024169921875, + "ewc_loss_parallel": 2.4199485778808594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.961503982543945, + "learning_rate": 1e-06, + "loss": 0.4628, + "mean_token_accuracy": 0.8500099182128906, + "num_tokens": 176255094.0, + "step": 4617 + }, + { + "epoch": 0.5874570665309757, + "ewc_loss": 0.0240478515625, + "ewc_loss_parallel": 2.4080276489257812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.04845428466797, + "learning_rate": 1e-06, + "loss": 0.4624, + "mean_token_accuracy": 0.8525356650352478, + "num_tokens": 176293586.0, + "step": 4618 + }, + { + "epoch": 0.5875842768095663, + "ewc_loss": 0.024169921875, + "ewc_loss_parallel": 2.4199485778808594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.093852996826172, + "learning_rate": 1e-06, + "loss": 0.4506, + "mean_token_accuracy": 0.8587696552276611, + "num_tokens": 176329203.0, + "step": 4619 + }, + { + "epoch": 0.5877114870881567, + "ewc_loss": 0.02392578125, + "ewc_loss_parallel": 2.396106719970703e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.970361709594727, + "learning_rate": 1e-06, + "loss": 0.3997, + "mean_token_accuracy": 0.8729337453842163, + "num_tokens": 176367984.0, + "step": 4620 + }, + { + "epoch": 0.5878386973667472, + "ewc_loss": 0.0240478515625, + "ewc_loss_parallel": 2.4080276489257812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.92013931274414, + "learning_rate": 1e-06, + "loss": 0.4165, + "mean_token_accuracy": 0.8696490526199341, + "num_tokens": 176408949.0, + "step": 4621 + }, + { + "epoch": 0.5879659076453377, + "ewc_loss": 0.02392578125, + "ewc_loss_parallel": 2.396106719970703e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.023357391357422, + "learning_rate": 1e-06, + "loss": 0.4003, + "mean_token_accuracy": 0.8724320530891418, + "num_tokens": 176451073.0, + "step": 4622 + }, + { + "epoch": 0.5880931179239283, + "ewc_loss": 0.0240478515625, + "ewc_loss_parallel": 2.4080276489257812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.161865234375, + "learning_rate": 1e-06, + "loss": 0.4356, + "mean_token_accuracy": 0.8641023635864258, + "num_tokens": 176490623.0, + "step": 4623 + }, + { + "epoch": 0.5882203282025188, + "ewc_loss": 0.0240478515625, + "ewc_loss_parallel": 2.4080276489257812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.043733596801758, + "learning_rate": 1e-06, + "loss": 0.4967, + "mean_token_accuracy": 0.8436636924743652, + "num_tokens": 176525771.0, + "step": 4624 + }, + { + "epoch": 0.5883475384811093, + "ewc_loss": 0.0238037109375, + "ewc_loss_parallel": 2.384185791015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.099773406982422, + "learning_rate": 1e-06, + "loss": 0.3908, + "mean_token_accuracy": 0.8748799562454224, + "num_tokens": 176565417.0, + "step": 4625 + }, + { + "epoch": 0.5884747487596997, + "ewc_loss": 0.024169921875, + "ewc_loss_parallel": 2.4199485778808594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.053844451904297, + "learning_rate": 1e-06, + "loss": 0.4751, + "mean_token_accuracy": 0.8539456129074097, + "num_tokens": 176599005.0, + "step": 4626 + }, + { + "epoch": 0.5886019590382903, + "ewc_loss": 0.023681640625, + "ewc_loss_parallel": 2.372264862060547e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.13025665283203, + "learning_rate": 1e-06, + "loss": 0.4416, + "mean_token_accuracy": 0.8590725660324097, + "num_tokens": 176637947.0, + "step": 4627 + }, + { + "epoch": 0.5887291693168808, + "ewc_loss": 0.0240478515625, + "ewc_loss_parallel": 2.4080276489257812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.132726669311523, + "learning_rate": 1e-06, + "loss": 0.4666, + "mean_token_accuracy": 0.8457168340682983, + "num_tokens": 176675241.0, + "step": 4628 + }, + { + "epoch": 0.5888563795954713, + "ewc_loss": 0.0238037109375, + "ewc_loss_parallel": 2.384185791015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.074888229370117, + "learning_rate": 1e-06, + "loss": 0.4676, + "mean_token_accuracy": 0.8499496579170227, + "num_tokens": 176715375.0, + "step": 4629 + }, + { + "epoch": 0.5889835898740619, + "ewc_loss": 0.0240478515625, + "ewc_loss_parallel": 2.4080276489257812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.12220001220703, + "learning_rate": 1e-06, + "loss": 0.4695, + "mean_token_accuracy": 0.8567551374435425, + "num_tokens": 176750977.0, + "step": 4630 + }, + { + "epoch": 0.5891108001526524, + "ewc_loss": 0.024169921875, + "ewc_loss_parallel": 2.4199485778808594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.829877853393555, + "learning_rate": 1e-06, + "loss": 0.4438, + "mean_token_accuracy": 0.8597245216369629, + "num_tokens": 176788968.0, + "step": 4631 + }, + { + "epoch": 0.5892380104312428, + "ewc_loss": 0.023681640625, + "ewc_loss_parallel": 2.372264862060547e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.99766731262207, + "learning_rate": 1e-06, + "loss": 0.4671, + "mean_token_accuracy": 0.8523268103599548, + "num_tokens": 176824399.0, + "step": 4632 + }, + { + "epoch": 0.5893652207098333, + "ewc_loss": 0.0242919921875, + "ewc_loss_parallel": 2.4318695068359375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.17905044555664, + "learning_rate": 1e-06, + "loss": 0.4668, + "mean_token_accuracy": 0.8517601490020752, + "num_tokens": 176862053.0, + "step": 4633 + }, + { + "epoch": 0.5894924309884239, + "ewc_loss": 0.023681640625, + "ewc_loss_parallel": 2.372264862060547e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.98471450805664, + "learning_rate": 1e-06, + "loss": 0.4007, + "mean_token_accuracy": 0.8703280687332153, + "num_tokens": 176896943.0, + "step": 4634 + }, + { + "epoch": 0.5896196412670144, + "ewc_loss": 0.0240478515625, + "ewc_loss_parallel": 2.4080276489257812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.115875244140625, + "learning_rate": 1e-06, + "loss": 0.4453, + "mean_token_accuracy": 0.8569340109825134, + "num_tokens": 176936263.0, + "step": 4635 + }, + { + "epoch": 0.5897468515456049, + "ewc_loss": 0.0238037109375, + "ewc_loss_parallel": 2.384185791015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.087141036987305, + "learning_rate": 1e-06, + "loss": 0.4732, + "mean_token_accuracy": 0.844512939453125, + "num_tokens": 176976632.0, + "step": 4636 + }, + { + "epoch": 0.5898740618241954, + "ewc_loss": 0.02392578125, + "ewc_loss_parallel": 2.396106719970703e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.06132698059082, + "learning_rate": 1e-06, + "loss": 0.4379, + "mean_token_accuracy": 0.862346887588501, + "num_tokens": 177007036.0, + "step": 4637 + }, + { + "epoch": 0.5900012721027859, + "ewc_loss": 0.024169921875, + "ewc_loss_parallel": 2.4199485778808594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.07002067565918, + "learning_rate": 1e-06, + "loss": 0.4837, + "mean_token_accuracy": 0.8480779528617859, + "num_tokens": 177042189.0, + "step": 4638 + }, + { + "epoch": 0.5901284823813764, + "ewc_loss": 0.0240478515625, + "ewc_loss_parallel": 2.4080276489257812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.095550537109375, + "learning_rate": 1e-06, + "loss": 0.4768, + "mean_token_accuracy": 0.8514362573623657, + "num_tokens": 177082428.0, + "step": 4639 + }, + { + "epoch": 0.5902556926599669, + "ewc_loss": 0.0240478515625, + "ewc_loss_parallel": 2.4080276489257812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.102354049682617, + "learning_rate": 1e-06, + "loss": 0.4357, + "mean_token_accuracy": 0.8590974807739258, + "num_tokens": 177124447.0, + "step": 4640 + }, + { + "epoch": 0.5903829029385574, + "ewc_loss": 0.024169921875, + "ewc_loss_parallel": 2.4199485778808594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.060283660888672, + "learning_rate": 1e-06, + "loss": 0.4073, + "mean_token_accuracy": 0.8722859621047974, + "num_tokens": 177160516.0, + "step": 4641 + }, + { + "epoch": 0.590510113217148, + "ewc_loss": 0.0242919921875, + "ewc_loss_parallel": 2.4318695068359375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.9913387298584, + "learning_rate": 1e-06, + "loss": 0.4088, + "mean_token_accuracy": 0.8637804388999939, + "num_tokens": 177198100.0, + "step": 4642 + }, + { + "epoch": 0.5906373234957385, + "ewc_loss": 0.0238037109375, + "ewc_loss_parallel": 2.384185791015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.09377098083496, + "learning_rate": 1e-06, + "loss": 0.4817, + "mean_token_accuracy": 0.8499895930290222, + "num_tokens": 177243170.0, + "step": 4643 + }, + { + "epoch": 0.5907645337743289, + "ewc_loss": 0.024169921875, + "ewc_loss_parallel": 2.4199485778808594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.067184448242188, + "learning_rate": 1e-06, + "loss": 0.4505, + "mean_token_accuracy": 0.8538600206375122, + "num_tokens": 177282976.0, + "step": 4644 + }, + { + "epoch": 0.5908917440529194, + "ewc_loss": 0.02392578125, + "ewc_loss_parallel": 2.396106719970703e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.843181610107422, + "learning_rate": 1e-06, + "loss": 0.4996, + "mean_token_accuracy": 0.8387928605079651, + "num_tokens": 177323725.0, + "step": 4645 + }, + { + "epoch": 0.59101895433151, + "ewc_loss": 0.02392578125, + "ewc_loss_parallel": 2.396106719970703e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.10329246520996, + "learning_rate": 1e-06, + "loss": 0.4419, + "mean_token_accuracy": 0.8565261960029602, + "num_tokens": 177352178.0, + "step": 4646 + }, + { + "epoch": 0.5911461646101005, + "ewc_loss": 0.024169921875, + "ewc_loss_parallel": 2.4199485778808594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.006935119628906, + "learning_rate": 1e-06, + "loss": 0.455, + "mean_token_accuracy": 0.8536677360534668, + "num_tokens": 177392322.0, + "step": 4647 + }, + { + "epoch": 0.591273374888691, + "ewc_loss": 0.02392578125, + "ewc_loss_parallel": 2.396106719970703e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.106016159057617, + "learning_rate": 1e-06, + "loss": 0.4586, + "mean_token_accuracy": 0.8549559116363525, + "num_tokens": 177432013.0, + "step": 4648 + }, + { + "epoch": 0.5914005851672816, + "ewc_loss": 0.0240478515625, + "ewc_loss_parallel": 2.4080276489257812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.993898391723633, + "learning_rate": 1e-06, + "loss": 0.4087, + "mean_token_accuracy": 0.8697959184646606, + "num_tokens": 177473542.0, + "step": 4649 + }, + { + "epoch": 0.591527795445872, + "ewc_loss": 0.0240478515625, + "ewc_loss_parallel": 2.4080276489257812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.045534133911133, + "learning_rate": 1e-06, + "loss": 0.4614, + "mean_token_accuracy": 0.8530093431472778, + "num_tokens": 177516590.0, + "step": 4650 + }, + { + "epoch": 0.5916550057244625, + "ewc_loss": 0.02392578125, + "ewc_loss_parallel": 2.396106719970703e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.077045440673828, + "learning_rate": 1e-06, + "loss": 0.4254, + "mean_token_accuracy": 0.8694193959236145, + "num_tokens": 177553750.0, + "step": 4651 + }, + { + "epoch": 0.591782216003053, + "ewc_loss": 0.02392578125, + "ewc_loss_parallel": 2.396106719970703e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.941707611083984, + "learning_rate": 1e-06, + "loss": 0.4814, + "mean_token_accuracy": 0.8494876623153687, + "num_tokens": 177586093.0, + "step": 4652 + }, + { + "epoch": 0.5919094262816436, + "ewc_loss": 0.0238037109375, + "ewc_loss_parallel": 2.384185791015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.212831497192383, + "learning_rate": 1e-06, + "loss": 0.4454, + "mean_token_accuracy": 0.8528275489807129, + "num_tokens": 177626800.0, + "step": 4653 + }, + { + "epoch": 0.5920366365602341, + "ewc_loss": 0.0240478515625, + "ewc_loss_parallel": 2.4080276489257812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.093095779418945, + "learning_rate": 1e-06, + "loss": 0.4521, + "mean_token_accuracy": 0.8540612459182739, + "num_tokens": 177662336.0, + "step": 4654 + }, + { + "epoch": 0.5921638468388246, + "ewc_loss": 0.02392578125, + "ewc_loss_parallel": 2.396106719970703e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.1700496673584, + "learning_rate": 1e-06, + "loss": 0.4165, + "mean_token_accuracy": 0.8645288944244385, + "num_tokens": 177706922.0, + "step": 4655 + }, + { + "epoch": 0.592291057117415, + "ewc_loss": 0.02392578125, + "ewc_loss_parallel": 2.396106719970703e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.030656814575195, + "learning_rate": 1e-06, + "loss": 0.4556, + "mean_token_accuracy": 0.85491943359375, + "num_tokens": 177743922.0, + "step": 4656 + }, + { + "epoch": 0.5924182673960056, + "ewc_loss": 0.02392578125, + "ewc_loss_parallel": 2.396106719970703e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.202560424804688, + "learning_rate": 1e-06, + "loss": 0.4417, + "mean_token_accuracy": 0.8628841042518616, + "num_tokens": 177777069.0, + "step": 4657 + }, + { + "epoch": 0.5925454776745961, + "ewc_loss": 0.0238037109375, + "ewc_loss_parallel": 2.384185791015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.15101432800293, + "learning_rate": 1e-06, + "loss": 0.4201, + "mean_token_accuracy": 0.8644781112670898, + "num_tokens": 177810959.0, + "step": 4658 + }, + { + "epoch": 0.5926726879531866, + "ewc_loss": 0.02392578125, + "ewc_loss_parallel": 2.396106719970703e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.038349151611328, + "learning_rate": 1e-06, + "loss": 0.4742, + "mean_token_accuracy": 0.8472853302955627, + "num_tokens": 177846366.0, + "step": 4659 + }, + { + "epoch": 0.5927998982317771, + "ewc_loss": 0.0238037109375, + "ewc_loss_parallel": 2.384185791015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.085060119628906, + "learning_rate": 1e-06, + "loss": 0.4115, + "mean_token_accuracy": 0.8693082332611084, + "num_tokens": 177881450.0, + "step": 4660 + }, + { + "epoch": 0.5929271085103677, + "ewc_loss": 0.0240478515625, + "ewc_loss_parallel": 2.4080276489257812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.01717185974121, + "learning_rate": 1e-06, + "loss": 0.4461, + "mean_token_accuracy": 0.8568677306175232, + "num_tokens": 177922931.0, + "step": 4661 + }, + { + "epoch": 0.5930543187889582, + "ewc_loss": 0.02392578125, + "ewc_loss_parallel": 2.396106719970703e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.117889404296875, + "learning_rate": 1e-06, + "loss": 0.4992, + "mean_token_accuracy": 0.8398231267929077, + "num_tokens": 177962722.0, + "step": 4662 + }, + { + "epoch": 0.5931815290675486, + "ewc_loss": 0.0240478515625, + "ewc_loss_parallel": 2.4080276489257812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.05253791809082, + "learning_rate": 1e-06, + "loss": 0.4557, + "mean_token_accuracy": 0.8548119068145752, + "num_tokens": 177999956.0, + "step": 4663 + }, + { + "epoch": 0.5933087393461391, + "ewc_loss": 0.0240478515625, + "ewc_loss_parallel": 2.4080276489257812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.22263526916504, + "learning_rate": 1e-06, + "loss": 0.4704, + "mean_token_accuracy": 0.8460583686828613, + "num_tokens": 178032450.0, + "step": 4664 + }, + { + "epoch": 0.5934359496247297, + "ewc_loss": 0.024169921875, + "ewc_loss_parallel": 2.4199485778808594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.029808044433594, + "learning_rate": 1e-06, + "loss": 0.4498, + "mean_token_accuracy": 0.8570692539215088, + "num_tokens": 178075028.0, + "step": 4665 + }, + { + "epoch": 0.5935631599033202, + "ewc_loss": 0.02392578125, + "ewc_loss_parallel": 2.396106719970703e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.248247146606445, + "learning_rate": 1e-06, + "loss": 0.4702, + "mean_token_accuracy": 0.852270245552063, + "num_tokens": 178108267.0, + "step": 4666 + }, + { + "epoch": 0.5936903701819107, + "ewc_loss": 0.02392578125, + "ewc_loss_parallel": 2.396106719970703e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.88504981994629, + "learning_rate": 1e-06, + "loss": 0.4463, + "mean_token_accuracy": 0.8566522002220154, + "num_tokens": 178148091.0, + "step": 4667 + }, + { + "epoch": 0.5938175804605013, + "ewc_loss": 0.0240478515625, + "ewc_loss_parallel": 2.4080276489257812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.331621170043945, + "learning_rate": 1e-06, + "loss": 0.408, + "mean_token_accuracy": 0.8700777292251587, + "num_tokens": 178185626.0, + "step": 4668 + }, + { + "epoch": 0.5939447907390917, + "ewc_loss": 0.02392578125, + "ewc_loss_parallel": 2.396106719970703e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.168014526367188, + "learning_rate": 1e-06, + "loss": 0.4337, + "mean_token_accuracy": 0.8640193939208984, + "num_tokens": 178224165.0, + "step": 4669 + }, + { + "epoch": 0.5940720010176822, + "ewc_loss": 0.02392578125, + "ewc_loss_parallel": 2.396106719970703e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.02165985107422, + "learning_rate": 1e-06, + "loss": 0.4401, + "mean_token_accuracy": 0.8553943037986755, + "num_tokens": 178264118.0, + "step": 4670 + }, + { + "epoch": 0.5941992112962727, + "ewc_loss": 0.0238037109375, + "ewc_loss_parallel": 2.384185791015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.192615509033203, + "learning_rate": 1e-06, + "loss": 0.4101, + "mean_token_accuracy": 0.8672602772712708, + "num_tokens": 178295634.0, + "step": 4671 + }, + { + "epoch": 0.5943264215748633, + "ewc_loss": 0.024169921875, + "ewc_loss_parallel": 2.4199485778808594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.039026260375977, + "learning_rate": 1e-06, + "loss": 0.4281, + "mean_token_accuracy": 0.8627618551254272, + "num_tokens": 178327076.0, + "step": 4672 + }, + { + "epoch": 0.5944536318534538, + "ewc_loss": 0.02392578125, + "ewc_loss_parallel": 2.396106719970703e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.200952529907227, + "learning_rate": 1e-06, + "loss": 0.4727, + "mean_token_accuracy": 0.8475837707519531, + "num_tokens": 178357469.0, + "step": 4673 + }, + { + "epoch": 0.5945808421320443, + "ewc_loss": 0.0240478515625, + "ewc_loss_parallel": 2.4080276489257812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.195201873779297, + "learning_rate": 1e-06, + "loss": 0.4272, + "mean_token_accuracy": 0.8626382350921631, + "num_tokens": 178393830.0, + "step": 4674 + }, + { + "epoch": 0.5947080524106347, + "ewc_loss": 0.0240478515625, + "ewc_loss_parallel": 2.4080276489257812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.00796890258789, + "learning_rate": 1e-06, + "loss": 0.4459, + "mean_token_accuracy": 0.8558286428451538, + "num_tokens": 178427560.0, + "step": 4675 + }, + { + "epoch": 0.5948352626892253, + "ewc_loss": 0.024169921875, + "ewc_loss_parallel": 2.4199485778808594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.075885772705078, + "learning_rate": 1e-06, + "loss": 0.4491, + "mean_token_accuracy": 0.8586656451225281, + "num_tokens": 178464666.0, + "step": 4676 + }, + { + "epoch": 0.5949624729678158, + "ewc_loss": 0.024169921875, + "ewc_loss_parallel": 2.4199485778808594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.14080238342285, + "learning_rate": 1e-06, + "loss": 0.4796, + "mean_token_accuracy": 0.8469897508621216, + "num_tokens": 178508771.0, + "step": 4677 + }, + { + "epoch": 0.5950896832464063, + "ewc_loss": 0.024169921875, + "ewc_loss_parallel": 2.4199485778808594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.945228576660156, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8607584834098816, + "num_tokens": 178548516.0, + "step": 4678 + }, + { + "epoch": 0.5952168935249968, + "ewc_loss": 0.024169921875, + "ewc_loss_parallel": 2.4199485778808594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.25284767150879, + "learning_rate": 1e-06, + "loss": 0.5704, + "mean_token_accuracy": 0.8216447830200195, + "num_tokens": 178591276.0, + "step": 4679 + }, + { + "epoch": 0.5953441038035874, + "ewc_loss": 0.0240478515625, + "ewc_loss_parallel": 2.4080276489257812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.122928619384766, + "learning_rate": 1e-06, + "loss": 0.4593, + "mean_token_accuracy": 0.8550022840499878, + "num_tokens": 178632580.0, + "step": 4680 + }, + { + "epoch": 0.5954713140821778, + "ewc_loss": 0.024169921875, + "ewc_loss_parallel": 2.4199485778808594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.152111053466797, + "learning_rate": 1e-06, + "loss": 0.4578, + "mean_token_accuracy": 0.8522632122039795, + "num_tokens": 178672858.0, + "step": 4681 + }, + { + "epoch": 0.5955985243607683, + "ewc_loss": 0.0240478515625, + "ewc_loss_parallel": 2.4080276489257812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.125389099121094, + "learning_rate": 1e-06, + "loss": 0.448, + "mean_token_accuracy": 0.8571725487709045, + "num_tokens": 178706435.0, + "step": 4682 + }, + { + "epoch": 0.5957257346393589, + "ewc_loss": 0.0242919921875, + "ewc_loss_parallel": 2.4318695068359375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.146896362304688, + "learning_rate": 1e-06, + "loss": 0.4643, + "mean_token_accuracy": 0.8532007336616516, + "num_tokens": 178746309.0, + "step": 4683 + }, + { + "epoch": 0.5958529449179494, + "ewc_loss": 0.0240478515625, + "ewc_loss_parallel": 2.4080276489257812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.089975357055664, + "learning_rate": 1e-06, + "loss": 0.5037, + "mean_token_accuracy": 0.8382200002670288, + "num_tokens": 178787639.0, + "step": 4684 + }, + { + "epoch": 0.5959801551965399, + "ewc_loss": 0.0238037109375, + "ewc_loss_parallel": 2.384185791015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.22158432006836, + "learning_rate": 1e-06, + "loss": 0.4375, + "mean_token_accuracy": 0.8608995676040649, + "num_tokens": 178828271.0, + "step": 4685 + }, + { + "epoch": 0.5961073654751304, + "ewc_loss": 0.024169921875, + "ewc_loss_parallel": 2.4199485778808594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.11550521850586, + "learning_rate": 1e-06, + "loss": 0.3841, + "mean_token_accuracy": 0.8771892189979553, + "num_tokens": 178862862.0, + "step": 4686 + }, + { + "epoch": 0.5962345757537209, + "ewc_loss": 0.0238037109375, + "ewc_loss_parallel": 2.384185791015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.10076332092285, + "learning_rate": 1e-06, + "loss": 0.4089, + "mean_token_accuracy": 0.8680515289306641, + "num_tokens": 178897969.0, + "step": 4687 + }, + { + "epoch": 0.5963617860323114, + "ewc_loss": 0.024169921875, + "ewc_loss_parallel": 2.4199485778808594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.913095474243164, + "learning_rate": 1e-06, + "loss": 0.4338, + "mean_token_accuracy": 0.8615230321884155, + "num_tokens": 178942002.0, + "step": 4688 + }, + { + "epoch": 0.5964889963109019, + "ewc_loss": 0.0240478515625, + "ewc_loss_parallel": 2.4080276489257812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.17826271057129, + "learning_rate": 1e-06, + "loss": 0.415, + "mean_token_accuracy": 0.8662056922912598, + "num_tokens": 178983473.0, + "step": 4689 + }, + { + "epoch": 0.5966162065894924, + "ewc_loss": 0.0242919921875, + "ewc_loss_parallel": 2.4318695068359375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.052814483642578, + "learning_rate": 1e-06, + "loss": 0.4878, + "mean_token_accuracy": 0.8447081446647644, + "num_tokens": 179022370.0, + "step": 4690 + }, + { + "epoch": 0.596743416868083, + "ewc_loss": 0.02392578125, + "ewc_loss_parallel": 2.396106719970703e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.98600196838379, + "learning_rate": 1e-06, + "loss": 0.3931, + "mean_token_accuracy": 0.8712930679321289, + "num_tokens": 179064841.0, + "step": 4691 + }, + { + "epoch": 0.5968706271466735, + "ewc_loss": 0.0242919921875, + "ewc_loss_parallel": 2.4318695068359375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.057964324951172, + "learning_rate": 1e-06, + "loss": 0.5322, + "mean_token_accuracy": 0.8289332985877991, + "num_tokens": 179099399.0, + "step": 4692 + }, + { + "epoch": 0.5969978374252639, + "ewc_loss": 0.02392578125, + "ewc_loss_parallel": 2.396106719970703e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.929210662841797, + "learning_rate": 1e-06, + "loss": 0.4702, + "mean_token_accuracy": 0.8475358486175537, + "num_tokens": 179133969.0, + "step": 4693 + }, + { + "epoch": 0.5971250477038544, + "ewc_loss": 0.0242919921875, + "ewc_loss_parallel": 2.4318695068359375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.362192153930664, + "learning_rate": 1e-06, + "loss": 0.4621, + "mean_token_accuracy": 0.8534896969795227, + "num_tokens": 179165456.0, + "step": 4694 + }, + { + "epoch": 0.597252257982445, + "ewc_loss": 0.0240478515625, + "ewc_loss_parallel": 2.4080276489257812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.161378860473633, + "learning_rate": 1e-06, + "loss": 0.4287, + "mean_token_accuracy": 0.8606630563735962, + "num_tokens": 179198209.0, + "step": 4695 + }, + { + "epoch": 0.5973794682610355, + "ewc_loss": 0.02392578125, + "ewc_loss_parallel": 2.396106719970703e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.985414505004883, + "learning_rate": 1e-06, + "loss": 0.4955, + "mean_token_accuracy": 0.8426640629768372, + "num_tokens": 179233872.0, + "step": 4696 + }, + { + "epoch": 0.597506678539626, + "ewc_loss": 0.024169921875, + "ewc_loss_parallel": 2.4199485778808594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.186201095581055, + "learning_rate": 1e-06, + "loss": 0.3685, + "mean_token_accuracy": 0.8775315284729004, + "num_tokens": 179272721.0, + "step": 4697 + }, + { + "epoch": 0.5976338888182166, + "ewc_loss": 0.0240478515625, + "ewc_loss_parallel": 2.4080276489257812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.0362491607666, + "learning_rate": 1e-06, + "loss": 0.459, + "mean_token_accuracy": 0.8552666902542114, + "num_tokens": 179305781.0, + "step": 4698 + }, + { + "epoch": 0.597761099096807, + "ewc_loss": 0.0242919921875, + "ewc_loss_parallel": 2.4318695068359375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.243549346923828, + "learning_rate": 1e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.8608262538909912, + "num_tokens": 179350049.0, + "step": 4699 + }, + { + "epoch": 0.5978883093753975, + "ewc_loss": 0.0244140625, + "ewc_loss_parallel": 2.4437904357910156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.396146774291992, + "learning_rate": 1e-06, + "loss": 0.4565, + "mean_token_accuracy": 0.8557211756706238, + "num_tokens": 179388063.0, + "step": 4700 + }, + { + "epoch": 0.598015519653988, + "ewc_loss": 0.0244140625, + "ewc_loss_parallel": 2.4437904357910156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.022493362426758, + "learning_rate": 1e-06, + "loss": 0.4317, + "mean_token_accuracy": 0.8638502955436707, + "num_tokens": 179427607.0, + "step": 4701 + }, + { + "epoch": 0.5981427299325786, + "ewc_loss": 0.024169921875, + "ewc_loss_parallel": 2.4199485778808594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.199750900268555, + "learning_rate": 1e-06, + "loss": 0.5095, + "mean_token_accuracy": 0.8398009538650513, + "num_tokens": 179471456.0, + "step": 4702 + }, + { + "epoch": 0.5982699402111691, + "ewc_loss": 0.0242919921875, + "ewc_loss_parallel": 2.4318695068359375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.09485626220703, + "learning_rate": 1e-06, + "loss": 0.4253, + "mean_token_accuracy": 0.8683232665061951, + "num_tokens": 179514318.0, + "step": 4703 + }, + { + "epoch": 0.5983971504897596, + "ewc_loss": 0.0242919921875, + "ewc_loss_parallel": 2.4318695068359375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.275096893310547, + "learning_rate": 1e-06, + "loss": 0.4055, + "mean_token_accuracy": 0.8698296546936035, + "num_tokens": 179553307.0, + "step": 4704 + }, + { + "epoch": 0.59852436076835, + "ewc_loss": 0.024169921875, + "ewc_loss_parallel": 2.4199485778808594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.02999496459961, + "learning_rate": 1e-06, + "loss": 0.4778, + "mean_token_accuracy": 0.8486894369125366, + "num_tokens": 179597288.0, + "step": 4705 + }, + { + "epoch": 0.5986515710469406, + "ewc_loss": 0.02392578125, + "ewc_loss_parallel": 2.396106719970703e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.291767120361328, + "learning_rate": 1e-06, + "loss": 0.4471, + "mean_token_accuracy": 0.8562271595001221, + "num_tokens": 179629583.0, + "step": 4706 + }, + { + "epoch": 0.5987787813255311, + "ewc_loss": 0.0240478515625, + "ewc_loss_parallel": 2.4080276489257812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.08216094970703, + "learning_rate": 1e-06, + "loss": 0.4721, + "mean_token_accuracy": 0.8504393100738525, + "num_tokens": 179672104.0, + "step": 4707 + }, + { + "epoch": 0.5989059916041216, + "ewc_loss": 0.02392578125, + "ewc_loss_parallel": 2.396106719970703e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.298091888427734, + "learning_rate": 1e-06, + "loss": 0.5012, + "mean_token_accuracy": 0.8477899432182312, + "num_tokens": 179712951.0, + "step": 4708 + }, + { + "epoch": 0.5990332018827121, + "ewc_loss": 0.0242919921875, + "ewc_loss_parallel": 2.4318695068359375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.203657150268555, + "learning_rate": 1e-06, + "loss": 0.4576, + "mean_token_accuracy": 0.8567391633987427, + "num_tokens": 179754626.0, + "step": 4709 + }, + { + "epoch": 0.5991604121613027, + "ewc_loss": 0.02392578125, + "ewc_loss_parallel": 2.396106719970703e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.0588436126709, + "learning_rate": 1e-06, + "loss": 0.4245, + "mean_token_accuracy": 0.8618181943893433, + "num_tokens": 179790068.0, + "step": 4710 + }, + { + "epoch": 0.5992876224398932, + "ewc_loss": 0.024169921875, + "ewc_loss_parallel": 2.4199485778808594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.926496505737305, + "learning_rate": 1e-06, + "loss": 0.5135, + "mean_token_accuracy": 0.8376478552818298, + "num_tokens": 179823936.0, + "step": 4711 + }, + { + "epoch": 0.5994148327184836, + "ewc_loss": 0.024169921875, + "ewc_loss_parallel": 2.4199485778808594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.137622833251953, + "learning_rate": 1e-06, + "loss": 0.4366, + "mean_token_accuracy": 0.8580455780029297, + "num_tokens": 179860920.0, + "step": 4712 + }, + { + "epoch": 0.5995420429970741, + "ewc_loss": 0.024169921875, + "ewc_loss_parallel": 2.4199485778808594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.95587921142578, + "learning_rate": 1e-06, + "loss": 0.4695, + "mean_token_accuracy": 0.8469407558441162, + "num_tokens": 179899848.0, + "step": 4713 + }, + { + "epoch": 0.5996692532756647, + "ewc_loss": 0.024169921875, + "ewc_loss_parallel": 2.4199485778808594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.270347595214844, + "learning_rate": 1e-06, + "loss": 0.3954, + "mean_token_accuracy": 0.8740960359573364, + "num_tokens": 179934462.0, + "step": 4714 + }, + { + "epoch": 0.5997964635542552, + "ewc_loss": 0.0242919921875, + "ewc_loss_parallel": 2.4318695068359375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.954132080078125, + "learning_rate": 1e-06, + "loss": 0.456, + "mean_token_accuracy": 0.8576527237892151, + "num_tokens": 179973955.0, + "step": 4715 + }, + { + "epoch": 0.5999236738328457, + "ewc_loss": 0.0240478515625, + "ewc_loss_parallel": 2.4080276489257812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.10063934326172, + "learning_rate": 1e-06, + "loss": 0.4514, + "mean_token_accuracy": 0.8593416810035706, + "num_tokens": 180015256.0, + "step": 4716 + }, + { + "epoch": 0.6000508841114363, + "ewc_loss": 0.0242919921875, + "ewc_loss_parallel": 2.4318695068359375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.16767120361328, + "learning_rate": 1e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.8640377521514893, + "num_tokens": 180054145.0, + "step": 4717 + }, + { + "epoch": 0.6001780943900267, + "ewc_loss": 0.0242919921875, + "ewc_loss_parallel": 2.4318695068359375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.036314010620117, + "learning_rate": 1e-06, + "loss": 0.4259, + "mean_token_accuracy": 0.8618489503860474, + "num_tokens": 180093411.0, + "step": 4718 + }, + { + "epoch": 0.6003053046686172, + "ewc_loss": 0.024169921875, + "ewc_loss_parallel": 2.4199485778808594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.089738845825195, + "learning_rate": 1e-06, + "loss": 0.4206, + "mean_token_accuracy": 0.8665869235992432, + "num_tokens": 180135799.0, + "step": 4719 + }, + { + "epoch": 0.6004325149472077, + "ewc_loss": 0.0244140625, + "ewc_loss_parallel": 2.4437904357910156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.206186294555664, + "learning_rate": 1e-06, + "loss": 0.4693, + "mean_token_accuracy": 0.8509418964385986, + "num_tokens": 180172623.0, + "step": 4720 + }, + { + "epoch": 0.6005597252257983, + "ewc_loss": 0.024169921875, + "ewc_loss_parallel": 2.4199485778808594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.073829650878906, + "learning_rate": 1e-06, + "loss": 0.4707, + "mean_token_accuracy": 0.8534137010574341, + "num_tokens": 180213833.0, + "step": 4721 + }, + { + "epoch": 0.6006869355043888, + "ewc_loss": 0.024169921875, + "ewc_loss_parallel": 2.4199485778808594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.084636688232422, + "learning_rate": 1e-06, + "loss": 0.4366, + "mean_token_accuracy": 0.8640729784965515, + "num_tokens": 180254982.0, + "step": 4722 + }, + { + "epoch": 0.6008141457829793, + "ewc_loss": 0.024169921875, + "ewc_loss_parallel": 2.4199485778808594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.11538314819336, + "learning_rate": 1e-06, + "loss": 0.4206, + "mean_token_accuracy": 0.864730954170227, + "num_tokens": 180288544.0, + "step": 4723 + }, + { + "epoch": 0.6009413560615697, + "ewc_loss": 0.024169921875, + "ewc_loss_parallel": 2.4199485778808594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.03787612915039, + "learning_rate": 1e-06, + "loss": 0.4578, + "mean_token_accuracy": 0.853173017501831, + "num_tokens": 180330233.0, + "step": 4724 + }, + { + "epoch": 0.6010685663401603, + "ewc_loss": 0.02392578125, + "ewc_loss_parallel": 2.396106719970703e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.221635818481445, + "learning_rate": 1e-06, + "loss": 0.439, + "mean_token_accuracy": 0.8603165149688721, + "num_tokens": 180361516.0, + "step": 4725 + }, + { + "epoch": 0.6011957766187508, + "ewc_loss": 0.024169921875, + "ewc_loss_parallel": 2.4199485778808594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.099414825439453, + "learning_rate": 1e-06, + "loss": 0.4599, + "mean_token_accuracy": 0.8557825088500977, + "num_tokens": 180404380.0, + "step": 4726 + }, + { + "epoch": 0.6013229868973413, + "ewc_loss": 0.0242919921875, + "ewc_loss_parallel": 2.4318695068359375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.13651466369629, + "learning_rate": 1e-06, + "loss": 0.4708, + "mean_token_accuracy": 0.849922239780426, + "num_tokens": 180436916.0, + "step": 4727 + }, + { + "epoch": 0.6014501971759318, + "ewc_loss": 0.0242919921875, + "ewc_loss_parallel": 2.4318695068359375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.147459030151367, + "learning_rate": 1e-06, + "loss": 0.5064, + "mean_token_accuracy": 0.8396141529083252, + "num_tokens": 180476895.0, + "step": 4728 + }, + { + "epoch": 0.6015774074545224, + "ewc_loss": 0.0242919921875, + "ewc_loss_parallel": 2.4318695068359375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.186124801635742, + "learning_rate": 1e-06, + "loss": 0.4456, + "mean_token_accuracy": 0.8570559024810791, + "num_tokens": 180515248.0, + "step": 4729 + }, + { + "epoch": 0.6017046177331128, + "ewc_loss": 0.0242919921875, + "ewc_loss_parallel": 2.4318695068359375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.0286808013916, + "learning_rate": 1e-06, + "loss": 0.4375, + "mean_token_accuracy": 0.863245964050293, + "num_tokens": 180554575.0, + "step": 4730 + }, + { + "epoch": 0.6018318280117033, + "ewc_loss": 0.02392578125, + "ewc_loss_parallel": 2.396106719970703e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.03545570373535, + "learning_rate": 1e-06, + "loss": 0.4833, + "mean_token_accuracy": 0.8471744656562805, + "num_tokens": 180594485.0, + "step": 4731 + }, + { + "epoch": 0.6019590382902938, + "ewc_loss": 0.024169921875, + "ewc_loss_parallel": 2.4199485778808594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.260847091674805, + "learning_rate": 1e-06, + "loss": 0.4952, + "mean_token_accuracy": 0.8423163890838623, + "num_tokens": 180634746.0, + "step": 4732 + }, + { + "epoch": 0.6020862485688844, + "ewc_loss": 0.0240478515625, + "ewc_loss_parallel": 2.4080276489257812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.199975967407227, + "learning_rate": 1e-06, + "loss": 0.4729, + "mean_token_accuracy": 0.8476303815841675, + "num_tokens": 180674946.0, + "step": 4733 + }, + { + "epoch": 0.6022134588474749, + "ewc_loss": 0.024169921875, + "ewc_loss_parallel": 2.4199485778808594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.08420753479004, + "learning_rate": 1e-06, + "loss": 0.4453, + "mean_token_accuracy": 0.8567615747451782, + "num_tokens": 180715538.0, + "step": 4734 + }, + { + "epoch": 0.6023406691260654, + "ewc_loss": 0.0242919921875, + "ewc_loss_parallel": 2.4318695068359375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.134830474853516, + "learning_rate": 1e-06, + "loss": 0.4561, + "mean_token_accuracy": 0.85538250207901, + "num_tokens": 180755298.0, + "step": 4735 + }, + { + "epoch": 0.6024678794046558, + "ewc_loss": 0.0242919921875, + "ewc_loss_parallel": 2.4318695068359375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.186155319213867, + "learning_rate": 1e-06, + "loss": 0.4757, + "mean_token_accuracy": 0.8505380153656006, + "num_tokens": 180797104.0, + "step": 4736 + }, + { + "epoch": 0.6025950896832464, + "ewc_loss": 0.0240478515625, + "ewc_loss_parallel": 2.4080276489257812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.9816837310791, + "learning_rate": 1e-06, + "loss": 0.4152, + "mean_token_accuracy": 0.8671727180480957, + "num_tokens": 180839845.0, + "step": 4737 + }, + { + "epoch": 0.6027222999618369, + "ewc_loss": 0.0240478515625, + "ewc_loss_parallel": 2.4080276489257812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.270511627197266, + "learning_rate": 1e-06, + "loss": 0.4002, + "mean_token_accuracy": 0.8707478046417236, + "num_tokens": 180878849.0, + "step": 4738 + }, + { + "epoch": 0.6028495102404274, + "ewc_loss": 0.0242919921875, + "ewc_loss_parallel": 2.4318695068359375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.266878128051758, + "learning_rate": 1e-06, + "loss": 0.4527, + "mean_token_accuracy": 0.8556976318359375, + "num_tokens": 180918427.0, + "step": 4739 + }, + { + "epoch": 0.602976720519018, + "ewc_loss": 0.024169921875, + "ewc_loss_parallel": 2.4199485778808594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.195499420166016, + "learning_rate": 1e-06, + "loss": 0.478, + "mean_token_accuracy": 0.8492717742919922, + "num_tokens": 180956206.0, + "step": 4740 + }, + { + "epoch": 0.6031039307976085, + "ewc_loss": 0.024169921875, + "ewc_loss_parallel": 2.4199485778808594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.347211837768555, + "learning_rate": 1e-06, + "loss": 0.4279, + "mean_token_accuracy": 0.8623110055923462, + "num_tokens": 180996945.0, + "step": 4741 + }, + { + "epoch": 0.6032311410761989, + "ewc_loss": 0.0242919921875, + "ewc_loss_parallel": 2.4318695068359375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.17229652404785, + "learning_rate": 1e-06, + "loss": 0.4314, + "mean_token_accuracy": 0.8621453046798706, + "num_tokens": 181031763.0, + "step": 4742 + }, + { + "epoch": 0.6033583513547894, + "ewc_loss": 0.02392578125, + "ewc_loss_parallel": 2.396106719970703e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.082216262817383, + "learning_rate": 1e-06, + "loss": 0.4261, + "mean_token_accuracy": 0.8651195764541626, + "num_tokens": 181070661.0, + "step": 4743 + }, + { + "epoch": 0.60348556163338, + "ewc_loss": 0.0240478515625, + "ewc_loss_parallel": 2.4080276489257812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.26750946044922, + "learning_rate": 1e-06, + "loss": 0.4925, + "mean_token_accuracy": 0.8435631394386292, + "num_tokens": 181112769.0, + "step": 4744 + }, + { + "epoch": 0.6036127719119705, + "ewc_loss": 0.02392578125, + "ewc_loss_parallel": 2.396106719970703e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.088878631591797, + "learning_rate": 1e-06, + "loss": 0.4421, + "mean_token_accuracy": 0.8567578792572021, + "num_tokens": 181151155.0, + "step": 4745 + }, + { + "epoch": 0.603739982190561, + "ewc_loss": 0.02392578125, + "ewc_loss_parallel": 2.396106719970703e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.163860321044922, + "learning_rate": 1e-06, + "loss": 0.4496, + "mean_token_accuracy": 0.8572624921798706, + "num_tokens": 181188689.0, + "step": 4746 + }, + { + "epoch": 0.6038671924691515, + "ewc_loss": 0.0240478515625, + "ewc_loss_parallel": 2.4080276489257812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.20812225341797, + "learning_rate": 1e-06, + "loss": 0.4397, + "mean_token_accuracy": 0.8610254526138306, + "num_tokens": 181230031.0, + "step": 4747 + }, + { + "epoch": 0.603994402747742, + "ewc_loss": 0.0238037109375, + "ewc_loss_parallel": 2.384185791015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.203981399536133, + "learning_rate": 1e-06, + "loss": 0.4472, + "mean_token_accuracy": 0.8565998673439026, + "num_tokens": 181271464.0, + "step": 4748 + }, + { + "epoch": 0.6041216130263325, + "ewc_loss": 0.02392578125, + "ewc_loss_parallel": 2.396106719970703e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.97089385986328, + "learning_rate": 1e-06, + "loss": 0.4734, + "mean_token_accuracy": 0.8518414497375488, + "num_tokens": 181312607.0, + "step": 4749 + }, + { + "epoch": 0.604248823304923, + "ewc_loss": 0.02392578125, + "ewc_loss_parallel": 2.396106719970703e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.26318359375, + "learning_rate": 1e-06, + "loss": 0.4203, + "mean_token_accuracy": 0.8655158281326294, + "num_tokens": 181348628.0, + "step": 4750 + }, + { + "epoch": 0.6043760335835135, + "ewc_loss": 0.024169921875, + "ewc_loss_parallel": 2.4199485778808594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.160322189331055, + "learning_rate": 1e-06, + "loss": 0.4688, + "mean_token_accuracy": 0.848919689655304, + "num_tokens": 181387975.0, + "step": 4751 + }, + { + "epoch": 0.6045032438621041, + "ewc_loss": 0.0240478515625, + "ewc_loss_parallel": 2.4080276489257812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.0560245513916, + "learning_rate": 1e-06, + "loss": 0.4078, + "mean_token_accuracy": 0.8696058988571167, + "num_tokens": 181424163.0, + "step": 4752 + }, + { + "epoch": 0.6046304541406946, + "ewc_loss": 0.024169921875, + "ewc_loss_parallel": 2.4199485778808594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.135610580444336, + "learning_rate": 1e-06, + "loss": 0.4786, + "mean_token_accuracy": 0.8469027876853943, + "num_tokens": 181457002.0, + "step": 4753 + }, + { + "epoch": 0.604757664419285, + "ewc_loss": 0.02392578125, + "ewc_loss_parallel": 2.396106719970703e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.21598243713379, + "learning_rate": 1e-06, + "loss": 0.4676, + "mean_token_accuracy": 0.8522664308547974, + "num_tokens": 181494410.0, + "step": 4754 + }, + { + "epoch": 0.6048848746978756, + "ewc_loss": 0.0242919921875, + "ewc_loss_parallel": 2.4318695068359375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.192014694213867, + "learning_rate": 1e-06, + "loss": 0.451, + "mean_token_accuracy": 0.8544108867645264, + "num_tokens": 181534595.0, + "step": 4755 + }, + { + "epoch": 0.6050120849764661, + "ewc_loss": 0.024169921875, + "ewc_loss_parallel": 2.4199485778808594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.209178924560547, + "learning_rate": 1e-06, + "loss": 0.4287, + "mean_token_accuracy": 0.8629359006881714, + "num_tokens": 181570654.0, + "step": 4756 + }, + { + "epoch": 0.6051392952550566, + "ewc_loss": 0.024169921875, + "ewc_loss_parallel": 2.4199485778808594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.37563133239746, + "learning_rate": 1e-06, + "loss": 0.4675, + "mean_token_accuracy": 0.8516747951507568, + "num_tokens": 181608366.0, + "step": 4757 + }, + { + "epoch": 0.6052665055336471, + "ewc_loss": 0.024169921875, + "ewc_loss_parallel": 2.4199485778808594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.018253326416016, + "learning_rate": 1e-06, + "loss": 0.4775, + "mean_token_accuracy": 0.8445306420326233, + "num_tokens": 181647168.0, + "step": 4758 + }, + { + "epoch": 0.6053937158122377, + "ewc_loss": 0.024169921875, + "ewc_loss_parallel": 2.4199485778808594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.14084815979004, + "learning_rate": 1e-06, + "loss": 0.4457, + "mean_token_accuracy": 0.8602025508880615, + "num_tokens": 181689177.0, + "step": 4759 + }, + { + "epoch": 0.6055209260908282, + "ewc_loss": 0.024169921875, + "ewc_loss_parallel": 2.4199485778808594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.33574104309082, + "learning_rate": 1e-06, + "loss": 0.4324, + "mean_token_accuracy": 0.8618243932723999, + "num_tokens": 181723742.0, + "step": 4760 + }, + { + "epoch": 0.6056481363694186, + "ewc_loss": 0.0240478515625, + "ewc_loss_parallel": 2.4080276489257812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.171112060546875, + "learning_rate": 1e-06, + "loss": 0.4188, + "mean_token_accuracy": 0.865517258644104, + "num_tokens": 181761539.0, + "step": 4761 + }, + { + "epoch": 0.6057753466480091, + "ewc_loss": 0.0240478515625, + "ewc_loss_parallel": 2.4080276489257812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.156301498413086, + "learning_rate": 1e-06, + "loss": 0.4482, + "mean_token_accuracy": 0.8553842306137085, + "num_tokens": 181799674.0, + "step": 4762 + }, + { + "epoch": 0.6059025569265997, + "ewc_loss": 0.024169921875, + "ewc_loss_parallel": 2.4199485778808594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.136606216430664, + "learning_rate": 1e-06, + "loss": 0.4891, + "mean_token_accuracy": 0.8450331687927246, + "num_tokens": 181838784.0, + "step": 4763 + }, + { + "epoch": 0.6060297672051902, + "ewc_loss": 0.0240478515625, + "ewc_loss_parallel": 2.4080276489257812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.14498519897461, + "learning_rate": 1e-06, + "loss": 0.4651, + "mean_token_accuracy": 0.8529865741729736, + "num_tokens": 181875669.0, + "step": 4764 + }, + { + "epoch": 0.6061569774837807, + "ewc_loss": 0.0240478515625, + "ewc_loss_parallel": 2.4080276489257812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.92983627319336, + "learning_rate": 1e-06, + "loss": 0.5023, + "mean_token_accuracy": 0.8403640985488892, + "num_tokens": 181912450.0, + "step": 4765 + }, + { + "epoch": 0.6062841877623713, + "ewc_loss": 0.024169921875, + "ewc_loss_parallel": 2.4199485778808594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.394113540649414, + "learning_rate": 1e-06, + "loss": 0.4538, + "mean_token_accuracy": 0.857121467590332, + "num_tokens": 181947691.0, + "step": 4766 + }, + { + "epoch": 0.6064113980409617, + "ewc_loss": 0.024169921875, + "ewc_loss_parallel": 2.4199485778808594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.90976905822754, + "learning_rate": 1e-06, + "loss": 0.4347, + "mean_token_accuracy": 0.8620160222053528, + "num_tokens": 181982364.0, + "step": 4767 + }, + { + "epoch": 0.6065386083195522, + "ewc_loss": 0.024169921875, + "ewc_loss_parallel": 2.4199485778808594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.240718841552734, + "learning_rate": 1e-06, + "loss": 0.4472, + "mean_token_accuracy": 0.856712281703949, + "num_tokens": 182017071.0, + "step": 4768 + }, + { + "epoch": 0.6066658185981427, + "ewc_loss": 0.0240478515625, + "ewc_loss_parallel": 2.4080276489257812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.99233627319336, + "learning_rate": 1e-06, + "loss": 0.4461, + "mean_token_accuracy": 0.8570064306259155, + "num_tokens": 182057234.0, + "step": 4769 + }, + { + "epoch": 0.6067930288767333, + "ewc_loss": 0.0240478515625, + "ewc_loss_parallel": 2.4080276489257812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.087522506713867, + "learning_rate": 1e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.8675752282142639, + "num_tokens": 182092450.0, + "step": 4770 + }, + { + "epoch": 0.6069202391553238, + "ewc_loss": 0.0240478515625, + "ewc_loss_parallel": 2.4080276489257812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.49541473388672, + "learning_rate": 1e-06, + "loss": 0.4726, + "mean_token_accuracy": 0.8497706055641174, + "num_tokens": 182126398.0, + "step": 4771 + }, + { + "epoch": 0.6070474494339143, + "ewc_loss": 0.0242919921875, + "ewc_loss_parallel": 2.4318695068359375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.62558937072754, + "learning_rate": 1e-06, + "loss": 0.3985, + "mean_token_accuracy": 0.8707361221313477, + "num_tokens": 182160848.0, + "step": 4772 + }, + { + "epoch": 0.6071746597125047, + "ewc_loss": 0.02392578125, + "ewc_loss_parallel": 2.396106719970703e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.164081573486328, + "learning_rate": 1e-06, + "loss": 0.4174, + "mean_token_accuracy": 0.8651416301727295, + "num_tokens": 182202596.0, + "step": 4773 + }, + { + "epoch": 0.6073018699910953, + "ewc_loss": 0.024169921875, + "ewc_loss_parallel": 2.4199485778808594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.437904357910156, + "learning_rate": 1e-06, + "loss": 0.4368, + "mean_token_accuracy": 0.8555787205696106, + "num_tokens": 182239235.0, + "step": 4774 + }, + { + "epoch": 0.6074290802696858, + "ewc_loss": 0.0242919921875, + "ewc_loss_parallel": 2.4318695068359375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.48590087890625, + "learning_rate": 1e-06, + "loss": 0.406, + "mean_token_accuracy": 0.8690982460975647, + "num_tokens": 182273662.0, + "step": 4775 + }, + { + "epoch": 0.6075562905482763, + "ewc_loss": 0.024169921875, + "ewc_loss_parallel": 2.4199485778808594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.14095687866211, + "learning_rate": 1e-06, + "loss": 0.5075, + "mean_token_accuracy": 0.8421767354011536, + "num_tokens": 182311991.0, + "step": 4776 + }, + { + "epoch": 0.6076835008268668, + "ewc_loss": 0.024169921875, + "ewc_loss_parallel": 2.4199485778808594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.505508422851562, + "learning_rate": 1e-06, + "loss": 0.4469, + "mean_token_accuracy": 0.8594459295272827, + "num_tokens": 182350722.0, + "step": 4777 + }, + { + "epoch": 0.6078107111054574, + "ewc_loss": 0.024169921875, + "ewc_loss_parallel": 2.4199485778808594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.053279876708984, + "learning_rate": 1e-06, + "loss": 0.4347, + "mean_token_accuracy": 0.8605539798736572, + "num_tokens": 182386755.0, + "step": 4778 + }, + { + "epoch": 0.6079379213840478, + "ewc_loss": 0.0244140625, + "ewc_loss_parallel": 2.4437904357910156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.08588981628418, + "learning_rate": 1e-06, + "loss": 0.402, + "mean_token_accuracy": 0.8741030693054199, + "num_tokens": 182423128.0, + "step": 4779 + }, + { + "epoch": 0.6080651316626383, + "ewc_loss": 0.0244140625, + "ewc_loss_parallel": 2.4437904357910156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.295482635498047, + "learning_rate": 1e-06, + "loss": 0.4897, + "mean_token_accuracy": 0.8434957265853882, + "num_tokens": 182461518.0, + "step": 4780 + }, + { + "epoch": 0.6081923419412288, + "ewc_loss": 0.0244140625, + "ewc_loss_parallel": 2.4437904357910156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.17003059387207, + "learning_rate": 1e-06, + "loss": 0.4507, + "mean_token_accuracy": 0.8567782044410706, + "num_tokens": 182495707.0, + "step": 4781 + }, + { + "epoch": 0.6083195522198194, + "ewc_loss": 0.0244140625, + "ewc_loss_parallel": 2.4437904357910156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.2026424407959, + "learning_rate": 1e-06, + "loss": 0.4792, + "mean_token_accuracy": 0.8475257158279419, + "num_tokens": 182539058.0, + "step": 4782 + }, + { + "epoch": 0.6084467624984099, + "ewc_loss": 0.0242919921875, + "ewc_loss_parallel": 2.4318695068359375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.207202911376953, + "learning_rate": 1e-06, + "loss": 0.478, + "mean_token_accuracy": 0.8435447812080383, + "num_tokens": 182572482.0, + "step": 4783 + }, + { + "epoch": 0.6085739727770004, + "ewc_loss": 0.0245361328125, + "ewc_loss_parallel": 2.4557113647460938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.10700798034668, + "learning_rate": 1e-06, + "loss": 0.4312, + "mean_token_accuracy": 0.8610730171203613, + "num_tokens": 182607665.0, + "step": 4784 + }, + { + "epoch": 0.6087011830555908, + "ewc_loss": 0.024169921875, + "ewc_loss_parallel": 2.4199485778808594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.15528106689453, + "learning_rate": 1e-06, + "loss": 0.445, + "mean_token_accuracy": 0.8593318462371826, + "num_tokens": 182650503.0, + "step": 4785 + }, + { + "epoch": 0.6088283933341814, + "ewc_loss": 0.0244140625, + "ewc_loss_parallel": 2.4437904357910156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.349246978759766, + "learning_rate": 1e-06, + "loss": 0.415, + "mean_token_accuracy": 0.8676425218582153, + "num_tokens": 182681662.0, + "step": 4786 + }, + { + "epoch": 0.6089556036127719, + "ewc_loss": 0.0244140625, + "ewc_loss_parallel": 2.4437904357910156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.10284996032715, + "learning_rate": 1e-06, + "loss": 0.4924, + "mean_token_accuracy": 0.8444859981536865, + "num_tokens": 182714912.0, + "step": 4787 + }, + { + "epoch": 0.6090828138913624, + "ewc_loss": 0.0242919921875, + "ewc_loss_parallel": 2.4318695068359375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.14470672607422, + "learning_rate": 1e-06, + "loss": 0.4529, + "mean_token_accuracy": 0.8546040058135986, + "num_tokens": 182748813.0, + "step": 4788 + }, + { + "epoch": 0.609210024169953, + "ewc_loss": 0.0242919921875, + "ewc_loss_parallel": 2.4318695068359375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.29010581970215, + "learning_rate": 1e-06, + "loss": 0.5017, + "mean_token_accuracy": 0.8417824506759644, + "num_tokens": 182786123.0, + "step": 4789 + }, + { + "epoch": 0.6093372344485435, + "ewc_loss": 0.0244140625, + "ewc_loss_parallel": 2.4437904357910156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.112110137939453, + "learning_rate": 1e-06, + "loss": 0.4014, + "mean_token_accuracy": 0.8704767823219299, + "num_tokens": 182820550.0, + "step": 4790 + }, + { + "epoch": 0.6094644447271339, + "ewc_loss": 0.0244140625, + "ewc_loss_parallel": 2.4437904357910156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.019699096679688, + "learning_rate": 1e-06, + "loss": 0.4445, + "mean_token_accuracy": 0.85825514793396, + "num_tokens": 182863135.0, + "step": 4791 + }, + { + "epoch": 0.6095916550057244, + "ewc_loss": 0.0244140625, + "ewc_loss_parallel": 2.4437904357910156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.195720672607422, + "learning_rate": 1e-06, + "loss": 0.4456, + "mean_token_accuracy": 0.8594709634780884, + "num_tokens": 182905193.0, + "step": 4792 + }, + { + "epoch": 0.609718865284315, + "ewc_loss": 0.0245361328125, + "ewc_loss_parallel": 2.4557113647460938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.25491714477539, + "learning_rate": 1e-06, + "loss": 0.4589, + "mean_token_accuracy": 0.85744309425354, + "num_tokens": 182944934.0, + "step": 4793 + }, + { + "epoch": 0.6098460755629055, + "ewc_loss": 0.0242919921875, + "ewc_loss_parallel": 2.4318695068359375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.245738983154297, + "learning_rate": 1e-06, + "loss": 0.4918, + "mean_token_accuracy": 0.8447966575622559, + "num_tokens": 182984604.0, + "step": 4794 + }, + { + "epoch": 0.609973285841496, + "ewc_loss": 0.0242919921875, + "ewc_loss_parallel": 2.4318695068359375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.09663200378418, + "learning_rate": 1e-06, + "loss": 0.4503, + "mean_token_accuracy": 0.8558530807495117, + "num_tokens": 183020421.0, + "step": 4795 + }, + { + "epoch": 0.6101004961200865, + "ewc_loss": 0.0244140625, + "ewc_loss_parallel": 2.4437904357910156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.16376304626465, + "learning_rate": 1e-06, + "loss": 0.4839, + "mean_token_accuracy": 0.8491611480712891, + "num_tokens": 183057635.0, + "step": 4796 + }, + { + "epoch": 0.610227706398677, + "ewc_loss": 0.0244140625, + "ewc_loss_parallel": 2.4437904357910156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.024921417236328, + "learning_rate": 1e-06, + "loss": 0.4909, + "mean_token_accuracy": 0.8445850014686584, + "num_tokens": 183094689.0, + "step": 4797 + }, + { + "epoch": 0.6103549166772675, + "ewc_loss": 0.0245361328125, + "ewc_loss_parallel": 2.4557113647460938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.144989013671875, + "learning_rate": 1e-06, + "loss": 0.4343, + "mean_token_accuracy": 0.8631619215011597, + "num_tokens": 183129118.0, + "step": 4798 + }, + { + "epoch": 0.610482126955858, + "ewc_loss": 0.0242919921875, + "ewc_loss_parallel": 2.4318695068359375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.2542667388916, + "learning_rate": 1e-06, + "loss": 0.4511, + "mean_token_accuracy": 0.8543691635131836, + "num_tokens": 183163365.0, + "step": 4799 + }, + { + "epoch": 0.6106093372344485, + "ewc_loss": 0.0244140625, + "ewc_loss_parallel": 2.4437904357910156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.048084259033203, + "learning_rate": 1e-06, + "loss": 0.4343, + "mean_token_accuracy": 0.8606115579605103, + "num_tokens": 183199271.0, + "step": 4800 + }, + { + "epoch": 0.6107365475130391, + "ewc_loss": 0.024169921875, + "ewc_loss_parallel": 2.4199485778808594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.337865829467773, + "learning_rate": 1e-06, + "loss": 0.3857, + "mean_token_accuracy": 0.8769131898880005, + "num_tokens": 183234523.0, + "step": 4801 + }, + { + "epoch": 0.6108637577916296, + "ewc_loss": 0.0244140625, + "ewc_loss_parallel": 2.4437904357910156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.024301528930664, + "learning_rate": 1e-06, + "loss": 0.4546, + "mean_token_accuracy": 0.8584392666816711, + "num_tokens": 183267098.0, + "step": 4802 + }, + { + "epoch": 0.61099096807022, + "ewc_loss": 0.0242919921875, + "ewc_loss_parallel": 2.4318695068359375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.29313087463379, + "learning_rate": 1e-06, + "loss": 0.4577, + "mean_token_accuracy": 0.8569207191467285, + "num_tokens": 183306825.0, + "step": 4803 + }, + { + "epoch": 0.6111181783488105, + "ewc_loss": 0.0245361328125, + "ewc_loss_parallel": 2.4557113647460938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.10619354248047, + "learning_rate": 1e-06, + "loss": 0.4466, + "mean_token_accuracy": 0.857017993927002, + "num_tokens": 183348712.0, + "step": 4804 + }, + { + "epoch": 0.6112453886274011, + "ewc_loss": 0.0244140625, + "ewc_loss_parallel": 2.4437904357910156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.187942504882812, + "learning_rate": 1e-06, + "loss": 0.3974, + "mean_token_accuracy": 0.8700573444366455, + "num_tokens": 183383797.0, + "step": 4805 + }, + { + "epoch": 0.6113725989059916, + "ewc_loss": 0.0245361328125, + "ewc_loss_parallel": 2.4557113647460938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.138927459716797, + "learning_rate": 1e-06, + "loss": 0.4041, + "mean_token_accuracy": 0.8692634105682373, + "num_tokens": 183417484.0, + "step": 4806 + }, + { + "epoch": 0.6114998091845821, + "ewc_loss": 0.0242919921875, + "ewc_loss_parallel": 2.4318695068359375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.143142700195312, + "learning_rate": 1e-06, + "loss": 0.4227, + "mean_token_accuracy": 0.8631926774978638, + "num_tokens": 183453808.0, + "step": 4807 + }, + { + "epoch": 0.6116270194631727, + "ewc_loss": 0.0244140625, + "ewc_loss_parallel": 2.4437904357910156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.201345443725586, + "learning_rate": 1e-06, + "loss": 0.4324, + "mean_token_accuracy": 0.8643326163291931, + "num_tokens": 183496693.0, + "step": 4808 + }, + { + "epoch": 0.6117542297417632, + "ewc_loss": 0.0245361328125, + "ewc_loss_parallel": 2.4557113647460938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.193056106567383, + "learning_rate": 1e-06, + "loss": 0.4422, + "mean_token_accuracy": 0.8605009913444519, + "num_tokens": 183536727.0, + "step": 4809 + }, + { + "epoch": 0.6118814400203536, + "ewc_loss": 0.0244140625, + "ewc_loss_parallel": 2.4437904357910156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.189773559570312, + "learning_rate": 1e-06, + "loss": 0.4442, + "mean_token_accuracy": 0.8590361475944519, + "num_tokens": 183571809.0, + "step": 4810 + }, + { + "epoch": 0.6120086502989441, + "ewc_loss": 0.0244140625, + "ewc_loss_parallel": 2.4437904357910156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.30063247680664, + "learning_rate": 1e-06, + "loss": 0.4452, + "mean_token_accuracy": 0.8567649126052856, + "num_tokens": 183610761.0, + "step": 4811 + }, + { + "epoch": 0.6121358605775347, + "ewc_loss": 0.0242919921875, + "ewc_loss_parallel": 2.4318695068359375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.119718551635742, + "learning_rate": 1e-06, + "loss": 0.4501, + "mean_token_accuracy": 0.8566350936889648, + "num_tokens": 183648304.0, + "step": 4812 + }, + { + "epoch": 0.6122630708561252, + "ewc_loss": 0.0244140625, + "ewc_loss_parallel": 2.4437904357910156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.36776351928711, + "learning_rate": 1e-06, + "loss": 0.4586, + "mean_token_accuracy": 0.8531885147094727, + "num_tokens": 183684201.0, + "step": 4813 + }, + { + "epoch": 0.6123902811347157, + "ewc_loss": 0.0244140625, + "ewc_loss_parallel": 2.4437904357910156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.026288986206055, + "learning_rate": 1e-06, + "loss": 0.4683, + "mean_token_accuracy": 0.8512142300605774, + "num_tokens": 183726595.0, + "step": 4814 + }, + { + "epoch": 0.6125174914133062, + "ewc_loss": 0.0242919921875, + "ewc_loss_parallel": 2.4318695068359375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.26581382751465, + "learning_rate": 1e-06, + "loss": 0.4216, + "mean_token_accuracy": 0.8661518692970276, + "num_tokens": 183762222.0, + "step": 4815 + }, + { + "epoch": 0.6126447016918967, + "ewc_loss": 0.0244140625, + "ewc_loss_parallel": 2.4437904357910156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.318906784057617, + "learning_rate": 1e-06, + "loss": 0.4154, + "mean_token_accuracy": 0.8705626726150513, + "num_tokens": 183797167.0, + "step": 4816 + }, + { + "epoch": 0.6127719119704872, + "ewc_loss": 0.0242919921875, + "ewc_loss_parallel": 2.4318695068359375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.01767349243164, + "learning_rate": 1e-06, + "loss": 0.4527, + "mean_token_accuracy": 0.8569883108139038, + "num_tokens": 183839591.0, + "step": 4817 + }, + { + "epoch": 0.6128991222490777, + "ewc_loss": 0.0242919921875, + "ewc_loss_parallel": 2.4318695068359375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.264558792114258, + "learning_rate": 1e-06, + "loss": 0.459, + "mean_token_accuracy": 0.857342004776001, + "num_tokens": 183877862.0, + "step": 4818 + }, + { + "epoch": 0.6130263325276682, + "ewc_loss": 0.0242919921875, + "ewc_loss_parallel": 2.4318695068359375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.19065284729004, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.8630401492118835, + "num_tokens": 183915964.0, + "step": 4819 + }, + { + "epoch": 0.6131535428062588, + "ewc_loss": 0.0244140625, + "ewc_loss_parallel": 2.4437904357910156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.152212142944336, + "learning_rate": 1e-06, + "loss": 0.4531, + "mean_token_accuracy": 0.8554469347000122, + "num_tokens": 183952823.0, + "step": 4820 + }, + { + "epoch": 0.6132807530848493, + "ewc_loss": 0.024169921875, + "ewc_loss_parallel": 2.4199485778808594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.219730377197266, + "learning_rate": 1e-06, + "loss": 0.4887, + "mean_token_accuracy": 0.8480724096298218, + "num_tokens": 183991650.0, + "step": 4821 + }, + { + "epoch": 0.6134079633634397, + "ewc_loss": 0.024169921875, + "ewc_loss_parallel": 2.4199485778808594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.13853645324707, + "learning_rate": 1e-06, + "loss": 0.4347, + "mean_token_accuracy": 0.8592666387557983, + "num_tokens": 184027768.0, + "step": 4822 + }, + { + "epoch": 0.6135351736420303, + "ewc_loss": 0.0245361328125, + "ewc_loss_parallel": 2.4557113647460938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.106769561767578, + "learning_rate": 1e-06, + "loss": 0.4102, + "mean_token_accuracy": 0.8684084415435791, + "num_tokens": 184063754.0, + "step": 4823 + }, + { + "epoch": 0.6136623839206208, + "ewc_loss": 0.024169921875, + "ewc_loss_parallel": 2.4199485778808594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.08326530456543, + "learning_rate": 1e-06, + "loss": 0.4621, + "mean_token_accuracy": 0.8515139818191528, + "num_tokens": 184105286.0, + "step": 4824 + }, + { + "epoch": 0.6137895941992113, + "ewc_loss": 0.0242919921875, + "ewc_loss_parallel": 2.4318695068359375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.136756896972656, + "learning_rate": 1e-06, + "loss": 0.4329, + "mean_token_accuracy": 0.8621273040771484, + "num_tokens": 184139312.0, + "step": 4825 + }, + { + "epoch": 0.6139168044778018, + "ewc_loss": 0.0245361328125, + "ewc_loss_parallel": 2.4557113647460938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.267745971679688, + "learning_rate": 1e-06, + "loss": 0.4497, + "mean_token_accuracy": 0.8575047850608826, + "num_tokens": 184181541.0, + "step": 4826 + }, + { + "epoch": 0.6140440147563924, + "ewc_loss": 0.0242919921875, + "ewc_loss_parallel": 2.4318695068359375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.10831642150879, + "learning_rate": 1e-06, + "loss": 0.4368, + "mean_token_accuracy": 0.8569502830505371, + "num_tokens": 184216857.0, + "step": 4827 + }, + { + "epoch": 0.6141712250349828, + "ewc_loss": 0.0244140625, + "ewc_loss_parallel": 2.4437904357910156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.310100555419922, + "learning_rate": 1e-06, + "loss": 0.4321, + "mean_token_accuracy": 0.8617556691169739, + "num_tokens": 184253318.0, + "step": 4828 + }, + { + "epoch": 0.6142984353135733, + "ewc_loss": 0.024658203125, + "ewc_loss_parallel": 2.467632293701172e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.26590347290039, + "learning_rate": 1e-06, + "loss": 0.4247, + "mean_token_accuracy": 0.8691016435623169, + "num_tokens": 184298625.0, + "step": 4829 + }, + { + "epoch": 0.6144256455921638, + "ewc_loss": 0.0244140625, + "ewc_loss_parallel": 2.4437904357910156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.248458862304688, + "learning_rate": 1e-06, + "loss": 0.4517, + "mean_token_accuracy": 0.85548996925354, + "num_tokens": 184334569.0, + "step": 4830 + }, + { + "epoch": 0.6145528558707544, + "ewc_loss": 0.0245361328125, + "ewc_loss_parallel": 2.4557113647460938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.233434677124023, + "learning_rate": 1e-06, + "loss": 0.5097, + "mean_token_accuracy": 0.8409391641616821, + "num_tokens": 184377477.0, + "step": 4831 + }, + { + "epoch": 0.6146800661493449, + "ewc_loss": 0.024169921875, + "ewc_loss_parallel": 2.4199485778808594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.165069580078125, + "learning_rate": 1e-06, + "loss": 0.5058, + "mean_token_accuracy": 0.8452353477478027, + "num_tokens": 184416671.0, + "step": 4832 + }, + { + "epoch": 0.6148072764279354, + "ewc_loss": 0.0244140625, + "ewc_loss_parallel": 2.4437904357910156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.127044677734375, + "learning_rate": 1e-06, + "loss": 0.4654, + "mean_token_accuracy": 0.8523191213607788, + "num_tokens": 184453174.0, + "step": 4833 + }, + { + "epoch": 0.6149344867065258, + "ewc_loss": 0.0242919921875, + "ewc_loss_parallel": 2.4318695068359375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.193817138671875, + "learning_rate": 1e-06, + "loss": 0.4335, + "mean_token_accuracy": 0.8644096255302429, + "num_tokens": 184488313.0, + "step": 4834 + }, + { + "epoch": 0.6150616969851164, + "ewc_loss": 0.024169921875, + "ewc_loss_parallel": 2.4199485778808594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.990467071533203, + "learning_rate": 1e-06, + "loss": 0.4921, + "mean_token_accuracy": 0.8382129669189453, + "num_tokens": 184523733.0, + "step": 4835 + }, + { + "epoch": 0.6151889072637069, + "ewc_loss": 0.0244140625, + "ewc_loss_parallel": 2.4437904357910156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.184738159179688, + "learning_rate": 1e-06, + "loss": 0.4585, + "mean_token_accuracy": 0.8555819392204285, + "num_tokens": 184568399.0, + "step": 4836 + }, + { + "epoch": 0.6153161175422974, + "ewc_loss": 0.0245361328125, + "ewc_loss_parallel": 2.4557113647460938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.092594146728516, + "learning_rate": 1e-06, + "loss": 0.4423, + "mean_token_accuracy": 0.8563570976257324, + "num_tokens": 184603302.0, + "step": 4837 + }, + { + "epoch": 0.615443327820888, + "ewc_loss": 0.024658203125, + "ewc_loss_parallel": 2.467632293701172e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.18210792541504, + "learning_rate": 1e-06, + "loss": 0.3953, + "mean_token_accuracy": 0.8768134117126465, + "num_tokens": 184637346.0, + "step": 4838 + }, + { + "epoch": 0.6155705380994785, + "ewc_loss": 0.0244140625, + "ewc_loss_parallel": 2.4437904357910156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.090208053588867, + "learning_rate": 1e-06, + "loss": 0.4301, + "mean_token_accuracy": 0.8639844059944153, + "num_tokens": 184671687.0, + "step": 4839 + }, + { + "epoch": 0.6156977483780689, + "ewc_loss": 0.0245361328125, + "ewc_loss_parallel": 2.4557113647460938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.085805892944336, + "learning_rate": 1e-06, + "loss": 0.4109, + "mean_token_accuracy": 0.8669288158416748, + "num_tokens": 184706867.0, + "step": 4840 + }, + { + "epoch": 0.6158249586566594, + "ewc_loss": 0.0245361328125, + "ewc_loss_parallel": 2.4557113647460938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.137378692626953, + "learning_rate": 1e-06, + "loss": 0.4582, + "mean_token_accuracy": 0.8539061546325684, + "num_tokens": 184743916.0, + "step": 4841 + }, + { + "epoch": 0.61595216893525, + "ewc_loss": 0.0244140625, + "ewc_loss_parallel": 2.4437904357910156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.254497528076172, + "learning_rate": 1e-06, + "loss": 0.4188, + "mean_token_accuracy": 0.8659744262695312, + "num_tokens": 184783247.0, + "step": 4842 + }, + { + "epoch": 0.6160793792138405, + "ewc_loss": 0.024658203125, + "ewc_loss_parallel": 2.467632293701172e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.279788970947266, + "learning_rate": 1e-06, + "loss": 0.4403, + "mean_token_accuracy": 0.857189416885376, + "num_tokens": 184826501.0, + "step": 4843 + }, + { + "epoch": 0.616206589492431, + "ewc_loss": 0.0244140625, + "ewc_loss_parallel": 2.4437904357910156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.97884178161621, + "learning_rate": 1e-06, + "loss": 0.4449, + "mean_token_accuracy": 0.8558295369148254, + "num_tokens": 184868743.0, + "step": 4844 + }, + { + "epoch": 0.6163337997710215, + "ewc_loss": 0.0242919921875, + "ewc_loss_parallel": 2.4318695068359375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.045724868774414, + "learning_rate": 1e-06, + "loss": 0.4611, + "mean_token_accuracy": 0.8546617031097412, + "num_tokens": 184908992.0, + "step": 4845 + }, + { + "epoch": 0.616461010049612, + "ewc_loss": 0.0245361328125, + "ewc_loss_parallel": 2.4557113647460938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.318828582763672, + "learning_rate": 1e-06, + "loss": 0.4896, + "mean_token_accuracy": 0.8460566997528076, + "num_tokens": 184942456.0, + "step": 4846 + }, + { + "epoch": 0.6165882203282025, + "ewc_loss": 0.0245361328125, + "ewc_loss_parallel": 2.4557113647460938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.974977493286133, + "learning_rate": 1e-06, + "loss": 0.488, + "mean_token_accuracy": 0.8481084108352661, + "num_tokens": 184975834.0, + "step": 4847 + }, + { + "epoch": 0.616715430606793, + "ewc_loss": 0.0245361328125, + "ewc_loss_parallel": 2.4557113647460938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.11850357055664, + "learning_rate": 1e-06, + "loss": 0.3774, + "mean_token_accuracy": 0.8801706433296204, + "num_tokens": 185012295.0, + "step": 4848 + }, + { + "epoch": 0.6168426408853835, + "ewc_loss": 0.024658203125, + "ewc_loss_parallel": 2.467632293701172e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.195253372192383, + "learning_rate": 1e-06, + "loss": 0.4273, + "mean_token_accuracy": 0.8621435165405273, + "num_tokens": 185049272.0, + "step": 4849 + }, + { + "epoch": 0.6169698511639741, + "ewc_loss": 0.024658203125, + "ewc_loss_parallel": 2.467632293701172e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.218116760253906, + "learning_rate": 1e-06, + "loss": 0.4091, + "mean_token_accuracy": 0.8697640895843506, + "num_tokens": 185081054.0, + "step": 4850 + }, + { + "epoch": 0.6170970614425646, + "ewc_loss": 0.0245361328125, + "ewc_loss_parallel": 2.4557113647460938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.114089965820312, + "learning_rate": 1e-06, + "loss": 0.4507, + "mean_token_accuracy": 0.8538007140159607, + "num_tokens": 185119809.0, + "step": 4851 + }, + { + "epoch": 0.617224271721155, + "ewc_loss": 0.0244140625, + "ewc_loss_parallel": 2.4437904357910156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.175535202026367, + "learning_rate": 1e-06, + "loss": 0.407, + "mean_token_accuracy": 0.8692424893379211, + "num_tokens": 185161462.0, + "step": 4852 + }, + { + "epoch": 0.6173514819997455, + "ewc_loss": 0.0245361328125, + "ewc_loss_parallel": 2.4557113647460938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.227983474731445, + "learning_rate": 1e-06, + "loss": 0.4928, + "mean_token_accuracy": 0.8499837517738342, + "num_tokens": 185196238.0, + "step": 4853 + }, + { + "epoch": 0.6174786922783361, + "ewc_loss": 0.024658203125, + "ewc_loss_parallel": 2.467632293701172e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.069326400756836, + "learning_rate": 1e-06, + "loss": 0.4562, + "mean_token_accuracy": 0.8567209243774414, + "num_tokens": 185239317.0, + "step": 4854 + }, + { + "epoch": 0.6176059025569266, + "ewc_loss": 0.0245361328125, + "ewc_loss_parallel": 2.4557113647460938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.140525817871094, + "learning_rate": 1e-06, + "loss": 0.418, + "mean_token_accuracy": 0.8672628402709961, + "num_tokens": 185273555.0, + "step": 4855 + }, + { + "epoch": 0.6177331128355171, + "ewc_loss": 0.0245361328125, + "ewc_loss_parallel": 2.4557113647460938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.25809097290039, + "learning_rate": 1e-06, + "loss": 0.4213, + "mean_token_accuracy": 0.8662193417549133, + "num_tokens": 185309721.0, + "step": 4856 + }, + { + "epoch": 0.6178603231141077, + "ewc_loss": 0.024658203125, + "ewc_loss_parallel": 2.467632293701172e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.995773315429688, + "learning_rate": 1e-06, + "loss": 0.4784, + "mean_token_accuracy": 0.8493630886077881, + "num_tokens": 185350846.0, + "step": 4857 + }, + { + "epoch": 0.6179875333926982, + "ewc_loss": 0.0245361328125, + "ewc_loss_parallel": 2.4557113647460938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.073131561279297, + "learning_rate": 1e-06, + "loss": 0.4774, + "mean_token_accuracy": 0.8513650894165039, + "num_tokens": 185392472.0, + "step": 4858 + }, + { + "epoch": 0.6181147436712886, + "ewc_loss": 0.0245361328125, + "ewc_loss_parallel": 2.4557113647460938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.154176712036133, + "learning_rate": 1e-06, + "loss": 0.4649, + "mean_token_accuracy": 0.8545622229576111, + "num_tokens": 185438250.0, + "step": 4859 + }, + { + "epoch": 0.6182419539498791, + "ewc_loss": 0.024658203125, + "ewc_loss_parallel": 2.467632293701172e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.12129783630371, + "learning_rate": 1e-06, + "loss": 0.4631, + "mean_token_accuracy": 0.8550342321395874, + "num_tokens": 185479455.0, + "step": 4860 + }, + { + "epoch": 0.6183691642284697, + "ewc_loss": 0.0245361328125, + "ewc_loss_parallel": 2.4557113647460938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.2392635345459, + "learning_rate": 1e-06, + "loss": 0.4639, + "mean_token_accuracy": 0.8544049263000488, + "num_tokens": 185518719.0, + "step": 4861 + }, + { + "epoch": 0.6184963745070602, + "ewc_loss": 0.024658203125, + "ewc_loss_parallel": 2.467632293701172e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.12110710144043, + "learning_rate": 1e-06, + "loss": 0.481, + "mean_token_accuracy": 0.8434734344482422, + "num_tokens": 185556671.0, + "step": 4862 + }, + { + "epoch": 0.6186235847856507, + "ewc_loss": 0.024658203125, + "ewc_loss_parallel": 2.467632293701172e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.137983322143555, + "learning_rate": 1e-06, + "loss": 0.4524, + "mean_token_accuracy": 0.8555978536605835, + "num_tokens": 185595063.0, + "step": 4863 + }, + { + "epoch": 0.6187507950642412, + "ewc_loss": 0.0244140625, + "ewc_loss_parallel": 2.4437904357910156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.132829666137695, + "learning_rate": 1e-06, + "loss": 0.4781, + "mean_token_accuracy": 0.8475676774978638, + "num_tokens": 185635626.0, + "step": 4864 + }, + { + "epoch": 0.6188780053428317, + "ewc_loss": 0.0245361328125, + "ewc_loss_parallel": 2.4557113647460938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.291772842407227, + "learning_rate": 1e-06, + "loss": 0.4749, + "mean_token_accuracy": 0.8463969826698303, + "num_tokens": 185671687.0, + "step": 4865 + }, + { + "epoch": 0.6190052156214222, + "ewc_loss": 0.0245361328125, + "ewc_loss_parallel": 2.4557113647460938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.198017120361328, + "learning_rate": 1e-06, + "loss": 0.4347, + "mean_token_accuracy": 0.86089026927948, + "num_tokens": 185706459.0, + "step": 4866 + }, + { + "epoch": 0.6191324259000127, + "ewc_loss": 0.0244140625, + "ewc_loss_parallel": 2.4437904357910156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.144474029541016, + "learning_rate": 1e-06, + "loss": 0.3803, + "mean_token_accuracy": 0.8759807348251343, + "num_tokens": 185743162.0, + "step": 4867 + }, + { + "epoch": 0.6192596361786032, + "ewc_loss": 0.0245361328125, + "ewc_loss_parallel": 2.4557113647460938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.075729370117188, + "learning_rate": 1e-06, + "loss": 0.5025, + "mean_token_accuracy": 0.8420301675796509, + "num_tokens": 185784036.0, + "step": 4868 + }, + { + "epoch": 0.6193868464571938, + "ewc_loss": 0.0245361328125, + "ewc_loss_parallel": 2.4557113647460938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.135799407958984, + "learning_rate": 1e-06, + "loss": 0.461, + "mean_token_accuracy": 0.8540173768997192, + "num_tokens": 185821407.0, + "step": 4869 + }, + { + "epoch": 0.6195140567357843, + "ewc_loss": 0.0245361328125, + "ewc_loss_parallel": 2.4557113647460938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.237524032592773, + "learning_rate": 1e-06, + "loss": 0.4415, + "mean_token_accuracy": 0.8626790046691895, + "num_tokens": 185857789.0, + "step": 4870 + }, + { + "epoch": 0.6196412670143747, + "ewc_loss": 0.024658203125, + "ewc_loss_parallel": 2.467632293701172e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.072235107421875, + "learning_rate": 1e-06, + "loss": 0.4022, + "mean_token_accuracy": 0.868215024471283, + "num_tokens": 185896989.0, + "step": 4871 + }, + { + "epoch": 0.6197684772929652, + "ewc_loss": 0.0245361328125, + "ewc_loss_parallel": 2.4557113647460938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.146196365356445, + "learning_rate": 1e-06, + "loss": 0.3997, + "mean_token_accuracy": 0.8726901412010193, + "num_tokens": 185938123.0, + "step": 4872 + }, + { + "epoch": 0.6198956875715558, + "ewc_loss": 0.0245361328125, + "ewc_loss_parallel": 2.4557113647460938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.12956428527832, + "learning_rate": 1e-06, + "loss": 0.446, + "mean_token_accuracy": 0.8577864766120911, + "num_tokens": 185973440.0, + "step": 4873 + }, + { + "epoch": 0.6200228978501463, + "ewc_loss": 0.0244140625, + "ewc_loss_parallel": 2.4437904357910156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.168941497802734, + "learning_rate": 1e-06, + "loss": 0.4504, + "mean_token_accuracy": 0.8584272861480713, + "num_tokens": 186010069.0, + "step": 4874 + }, + { + "epoch": 0.6201501081287368, + "ewc_loss": 0.0244140625, + "ewc_loss_parallel": 2.4437904357910156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.102439880371094, + "learning_rate": 1e-06, + "loss": 0.4748, + "mean_token_accuracy": 0.851668119430542, + "num_tokens": 186048365.0, + "step": 4875 + }, + { + "epoch": 0.6202773184073274, + "ewc_loss": 0.0247802734375, + "ewc_loss_parallel": 2.47955322265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.360694885253906, + "learning_rate": 1e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.8626779913902283, + "num_tokens": 186085358.0, + "step": 4876 + }, + { + "epoch": 0.6204045286859178, + "ewc_loss": 0.0245361328125, + "ewc_loss_parallel": 2.4557113647460938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.101715087890625, + "learning_rate": 1e-06, + "loss": 0.4263, + "mean_token_accuracy": 0.8620635271072388, + "num_tokens": 186118476.0, + "step": 4877 + }, + { + "epoch": 0.6205317389645083, + "ewc_loss": 0.0245361328125, + "ewc_loss_parallel": 2.4557113647460938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.136518478393555, + "learning_rate": 1e-06, + "loss": 0.4577, + "mean_token_accuracy": 0.8540284633636475, + "num_tokens": 186155487.0, + "step": 4878 + }, + { + "epoch": 0.6206589492430988, + "ewc_loss": 0.0245361328125, + "ewc_loss_parallel": 2.4557113647460938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.143720626831055, + "learning_rate": 1e-06, + "loss": 0.4232, + "mean_token_accuracy": 0.8626376390457153, + "num_tokens": 186192941.0, + "step": 4879 + }, + { + "epoch": 0.6207861595216894, + "ewc_loss": 0.024658203125, + "ewc_loss_parallel": 2.467632293701172e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.154380798339844, + "learning_rate": 1e-06, + "loss": 0.4324, + "mean_token_accuracy": 0.8604750633239746, + "num_tokens": 186233860.0, + "step": 4880 + }, + { + "epoch": 0.6209133698002799, + "ewc_loss": 0.024658203125, + "ewc_loss_parallel": 2.467632293701172e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.11132049560547, + "learning_rate": 1e-06, + "loss": 0.4872, + "mean_token_accuracy": 0.8466952443122864, + "num_tokens": 186273081.0, + "step": 4881 + }, + { + "epoch": 0.6210405800788704, + "ewc_loss": 0.024658203125, + "ewc_loss_parallel": 2.467632293701172e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.152921676635742, + "learning_rate": 1e-06, + "loss": 0.4248, + "mean_token_accuracy": 0.8630801439285278, + "num_tokens": 186310596.0, + "step": 4882 + }, + { + "epoch": 0.6211677903574608, + "ewc_loss": 0.024658203125, + "ewc_loss_parallel": 2.467632293701172e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.1700382232666, + "learning_rate": 1e-06, + "loss": 0.4641, + "mean_token_accuracy": 0.8538500666618347, + "num_tokens": 186346262.0, + "step": 4883 + }, + { + "epoch": 0.6212950006360514, + "ewc_loss": 0.0247802734375, + "ewc_loss_parallel": 2.47955322265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.34009552001953, + "learning_rate": 1e-06, + "loss": 0.4364, + "mean_token_accuracy": 0.8606337308883667, + "num_tokens": 186386508.0, + "step": 4884 + }, + { + "epoch": 0.6214222109146419, + "ewc_loss": 0.024658203125, + "ewc_loss_parallel": 2.467632293701172e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.110977172851562, + "learning_rate": 1e-06, + "loss": 0.4586, + "mean_token_accuracy": 0.8537098169326782, + "num_tokens": 186425184.0, + "step": 4885 + }, + { + "epoch": 0.6215494211932324, + "ewc_loss": 0.024658203125, + "ewc_loss_parallel": 2.467632293701172e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.169490814208984, + "learning_rate": 1e-06, + "loss": 0.4425, + "mean_token_accuracy": 0.8569860458374023, + "num_tokens": 186463072.0, + "step": 4886 + }, + { + "epoch": 0.621676631471823, + "ewc_loss": 0.0247802734375, + "ewc_loss_parallel": 2.47955322265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.221338272094727, + "learning_rate": 1e-06, + "loss": 0.4196, + "mean_token_accuracy": 0.8682944774627686, + "num_tokens": 186504062.0, + "step": 4887 + }, + { + "epoch": 0.6218038417504135, + "ewc_loss": 0.0245361328125, + "ewc_loss_parallel": 2.4557113647460938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.35624122619629, + "learning_rate": 1e-06, + "loss": 0.4747, + "mean_token_accuracy": 0.8483598828315735, + "num_tokens": 186544564.0, + "step": 4888 + }, + { + "epoch": 0.6219310520290039, + "ewc_loss": 0.0245361328125, + "ewc_loss_parallel": 2.4557113647460938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.13986587524414, + "learning_rate": 1e-06, + "loss": 0.4965, + "mean_token_accuracy": 0.8445771932601929, + "num_tokens": 186587196.0, + "step": 4889 + }, + { + "epoch": 0.6220582623075944, + "ewc_loss": 0.024658203125, + "ewc_loss_parallel": 2.467632293701172e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.40725326538086, + "learning_rate": 1e-06, + "loss": 0.4328, + "mean_token_accuracy": 0.857997477054596, + "num_tokens": 186620673.0, + "step": 4890 + }, + { + "epoch": 0.622185472586185, + "ewc_loss": 0.0245361328125, + "ewc_loss_parallel": 2.4557113647460938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.374853134155273, + "learning_rate": 1e-06, + "loss": 0.4337, + "mean_token_accuracy": 0.8603109121322632, + "num_tokens": 186654028.0, + "step": 4891 + }, + { + "epoch": 0.6223126828647755, + "ewc_loss": 0.0244140625, + "ewc_loss_parallel": 2.4437904357910156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.138303756713867, + "learning_rate": 1e-06, + "loss": 0.4547, + "mean_token_accuracy": 0.854319155216217, + "num_tokens": 186693395.0, + "step": 4892 + }, + { + "epoch": 0.622439893143366, + "ewc_loss": 0.024658203125, + "ewc_loss_parallel": 2.467632293701172e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.24721908569336, + "learning_rate": 1e-06, + "loss": 0.5072, + "mean_token_accuracy": 0.8403393030166626, + "num_tokens": 186734440.0, + "step": 4893 + }, + { + "epoch": 0.6225671034219565, + "ewc_loss": 0.0245361328125, + "ewc_loss_parallel": 2.4557113647460938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.198410034179688, + "learning_rate": 1e-06, + "loss": 0.4382, + "mean_token_accuracy": 0.8595511317253113, + "num_tokens": 186769637.0, + "step": 4894 + }, + { + "epoch": 0.622694313700547, + "ewc_loss": 0.0245361328125, + "ewc_loss_parallel": 2.4557113647460938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.339698791503906, + "learning_rate": 1e-06, + "loss": 0.4559, + "mean_token_accuracy": 0.8534422516822815, + "num_tokens": 186810188.0, + "step": 4895 + }, + { + "epoch": 0.6228215239791375, + "ewc_loss": 0.024658203125, + "ewc_loss_parallel": 2.467632293701172e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.295223236083984, + "learning_rate": 1e-06, + "loss": 0.4701, + "mean_token_accuracy": 0.848124623298645, + "num_tokens": 186847512.0, + "step": 4896 + }, + { + "epoch": 0.622948734257728, + "ewc_loss": 0.0245361328125, + "ewc_loss_parallel": 2.4557113647460938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.288841247558594, + "learning_rate": 1e-06, + "loss": 0.4171, + "mean_token_accuracy": 0.8671271204948425, + "num_tokens": 186878721.0, + "step": 4897 + }, + { + "epoch": 0.6230759445363185, + "ewc_loss": 0.024658203125, + "ewc_loss_parallel": 2.467632293701172e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.22123146057129, + "learning_rate": 1e-06, + "loss": 0.4367, + "mean_token_accuracy": 0.8623150587081909, + "num_tokens": 186920239.0, + "step": 4898 + }, + { + "epoch": 0.6232031548149091, + "ewc_loss": 0.0245361328125, + "ewc_loss_parallel": 2.4557113647460938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.17087173461914, + "learning_rate": 1e-06, + "loss": 0.4701, + "mean_token_accuracy": 0.8505972027778625, + "num_tokens": 186964158.0, + "step": 4899 + }, + { + "epoch": 0.6233303650934996, + "ewc_loss": 0.0242919921875, + "ewc_loss_parallel": 2.4318695068359375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.127723693847656, + "learning_rate": 1e-06, + "loss": 0.4016, + "mean_token_accuracy": 0.8728334903717041, + "num_tokens": 187003176.0, + "step": 4900 + }, + { + "epoch": 0.62345757537209, + "ewc_loss": 0.0245361328125, + "ewc_loss_parallel": 2.4557113647460938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.30554962158203, + "learning_rate": 1e-06, + "loss": 0.4471, + "mean_token_accuracy": 0.856310248374939, + "num_tokens": 187046152.0, + "step": 4901 + }, + { + "epoch": 0.6235847856506805, + "ewc_loss": 0.0244140625, + "ewc_loss_parallel": 2.4437904357910156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.17713737487793, + "learning_rate": 1e-06, + "loss": 0.4137, + "mean_token_accuracy": 0.8690441250801086, + "num_tokens": 187086629.0, + "step": 4902 + }, + { + "epoch": 0.6237119959292711, + "ewc_loss": 0.0242919921875, + "ewc_loss_parallel": 2.4318695068359375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.09726333618164, + "learning_rate": 1e-06, + "loss": 0.468, + "mean_token_accuracy": 0.8524888753890991, + "num_tokens": 187122856.0, + "step": 4903 + }, + { + "epoch": 0.6238392062078616, + "ewc_loss": 0.024658203125, + "ewc_loss_parallel": 2.467632293701172e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.27121353149414, + "learning_rate": 1e-06, + "loss": 0.3964, + "mean_token_accuracy": 0.8726769089698792, + "num_tokens": 187161568.0, + "step": 4904 + }, + { + "epoch": 0.6239664164864521, + "ewc_loss": 0.0245361328125, + "ewc_loss_parallel": 2.4557113647460938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.15854263305664, + "learning_rate": 1e-06, + "loss": 0.4451, + "mean_token_accuracy": 0.8574921488761902, + "num_tokens": 187202831.0, + "step": 4905 + }, + { + "epoch": 0.6240936267650427, + "ewc_loss": 0.0245361328125, + "ewc_loss_parallel": 2.4557113647460938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.217994689941406, + "learning_rate": 1e-06, + "loss": 0.4167, + "mean_token_accuracy": 0.865024209022522, + "num_tokens": 187239059.0, + "step": 4906 + }, + { + "epoch": 0.6242208370436332, + "ewc_loss": 0.0245361328125, + "ewc_loss_parallel": 2.4557113647460938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.36332893371582, + "learning_rate": 1e-06, + "loss": 0.4659, + "mean_token_accuracy": 0.8515938520431519, + "num_tokens": 187272940.0, + "step": 4907 + }, + { + "epoch": 0.6243480473222236, + "ewc_loss": 0.024658203125, + "ewc_loss_parallel": 2.467632293701172e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.159526824951172, + "learning_rate": 1e-06, + "loss": 0.3959, + "mean_token_accuracy": 0.8712313175201416, + "num_tokens": 187310963.0, + "step": 4908 + }, + { + "epoch": 0.6244752576008141, + "ewc_loss": 0.0244140625, + "ewc_loss_parallel": 2.4437904357910156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.180381774902344, + "learning_rate": 1e-06, + "loss": 0.382, + "mean_token_accuracy": 0.8776190280914307, + "num_tokens": 187345698.0, + "step": 4909 + }, + { + "epoch": 0.6246024678794047, + "ewc_loss": 0.024658203125, + "ewc_loss_parallel": 2.467632293701172e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.352540969848633, + "learning_rate": 1e-06, + "loss": 0.4644, + "mean_token_accuracy": 0.8557994365692139, + "num_tokens": 187384762.0, + "step": 4910 + }, + { + "epoch": 0.6247296781579952, + "ewc_loss": 0.0245361328125, + "ewc_loss_parallel": 2.4557113647460938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.30299186706543, + "learning_rate": 1e-06, + "loss": 0.4802, + "mean_token_accuracy": 0.8475568294525146, + "num_tokens": 187424288.0, + "step": 4911 + }, + { + "epoch": 0.6248568884365857, + "ewc_loss": 0.0245361328125, + "ewc_loss_parallel": 2.4557113647460938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.039457321166992, + "learning_rate": 1e-06, + "loss": 0.4105, + "mean_token_accuracy": 0.8670694231987, + "num_tokens": 187461481.0, + "step": 4912 + }, + { + "epoch": 0.6249840987151762, + "ewc_loss": 0.0244140625, + "ewc_loss_parallel": 2.4437904357910156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.162525177001953, + "learning_rate": 1e-06, + "loss": 0.395, + "mean_token_accuracy": 0.8731701374053955, + "num_tokens": 187497690.0, + "step": 4913 + }, + { + "epoch": 0.6251113089937667, + "ewc_loss": 0.02490234375, + "ewc_loss_parallel": 2.491474151611328e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.1556396484375, + "learning_rate": 1e-06, + "loss": 0.4724, + "mean_token_accuracy": 0.8520582318305969, + "num_tokens": 187537825.0, + "step": 4914 + }, + { + "epoch": 0.6252385192723572, + "ewc_loss": 0.0244140625, + "ewc_loss_parallel": 2.4437904357910156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.07340431213379, + "learning_rate": 1e-06, + "loss": 0.4147, + "mean_token_accuracy": 0.868394136428833, + "num_tokens": 187583998.0, + "step": 4915 + }, + { + "epoch": 0.6253657295509477, + "ewc_loss": 0.0247802734375, + "ewc_loss_parallel": 2.47955322265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.12090492248535, + "learning_rate": 1e-06, + "loss": 0.4644, + "mean_token_accuracy": 0.8572065830230713, + "num_tokens": 187621962.0, + "step": 4916 + }, + { + "epoch": 0.6254929398295382, + "ewc_loss": 0.0245361328125, + "ewc_loss_parallel": 2.4557113647460938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.133018493652344, + "learning_rate": 1e-06, + "loss": 0.4168, + "mean_token_accuracy": 0.8637489676475525, + "num_tokens": 187655913.0, + "step": 4917 + }, + { + "epoch": 0.6256201501081288, + "ewc_loss": 0.0247802734375, + "ewc_loss_parallel": 2.47955322265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.058629989624023, + "learning_rate": 1e-06, + "loss": 0.4763, + "mean_token_accuracy": 0.8500975370407104, + "num_tokens": 187692433.0, + "step": 4918 + }, + { + "epoch": 0.6257473603867193, + "ewc_loss": 0.0245361328125, + "ewc_loss_parallel": 2.4557113647460938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.26864242553711, + "learning_rate": 1e-06, + "loss": 0.4058, + "mean_token_accuracy": 0.8699916005134583, + "num_tokens": 187724311.0, + "step": 4919 + }, + { + "epoch": 0.6258745706653097, + "ewc_loss": 0.024658203125, + "ewc_loss_parallel": 2.467632293701172e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.18960952758789, + "learning_rate": 1e-06, + "loss": 0.4196, + "mean_token_accuracy": 0.8669251203536987, + "num_tokens": 187766614.0, + "step": 4920 + }, + { + "epoch": 0.6260017809439002, + "ewc_loss": 0.0245361328125, + "ewc_loss_parallel": 2.4557113647460938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.16292953491211, + "learning_rate": 1e-06, + "loss": 0.4251, + "mean_token_accuracy": 0.8693192601203918, + "num_tokens": 187804847.0, + "step": 4921 + }, + { + "epoch": 0.6261289912224908, + "ewc_loss": 0.0247802734375, + "ewc_loss_parallel": 2.47955322265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.163406372070312, + "learning_rate": 1e-06, + "loss": 0.4437, + "mean_token_accuracy": 0.8590748310089111, + "num_tokens": 187842791.0, + "step": 4922 + }, + { + "epoch": 0.6262562015010813, + "ewc_loss": 0.024658203125, + "ewc_loss_parallel": 2.467632293701172e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.34959602355957, + "learning_rate": 1e-06, + "loss": 0.4759, + "mean_token_accuracy": 0.8482576608657837, + "num_tokens": 187878304.0, + "step": 4923 + }, + { + "epoch": 0.6263834117796718, + "ewc_loss": 0.0245361328125, + "ewc_loss_parallel": 2.4557113647460938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.027164459228516, + "learning_rate": 1e-06, + "loss": 0.4359, + "mean_token_accuracy": 0.8596327900886536, + "num_tokens": 187915022.0, + "step": 4924 + }, + { + "epoch": 0.6265106220582624, + "ewc_loss": 0.024658203125, + "ewc_loss_parallel": 2.467632293701172e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.325563430786133, + "learning_rate": 1e-06, + "loss": 0.451, + "mean_token_accuracy": 0.8587279319763184, + "num_tokens": 187950533.0, + "step": 4925 + }, + { + "epoch": 0.6266378323368528, + "ewc_loss": 0.024658203125, + "ewc_loss_parallel": 2.467632293701172e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.01557159423828, + "learning_rate": 1e-06, + "loss": 0.4607, + "mean_token_accuracy": 0.8538024425506592, + "num_tokens": 187989893.0, + "step": 4926 + }, + { + "epoch": 0.6267650426154433, + "ewc_loss": 0.024658203125, + "ewc_loss_parallel": 2.467632293701172e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.30216407775879, + "learning_rate": 1e-06, + "loss": 0.4669, + "mean_token_accuracy": 0.8509602546691895, + "num_tokens": 188025031.0, + "step": 4927 + }, + { + "epoch": 0.6268922528940338, + "ewc_loss": 0.0247802734375, + "ewc_loss_parallel": 2.47955322265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.281888961791992, + "learning_rate": 1e-06, + "loss": 0.4763, + "mean_token_accuracy": 0.8502963781356812, + "num_tokens": 188060797.0, + "step": 4928 + }, + { + "epoch": 0.6270194631726244, + "ewc_loss": 0.0247802734375, + "ewc_loss_parallel": 2.47955322265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.24144172668457, + "learning_rate": 1e-06, + "loss": 0.4592, + "mean_token_accuracy": 0.8514930009841919, + "num_tokens": 188096271.0, + "step": 4929 + }, + { + "epoch": 0.6271466734512149, + "ewc_loss": 0.0245361328125, + "ewc_loss_parallel": 2.4557113647460938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.17159652709961, + "learning_rate": 1e-06, + "loss": 0.4434, + "mean_token_accuracy": 0.8584821224212646, + "num_tokens": 188136697.0, + "step": 4930 + }, + { + "epoch": 0.6272738837298054, + "ewc_loss": 0.024658203125, + "ewc_loss_parallel": 2.467632293701172e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.24480628967285, + "learning_rate": 1e-06, + "loss": 0.4529, + "mean_token_accuracy": 0.8563743233680725, + "num_tokens": 188174715.0, + "step": 4931 + }, + { + "epoch": 0.6274010940083958, + "ewc_loss": 0.0247802734375, + "ewc_loss_parallel": 2.47955322265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.29486846923828, + "learning_rate": 1e-06, + "loss": 0.5295, + "mean_token_accuracy": 0.8363273739814758, + "num_tokens": 188216379.0, + "step": 4932 + }, + { + "epoch": 0.6275283042869864, + "ewc_loss": 0.0247802734375, + "ewc_loss_parallel": 2.47955322265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.167484283447266, + "learning_rate": 1e-06, + "loss": 0.439, + "mean_token_accuracy": 0.8632183074951172, + "num_tokens": 188252183.0, + "step": 4933 + }, + { + "epoch": 0.6276555145655769, + "ewc_loss": 0.0245361328125, + "ewc_loss_parallel": 2.4557113647460938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.312332153320312, + "learning_rate": 1e-06, + "loss": 0.4409, + "mean_token_accuracy": 0.8607858419418335, + "num_tokens": 188291128.0, + "step": 4934 + }, + { + "epoch": 0.6277827248441674, + "ewc_loss": 0.02490234375, + "ewc_loss_parallel": 2.491474151611328e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.180583953857422, + "learning_rate": 1e-06, + "loss": 0.5159, + "mean_token_accuracy": 0.8411440849304199, + "num_tokens": 188334256.0, + "step": 4935 + }, + { + "epoch": 0.627909935122758, + "ewc_loss": 0.0245361328125, + "ewc_loss_parallel": 2.4557113647460938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.241832733154297, + "learning_rate": 1e-06, + "loss": 0.4728, + "mean_token_accuracy": 0.849702000617981, + "num_tokens": 188373207.0, + "step": 4936 + }, + { + "epoch": 0.6280371454013485, + "ewc_loss": 0.02490234375, + "ewc_loss_parallel": 2.491474151611328e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.333660125732422, + "learning_rate": 1e-06, + "loss": 0.4769, + "mean_token_accuracy": 0.8547837138175964, + "num_tokens": 188414937.0, + "step": 4937 + }, + { + "epoch": 0.6281643556799389, + "ewc_loss": 0.024658203125, + "ewc_loss_parallel": 2.467632293701172e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.0964298248291, + "learning_rate": 1e-06, + "loss": 0.4549, + "mean_token_accuracy": 0.8550196886062622, + "num_tokens": 188455398.0, + "step": 4938 + }, + { + "epoch": 0.6282915659585294, + "ewc_loss": 0.0244140625, + "ewc_loss_parallel": 2.4437904357910156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.297401428222656, + "learning_rate": 1e-06, + "loss": 0.4424, + "mean_token_accuracy": 0.8581081628799438, + "num_tokens": 188496561.0, + "step": 4939 + }, + { + "epoch": 0.62841877623712, + "ewc_loss": 0.024658203125, + "ewc_loss_parallel": 2.467632293701172e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.198253631591797, + "learning_rate": 1e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.8611718416213989, + "num_tokens": 188531318.0, + "step": 4940 + }, + { + "epoch": 0.6285459865157105, + "ewc_loss": 0.0247802734375, + "ewc_loss_parallel": 2.47955322265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.263072967529297, + "learning_rate": 1e-06, + "loss": 0.472, + "mean_token_accuracy": 0.8494687080383301, + "num_tokens": 188566807.0, + "step": 4941 + }, + { + "epoch": 0.628673196794301, + "ewc_loss": 0.0247802734375, + "ewc_loss_parallel": 2.47955322265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.180221557617188, + "learning_rate": 1e-06, + "loss": 0.4577, + "mean_token_accuracy": 0.8538011908531189, + "num_tokens": 188608062.0, + "step": 4942 + }, + { + "epoch": 0.6288004070728915, + "ewc_loss": 0.024658203125, + "ewc_loss_parallel": 2.467632293701172e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.265647888183594, + "learning_rate": 1e-06, + "loss": 0.4089, + "mean_token_accuracy": 0.8689291477203369, + "num_tokens": 188644953.0, + "step": 4943 + }, + { + "epoch": 0.628927617351482, + "ewc_loss": 0.024658203125, + "ewc_loss_parallel": 2.467632293701172e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.247556686401367, + "learning_rate": 1e-06, + "loss": 0.3955, + "mean_token_accuracy": 0.8729256987571716, + "num_tokens": 188687720.0, + "step": 4944 + }, + { + "epoch": 0.6290548276300725, + "ewc_loss": 0.024658203125, + "ewc_loss_parallel": 2.467632293701172e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.28855323791504, + "learning_rate": 1e-06, + "loss": 0.4642, + "mean_token_accuracy": 0.8507585525512695, + "num_tokens": 188730077.0, + "step": 4945 + }, + { + "epoch": 0.629182037908663, + "ewc_loss": 0.0247802734375, + "ewc_loss_parallel": 2.47955322265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.318204879760742, + "learning_rate": 1e-06, + "loss": 0.4285, + "mean_token_accuracy": 0.8639599084854126, + "num_tokens": 188768306.0, + "step": 4946 + }, + { + "epoch": 0.6293092481872535, + "ewc_loss": 0.024658203125, + "ewc_loss_parallel": 2.467632293701172e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.35731315612793, + "learning_rate": 1e-06, + "loss": 0.5151, + "mean_token_accuracy": 0.8373095393180847, + "num_tokens": 188807480.0, + "step": 4947 + }, + { + "epoch": 0.6294364584658441, + "ewc_loss": 0.024658203125, + "ewc_loss_parallel": 2.467632293701172e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.267292022705078, + "learning_rate": 1e-06, + "loss": 0.4742, + "mean_token_accuracy": 0.8483549356460571, + "num_tokens": 188836383.0, + "step": 4948 + }, + { + "epoch": 0.6295636687444346, + "ewc_loss": 0.024658203125, + "ewc_loss_parallel": 2.467632293701172e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.315717697143555, + "learning_rate": 1e-06, + "loss": 0.4356, + "mean_token_accuracy": 0.8604462742805481, + "num_tokens": 188876079.0, + "step": 4949 + }, + { + "epoch": 0.629690879023025, + "ewc_loss": 0.0247802734375, + "ewc_loss_parallel": 2.47955322265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.26259422302246, + "learning_rate": 1e-06, + "loss": 0.4461, + "mean_token_accuracy": 0.8541063070297241, + "num_tokens": 188912936.0, + "step": 4950 + }, + { + "epoch": 0.6298180893016155, + "ewc_loss": 0.0245361328125, + "ewc_loss_parallel": 2.4557113647460938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.280630111694336, + "learning_rate": 1e-06, + "loss": 0.4564, + "mean_token_accuracy": 0.8558061718940735, + "num_tokens": 188951475.0, + "step": 4951 + }, + { + "epoch": 0.6299452995802061, + "ewc_loss": 0.0247802734375, + "ewc_loss_parallel": 2.47955322265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.130290985107422, + "learning_rate": 1e-06, + "loss": 0.5015, + "mean_token_accuracy": 0.8433521389961243, + "num_tokens": 188996972.0, + "step": 4952 + }, + { + "epoch": 0.6300725098587966, + "ewc_loss": 0.024658203125, + "ewc_loss_parallel": 2.467632293701172e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.295255661010742, + "learning_rate": 1e-06, + "loss": 0.3821, + "mean_token_accuracy": 0.8772457242012024, + "num_tokens": 189030706.0, + "step": 4953 + }, + { + "epoch": 0.6301997201373871, + "ewc_loss": 0.024658203125, + "ewc_loss_parallel": 2.467632293701172e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.285818099975586, + "learning_rate": 1e-06, + "loss": 0.3677, + "mean_token_accuracy": 0.8803935050964355, + "num_tokens": 189065638.0, + "step": 4954 + }, + { + "epoch": 0.6303269304159776, + "ewc_loss": 0.024658203125, + "ewc_loss_parallel": 2.467632293701172e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.31952476501465, + "learning_rate": 1e-06, + "loss": 0.4053, + "mean_token_accuracy": 0.8707148432731628, + "num_tokens": 189101747.0, + "step": 4955 + }, + { + "epoch": 0.6304541406945681, + "ewc_loss": 0.0247802734375, + "ewc_loss_parallel": 2.47955322265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.252004623413086, + "learning_rate": 1e-06, + "loss": 0.4643, + "mean_token_accuracy": 0.8548365831375122, + "num_tokens": 189142965.0, + "step": 4956 + }, + { + "epoch": 0.6305813509731586, + "ewc_loss": 0.0247802734375, + "ewc_loss_parallel": 2.47955322265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.172828674316406, + "learning_rate": 1e-06, + "loss": 0.4328, + "mean_token_accuracy": 0.862547755241394, + "num_tokens": 189177822.0, + "step": 4957 + }, + { + "epoch": 0.6307085612517491, + "ewc_loss": 0.024658203125, + "ewc_loss_parallel": 2.467632293701172e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.523576736450195, + "learning_rate": 1e-06, + "loss": 0.4685, + "mean_token_accuracy": 0.8515690565109253, + "num_tokens": 189215710.0, + "step": 4958 + }, + { + "epoch": 0.6308357715303397, + "ewc_loss": 0.0245361328125, + "ewc_loss_parallel": 2.4557113647460938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.186403274536133, + "learning_rate": 1e-06, + "loss": 0.5393, + "mean_token_accuracy": 0.8314673900604248, + "num_tokens": 189250788.0, + "step": 4959 + }, + { + "epoch": 0.6309629818089302, + "ewc_loss": 0.024658203125, + "ewc_loss_parallel": 2.467632293701172e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.42388916015625, + "learning_rate": 1e-06, + "loss": 0.4407, + "mean_token_accuracy": 0.8547137975692749, + "num_tokens": 189278451.0, + "step": 4960 + }, + { + "epoch": 0.6310901920875207, + "ewc_loss": 0.024658203125, + "ewc_loss_parallel": 2.467632293701172e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.300615310668945, + "learning_rate": 1e-06, + "loss": 0.4463, + "mean_token_accuracy": 0.8611125946044922, + "num_tokens": 189316857.0, + "step": 4961 + }, + { + "epoch": 0.6312174023661112, + "ewc_loss": 0.024658203125, + "ewc_loss_parallel": 2.467632293701172e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.139556884765625, + "learning_rate": 1e-06, + "loss": 0.4072, + "mean_token_accuracy": 0.8684132099151611, + "num_tokens": 189356600.0, + "step": 4962 + }, + { + "epoch": 0.6313446126447017, + "ewc_loss": 0.024658203125, + "ewc_loss_parallel": 2.467632293701172e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.373323440551758, + "learning_rate": 1e-06, + "loss": 0.4689, + "mean_token_accuracy": 0.852148175239563, + "num_tokens": 189393134.0, + "step": 4963 + }, + { + "epoch": 0.6314718229232922, + "ewc_loss": 0.0247802734375, + "ewc_loss_parallel": 2.47955322265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.136287689208984, + "learning_rate": 1e-06, + "loss": 0.4956, + "mean_token_accuracy": 0.8430793285369873, + "num_tokens": 189433431.0, + "step": 4964 + }, + { + "epoch": 0.6315990332018827, + "ewc_loss": 0.0245361328125, + "ewc_loss_parallel": 2.4557113647460938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.254051208496094, + "learning_rate": 1e-06, + "loss": 0.49, + "mean_token_accuracy": 0.8445813655853271, + "num_tokens": 189473738.0, + "step": 4965 + }, + { + "epoch": 0.6317262434804732, + "ewc_loss": 0.024658203125, + "ewc_loss_parallel": 2.467632293701172e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.103076934814453, + "learning_rate": 1e-06, + "loss": 0.4021, + "mean_token_accuracy": 0.8713893890380859, + "num_tokens": 189509930.0, + "step": 4966 + }, + { + "epoch": 0.6318534537590638, + "ewc_loss": 0.024658203125, + "ewc_loss_parallel": 2.467632293701172e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.443071365356445, + "learning_rate": 1e-06, + "loss": 0.4294, + "mean_token_accuracy": 0.8627078533172607, + "num_tokens": 189547801.0, + "step": 4967 + }, + { + "epoch": 0.6319806640376543, + "ewc_loss": 0.024658203125, + "ewc_loss_parallel": 2.467632293701172e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.25766944885254, + "learning_rate": 1e-06, + "loss": 0.4082, + "mean_token_accuracy": 0.870903730392456, + "num_tokens": 189582226.0, + "step": 4968 + }, + { + "epoch": 0.6321078743162447, + "ewc_loss": 0.0245361328125, + "ewc_loss_parallel": 2.4557113647460938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.33940887451172, + "learning_rate": 1e-06, + "loss": 0.4576, + "mean_token_accuracy": 0.8572348952293396, + "num_tokens": 189622915.0, + "step": 4969 + }, + { + "epoch": 0.6322350845948352, + "ewc_loss": 0.0247802734375, + "ewc_loss_parallel": 2.47955322265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.333524703979492, + "learning_rate": 1e-06, + "loss": 0.4119, + "mean_token_accuracy": 0.8655270338058472, + "num_tokens": 189658182.0, + "step": 4970 + }, + { + "epoch": 0.6323622948734258, + "ewc_loss": 0.0247802734375, + "ewc_loss_parallel": 2.47955322265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.39955711364746, + "learning_rate": 1e-06, + "loss": 0.4376, + "mean_token_accuracy": 0.8602157831192017, + "num_tokens": 189696627.0, + "step": 4971 + }, + { + "epoch": 0.6324895051520163, + "ewc_loss": 0.0247802734375, + "ewc_loss_parallel": 2.47955322265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.44563865661621, + "learning_rate": 1e-06, + "loss": 0.4451, + "mean_token_accuracy": 0.856278657913208, + "num_tokens": 189729826.0, + "step": 4972 + }, + { + "epoch": 0.6326167154306068, + "ewc_loss": 0.024658203125, + "ewc_loss_parallel": 2.467632293701172e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.357521057128906, + "learning_rate": 1e-06, + "loss": 0.4201, + "mean_token_accuracy": 0.8640018701553345, + "num_tokens": 189771504.0, + "step": 4973 + }, + { + "epoch": 0.6327439257091974, + "ewc_loss": 0.024658203125, + "ewc_loss_parallel": 2.467632293701172e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.269926071166992, + "learning_rate": 1e-06, + "loss": 0.3914, + "mean_token_accuracy": 0.8745754957199097, + "num_tokens": 189809220.0, + "step": 4974 + }, + { + "epoch": 0.6328711359877878, + "ewc_loss": 0.0247802734375, + "ewc_loss_parallel": 2.47955322265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.339282989501953, + "learning_rate": 1e-06, + "loss": 0.4126, + "mean_token_accuracy": 0.8669488430023193, + "num_tokens": 189849586.0, + "step": 4975 + }, + { + "epoch": 0.6329983462663783, + "ewc_loss": 0.024658203125, + "ewc_loss_parallel": 2.467632293701172e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.203493118286133, + "learning_rate": 1e-06, + "loss": 0.4372, + "mean_token_accuracy": 0.859819769859314, + "num_tokens": 189885054.0, + "step": 4976 + }, + { + "epoch": 0.6331255565449688, + "ewc_loss": 0.0247802734375, + "ewc_loss_parallel": 2.47955322265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.199352264404297, + "learning_rate": 1e-06, + "loss": 0.4754, + "mean_token_accuracy": 0.8515830039978027, + "num_tokens": 189918286.0, + "step": 4977 + }, + { + "epoch": 0.6332527668235594, + "ewc_loss": 0.0247802734375, + "ewc_loss_parallel": 2.47955322265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.30942153930664, + "learning_rate": 1e-06, + "loss": 0.4969, + "mean_token_accuracy": 0.8398183584213257, + "num_tokens": 189956845.0, + "step": 4978 + }, + { + "epoch": 0.6333799771021499, + "ewc_loss": 0.0244140625, + "ewc_loss_parallel": 2.4437904357910156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.230894088745117, + "learning_rate": 1e-06, + "loss": 0.3987, + "mean_token_accuracy": 0.8751174211502075, + "num_tokens": 189994827.0, + "step": 4979 + }, + { + "epoch": 0.6335071873807404, + "ewc_loss": 0.024658203125, + "ewc_loss_parallel": 2.467632293701172e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.295469284057617, + "learning_rate": 1e-06, + "loss": 0.4572, + "mean_token_accuracy": 0.8527327179908752, + "num_tokens": 190036066.0, + "step": 4980 + }, + { + "epoch": 0.6336343976593308, + "ewc_loss": 0.024658203125, + "ewc_loss_parallel": 2.467632293701172e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.306127548217773, + "learning_rate": 1e-06, + "loss": 0.4099, + "mean_token_accuracy": 0.8695775270462036, + "num_tokens": 190073326.0, + "step": 4981 + }, + { + "epoch": 0.6337616079379214, + "ewc_loss": 0.02490234375, + "ewc_loss_parallel": 2.491474151611328e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.277385711669922, + "learning_rate": 1e-06, + "loss": 0.4474, + "mean_token_accuracy": 0.8563178181648254, + "num_tokens": 190109125.0, + "step": 4982 + }, + { + "epoch": 0.6338888182165119, + "ewc_loss": 0.024658203125, + "ewc_loss_parallel": 2.467632293701172e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.24909210205078, + "learning_rate": 1e-06, + "loss": 0.4299, + "mean_token_accuracy": 0.8647117614746094, + "num_tokens": 190148520.0, + "step": 4983 + }, + { + "epoch": 0.6340160284951024, + "ewc_loss": 0.0247802734375, + "ewc_loss_parallel": 2.47955322265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.259248733520508, + "learning_rate": 1e-06, + "loss": 0.4376, + "mean_token_accuracy": 0.8610219955444336, + "num_tokens": 190184272.0, + "step": 4984 + }, + { + "epoch": 0.6341432387736929, + "ewc_loss": 0.0247802734375, + "ewc_loss_parallel": 2.47955322265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.35799217224121, + "learning_rate": 1e-06, + "loss": 0.4444, + "mean_token_accuracy": 0.8575198650360107, + "num_tokens": 190221483.0, + "step": 4985 + }, + { + "epoch": 0.6342704490522835, + "ewc_loss": 0.024658203125, + "ewc_loss_parallel": 2.467632293701172e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.016738891601562, + "learning_rate": 1e-06, + "loss": 0.4113, + "mean_token_accuracy": 0.8704782724380493, + "num_tokens": 190266430.0, + "step": 4986 + }, + { + "epoch": 0.6343976593308739, + "ewc_loss": 0.024658203125, + "ewc_loss_parallel": 2.467632293701172e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.43081283569336, + "learning_rate": 1e-06, + "loss": 0.4167, + "mean_token_accuracy": 0.8701776266098022, + "num_tokens": 190308575.0, + "step": 4987 + }, + { + "epoch": 0.6345248696094644, + "ewc_loss": 0.0247802734375, + "ewc_loss_parallel": 2.47955322265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.19135856628418, + "learning_rate": 1e-06, + "loss": 0.4132, + "mean_token_accuracy": 0.8686129450798035, + "num_tokens": 190344584.0, + "step": 4988 + }, + { + "epoch": 0.6346520798880549, + "ewc_loss": 0.024658203125, + "ewc_loss_parallel": 2.467632293701172e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.281784057617188, + "learning_rate": 1e-06, + "loss": 0.4185, + "mean_token_accuracy": 0.8647929430007935, + "num_tokens": 190376497.0, + "step": 4989 + }, + { + "epoch": 0.6347792901666455, + "ewc_loss": 0.0247802734375, + "ewc_loss_parallel": 2.47955322265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.33302879333496, + "learning_rate": 1e-06, + "loss": 0.4804, + "mean_token_accuracy": 0.8475453853607178, + "num_tokens": 190416879.0, + "step": 4990 + }, + { + "epoch": 0.634906500445236, + "ewc_loss": 0.024658203125, + "ewc_loss_parallel": 2.467632293701172e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.31146240234375, + "learning_rate": 1e-06, + "loss": 0.3808, + "mean_token_accuracy": 0.8785461783409119, + "num_tokens": 190454684.0, + "step": 4991 + }, + { + "epoch": 0.6350337107238265, + "ewc_loss": 0.024658203125, + "ewc_loss_parallel": 2.467632293701172e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.311946868896484, + "learning_rate": 1e-06, + "loss": 0.4461, + "mean_token_accuracy": 0.851919949054718, + "num_tokens": 190490524.0, + "step": 4992 + }, + { + "epoch": 0.635160921002417, + "ewc_loss": 0.0245361328125, + "ewc_loss_parallel": 2.4557113647460938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.23799705505371, + "learning_rate": 1e-06, + "loss": 0.4271, + "mean_token_accuracy": 0.8653661608695984, + "num_tokens": 190526558.0, + "step": 4993 + }, + { + "epoch": 0.6352881312810075, + "ewc_loss": 0.0247802734375, + "ewc_loss_parallel": 2.47955322265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.237598419189453, + "learning_rate": 1e-06, + "loss": 0.4467, + "mean_token_accuracy": 0.8589143753051758, + "num_tokens": 190571206.0, + "step": 4994 + }, + { + "epoch": 0.635415341559598, + "ewc_loss": 0.0245361328125, + "ewc_loss_parallel": 2.4557113647460938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.484844207763672, + "learning_rate": 1e-06, + "loss": 0.4879, + "mean_token_accuracy": 0.8475591540336609, + "num_tokens": 190611480.0, + "step": 4995 + }, + { + "epoch": 0.6355425518381885, + "ewc_loss": 0.0245361328125, + "ewc_loss_parallel": 2.4557113647460938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.20451545715332, + "learning_rate": 1e-06, + "loss": 0.5253, + "mean_token_accuracy": 0.8311128616333008, + "num_tokens": 190647699.0, + "step": 4996 + }, + { + "epoch": 0.6356697621167791, + "ewc_loss": 0.0245361328125, + "ewc_loss_parallel": 2.4557113647460938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.31572914123535, + "learning_rate": 1e-06, + "loss": 0.4386, + "mean_token_accuracy": 0.8590264320373535, + "num_tokens": 190684863.0, + "step": 4997 + }, + { + "epoch": 0.6357969723953696, + "ewc_loss": 0.024658203125, + "ewc_loss_parallel": 2.467632293701172e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.267108917236328, + "learning_rate": 1e-06, + "loss": 0.4916, + "mean_token_accuracy": 0.8438704013824463, + "num_tokens": 190725006.0, + "step": 4998 + }, + { + "epoch": 0.63592418267396, + "ewc_loss": 0.0245361328125, + "ewc_loss_parallel": 2.4557113647460938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.361536026000977, + "learning_rate": 1e-06, + "loss": 0.4483, + "mean_token_accuracy": 0.856766939163208, + "num_tokens": 190757431.0, + "step": 4999 + }, + { + "epoch": 0.6360513929525505, + "ewc_loss": 0.024658203125, + "ewc_loss_parallel": 2.467632293701172e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.340742111206055, + "learning_rate": 1e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.8633525371551514, + "num_tokens": 190793118.0, + "step": 5000 + }, + { + "epoch": 0.6361786032311411, + "ewc_loss": 0.024658203125, + "ewc_loss_parallel": 2.467632293701172e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.306432723999023, + "learning_rate": 1e-06, + "loss": 0.408, + "mean_token_accuracy": 0.8677713871002197, + "num_tokens": 190837105.0, + "step": 5001 + }, + { + "epoch": 0.6363058135097316, + "ewc_loss": 0.024658203125, + "ewc_loss_parallel": 2.467632293701172e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.199851989746094, + "learning_rate": 1e-06, + "loss": 0.4528, + "mean_token_accuracy": 0.8533474802970886, + "num_tokens": 190875372.0, + "step": 5002 + }, + { + "epoch": 0.6364330237883221, + "ewc_loss": 0.0244140625, + "ewc_loss_parallel": 2.4437904357910156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.34195327758789, + "learning_rate": 1e-06, + "loss": 0.43, + "mean_token_accuracy": 0.8641729354858398, + "num_tokens": 190921307.0, + "step": 5003 + }, + { + "epoch": 0.6365602340669126, + "ewc_loss": 0.024658203125, + "ewc_loss_parallel": 2.467632293701172e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.218502044677734, + "learning_rate": 1e-06, + "loss": 0.4445, + "mean_token_accuracy": 0.8586626648902893, + "num_tokens": 190967919.0, + "step": 5004 + }, + { + "epoch": 0.6366874443455031, + "ewc_loss": 0.0244140625, + "ewc_loss_parallel": 2.4437904357910156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.32505226135254, + "learning_rate": 1e-06, + "loss": 0.4524, + "mean_token_accuracy": 0.8568997383117676, + "num_tokens": 191005668.0, + "step": 5005 + }, + { + "epoch": 0.6368146546240936, + "ewc_loss": 0.024658203125, + "ewc_loss_parallel": 2.467632293701172e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.250755310058594, + "learning_rate": 1e-06, + "loss": 0.469, + "mean_token_accuracy": 0.8475028872489929, + "num_tokens": 191044455.0, + "step": 5006 + }, + { + "epoch": 0.6369418649026841, + "ewc_loss": 0.0247802734375, + "ewc_loss_parallel": 2.47955322265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.532018661499023, + "learning_rate": 1e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.8639160394668579, + "num_tokens": 191086369.0, + "step": 5007 + }, + { + "epoch": 0.6370690751812746, + "ewc_loss": 0.0245361328125, + "ewc_loss_parallel": 2.4557113647460938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.1864070892334, + "learning_rate": 1e-06, + "loss": 0.4629, + "mean_token_accuracy": 0.8536818027496338, + "num_tokens": 191127070.0, + "step": 5008 + }, + { + "epoch": 0.6371962854598652, + "ewc_loss": 0.0244140625, + "ewc_loss_parallel": 2.4437904357910156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.37537384033203, + "learning_rate": 1e-06, + "loss": 0.3944, + "mean_token_accuracy": 0.8727949261665344, + "num_tokens": 191163623.0, + "step": 5009 + }, + { + "epoch": 0.6373234957384557, + "ewc_loss": 0.024658203125, + "ewc_loss_parallel": 2.467632293701172e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.2059326171875, + "learning_rate": 1e-06, + "loss": 0.4518, + "mean_token_accuracy": 0.8540539145469666, + "num_tokens": 191196790.0, + "step": 5010 + }, + { + "epoch": 0.6374507060170462, + "ewc_loss": 0.024658203125, + "ewc_loss_parallel": 2.467632293701172e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.59085464477539, + "learning_rate": 1e-06, + "loss": 0.437, + "mean_token_accuracy": 0.8556687831878662, + "num_tokens": 191230265.0, + "step": 5011 + }, + { + "epoch": 0.6375779162956366, + "ewc_loss": 0.0247802734375, + "ewc_loss_parallel": 2.47955322265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.203039169311523, + "learning_rate": 1e-06, + "loss": 0.4688, + "mean_token_accuracy": 0.8500072956085205, + "num_tokens": 191266781.0, + "step": 5012 + }, + { + "epoch": 0.6377051265742272, + "ewc_loss": 0.024658203125, + "ewc_loss_parallel": 2.467632293701172e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.44268035888672, + "learning_rate": 1e-06, + "loss": 0.4752, + "mean_token_accuracy": 0.8489546775817871, + "num_tokens": 191305869.0, + "step": 5013 + }, + { + "epoch": 0.6378323368528177, + "ewc_loss": 0.024658203125, + "ewc_loss_parallel": 2.467632293701172e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.30812644958496, + "learning_rate": 1e-06, + "loss": 0.4766, + "mean_token_accuracy": 0.8464758396148682, + "num_tokens": 191343498.0, + "step": 5014 + }, + { + "epoch": 0.6379595471314082, + "ewc_loss": 0.0247802734375, + "ewc_loss_parallel": 2.47955322265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.309207916259766, + "learning_rate": 1e-06, + "loss": 0.4983, + "mean_token_accuracy": 0.8435269594192505, + "num_tokens": 191383260.0, + "step": 5015 + }, + { + "epoch": 0.6380867574099988, + "ewc_loss": 0.024658203125, + "ewc_loss_parallel": 2.467632293701172e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.152179718017578, + "learning_rate": 1e-06, + "loss": 0.4791, + "mean_token_accuracy": 0.8504211902618408, + "num_tokens": 191417350.0, + "step": 5016 + }, + { + "epoch": 0.6382139676885893, + "ewc_loss": 0.0247802734375, + "ewc_loss_parallel": 2.47955322265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.414297103881836, + "learning_rate": 1e-06, + "loss": 0.4381, + "mean_token_accuracy": 0.8601117134094238, + "num_tokens": 191454336.0, + "step": 5017 + }, + { + "epoch": 0.6383411779671797, + "ewc_loss": 0.0247802734375, + "ewc_loss_parallel": 2.47955322265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.377361297607422, + "learning_rate": 1e-06, + "loss": 0.4458, + "mean_token_accuracy": 0.859711766242981, + "num_tokens": 191489977.0, + "step": 5018 + }, + { + "epoch": 0.6384683882457702, + "ewc_loss": 0.0247802734375, + "ewc_loss_parallel": 2.47955322265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.282283782958984, + "learning_rate": 1e-06, + "loss": 0.4829, + "mean_token_accuracy": 0.8486099243164062, + "num_tokens": 191524622.0, + "step": 5019 + }, + { + "epoch": 0.6385955985243608, + "ewc_loss": 0.0247802734375, + "ewc_loss_parallel": 2.47955322265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.44310188293457, + "learning_rate": 1e-06, + "loss": 0.4784, + "mean_token_accuracy": 0.8472862839698792, + "num_tokens": 191565389.0, + "step": 5020 + }, + { + "epoch": 0.6387228088029513, + "ewc_loss": 0.02490234375, + "ewc_loss_parallel": 2.491474151611328e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.279769897460938, + "learning_rate": 1e-06, + "loss": 0.3782, + "mean_token_accuracy": 0.877055287361145, + "num_tokens": 191604640.0, + "step": 5021 + }, + { + "epoch": 0.6388500190815418, + "ewc_loss": 0.024658203125, + "ewc_loss_parallel": 2.467632293701172e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.115140914916992, + "learning_rate": 1e-06, + "loss": 0.4624, + "mean_token_accuracy": 0.8576565980911255, + "num_tokens": 191651029.0, + "step": 5022 + }, + { + "epoch": 0.6389772293601323, + "ewc_loss": 0.0247802734375, + "ewc_loss_parallel": 2.47955322265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.54631233215332, + "learning_rate": 1e-06, + "loss": 0.4385, + "mean_token_accuracy": 0.8584460616111755, + "num_tokens": 191690524.0, + "step": 5023 + }, + { + "epoch": 0.6391044396387228, + "ewc_loss": 0.0247802734375, + "ewc_loss_parallel": 2.47955322265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.222557067871094, + "learning_rate": 1e-06, + "loss": 0.4619, + "mean_token_accuracy": 0.8528200387954712, + "num_tokens": 191733992.0, + "step": 5024 + }, + { + "epoch": 0.6392316499173133, + "ewc_loss": 0.0245361328125, + "ewc_loss_parallel": 2.4557113647460938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.385818481445312, + "learning_rate": 1e-06, + "loss": 0.4077, + "mean_token_accuracy": 0.8692034482955933, + "num_tokens": 191769717.0, + "step": 5025 + }, + { + "epoch": 0.6393588601959038, + "ewc_loss": 0.024658203125, + "ewc_loss_parallel": 2.467632293701172e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.304790496826172, + "learning_rate": 1e-06, + "loss": 0.4939, + "mean_token_accuracy": 0.844083309173584, + "num_tokens": 191813334.0, + "step": 5026 + }, + { + "epoch": 0.6394860704744944, + "ewc_loss": 0.0247802734375, + "ewc_loss_parallel": 2.47955322265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.307119369506836, + "learning_rate": 1e-06, + "loss": 0.44, + "mean_token_accuracy": 0.8601794242858887, + "num_tokens": 191848295.0, + "step": 5027 + }, + { + "epoch": 0.6396132807530849, + "ewc_loss": 0.0247802734375, + "ewc_loss_parallel": 2.47955322265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.48474884033203, + "learning_rate": 1e-06, + "loss": 0.4578, + "mean_token_accuracy": 0.8508073687553406, + "num_tokens": 191885329.0, + "step": 5028 + }, + { + "epoch": 0.6397404910316754, + "ewc_loss": 0.0247802734375, + "ewc_loss_parallel": 2.47955322265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.126495361328125, + "learning_rate": 1e-06, + "loss": 0.4164, + "mean_token_accuracy": 0.8710962533950806, + "num_tokens": 191918070.0, + "step": 5029 + }, + { + "epoch": 0.6398677013102658, + "ewc_loss": 0.024658203125, + "ewc_loss_parallel": 2.467632293701172e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.558019638061523, + "learning_rate": 1e-06, + "loss": 0.4334, + "mean_token_accuracy": 0.8623795509338379, + "num_tokens": 191966084.0, + "step": 5030 + }, + { + "epoch": 0.6399949115888564, + "ewc_loss": 0.0247802734375, + "ewc_loss_parallel": 2.47955322265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.253713607788086, + "learning_rate": 1e-06, + "loss": 0.4572, + "mean_token_accuracy": 0.8575741052627563, + "num_tokens": 192004907.0, + "step": 5031 + }, + { + "epoch": 0.6401221218674469, + "ewc_loss": 0.024658203125, + "ewc_loss_parallel": 2.467632293701172e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.334033966064453, + "learning_rate": 1e-06, + "loss": 0.5002, + "mean_token_accuracy": 0.8415967226028442, + "num_tokens": 192042942.0, + "step": 5032 + }, + { + "epoch": 0.6402493321460374, + "ewc_loss": 0.024658203125, + "ewc_loss_parallel": 2.467632293701172e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.280576705932617, + "learning_rate": 1e-06, + "loss": 0.4402, + "mean_token_accuracy": 0.8584730625152588, + "num_tokens": 192081133.0, + "step": 5033 + }, + { + "epoch": 0.6403765424246279, + "ewc_loss": 0.024658203125, + "ewc_loss_parallel": 2.467632293701172e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.328914642333984, + "learning_rate": 1e-06, + "loss": 0.4468, + "mean_token_accuracy": 0.859123706817627, + "num_tokens": 192120171.0, + "step": 5034 + }, + { + "epoch": 0.6405037527032185, + "ewc_loss": 0.024658203125, + "ewc_loss_parallel": 2.467632293701172e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.28407859802246, + "learning_rate": 1e-06, + "loss": 0.4438, + "mean_token_accuracy": 0.8593738079071045, + "num_tokens": 192159583.0, + "step": 5035 + }, + { + "epoch": 0.6406309629818089, + "ewc_loss": 0.0247802734375, + "ewc_loss_parallel": 2.47955322265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.407014846801758, + "learning_rate": 1e-06, + "loss": 0.4533, + "mean_token_accuracy": 0.8564491271972656, + "num_tokens": 192199513.0, + "step": 5036 + }, + { + "epoch": 0.6407581732603994, + "ewc_loss": 0.024658203125, + "ewc_loss_parallel": 2.467632293701172e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.27806854248047, + "learning_rate": 1e-06, + "loss": 0.4321, + "mean_token_accuracy": 0.8607884645462036, + "num_tokens": 192228352.0, + "step": 5037 + }, + { + "epoch": 0.6408853835389899, + "ewc_loss": 0.024658203125, + "ewc_loss_parallel": 2.467632293701172e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.3030948638916, + "learning_rate": 1e-06, + "loss": 0.4534, + "mean_token_accuracy": 0.8537001013755798, + "num_tokens": 192264480.0, + "step": 5038 + }, + { + "epoch": 0.6410125938175805, + "ewc_loss": 0.024658203125, + "ewc_loss_parallel": 2.467632293701172e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.278663635253906, + "learning_rate": 1e-06, + "loss": 0.4496, + "mean_token_accuracy": 0.8565954566001892, + "num_tokens": 192303307.0, + "step": 5039 + }, + { + "epoch": 0.641139804096171, + "ewc_loss": 0.0247802734375, + "ewc_loss_parallel": 2.47955322265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.379179000854492, + "learning_rate": 1e-06, + "loss": 0.4363, + "mean_token_accuracy": 0.8611140251159668, + "num_tokens": 192342064.0, + "step": 5040 + }, + { + "epoch": 0.6412670143747615, + "ewc_loss": 0.024658203125, + "ewc_loss_parallel": 2.467632293701172e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.4567928314209, + "learning_rate": 1e-06, + "loss": 0.4262, + "mean_token_accuracy": 0.8657889366149902, + "num_tokens": 192379297.0, + "step": 5041 + }, + { + "epoch": 0.6413942246533519, + "ewc_loss": 0.02490234375, + "ewc_loss_parallel": 2.491474151611328e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.332645416259766, + "learning_rate": 1e-06, + "loss": 0.4266, + "mean_token_accuracy": 0.8612036108970642, + "num_tokens": 192416087.0, + "step": 5042 + }, + { + "epoch": 0.6415214349319425, + "ewc_loss": 0.0247802734375, + "ewc_loss_parallel": 2.47955322265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.31987190246582, + "learning_rate": 1e-06, + "loss": 0.4306, + "mean_token_accuracy": 0.8637990951538086, + "num_tokens": 192454421.0, + "step": 5043 + }, + { + "epoch": 0.641648645210533, + "ewc_loss": 0.02490234375, + "ewc_loss_parallel": 2.491474151611328e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.517995834350586, + "learning_rate": 1e-06, + "loss": 0.4498, + "mean_token_accuracy": 0.8551318645477295, + "num_tokens": 192493860.0, + "step": 5044 + }, + { + "epoch": 0.6417758554891235, + "ewc_loss": 0.0247802734375, + "ewc_loss_parallel": 2.47955322265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.1538143157959, + "learning_rate": 1e-06, + "loss": 0.4719, + "mean_token_accuracy": 0.8492567539215088, + "num_tokens": 192533672.0, + "step": 5045 + }, + { + "epoch": 0.641903065767714, + "ewc_loss": 0.024658203125, + "ewc_loss_parallel": 2.467632293701172e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.466100692749023, + "learning_rate": 1e-06, + "loss": 0.4744, + "mean_token_accuracy": 0.846220850944519, + "num_tokens": 192569569.0, + "step": 5046 + }, + { + "epoch": 0.6420302760463046, + "ewc_loss": 0.02490234375, + "ewc_loss_parallel": 2.491474151611328e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.305294036865234, + "learning_rate": 1e-06, + "loss": 0.4291, + "mean_token_accuracy": 0.8643122315406799, + "num_tokens": 192610716.0, + "step": 5047 + }, + { + "epoch": 0.642157486324895, + "ewc_loss": 0.0247802734375, + "ewc_loss_parallel": 2.47955322265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.388519287109375, + "learning_rate": 1e-06, + "loss": 0.4631, + "mean_token_accuracy": 0.8522391319274902, + "num_tokens": 192648040.0, + "step": 5048 + }, + { + "epoch": 0.6422846966034855, + "ewc_loss": 0.0247802734375, + "ewc_loss_parallel": 2.47955322265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.286027908325195, + "learning_rate": 1e-06, + "loss": 0.4456, + "mean_token_accuracy": 0.8569899201393127, + "num_tokens": 192690063.0, + "step": 5049 + }, + { + "epoch": 0.6424119068820761, + "ewc_loss": 0.0247802734375, + "ewc_loss_parallel": 2.47955322265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.381044387817383, + "learning_rate": 1e-06, + "loss": 0.4568, + "mean_token_accuracy": 0.8527121543884277, + "num_tokens": 192732067.0, + "step": 5050 + }, + { + "epoch": 0.6425391171606666, + "ewc_loss": 0.0247802734375, + "ewc_loss_parallel": 2.47955322265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.248506546020508, + "learning_rate": 1e-06, + "loss": 0.4132, + "mean_token_accuracy": 0.8686931133270264, + "num_tokens": 192768598.0, + "step": 5051 + }, + { + "epoch": 0.6426663274392571, + "ewc_loss": 0.0244140625, + "ewc_loss_parallel": 2.4437904357910156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.28061866760254, + "learning_rate": 1e-06, + "loss": 0.5389, + "mean_token_accuracy": 0.8295154571533203, + "num_tokens": 192805420.0, + "step": 5052 + }, + { + "epoch": 0.6427935377178476, + "ewc_loss": 0.0245361328125, + "ewc_loss_parallel": 2.4557113647460938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.488563537597656, + "learning_rate": 1e-06, + "loss": 0.4645, + "mean_token_accuracy": 0.8534077405929565, + "num_tokens": 192852082.0, + "step": 5053 + }, + { + "epoch": 0.6429207479964381, + "ewc_loss": 0.0245361328125, + "ewc_loss_parallel": 2.4557113647460938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.33669662475586, + "learning_rate": 1e-06, + "loss": 0.4439, + "mean_token_accuracy": 0.8587291240692139, + "num_tokens": 192892846.0, + "step": 5054 + }, + { + "epoch": 0.6430479582750286, + "ewc_loss": 0.02490234375, + "ewc_loss_parallel": 2.491474151611328e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.47190284729004, + "learning_rate": 1e-06, + "loss": 0.3804, + "mean_token_accuracy": 0.8762313723564148, + "num_tokens": 192932091.0, + "step": 5055 + }, + { + "epoch": 0.6431751685536191, + "ewc_loss": 0.024658203125, + "ewc_loss_parallel": 2.467632293701172e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.2208251953125, + "learning_rate": 1e-06, + "loss": 0.4416, + "mean_token_accuracy": 0.8599008321762085, + "num_tokens": 192960628.0, + "step": 5056 + }, + { + "epoch": 0.6433023788322096, + "ewc_loss": 0.024658203125, + "ewc_loss_parallel": 2.467632293701172e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.44380760192871, + "learning_rate": 1e-06, + "loss": 0.46, + "mean_token_accuracy": 0.8519091010093689, + "num_tokens": 193002075.0, + "step": 5057 + }, + { + "epoch": 0.6434295891108002, + "ewc_loss": 0.024658203125, + "ewc_loss_parallel": 2.467632293701172e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.247303009033203, + "learning_rate": 1e-06, + "loss": 0.4784, + "mean_token_accuracy": 0.8479272723197937, + "num_tokens": 193039720.0, + "step": 5058 + }, + { + "epoch": 0.6435567993893907, + "ewc_loss": 0.024658203125, + "ewc_loss_parallel": 2.467632293701172e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.364736557006836, + "learning_rate": 1e-06, + "loss": 0.4387, + "mean_token_accuracy": 0.8580514788627625, + "num_tokens": 193072806.0, + "step": 5059 + }, + { + "epoch": 0.6436840096679812, + "ewc_loss": 0.0247802734375, + "ewc_loss_parallel": 2.47955322265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.396408081054688, + "learning_rate": 1e-06, + "loss": 0.4295, + "mean_token_accuracy": 0.8669509291648865, + "num_tokens": 193108478.0, + "step": 5060 + }, + { + "epoch": 0.6438112199465716, + "ewc_loss": 0.0247802734375, + "ewc_loss_parallel": 2.47955322265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.209959030151367, + "learning_rate": 1e-06, + "loss": 0.4399, + "mean_token_accuracy": 0.8614054322242737, + "num_tokens": 193150663.0, + "step": 5061 + }, + { + "epoch": 0.6439384302251622, + "ewc_loss": 0.0245361328125, + "ewc_loss_parallel": 2.4557113647460938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.223608016967773, + "learning_rate": 1e-06, + "loss": 0.4301, + "mean_token_accuracy": 0.8612014651298523, + "num_tokens": 193191499.0, + "step": 5062 + }, + { + "epoch": 0.6440656405037527, + "ewc_loss": 0.02490234375, + "ewc_loss_parallel": 2.491474151611328e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.235334396362305, + "learning_rate": 1e-06, + "loss": 0.488, + "mean_token_accuracy": 0.8491250872612, + "num_tokens": 193236103.0, + "step": 5063 + }, + { + "epoch": 0.6441928507823432, + "ewc_loss": 0.024658203125, + "ewc_loss_parallel": 2.467632293701172e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.310192108154297, + "learning_rate": 1e-06, + "loss": 0.5159, + "mean_token_accuracy": 0.8370878100395203, + "num_tokens": 193274095.0, + "step": 5064 + }, + { + "epoch": 0.6443200610609338, + "ewc_loss": 0.0247802734375, + "ewc_loss_parallel": 2.47955322265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.33344841003418, + "learning_rate": 1e-06, + "loss": 0.4297, + "mean_token_accuracy": 0.8638452291488647, + "num_tokens": 193316988.0, + "step": 5065 + }, + { + "epoch": 0.6444472713395243, + "ewc_loss": 0.0247802734375, + "ewc_loss_parallel": 2.47955322265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.33737564086914, + "learning_rate": 1e-06, + "loss": 0.4524, + "mean_token_accuracy": 0.856227457523346, + "num_tokens": 193357430.0, + "step": 5066 + }, + { + "epoch": 0.6445744816181147, + "ewc_loss": 0.0247802734375, + "ewc_loss_parallel": 2.47955322265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.307172775268555, + "learning_rate": 1e-06, + "loss": 0.4102, + "mean_token_accuracy": 0.865808367729187, + "num_tokens": 193390392.0, + "step": 5067 + }, + { + "epoch": 0.6447016918967052, + "ewc_loss": 0.024658203125, + "ewc_loss_parallel": 2.467632293701172e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.400362014770508, + "learning_rate": 1e-06, + "loss": 0.4585, + "mean_token_accuracy": 0.8554421067237854, + "num_tokens": 193430961.0, + "step": 5068 + }, + { + "epoch": 0.6448289021752958, + "ewc_loss": 0.0245361328125, + "ewc_loss_parallel": 2.4557113647460938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.281368255615234, + "learning_rate": 1e-06, + "loss": 0.4506, + "mean_token_accuracy": 0.8564870953559875, + "num_tokens": 193473767.0, + "step": 5069 + }, + { + "epoch": 0.6449561124538863, + "ewc_loss": 0.0247802734375, + "ewc_loss_parallel": 2.47955322265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.330707550048828, + "learning_rate": 1e-06, + "loss": 0.4889, + "mean_token_accuracy": 0.8462792038917542, + "num_tokens": 193511375.0, + "step": 5070 + }, + { + "epoch": 0.6450833227324768, + "ewc_loss": 0.024658203125, + "ewc_loss_parallel": 2.467632293701172e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.39545249938965, + "learning_rate": 1e-06, + "loss": 0.4061, + "mean_token_accuracy": 0.8689031600952148, + "num_tokens": 193551296.0, + "step": 5071 + }, + { + "epoch": 0.6452105330110673, + "ewc_loss": 0.0245361328125, + "ewc_loss_parallel": 2.4557113647460938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.26249122619629, + "learning_rate": 1e-06, + "loss": 0.437, + "mean_token_accuracy": 0.8622565269470215, + "num_tokens": 193596484.0, + "step": 5072 + }, + { + "epoch": 0.6453377432896578, + "ewc_loss": 0.0247802734375, + "ewc_loss_parallel": 2.47955322265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.418378829956055, + "learning_rate": 1e-06, + "loss": 0.4361, + "mean_token_accuracy": 0.8644648790359497, + "num_tokens": 193637513.0, + "step": 5073 + }, + { + "epoch": 0.6454649535682483, + "ewc_loss": 0.0247802734375, + "ewc_loss_parallel": 2.47955322265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.392202377319336, + "learning_rate": 1e-06, + "loss": 0.4644, + "mean_token_accuracy": 0.8523650765419006, + "num_tokens": 193672808.0, + "step": 5074 + }, + { + "epoch": 0.6455921638468388, + "ewc_loss": 0.024658203125, + "ewc_loss_parallel": 2.467632293701172e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.203805923461914, + "learning_rate": 1e-06, + "loss": 0.4025, + "mean_token_accuracy": 0.8730077147483826, + "num_tokens": 193713018.0, + "step": 5075 + }, + { + "epoch": 0.6457193741254293, + "ewc_loss": 0.02490234375, + "ewc_loss_parallel": 2.491474151611328e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.306251525878906, + "learning_rate": 1e-06, + "loss": 0.4395, + "mean_token_accuracy": 0.860379695892334, + "num_tokens": 193757649.0, + "step": 5076 + }, + { + "epoch": 0.6458465844040199, + "ewc_loss": 0.0245361328125, + "ewc_loss_parallel": 2.4557113647460938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.38042449951172, + "learning_rate": 1e-06, + "loss": 0.4678, + "mean_token_accuracy": 0.8551130294799805, + "num_tokens": 193792390.0, + "step": 5077 + }, + { + "epoch": 0.6459737946826104, + "ewc_loss": 0.0247802734375, + "ewc_loss_parallel": 2.47955322265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.231502532958984, + "learning_rate": 1e-06, + "loss": 0.4161, + "mean_token_accuracy": 0.8656533360481262, + "num_tokens": 193827721.0, + "step": 5078 + }, + { + "epoch": 0.6461010049612008, + "ewc_loss": 0.024658203125, + "ewc_loss_parallel": 2.467632293701172e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.51895523071289, + "learning_rate": 1e-06, + "loss": 0.4684, + "mean_token_accuracy": 0.8496549129486084, + "num_tokens": 193867217.0, + "step": 5079 + }, + { + "epoch": 0.6462282152397913, + "ewc_loss": 0.024658203125, + "ewc_loss_parallel": 2.467632293701172e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.300294876098633, + "learning_rate": 1e-06, + "loss": 0.4263, + "mean_token_accuracy": 0.8654727935791016, + "num_tokens": 193900128.0, + "step": 5080 + }, + { + "epoch": 0.6463554255183819, + "ewc_loss": 0.024658203125, + "ewc_loss_parallel": 2.467632293701172e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.551300048828125, + "learning_rate": 1e-06, + "loss": 0.4098, + "mean_token_accuracy": 0.8691838979721069, + "num_tokens": 193935744.0, + "step": 5081 + }, + { + "epoch": 0.6464826357969724, + "ewc_loss": 0.024658203125, + "ewc_loss_parallel": 2.467632293701172e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.247142791748047, + "learning_rate": 1e-06, + "loss": 0.4545, + "mean_token_accuracy": 0.8551270961761475, + "num_tokens": 193977344.0, + "step": 5082 + }, + { + "epoch": 0.6466098460755629, + "ewc_loss": 0.024658203125, + "ewc_loss_parallel": 2.467632293701172e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.45035743713379, + "learning_rate": 1e-06, + "loss": 0.5305, + "mean_token_accuracy": 0.8318960666656494, + "num_tokens": 194014295.0, + "step": 5083 + }, + { + "epoch": 0.6467370563541535, + "ewc_loss": 0.024658203125, + "ewc_loss_parallel": 2.467632293701172e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.41549301147461, + "learning_rate": 1e-06, + "loss": 0.4977, + "mean_token_accuracy": 0.8424695134162903, + "num_tokens": 194055245.0, + "step": 5084 + }, + { + "epoch": 0.6468642666327439, + "ewc_loss": 0.0247802734375, + "ewc_loss_parallel": 2.47955322265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.31985855102539, + "learning_rate": 1e-06, + "loss": 0.4735, + "mean_token_accuracy": 0.850659966468811, + "num_tokens": 194089313.0, + "step": 5085 + }, + { + "epoch": 0.6469914769113344, + "ewc_loss": 0.024658203125, + "ewc_loss_parallel": 2.467632293701172e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.382244110107422, + "learning_rate": 1e-06, + "loss": 0.4247, + "mean_token_accuracy": 0.8647987246513367, + "num_tokens": 194128293.0, + "step": 5086 + }, + { + "epoch": 0.6471186871899249, + "ewc_loss": 0.02490234375, + "ewc_loss_parallel": 2.491474151611328e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.334117889404297, + "learning_rate": 1e-06, + "loss": 0.4719, + "mean_token_accuracy": 0.8513244390487671, + "num_tokens": 194166632.0, + "step": 5087 + }, + { + "epoch": 0.6472458974685155, + "ewc_loss": 0.0247802734375, + "ewc_loss_parallel": 2.47955322265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.43696403503418, + "learning_rate": 1e-06, + "loss": 0.4788, + "mean_token_accuracy": 0.8445971012115479, + "num_tokens": 194209176.0, + "step": 5088 + }, + { + "epoch": 0.647373107747106, + "ewc_loss": 0.0247802734375, + "ewc_loss_parallel": 2.47955322265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.303810119628906, + "learning_rate": 1e-06, + "loss": 0.4793, + "mean_token_accuracy": 0.8478701114654541, + "num_tokens": 194245617.0, + "step": 5089 + }, + { + "epoch": 0.6475003180256965, + "ewc_loss": 0.024658203125, + "ewc_loss_parallel": 2.467632293701172e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.372634887695312, + "learning_rate": 1e-06, + "loss": 0.4243, + "mean_token_accuracy": 0.8638421297073364, + "num_tokens": 194285983.0, + "step": 5090 + }, + { + "epoch": 0.6476275283042869, + "ewc_loss": 0.0247802734375, + "ewc_loss_parallel": 2.47955322265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.1982479095459, + "learning_rate": 1e-06, + "loss": 0.4473, + "mean_token_accuracy": 0.8545289039611816, + "num_tokens": 194317088.0, + "step": 5091 + }, + { + "epoch": 0.6477547385828775, + "ewc_loss": 0.0247802734375, + "ewc_loss_parallel": 2.47955322265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.436084747314453, + "learning_rate": 1e-06, + "loss": 0.4273, + "mean_token_accuracy": 0.8627498149871826, + "num_tokens": 194354025.0, + "step": 5092 + }, + { + "epoch": 0.647881948861468, + "ewc_loss": 0.0247802734375, + "ewc_loss_parallel": 2.47955322265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.233325958251953, + "learning_rate": 1e-06, + "loss": 0.4457, + "mean_token_accuracy": 0.8616614937782288, + "num_tokens": 194388539.0, + "step": 5093 + }, + { + "epoch": 0.6480091591400585, + "ewc_loss": 0.0247802734375, + "ewc_loss_parallel": 2.47955322265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.40316390991211, + "learning_rate": 1e-06, + "loss": 0.4422, + "mean_token_accuracy": 0.8595485687255859, + "num_tokens": 194424518.0, + "step": 5094 + }, + { + "epoch": 0.648136369418649, + "ewc_loss": 0.02490234375, + "ewc_loss_parallel": 2.491474151611328e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.23583984375, + "learning_rate": 1e-06, + "loss": 0.4631, + "mean_token_accuracy": 0.8539401292800903, + "num_tokens": 194463310.0, + "step": 5095 + }, + { + "epoch": 0.6482635796972396, + "ewc_loss": 0.024658203125, + "ewc_loss_parallel": 2.467632293701172e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.313180923461914, + "learning_rate": 1e-06, + "loss": 0.4446, + "mean_token_accuracy": 0.8569425344467163, + "num_tokens": 194499911.0, + "step": 5096 + }, + { + "epoch": 0.64839078997583, + "ewc_loss": 0.02490234375, + "ewc_loss_parallel": 2.491474151611328e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.405261993408203, + "learning_rate": 1e-06, + "loss": 0.4873, + "mean_token_accuracy": 0.8484963178634644, + "num_tokens": 194536731.0, + "step": 5097 + }, + { + "epoch": 0.6485180002544205, + "ewc_loss": 0.024658203125, + "ewc_loss_parallel": 2.467632293701172e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.242938995361328, + "learning_rate": 1e-06, + "loss": 0.4858, + "mean_token_accuracy": 0.8469225764274597, + "num_tokens": 194574014.0, + "step": 5098 + }, + { + "epoch": 0.648645210533011, + "ewc_loss": 0.0247802734375, + "ewc_loss_parallel": 2.47955322265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.391063690185547, + "learning_rate": 1e-06, + "loss": 0.4634, + "mean_token_accuracy": 0.8497642278671265, + "num_tokens": 194611944.0, + "step": 5099 + }, + { + "epoch": 0.6487724208116016, + "ewc_loss": 0.0247802734375, + "ewc_loss_parallel": 2.47955322265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.396257400512695, + "learning_rate": 1e-06, + "loss": 0.4156, + "mean_token_accuracy": 0.8682329654693604, + "num_tokens": 194649734.0, + "step": 5100 + }, + { + "epoch": 0.6488996310901921, + "ewc_loss": 0.0247802734375, + "ewc_loss_parallel": 2.47955322265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.314178466796875, + "learning_rate": 1e-06, + "loss": 0.4342, + "mean_token_accuracy": 0.8655580282211304, + "num_tokens": 194688344.0, + "step": 5101 + }, + { + "epoch": 0.6490268413687826, + "ewc_loss": 0.0247802734375, + "ewc_loss_parallel": 2.47955322265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.31422233581543, + "learning_rate": 1e-06, + "loss": 0.5055, + "mean_token_accuracy": 0.8450149893760681, + "num_tokens": 194728582.0, + "step": 5102 + }, + { + "epoch": 0.649154051647373, + "ewc_loss": 0.0250244140625, + "ewc_loss_parallel": 2.5033950805664062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.360536575317383, + "learning_rate": 1e-06, + "loss": 0.4423, + "mean_token_accuracy": 0.8582873344421387, + "num_tokens": 194762753.0, + "step": 5103 + }, + { + "epoch": 0.6492812619259636, + "ewc_loss": 0.024658203125, + "ewc_loss_parallel": 2.467632293701172e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.348419189453125, + "learning_rate": 1e-06, + "loss": 0.3995, + "mean_token_accuracy": 0.8710231184959412, + "num_tokens": 194802325.0, + "step": 5104 + }, + { + "epoch": 0.6494084722045541, + "ewc_loss": 0.02490234375, + "ewc_loss_parallel": 2.491474151611328e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.397783279418945, + "learning_rate": 1e-06, + "loss": 0.4553, + "mean_token_accuracy": 0.8568049073219299, + "num_tokens": 194846349.0, + "step": 5105 + }, + { + "epoch": 0.6495356824831446, + "ewc_loss": 0.024658203125, + "ewc_loss_parallel": 2.467632293701172e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.246793746948242, + "learning_rate": 1e-06, + "loss": 0.4369, + "mean_token_accuracy": 0.8587260246276855, + "num_tokens": 194880334.0, + "step": 5106 + }, + { + "epoch": 0.6496628927617352, + "ewc_loss": 0.025146484375, + "ewc_loss_parallel": 2.5153160095214844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.52637481689453, + "learning_rate": 1e-06, + "loss": 0.4691, + "mean_token_accuracy": 0.8487460613250732, + "num_tokens": 194919343.0, + "step": 5107 + }, + { + "epoch": 0.6497901030403257, + "ewc_loss": 0.0247802734375, + "ewc_loss_parallel": 2.47955322265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.38519287109375, + "learning_rate": 1e-06, + "loss": 0.4379, + "mean_token_accuracy": 0.8589885234832764, + "num_tokens": 194958800.0, + "step": 5108 + }, + { + "epoch": 0.6499173133189162, + "ewc_loss": 0.0247802734375, + "ewc_loss_parallel": 2.47955322265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.55414581298828, + "learning_rate": 1e-06, + "loss": 0.5125, + "mean_token_accuracy": 0.8312116861343384, + "num_tokens": 194997249.0, + "step": 5109 + }, + { + "epoch": 0.6500445235975066, + "ewc_loss": 0.024658203125, + "ewc_loss_parallel": 2.467632293701172e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.396644592285156, + "learning_rate": 1e-06, + "loss": 0.4406, + "mean_token_accuracy": 0.8599998354911804, + "num_tokens": 195035879.0, + "step": 5110 + }, + { + "epoch": 0.6501717338760972, + "ewc_loss": 0.0247802734375, + "ewc_loss_parallel": 2.47955322265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.44324493408203, + "learning_rate": 1e-06, + "loss": 0.4575, + "mean_token_accuracy": 0.8527079820632935, + "num_tokens": 195075925.0, + "step": 5111 + }, + { + "epoch": 0.6502989441546877, + "ewc_loss": 0.02490234375, + "ewc_loss_parallel": 2.491474151611328e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.47683334350586, + "learning_rate": 1e-06, + "loss": 0.4367, + "mean_token_accuracy": 0.8601975440979004, + "num_tokens": 195110185.0, + "step": 5112 + }, + { + "epoch": 0.6504261544332782, + "ewc_loss": 0.0247802734375, + "ewc_loss_parallel": 2.47955322265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.559162139892578, + "learning_rate": 1e-06, + "loss": 0.4665, + "mean_token_accuracy": 0.8510600328445435, + "num_tokens": 195154666.0, + "step": 5113 + }, + { + "epoch": 0.6505533647118688, + "ewc_loss": 0.0247802734375, + "ewc_loss_parallel": 2.47955322265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.366674423217773, + "learning_rate": 1e-06, + "loss": 0.4161, + "mean_token_accuracy": 0.8701121807098389, + "num_tokens": 195188998.0, + "step": 5114 + }, + { + "epoch": 0.6506805749904593, + "ewc_loss": 0.0247802734375, + "ewc_loss_parallel": 2.47955322265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.4007511138916, + "learning_rate": 1e-06, + "loss": 0.4173, + "mean_token_accuracy": 0.8654916286468506, + "num_tokens": 195224124.0, + "step": 5115 + }, + { + "epoch": 0.6508077852690497, + "ewc_loss": 0.0247802734375, + "ewc_loss_parallel": 2.47955322265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.46843910217285, + "learning_rate": 1e-06, + "loss": 0.4396, + "mean_token_accuracy": 0.8588811159133911, + "num_tokens": 195253601.0, + "step": 5116 + }, + { + "epoch": 0.6509349955476402, + "ewc_loss": 0.0247802734375, + "ewc_loss_parallel": 2.47955322265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.312318801879883, + "learning_rate": 1e-06, + "loss": 0.4686, + "mean_token_accuracy": 0.8498359322547913, + "num_tokens": 195289045.0, + "step": 5117 + }, + { + "epoch": 0.6510622058262308, + "ewc_loss": 0.02490234375, + "ewc_loss_parallel": 2.491474151611328e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.492910385131836, + "learning_rate": 1e-06, + "loss": 0.4306, + "mean_token_accuracy": 0.8586888909339905, + "num_tokens": 195326983.0, + "step": 5118 + }, + { + "epoch": 0.6511894161048213, + "ewc_loss": 0.0247802734375, + "ewc_loss_parallel": 2.47955322265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.408550262451172, + "learning_rate": 1e-06, + "loss": 0.4065, + "mean_token_accuracy": 0.8704676628112793, + "num_tokens": 195360425.0, + "step": 5119 + }, + { + "epoch": 0.6513166263834118, + "ewc_loss": 0.0247802734375, + "ewc_loss_parallel": 2.47955322265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.356584548950195, + "learning_rate": 1e-06, + "loss": 0.4856, + "mean_token_accuracy": 0.8475009202957153, + "num_tokens": 195402648.0, + "step": 5120 + }, + { + "epoch": 0.6514438366620023, + "ewc_loss": 0.02490234375, + "ewc_loss_parallel": 2.491474151611328e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.37721061706543, + "learning_rate": 1e-06, + "loss": 0.4017, + "mean_token_accuracy": 0.8761625289916992, + "num_tokens": 195440945.0, + "step": 5121 + }, + { + "epoch": 0.6515710469405928, + "ewc_loss": 0.02490234375, + "ewc_loss_parallel": 2.491474151611328e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.421354293823242, + "learning_rate": 1e-06, + "loss": 0.4403, + "mean_token_accuracy": 0.8600515723228455, + "num_tokens": 195481872.0, + "step": 5122 + }, + { + "epoch": 0.6516982572191833, + "ewc_loss": 0.02490234375, + "ewc_loss_parallel": 2.491474151611328e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.598955154418945, + "learning_rate": 1e-06, + "loss": 0.4323, + "mean_token_accuracy": 0.8631188273429871, + "num_tokens": 195525460.0, + "step": 5123 + }, + { + "epoch": 0.6518254674977738, + "ewc_loss": 0.0247802734375, + "ewc_loss_parallel": 2.47955322265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.354625701904297, + "learning_rate": 1e-06, + "loss": 0.4396, + "mean_token_accuracy": 0.8608934879302979, + "num_tokens": 195563522.0, + "step": 5124 + }, + { + "epoch": 0.6519526777763643, + "ewc_loss": 0.024658203125, + "ewc_loss_parallel": 2.467632293701172e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.517892837524414, + "learning_rate": 1e-06, + "loss": 0.4165, + "mean_token_accuracy": 0.8675885200500488, + "num_tokens": 195598114.0, + "step": 5125 + }, + { + "epoch": 0.6520798880549549, + "ewc_loss": 0.0247802734375, + "ewc_loss_parallel": 2.47955322265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.37718963623047, + "learning_rate": 1e-06, + "loss": 0.4635, + "mean_token_accuracy": 0.853093147277832, + "num_tokens": 195639826.0, + "step": 5126 + }, + { + "epoch": 0.6522070983335454, + "ewc_loss": 0.0247802734375, + "ewc_loss_parallel": 2.47955322265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.563018798828125, + "learning_rate": 1e-06, + "loss": 0.4212, + "mean_token_accuracy": 0.864683210849762, + "num_tokens": 195682047.0, + "step": 5127 + }, + { + "epoch": 0.6523343086121358, + "ewc_loss": 0.0247802734375, + "ewc_loss_parallel": 2.47955322265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.342823028564453, + "learning_rate": 1e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.8622738718986511, + "num_tokens": 195722453.0, + "step": 5128 + }, + { + "epoch": 0.6524615188907263, + "ewc_loss": 0.0247802734375, + "ewc_loss_parallel": 2.47955322265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.36318016052246, + "learning_rate": 1e-06, + "loss": 0.4631, + "mean_token_accuracy": 0.854104220867157, + "num_tokens": 195759182.0, + "step": 5129 + }, + { + "epoch": 0.6525887291693169, + "ewc_loss": 0.0250244140625, + "ewc_loss_parallel": 2.5033950805664062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.338680267333984, + "learning_rate": 1e-06, + "loss": 0.4404, + "mean_token_accuracy": 0.8620772361755371, + "num_tokens": 195795641.0, + "step": 5130 + }, + { + "epoch": 0.6527159394479074, + "ewc_loss": 0.024658203125, + "ewc_loss_parallel": 2.467632293701172e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.506324768066406, + "learning_rate": 1e-06, + "loss": 0.416, + "mean_token_accuracy": 0.8678109049797058, + "num_tokens": 195831200.0, + "step": 5131 + }, + { + "epoch": 0.6528431497264979, + "ewc_loss": 0.0247802734375, + "ewc_loss_parallel": 2.47955322265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.27205467224121, + "learning_rate": 1e-06, + "loss": 0.4311, + "mean_token_accuracy": 0.8608867526054382, + "num_tokens": 195875110.0, + "step": 5132 + }, + { + "epoch": 0.6529703600050885, + "ewc_loss": 0.0247802734375, + "ewc_loss_parallel": 2.47955322265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.46634292602539, + "learning_rate": 1e-06, + "loss": 0.4293, + "mean_token_accuracy": 0.8609404563903809, + "num_tokens": 195916724.0, + "step": 5133 + }, + { + "epoch": 0.6530975702836789, + "ewc_loss": 0.02490234375, + "ewc_loss_parallel": 2.491474151611328e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.645206451416016, + "learning_rate": 1e-06, + "loss": 0.4652, + "mean_token_accuracy": 0.851941704750061, + "num_tokens": 195956219.0, + "step": 5134 + }, + { + "epoch": 0.6532247805622694, + "ewc_loss": 0.02490234375, + "ewc_loss_parallel": 2.491474151611328e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.22638702392578, + "learning_rate": 1e-06, + "loss": 0.4316, + "mean_token_accuracy": 0.8620474338531494, + "num_tokens": 195991563.0, + "step": 5135 + }, + { + "epoch": 0.6533519908408599, + "ewc_loss": 0.0247802734375, + "ewc_loss_parallel": 2.47955322265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.35976219177246, + "learning_rate": 1e-06, + "loss": 0.4229, + "mean_token_accuracy": 0.8651986122131348, + "num_tokens": 196031864.0, + "step": 5136 + }, + { + "epoch": 0.6534792011194505, + "ewc_loss": 0.0250244140625, + "ewc_loss_parallel": 2.5033950805664062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.51563835144043, + "learning_rate": 1e-06, + "loss": 0.4747, + "mean_token_accuracy": 0.8514618873596191, + "num_tokens": 196067208.0, + "step": 5137 + }, + { + "epoch": 0.653606411398041, + "ewc_loss": 0.02490234375, + "ewc_loss_parallel": 2.491474151611328e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.235185623168945, + "learning_rate": 1e-06, + "loss": 0.4366, + "mean_token_accuracy": 0.8625198602676392, + "num_tokens": 196103934.0, + "step": 5138 + }, + { + "epoch": 0.6537336216766315, + "ewc_loss": 0.024658203125, + "ewc_loss_parallel": 2.467632293701172e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.346229553222656, + "learning_rate": 1e-06, + "loss": 0.4583, + "mean_token_accuracy": 0.8530006408691406, + "num_tokens": 196140886.0, + "step": 5139 + }, + { + "epoch": 0.6538608319552219, + "ewc_loss": 0.0247802734375, + "ewc_loss_parallel": 2.47955322265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.462051391601562, + "learning_rate": 1e-06, + "loss": 0.4121, + "mean_token_accuracy": 0.8671607971191406, + "num_tokens": 196178940.0, + "step": 5140 + }, + { + "epoch": 0.6539880422338125, + "ewc_loss": 0.0247802734375, + "ewc_loss_parallel": 2.47955322265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.122106552124023, + "learning_rate": 1e-06, + "loss": 0.4409, + "mean_token_accuracy": 0.8605095148086548, + "num_tokens": 196229461.0, + "step": 5141 + }, + { + "epoch": 0.654115252512403, + "ewc_loss": 0.024658203125, + "ewc_loss_parallel": 2.467632293701172e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.539222717285156, + "learning_rate": 1e-06, + "loss": 0.3946, + "mean_token_accuracy": 0.8723133206367493, + "num_tokens": 196265793.0, + "step": 5142 + }, + { + "epoch": 0.6542424627909935, + "ewc_loss": 0.02490234375, + "ewc_loss_parallel": 2.491474151611328e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.29541778564453, + "learning_rate": 1e-06, + "loss": 0.4199, + "mean_token_accuracy": 0.8650810718536377, + "num_tokens": 196304919.0, + "step": 5143 + }, + { + "epoch": 0.654369673069584, + "ewc_loss": 0.0245361328125, + "ewc_loss_parallel": 2.4557113647460938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.46488380432129, + "learning_rate": 1e-06, + "loss": 0.3927, + "mean_token_accuracy": 0.8737082481384277, + "num_tokens": 196344761.0, + "step": 5144 + }, + { + "epoch": 0.6544968833481746, + "ewc_loss": 0.025146484375, + "ewc_loss_parallel": 2.5153160095214844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.47847557067871, + "learning_rate": 1e-06, + "loss": 0.4119, + "mean_token_accuracy": 0.873031735420227, + "num_tokens": 196382625.0, + "step": 5145 + }, + { + "epoch": 0.654624093626765, + "ewc_loss": 0.024658203125, + "ewc_loss_parallel": 2.467632293701172e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.521467208862305, + "learning_rate": 1e-06, + "loss": 0.4091, + "mean_token_accuracy": 0.8702808618545532, + "num_tokens": 196422279.0, + "step": 5146 + }, + { + "epoch": 0.6547513039053555, + "ewc_loss": 0.024658203125, + "ewc_loss_parallel": 2.467632293701172e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.33946990966797, + "learning_rate": 1e-06, + "loss": 0.4581, + "mean_token_accuracy": 0.8549719452857971, + "num_tokens": 196463684.0, + "step": 5147 + }, + { + "epoch": 0.654878514183946, + "ewc_loss": 0.024658203125, + "ewc_loss_parallel": 2.467632293701172e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.382999420166016, + "learning_rate": 1e-06, + "loss": 0.4321, + "mean_token_accuracy": 0.8616893291473389, + "num_tokens": 196506319.0, + "step": 5148 + }, + { + "epoch": 0.6550057244625366, + "ewc_loss": 0.02490234375, + "ewc_loss_parallel": 2.491474151611328e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.33462905883789, + "learning_rate": 1e-06, + "loss": 0.4658, + "mean_token_accuracy": 0.8528415560722351, + "num_tokens": 196546367.0, + "step": 5149 + }, + { + "epoch": 0.6551329347411271, + "ewc_loss": 0.024658203125, + "ewc_loss_parallel": 2.467632293701172e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.265344619750977, + "learning_rate": 1e-06, + "loss": 0.4786, + "mean_token_accuracy": 0.8440197706222534, + "num_tokens": 196576598.0, + "step": 5150 + }, + { + "epoch": 0.6552601450197176, + "ewc_loss": 0.0247802734375, + "ewc_loss_parallel": 2.47955322265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.509109497070312, + "learning_rate": 1e-06, + "loss": 0.4667, + "mean_token_accuracy": 0.8531379103660583, + "num_tokens": 196610633.0, + "step": 5151 + }, + { + "epoch": 0.655387355298308, + "ewc_loss": 0.02490234375, + "ewc_loss_parallel": 2.491474151611328e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.233243942260742, + "learning_rate": 1e-06, + "loss": 0.4684, + "mean_token_accuracy": 0.8506776690483093, + "num_tokens": 196646183.0, + "step": 5152 + }, + { + "epoch": 0.6555145655768986, + "ewc_loss": 0.02490234375, + "ewc_loss_parallel": 2.491474151611328e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.329395294189453, + "learning_rate": 1e-06, + "loss": 0.4215, + "mean_token_accuracy": 0.8681751489639282, + "num_tokens": 196689454.0, + "step": 5153 + }, + { + "epoch": 0.6556417758554891, + "ewc_loss": 0.0247802734375, + "ewc_loss_parallel": 2.47955322265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.348825454711914, + "learning_rate": 1e-06, + "loss": 0.489, + "mean_token_accuracy": 0.8432170152664185, + "num_tokens": 196731820.0, + "step": 5154 + }, + { + "epoch": 0.6557689861340796, + "ewc_loss": 0.02490234375, + "ewc_loss_parallel": 2.491474151611328e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.42500114440918, + "learning_rate": 1e-06, + "loss": 0.4817, + "mean_token_accuracy": 0.8494375944137573, + "num_tokens": 196768542.0, + "step": 5155 + }, + { + "epoch": 0.6558961964126702, + "ewc_loss": 0.02490234375, + "ewc_loss_parallel": 2.491474151611328e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.48253631591797, + "learning_rate": 1e-06, + "loss": 0.4944, + "mean_token_accuracy": 0.8438669443130493, + "num_tokens": 196810300.0, + "step": 5156 + }, + { + "epoch": 0.6560234066912607, + "ewc_loss": 0.0247802734375, + "ewc_loss_parallel": 2.47955322265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.329071044921875, + "learning_rate": 1e-06, + "loss": 0.4963, + "mean_token_accuracy": 0.842616081237793, + "num_tokens": 196847271.0, + "step": 5157 + }, + { + "epoch": 0.6561506169698512, + "ewc_loss": 0.02490234375, + "ewc_loss_parallel": 2.491474151611328e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.781578063964844, + "learning_rate": 1e-06, + "loss": 0.3858, + "mean_token_accuracy": 0.8738247156143188, + "num_tokens": 196880439.0, + "step": 5158 + }, + { + "epoch": 0.6562778272484416, + "ewc_loss": 0.0247802734375, + "ewc_loss_parallel": 2.47955322265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.4271240234375, + "learning_rate": 1e-06, + "loss": 0.4594, + "mean_token_accuracy": 0.8519086837768555, + "num_tokens": 196914911.0, + "step": 5159 + }, + { + "epoch": 0.6564050375270322, + "ewc_loss": 0.0247802734375, + "ewc_loss_parallel": 2.47955322265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.45361328125, + "learning_rate": 1e-06, + "loss": 0.469, + "mean_token_accuracy": 0.8595172762870789, + "num_tokens": 196949419.0, + "step": 5160 + }, + { + "epoch": 0.6565322478056227, + "ewc_loss": 0.0247802734375, + "ewc_loss_parallel": 2.47955322265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.439176559448242, + "learning_rate": 1e-06, + "loss": 0.469, + "mean_token_accuracy": 0.8509204387664795, + "num_tokens": 196989401.0, + "step": 5161 + }, + { + "epoch": 0.6566594580842132, + "ewc_loss": 0.0247802734375, + "ewc_loss_parallel": 2.47955322265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.632633209228516, + "learning_rate": 1e-06, + "loss": 0.4701, + "mean_token_accuracy": 0.850871205329895, + "num_tokens": 197028377.0, + "step": 5162 + }, + { + "epoch": 0.6567866683628037, + "ewc_loss": 0.02490234375, + "ewc_loss_parallel": 2.491474151611328e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.34510612487793, + "learning_rate": 1e-06, + "loss": 0.5075, + "mean_token_accuracy": 0.8380993604660034, + "num_tokens": 197068334.0, + "step": 5163 + }, + { + "epoch": 0.6569138786413943, + "ewc_loss": 0.024658203125, + "ewc_loss_parallel": 2.467632293701172e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.478239059448242, + "learning_rate": 1e-06, + "loss": 0.4344, + "mean_token_accuracy": 0.860978364944458, + "num_tokens": 197105978.0, + "step": 5164 + }, + { + "epoch": 0.6570410889199847, + "ewc_loss": 0.0250244140625, + "ewc_loss_parallel": 2.5033950805664062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.454675674438477, + "learning_rate": 1e-06, + "loss": 0.4951, + "mean_token_accuracy": 0.8415173292160034, + "num_tokens": 197148641.0, + "step": 5165 + }, + { + "epoch": 0.6571682991985752, + "ewc_loss": 0.024658203125, + "ewc_loss_parallel": 2.467632293701172e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.25777244567871, + "learning_rate": 1e-06, + "loss": 0.509, + "mean_token_accuracy": 0.8392991423606873, + "num_tokens": 197188820.0, + "step": 5166 + }, + { + "epoch": 0.6572955094771658, + "ewc_loss": 0.02490234375, + "ewc_loss_parallel": 2.491474151611328e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.49042320251465, + "learning_rate": 1e-06, + "loss": 0.4784, + "mean_token_accuracy": 0.8456175327301025, + "num_tokens": 197227109.0, + "step": 5167 + }, + { + "epoch": 0.6574227197557563, + "ewc_loss": 0.02490234375, + "ewc_loss_parallel": 2.491474151611328e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.334897994995117, + "learning_rate": 1e-06, + "loss": 0.436, + "mean_token_accuracy": 0.8651843070983887, + "num_tokens": 197264083.0, + "step": 5168 + }, + { + "epoch": 0.6575499300343468, + "ewc_loss": 0.0247802734375, + "ewc_loss_parallel": 2.47955322265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.30080795288086, + "learning_rate": 1e-06, + "loss": 0.4589, + "mean_token_accuracy": 0.8541019558906555, + "num_tokens": 197301349.0, + "step": 5169 + }, + { + "epoch": 0.6576771403129373, + "ewc_loss": 0.0247802734375, + "ewc_loss_parallel": 2.47955322265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.40656852722168, + "learning_rate": 1e-06, + "loss": 0.4851, + "mean_token_accuracy": 0.8476405739784241, + "num_tokens": 197348176.0, + "step": 5170 + }, + { + "epoch": 0.6578043505915278, + "ewc_loss": 0.0247802734375, + "ewc_loss_parallel": 2.47955322265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.446870803833008, + "learning_rate": 1e-06, + "loss": 0.4604, + "mean_token_accuracy": 0.8499540090560913, + "num_tokens": 197380490.0, + "step": 5171 + }, + { + "epoch": 0.6579315608701183, + "ewc_loss": 0.02490234375, + "ewc_loss_parallel": 2.491474151611328e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.391834259033203, + "learning_rate": 1e-06, + "loss": 0.4905, + "mean_token_accuracy": 0.8453503847122192, + "num_tokens": 197423075.0, + "step": 5172 + }, + { + "epoch": 0.6580587711487088, + "ewc_loss": 0.025146484375, + "ewc_loss_parallel": 2.5153160095214844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.464427947998047, + "learning_rate": 1e-06, + "loss": 0.4375, + "mean_token_accuracy": 0.8629224300384521, + "num_tokens": 197461730.0, + "step": 5173 + }, + { + "epoch": 0.6581859814272993, + "ewc_loss": 0.02490234375, + "ewc_loss_parallel": 2.491474151611328e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.371456146240234, + "learning_rate": 1e-06, + "loss": 0.479, + "mean_token_accuracy": 0.8468550443649292, + "num_tokens": 197505801.0, + "step": 5174 + }, + { + "epoch": 0.6583131917058899, + "ewc_loss": 0.0247802734375, + "ewc_loss_parallel": 2.47955322265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.520856857299805, + "learning_rate": 1e-06, + "loss": 0.3867, + "mean_token_accuracy": 0.8753713369369507, + "num_tokens": 197534643.0, + "step": 5175 + }, + { + "epoch": 0.6584404019844804, + "ewc_loss": 0.02490234375, + "ewc_loss_parallel": 2.491474151611328e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.252321243286133, + "learning_rate": 1e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.8636236786842346, + "num_tokens": 197576490.0, + "step": 5176 + }, + { + "epoch": 0.6585676122630708, + "ewc_loss": 0.02490234375, + "ewc_loss_parallel": 2.491474151611328e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.036624908447266, + "learning_rate": 1e-06, + "loss": 0.3963, + "mean_token_accuracy": 0.8711819648742676, + "num_tokens": 197612745.0, + "step": 5177 + }, + { + "epoch": 0.6586948225416613, + "ewc_loss": 0.025146484375, + "ewc_loss_parallel": 2.5153160095214844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.380990982055664, + "learning_rate": 1e-06, + "loss": 0.4603, + "mean_token_accuracy": 0.8533097505569458, + "num_tokens": 197653756.0, + "step": 5178 + }, + { + "epoch": 0.6588220328202519, + "ewc_loss": 0.025146484375, + "ewc_loss_parallel": 2.5153160095214844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.243864059448242, + "learning_rate": 1e-06, + "loss": 0.3649, + "mean_token_accuracy": 0.8827776312828064, + "num_tokens": 197694417.0, + "step": 5179 + }, + { + "epoch": 0.6589492430988424, + "ewc_loss": 0.0250244140625, + "ewc_loss_parallel": 2.5033950805664062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.474416732788086, + "learning_rate": 1e-06, + "loss": 0.4648, + "mean_token_accuracy": 0.8532835245132446, + "num_tokens": 197732738.0, + "step": 5180 + }, + { + "epoch": 0.6590764533774329, + "ewc_loss": 0.0250244140625, + "ewc_loss_parallel": 2.5033950805664062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.443378448486328, + "learning_rate": 1e-06, + "loss": 0.4795, + "mean_token_accuracy": 0.8496096134185791, + "num_tokens": 197759408.0, + "step": 5181 + }, + { + "epoch": 0.6592036636560235, + "ewc_loss": 0.02490234375, + "ewc_loss_parallel": 2.491474151611328e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.393112182617188, + "learning_rate": 1e-06, + "loss": 0.4441, + "mean_token_accuracy": 0.8602548837661743, + "num_tokens": 197794334.0, + "step": 5182 + }, + { + "epoch": 0.6593308739346139, + "ewc_loss": 0.0250244140625, + "ewc_loss_parallel": 2.5033950805664062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.32014274597168, + "learning_rate": 1e-06, + "loss": 0.4256, + "mean_token_accuracy": 0.8673602342605591, + "num_tokens": 197832497.0, + "step": 5183 + }, + { + "epoch": 0.6594580842132044, + "ewc_loss": 0.0250244140625, + "ewc_loss_parallel": 2.5033950805664062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.585308074951172, + "learning_rate": 1e-06, + "loss": 0.4662, + "mean_token_accuracy": 0.8498578071594238, + "num_tokens": 197869021.0, + "step": 5184 + }, + { + "epoch": 0.6595852944917949, + "ewc_loss": 0.025146484375, + "ewc_loss_parallel": 2.5153160095214844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.449934005737305, + "learning_rate": 1e-06, + "loss": 0.4829, + "mean_token_accuracy": 0.844512939453125, + "num_tokens": 197906365.0, + "step": 5185 + }, + { + "epoch": 0.6597125047703855, + "ewc_loss": 0.0250244140625, + "ewc_loss_parallel": 2.5033950805664062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.423025131225586, + "learning_rate": 1e-06, + "loss": 0.4431, + "mean_token_accuracy": 0.8614513874053955, + "num_tokens": 197950725.0, + "step": 5186 + }, + { + "epoch": 0.659839715048976, + "ewc_loss": 0.0250244140625, + "ewc_loss_parallel": 2.5033950805664062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.40033721923828, + "learning_rate": 1e-06, + "loss": 0.4213, + "mean_token_accuracy": 0.8664910793304443, + "num_tokens": 197991178.0, + "step": 5187 + }, + { + "epoch": 0.6599669253275665, + "ewc_loss": 0.0250244140625, + "ewc_loss_parallel": 2.5033950805664062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.5674991607666, + "learning_rate": 1e-06, + "loss": 0.4792, + "mean_token_accuracy": 0.8495624661445618, + "num_tokens": 198030176.0, + "step": 5188 + }, + { + "epoch": 0.6600941356061569, + "ewc_loss": 0.0250244140625, + "ewc_loss_parallel": 2.5033950805664062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.4217586517334, + "learning_rate": 1e-06, + "loss": 0.4663, + "mean_token_accuracy": 0.8524253368377686, + "num_tokens": 198067169.0, + "step": 5189 + }, + { + "epoch": 0.6602213458847475, + "ewc_loss": 0.0250244140625, + "ewc_loss_parallel": 2.5033950805664062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.481264114379883, + "learning_rate": 1e-06, + "loss": 0.483, + "mean_token_accuracy": 0.8475406765937805, + "num_tokens": 198104994.0, + "step": 5190 + }, + { + "epoch": 0.660348556163338, + "ewc_loss": 0.0250244140625, + "ewc_loss_parallel": 2.5033950805664062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.40628433227539, + "learning_rate": 1e-06, + "loss": 0.4323, + "mean_token_accuracy": 0.8625981211662292, + "num_tokens": 198142791.0, + "step": 5191 + }, + { + "epoch": 0.6604757664419285, + "ewc_loss": 0.02490234375, + "ewc_loss_parallel": 2.491474151611328e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.49340057373047, + "learning_rate": 1e-06, + "loss": 0.4469, + "mean_token_accuracy": 0.8545874357223511, + "num_tokens": 198179300.0, + "step": 5192 + }, + { + "epoch": 0.660602976720519, + "ewc_loss": 0.02490234375, + "ewc_loss_parallel": 2.491474151611328e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.529644012451172, + "learning_rate": 1e-06, + "loss": 0.403, + "mean_token_accuracy": 0.8726921081542969, + "num_tokens": 198220230.0, + "step": 5193 + }, + { + "epoch": 0.6607301869991096, + "ewc_loss": 0.0250244140625, + "ewc_loss_parallel": 2.5033950805664062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.613384246826172, + "learning_rate": 1e-06, + "loss": 0.4814, + "mean_token_accuracy": 0.8476791977882385, + "num_tokens": 198251067.0, + "step": 5194 + }, + { + "epoch": 0.6608573972777, + "ewc_loss": 0.02490234375, + "ewc_loss_parallel": 2.491474151611328e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.17670440673828, + "learning_rate": 1e-06, + "loss": 0.4094, + "mean_token_accuracy": 0.8670467138290405, + "num_tokens": 198282130.0, + "step": 5195 + }, + { + "epoch": 0.6609846075562905, + "ewc_loss": 0.0247802734375, + "ewc_loss_parallel": 2.47955322265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.409568786621094, + "learning_rate": 1e-06, + "loss": 0.4005, + "mean_token_accuracy": 0.8689068555831909, + "num_tokens": 198325609.0, + "step": 5196 + }, + { + "epoch": 0.661111817834881, + "ewc_loss": 0.025146484375, + "ewc_loss_parallel": 2.5153160095214844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.348344802856445, + "learning_rate": 1e-06, + "loss": 0.4421, + "mean_token_accuracy": 0.8606033325195312, + "num_tokens": 198365116.0, + "step": 5197 + }, + { + "epoch": 0.6612390281134716, + "ewc_loss": 0.02490234375, + "ewc_loss_parallel": 2.491474151611328e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.52961540222168, + "learning_rate": 1e-06, + "loss": 0.4517, + "mean_token_accuracy": 0.8584806323051453, + "num_tokens": 198403467.0, + "step": 5198 + }, + { + "epoch": 0.6613662383920621, + "ewc_loss": 0.0250244140625, + "ewc_loss_parallel": 2.5033950805664062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.459407806396484, + "learning_rate": 1e-06, + "loss": 0.4261, + "mean_token_accuracy": 0.8640937209129333, + "num_tokens": 198440676.0, + "step": 5199 + }, + { + "epoch": 0.6614934486706526, + "ewc_loss": 0.0250244140625, + "ewc_loss_parallel": 2.5033950805664062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.3649959564209, + "learning_rate": 1e-06, + "loss": 0.4304, + "mean_token_accuracy": 0.8609774112701416, + "num_tokens": 198479082.0, + "step": 5200 + }, + { + "epoch": 0.661620658949243, + "ewc_loss": 0.0250244140625, + "ewc_loss_parallel": 2.5033950805664062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.164438247680664, + "learning_rate": 1e-06, + "loss": 0.45, + "mean_token_accuracy": 0.8546701669692993, + "num_tokens": 198521741.0, + "step": 5201 + }, + { + "epoch": 0.6617478692278336, + "ewc_loss": 0.0250244140625, + "ewc_loss_parallel": 2.5033950805664062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.4113826751709, + "learning_rate": 1e-06, + "loss": 0.4521, + "mean_token_accuracy": 0.8514459133148193, + "num_tokens": 198562177.0, + "step": 5202 + }, + { + "epoch": 0.6618750795064241, + "ewc_loss": 0.025146484375, + "ewc_loss_parallel": 2.5153160095214844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.43836784362793, + "learning_rate": 1e-06, + "loss": 0.4554, + "mean_token_accuracy": 0.8590980768203735, + "num_tokens": 198600726.0, + "step": 5203 + }, + { + "epoch": 0.6620022897850146, + "ewc_loss": 0.0247802734375, + "ewc_loss_parallel": 2.47955322265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.365814208984375, + "learning_rate": 1e-06, + "loss": 0.5357, + "mean_token_accuracy": 0.8292657732963562, + "num_tokens": 198638156.0, + "step": 5204 + }, + { + "epoch": 0.6621295000636052, + "ewc_loss": 0.02490234375, + "ewc_loss_parallel": 2.491474151611328e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.22855567932129, + "learning_rate": 1e-06, + "loss": 0.52, + "mean_token_accuracy": 0.839910626411438, + "num_tokens": 198680312.0, + "step": 5205 + }, + { + "epoch": 0.6622567103421957, + "ewc_loss": 0.025146484375, + "ewc_loss_parallel": 2.5153160095214844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.563129425048828, + "learning_rate": 1e-06, + "loss": 0.3878, + "mean_token_accuracy": 0.8762301206588745, + "num_tokens": 198723886.0, + "step": 5206 + }, + { + "epoch": 0.6623839206207861, + "ewc_loss": 0.025146484375, + "ewc_loss_parallel": 2.5153160095214844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.41889190673828, + "learning_rate": 1e-06, + "loss": 0.4485, + "mean_token_accuracy": 0.858838677406311, + "num_tokens": 198765331.0, + "step": 5207 + }, + { + "epoch": 0.6625111308993766, + "ewc_loss": 0.0250244140625, + "ewc_loss_parallel": 2.5033950805664062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.380189895629883, + "learning_rate": 1e-06, + "loss": 0.4333, + "mean_token_accuracy": 0.8618923425674438, + "num_tokens": 198802421.0, + "step": 5208 + }, + { + "epoch": 0.6626383411779672, + "ewc_loss": 0.025146484375, + "ewc_loss_parallel": 2.5153160095214844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.53416633605957, + "learning_rate": 1e-06, + "loss": 0.4097, + "mean_token_accuracy": 0.8685469627380371, + "num_tokens": 198843335.0, + "step": 5209 + }, + { + "epoch": 0.6627655514565577, + "ewc_loss": 0.02490234375, + "ewc_loss_parallel": 2.491474151611328e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.300086975097656, + "learning_rate": 1e-06, + "loss": 0.4224, + "mean_token_accuracy": 0.8652453422546387, + "num_tokens": 198880925.0, + "step": 5210 + }, + { + "epoch": 0.6628927617351482, + "ewc_loss": 0.0250244140625, + "ewc_loss_parallel": 2.5033950805664062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.599449157714844, + "learning_rate": 1e-06, + "loss": 0.4485, + "mean_token_accuracy": 0.8580777049064636, + "num_tokens": 198915448.0, + "step": 5211 + }, + { + "epoch": 0.6630199720137387, + "ewc_loss": 0.0250244140625, + "ewc_loss_parallel": 2.5033950805664062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.36501693725586, + "learning_rate": 1e-06, + "loss": 0.4066, + "mean_token_accuracy": 0.8688064813613892, + "num_tokens": 198952254.0, + "step": 5212 + }, + { + "epoch": 0.6631471822923293, + "ewc_loss": 0.02490234375, + "ewc_loss_parallel": 2.491474151611328e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.56962776184082, + "learning_rate": 1e-06, + "loss": 0.4273, + "mean_token_accuracy": 0.8634802103042603, + "num_tokens": 198990841.0, + "step": 5213 + }, + { + "epoch": 0.6632743925709197, + "ewc_loss": 0.0250244140625, + "ewc_loss_parallel": 2.5033950805664062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.513704299926758, + "learning_rate": 1e-06, + "loss": 0.4261, + "mean_token_accuracy": 0.8616454601287842, + "num_tokens": 199029383.0, + "step": 5214 + }, + { + "epoch": 0.6634016028495102, + "ewc_loss": 0.0252685546875, + "ewc_loss_parallel": 2.5272369384765625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.578096389770508, + "learning_rate": 1e-06, + "loss": 0.4361, + "mean_token_accuracy": 0.8588740825653076, + "num_tokens": 199066731.0, + "step": 5215 + }, + { + "epoch": 0.6635288131281007, + "ewc_loss": 0.0250244140625, + "ewc_loss_parallel": 2.5033950805664062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.54877281188965, + "learning_rate": 1e-06, + "loss": 0.4061, + "mean_token_accuracy": 0.8706391453742981, + "num_tokens": 199108752.0, + "step": 5216 + }, + { + "epoch": 0.6636560234066913, + "ewc_loss": 0.0250244140625, + "ewc_loss_parallel": 2.5033950805664062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.77461051940918, + "learning_rate": 1e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.8626335859298706, + "num_tokens": 199146438.0, + "step": 5217 + }, + { + "epoch": 0.6637832336852818, + "ewc_loss": 0.02490234375, + "ewc_loss_parallel": 2.491474151611328e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.50086212158203, + "learning_rate": 1e-06, + "loss": 0.4393, + "mean_token_accuracy": 0.8612560629844666, + "num_tokens": 199186367.0, + "step": 5218 + }, + { + "epoch": 0.6639104439638723, + "ewc_loss": 0.0247802734375, + "ewc_loss_parallel": 2.47955322265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.964950561523438, + "learning_rate": 1e-06, + "loss": 0.405, + "mean_token_accuracy": 0.8727052807807922, + "num_tokens": 199225785.0, + "step": 5219 + }, + { + "epoch": 0.6640376542424627, + "ewc_loss": 0.02490234375, + "ewc_loss_parallel": 2.491474151611328e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.570524215698242, + "learning_rate": 1e-06, + "loss": 0.4365, + "mean_token_accuracy": 0.8624505400657654, + "num_tokens": 199262574.0, + "step": 5220 + }, + { + "epoch": 0.6641648645210533, + "ewc_loss": 0.0247802734375, + "ewc_loss_parallel": 2.47955322265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.49298858642578, + "learning_rate": 1e-06, + "loss": 0.4258, + "mean_token_accuracy": 0.8642722368240356, + "num_tokens": 199299093.0, + "step": 5221 + }, + { + "epoch": 0.6642920747996438, + "ewc_loss": 0.025146484375, + "ewc_loss_parallel": 2.5153160095214844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.87145233154297, + "learning_rate": 1e-06, + "loss": 0.4248, + "mean_token_accuracy": 0.8647205829620361, + "num_tokens": 199337446.0, + "step": 5222 + }, + { + "epoch": 0.6644192850782343, + "ewc_loss": 0.024658203125, + "ewc_loss_parallel": 2.467632293701172e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.62635612487793, + "learning_rate": 1e-06, + "loss": 0.4298, + "mean_token_accuracy": 0.8610079288482666, + "num_tokens": 199377321.0, + "step": 5223 + }, + { + "epoch": 0.6645464953568249, + "ewc_loss": 0.02490234375, + "ewc_loss_parallel": 2.491474151611328e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.425334930419922, + "learning_rate": 1e-06, + "loss": 0.4125, + "mean_token_accuracy": 0.8694557547569275, + "num_tokens": 199413057.0, + "step": 5224 + }, + { + "epoch": 0.6646737056354154, + "ewc_loss": 0.02490234375, + "ewc_loss_parallel": 2.491474151611328e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.614450454711914, + "learning_rate": 1e-06, + "loss": 0.4246, + "mean_token_accuracy": 0.8663114309310913, + "num_tokens": 199452730.0, + "step": 5225 + }, + { + "epoch": 0.6648009159140058, + "ewc_loss": 0.024658203125, + "ewc_loss_parallel": 2.467632293701172e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.412174224853516, + "learning_rate": 1e-06, + "loss": 0.4116, + "mean_token_accuracy": 0.8697598576545715, + "num_tokens": 199487860.0, + "step": 5226 + }, + { + "epoch": 0.6649281261925963, + "ewc_loss": 0.0247802734375, + "ewc_loss_parallel": 2.47955322265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.334928512573242, + "learning_rate": 1e-06, + "loss": 0.4074, + "mean_token_accuracy": 0.8657869100570679, + "num_tokens": 199532790.0, + "step": 5227 + }, + { + "epoch": 0.6650553364711869, + "ewc_loss": 0.024658203125, + "ewc_loss_parallel": 2.467632293701172e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.378387451171875, + "learning_rate": 1e-06, + "loss": 0.3688, + "mean_token_accuracy": 0.8789557218551636, + "num_tokens": 199561072.0, + "step": 5228 + }, + { + "epoch": 0.6651825467497774, + "ewc_loss": 0.0247802734375, + "ewc_loss_parallel": 2.47955322265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.472543716430664, + "learning_rate": 1e-06, + "loss": 0.4254, + "mean_token_accuracy": 0.8653830885887146, + "num_tokens": 199595820.0, + "step": 5229 + }, + { + "epoch": 0.6653097570283679, + "ewc_loss": 0.02490234375, + "ewc_loss_parallel": 2.491474151611328e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.361719131469727, + "learning_rate": 1e-06, + "loss": 0.483, + "mean_token_accuracy": 0.8464634418487549, + "num_tokens": 199640101.0, + "step": 5230 + }, + { + "epoch": 0.6654369673069584, + "ewc_loss": 0.02490234375, + "ewc_loss_parallel": 2.491474151611328e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.580698013305664, + "learning_rate": 1e-06, + "loss": 0.4621, + "mean_token_accuracy": 0.8555882573127747, + "num_tokens": 199680287.0, + "step": 5231 + }, + { + "epoch": 0.6655641775855489, + "ewc_loss": 0.025146484375, + "ewc_loss_parallel": 2.5153160095214844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.595977783203125, + "learning_rate": 1e-06, + "loss": 0.443, + "mean_token_accuracy": 0.858962893486023, + "num_tokens": 199721958.0, + "step": 5232 + }, + { + "epoch": 0.6656913878641394, + "ewc_loss": 0.0247802734375, + "ewc_loss_parallel": 2.47955322265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.338611602783203, + "learning_rate": 1e-06, + "loss": 0.4206, + "mean_token_accuracy": 0.8653953075408936, + "num_tokens": 199756021.0, + "step": 5233 + }, + { + "epoch": 0.6658185981427299, + "ewc_loss": 0.0247802734375, + "ewc_loss_parallel": 2.47955322265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.532207489013672, + "learning_rate": 1e-06, + "loss": 0.422, + "mean_token_accuracy": 0.8636599779129028, + "num_tokens": 199793586.0, + "step": 5234 + }, + { + "epoch": 0.6659458084213205, + "ewc_loss": 0.0250244140625, + "ewc_loss_parallel": 2.5033950805664062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.513418197631836, + "learning_rate": 1e-06, + "loss": 0.4808, + "mean_token_accuracy": 0.8463506698608398, + "num_tokens": 199835728.0, + "step": 5235 + }, + { + "epoch": 0.666073018699911, + "ewc_loss": 0.0247802734375, + "ewc_loss_parallel": 2.47955322265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.32973861694336, + "learning_rate": 1e-06, + "loss": 0.4117, + "mean_token_accuracy": 0.8685632944107056, + "num_tokens": 199873348.0, + "step": 5236 + }, + { + "epoch": 0.6662002289785015, + "ewc_loss": 0.0250244140625, + "ewc_loss_parallel": 2.5033950805664062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.55747413635254, + "learning_rate": 1e-06, + "loss": 0.4478, + "mean_token_accuracy": 0.8593828678131104, + "num_tokens": 199908223.0, + "step": 5237 + }, + { + "epoch": 0.6663274392570919, + "ewc_loss": 0.024658203125, + "ewc_loss_parallel": 2.467632293701172e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.457956314086914, + "learning_rate": 1e-06, + "loss": 0.4444, + "mean_token_accuracy": 0.855657160282135, + "num_tokens": 199947017.0, + "step": 5238 + }, + { + "epoch": 0.6664546495356825, + "ewc_loss": 0.02490234375, + "ewc_loss_parallel": 2.491474151611328e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.548625946044922, + "learning_rate": 1e-06, + "loss": 0.4249, + "mean_token_accuracy": 0.867559015750885, + "num_tokens": 199984806.0, + "step": 5239 + }, + { + "epoch": 0.666581859814273, + "ewc_loss": 0.0250244140625, + "ewc_loss_parallel": 2.5033950805664062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.479305267333984, + "learning_rate": 1e-06, + "loss": 0.4681, + "mean_token_accuracy": 0.8536876440048218, + "num_tokens": 200022600.0, + "step": 5240 + }, + { + "epoch": 0.6667090700928635, + "ewc_loss": 0.0247802734375, + "ewc_loss_parallel": 2.47955322265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.431852340698242, + "learning_rate": 1e-06, + "loss": 0.3995, + "mean_token_accuracy": 0.869942307472229, + "num_tokens": 200058407.0, + "step": 5241 + }, + { + "epoch": 0.666836280371454, + "ewc_loss": 0.02490234375, + "ewc_loss_parallel": 2.491474151611328e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.445308685302734, + "learning_rate": 1e-06, + "loss": 0.5253, + "mean_token_accuracy": 0.8329372406005859, + "num_tokens": 200101871.0, + "step": 5242 + }, + { + "epoch": 0.6669634906500446, + "ewc_loss": 0.0250244140625, + "ewc_loss_parallel": 2.5033950805664062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.56867790222168, + "learning_rate": 1e-06, + "loss": 0.4404, + "mean_token_accuracy": 0.8603261113166809, + "num_tokens": 200138501.0, + "step": 5243 + }, + { + "epoch": 0.667090700928635, + "ewc_loss": 0.0250244140625, + "ewc_loss_parallel": 2.5033950805664062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.82499122619629, + "learning_rate": 1e-06, + "loss": 0.4441, + "mean_token_accuracy": 0.858237624168396, + "num_tokens": 200176639.0, + "step": 5244 + }, + { + "epoch": 0.6672179112072255, + "ewc_loss": 0.0250244140625, + "ewc_loss_parallel": 2.5033950805664062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.5460262298584, + "learning_rate": 1e-06, + "loss": 0.4635, + "mean_token_accuracy": 0.8553596138954163, + "num_tokens": 200211873.0, + "step": 5245 + }, + { + "epoch": 0.667345121485816, + "ewc_loss": 0.02490234375, + "ewc_loss_parallel": 2.491474151611328e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.122268676757812, + "learning_rate": 1e-06, + "loss": 0.4632, + "mean_token_accuracy": 0.8553339242935181, + "num_tokens": 200246596.0, + "step": 5246 + }, + { + "epoch": 0.6674723317644066, + "ewc_loss": 0.0247802734375, + "ewc_loss_parallel": 2.47955322265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.324920654296875, + "learning_rate": 1e-06, + "loss": 0.3848, + "mean_token_accuracy": 0.8748760223388672, + "num_tokens": 200282761.0, + "step": 5247 + }, + { + "epoch": 0.6675995420429971, + "ewc_loss": 0.02490234375, + "ewc_loss_parallel": 2.491474151611328e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.684749603271484, + "learning_rate": 1e-06, + "loss": 0.4844, + "mean_token_accuracy": 0.8441270589828491, + "num_tokens": 200321821.0, + "step": 5248 + }, + { + "epoch": 0.6677267523215876, + "ewc_loss": 0.0250244140625, + "ewc_loss_parallel": 2.5033950805664062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.652170181274414, + "learning_rate": 1e-06, + "loss": 0.4918, + "mean_token_accuracy": 0.8428105115890503, + "num_tokens": 200369066.0, + "step": 5249 + }, + { + "epoch": 0.667853962600178, + "ewc_loss": 0.02490234375, + "ewc_loss_parallel": 2.491474151611328e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.28325843811035, + "learning_rate": 1e-06, + "loss": 0.4535, + "mean_token_accuracy": 0.8513774871826172, + "num_tokens": 200408067.0, + "step": 5250 + }, + { + "epoch": 0.6679811728787686, + "ewc_loss": 0.0250244140625, + "ewc_loss_parallel": 2.5033950805664062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.865196228027344, + "learning_rate": 1e-06, + "loss": 0.3948, + "mean_token_accuracy": 0.8767184019088745, + "num_tokens": 200446693.0, + "step": 5251 + }, + { + "epoch": 0.6681083831573591, + "ewc_loss": 0.02490234375, + "ewc_loss_parallel": 2.491474151611328e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.560155868530273, + "learning_rate": 1e-06, + "loss": 0.5092, + "mean_token_accuracy": 0.8404232263565063, + "num_tokens": 200485340.0, + "step": 5252 + }, + { + "epoch": 0.6682355934359496, + "ewc_loss": 0.0247802734375, + "ewc_loss_parallel": 2.47955322265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.434274673461914, + "learning_rate": 1e-06, + "loss": 0.4581, + "mean_token_accuracy": 0.856099545955658, + "num_tokens": 200521978.0, + "step": 5253 + }, + { + "epoch": 0.6683628037145402, + "ewc_loss": 0.02490234375, + "ewc_loss_parallel": 2.491474151611328e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.57126808166504, + "learning_rate": 1e-06, + "loss": 0.4332, + "mean_token_accuracy": 0.860609769821167, + "num_tokens": 200560604.0, + "step": 5254 + }, + { + "epoch": 0.6684900139931307, + "ewc_loss": 0.02490234375, + "ewc_loss_parallel": 2.491474151611328e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.719694137573242, + "learning_rate": 1e-06, + "loss": 0.4595, + "mean_token_accuracy": 0.8556034564971924, + "num_tokens": 200601718.0, + "step": 5255 + }, + { + "epoch": 0.6686172242717211, + "ewc_loss": 0.02490234375, + "ewc_loss_parallel": 2.491474151611328e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.44786262512207, + "learning_rate": 1e-06, + "loss": 0.5165, + "mean_token_accuracy": 0.8348694443702698, + "num_tokens": 200646186.0, + "step": 5256 + }, + { + "epoch": 0.6687444345503116, + "ewc_loss": 0.02490234375, + "ewc_loss_parallel": 2.491474151611328e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.474031448364258, + "learning_rate": 1e-06, + "loss": 0.434, + "mean_token_accuracy": 0.8617881536483765, + "num_tokens": 200684849.0, + "step": 5257 + }, + { + "epoch": 0.6688716448289022, + "ewc_loss": 0.02490234375, + "ewc_loss_parallel": 2.491474151611328e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.705787658691406, + "learning_rate": 1e-06, + "loss": 0.4057, + "mean_token_accuracy": 0.8646823763847351, + "num_tokens": 200719542.0, + "step": 5258 + }, + { + "epoch": 0.6689988551074927, + "ewc_loss": 0.0250244140625, + "ewc_loss_parallel": 2.5033950805664062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.65389633178711, + "learning_rate": 1e-06, + "loss": 0.4499, + "mean_token_accuracy": 0.8599667549133301, + "num_tokens": 200756212.0, + "step": 5259 + }, + { + "epoch": 0.6691260653860832, + "ewc_loss": 0.02490234375, + "ewc_loss_parallel": 2.491474151611328e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.512874603271484, + "learning_rate": 1e-06, + "loss": 0.4326, + "mean_token_accuracy": 0.8619688749313354, + "num_tokens": 200791184.0, + "step": 5260 + }, + { + "epoch": 0.6692532756646737, + "ewc_loss": 0.02490234375, + "ewc_loss_parallel": 2.491474151611328e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.61037254333496, + "learning_rate": 1e-06, + "loss": 0.4947, + "mean_token_accuracy": 0.8432580232620239, + "num_tokens": 200823966.0, + "step": 5261 + }, + { + "epoch": 0.6693804859432643, + "ewc_loss": 0.0250244140625, + "ewc_loss_parallel": 2.5033950805664062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.541051864624023, + "learning_rate": 1e-06, + "loss": 0.4317, + "mean_token_accuracy": 0.8642834424972534, + "num_tokens": 200862718.0, + "step": 5262 + }, + { + "epoch": 0.6695076962218547, + "ewc_loss": 0.0247802734375, + "ewc_loss_parallel": 2.47955322265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.456701278686523, + "learning_rate": 1e-06, + "loss": 0.4437, + "mean_token_accuracy": 0.8545529842376709, + "num_tokens": 200901755.0, + "step": 5263 + }, + { + "epoch": 0.6696349065004452, + "ewc_loss": 0.0250244140625, + "ewc_loss_parallel": 2.5033950805664062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.534778594970703, + "learning_rate": 1e-06, + "loss": 0.379, + "mean_token_accuracy": 0.8707927465438843, + "num_tokens": 200935788.0, + "step": 5264 + }, + { + "epoch": 0.6697621167790357, + "ewc_loss": 0.02490234375, + "ewc_loss_parallel": 2.491474151611328e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.58049774169922, + "learning_rate": 1e-06, + "loss": 0.3993, + "mean_token_accuracy": 0.8729139566421509, + "num_tokens": 200975159.0, + "step": 5265 + }, + { + "epoch": 0.6698893270576263, + "ewc_loss": 0.02490234375, + "ewc_loss_parallel": 2.491474151611328e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.405065536499023, + "learning_rate": 1e-06, + "loss": 0.4213, + "mean_token_accuracy": 0.8713672161102295, + "num_tokens": 201010813.0, + "step": 5266 + }, + { + "epoch": 0.6700165373362168, + "ewc_loss": 0.02490234375, + "ewc_loss_parallel": 2.491474151611328e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.743728637695312, + "learning_rate": 1e-06, + "loss": 0.4366, + "mean_token_accuracy": 0.8572957515716553, + "num_tokens": 201051937.0, + "step": 5267 + }, + { + "epoch": 0.6701437476148073, + "ewc_loss": 0.02490234375, + "ewc_loss_parallel": 2.491474151611328e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.377971649169922, + "learning_rate": 1e-06, + "loss": 0.4187, + "mean_token_accuracy": 0.8654572367668152, + "num_tokens": 201090926.0, + "step": 5268 + }, + { + "epoch": 0.6702709578933977, + "ewc_loss": 0.0247802734375, + "ewc_loss_parallel": 2.47955322265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.583057403564453, + "learning_rate": 1e-06, + "loss": 0.5019, + "mean_token_accuracy": 0.8390078544616699, + "num_tokens": 201126086.0, + "step": 5269 + }, + { + "epoch": 0.6703981681719883, + "ewc_loss": 0.025146484375, + "ewc_loss_parallel": 2.5153160095214844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.26062774658203, + "learning_rate": 1e-06, + "loss": 0.4715, + "mean_token_accuracy": 0.8490873575210571, + "num_tokens": 201165933.0, + "step": 5270 + }, + { + "epoch": 0.6705253784505788, + "ewc_loss": 0.02490234375, + "ewc_loss_parallel": 2.491474151611328e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.365203857421875, + "learning_rate": 1e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.8619486093521118, + "num_tokens": 201198794.0, + "step": 5271 + }, + { + "epoch": 0.6706525887291693, + "ewc_loss": 0.0250244140625, + "ewc_loss_parallel": 2.5033950805664062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.507831573486328, + "learning_rate": 1e-06, + "loss": 0.4803, + "mean_token_accuracy": 0.8505369424819946, + "num_tokens": 201234081.0, + "step": 5272 + }, + { + "epoch": 0.6707797990077599, + "ewc_loss": 0.02490234375, + "ewc_loss_parallel": 2.491474151611328e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.255077362060547, + "learning_rate": 1e-06, + "loss": 0.4612, + "mean_token_accuracy": 0.8553745746612549, + "num_tokens": 201271796.0, + "step": 5273 + }, + { + "epoch": 0.6709070092863504, + "ewc_loss": 0.025146484375, + "ewc_loss_parallel": 2.5153160095214844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.431779861450195, + "learning_rate": 1e-06, + "loss": 0.4941, + "mean_token_accuracy": 0.8411006927490234, + "num_tokens": 201309526.0, + "step": 5274 + }, + { + "epoch": 0.6710342195649408, + "ewc_loss": 0.0247802734375, + "ewc_loss_parallel": 2.47955322265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.401229858398438, + "learning_rate": 1e-06, + "loss": 0.4195, + "mean_token_accuracy": 0.8664993047714233, + "num_tokens": 201344015.0, + "step": 5275 + }, + { + "epoch": 0.6711614298435313, + "ewc_loss": 0.0247802734375, + "ewc_loss_parallel": 2.47955322265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.467336654663086, + "learning_rate": 1e-06, + "loss": 0.4417, + "mean_token_accuracy": 0.8601406812667847, + "num_tokens": 201385201.0, + "step": 5276 + }, + { + "epoch": 0.6712886401221219, + "ewc_loss": 0.025146484375, + "ewc_loss_parallel": 2.5153160095214844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.327470779418945, + "learning_rate": 1e-06, + "loss": 0.426, + "mean_token_accuracy": 0.8646523356437683, + "num_tokens": 201422370.0, + "step": 5277 + }, + { + "epoch": 0.6714158504007124, + "ewc_loss": 0.0252685546875, + "ewc_loss_parallel": 2.5272369384765625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.603866577148438, + "learning_rate": 1e-06, + "loss": 0.4971, + "mean_token_accuracy": 0.844514012336731, + "num_tokens": 201467085.0, + "step": 5278 + }, + { + "epoch": 0.6715430606793029, + "ewc_loss": 0.0247802734375, + "ewc_loss_parallel": 2.47955322265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.564672470092773, + "learning_rate": 1e-06, + "loss": 0.3918, + "mean_token_accuracy": 0.8737037181854248, + "num_tokens": 201503432.0, + "step": 5279 + }, + { + "epoch": 0.6716702709578934, + "ewc_loss": 0.0250244140625, + "ewc_loss_parallel": 2.5033950805664062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.608938217163086, + "learning_rate": 1e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.8659012913703918, + "num_tokens": 201541358.0, + "step": 5280 + }, + { + "epoch": 0.6717974812364839, + "ewc_loss": 0.0250244140625, + "ewc_loss_parallel": 2.5033950805664062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.465755462646484, + "learning_rate": 1e-06, + "loss": 0.4454, + "mean_token_accuracy": 0.8562523126602173, + "num_tokens": 201580235.0, + "step": 5281 + }, + { + "epoch": 0.6719246915150744, + "ewc_loss": 0.025146484375, + "ewc_loss_parallel": 2.5153160095214844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.872802734375, + "learning_rate": 1e-06, + "loss": 0.4392, + "mean_token_accuracy": 0.8600807189941406, + "num_tokens": 201627703.0, + "step": 5282 + }, + { + "epoch": 0.6720519017936649, + "ewc_loss": 0.025146484375, + "ewc_loss_parallel": 2.5153160095214844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.43580436706543, + "learning_rate": 1e-06, + "loss": 0.4309, + "mean_token_accuracy": 0.8615995645523071, + "num_tokens": 201667182.0, + "step": 5283 + }, + { + "epoch": 0.6721791120722554, + "ewc_loss": 0.02490234375, + "ewc_loss_parallel": 2.491474151611328e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.458938598632812, + "learning_rate": 1e-06, + "loss": 0.437, + "mean_token_accuracy": 0.8606282472610474, + "num_tokens": 201699876.0, + "step": 5284 + }, + { + "epoch": 0.672306322350846, + "ewc_loss": 0.02490234375, + "ewc_loss_parallel": 2.491474151611328e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.536911010742188, + "learning_rate": 1e-06, + "loss": 0.4805, + "mean_token_accuracy": 0.8491083383560181, + "num_tokens": 201732770.0, + "step": 5285 + }, + { + "epoch": 0.6724335326294365, + "ewc_loss": 0.0250244140625, + "ewc_loss_parallel": 2.5033950805664062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.54913330078125, + "learning_rate": 1e-06, + "loss": 0.4362, + "mean_token_accuracy": 0.8612428903579712, + "num_tokens": 201771388.0, + "step": 5286 + }, + { + "epoch": 0.6725607429080269, + "ewc_loss": 0.0250244140625, + "ewc_loss_parallel": 2.5033950805664062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.648311614990234, + "learning_rate": 1e-06, + "loss": 0.4573, + "mean_token_accuracy": 0.8535610437393188, + "num_tokens": 201809283.0, + "step": 5287 + }, + { + "epoch": 0.6726879531866174, + "ewc_loss": 0.024658203125, + "ewc_loss_parallel": 2.467632293701172e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.531728744506836, + "learning_rate": 1e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.8651280403137207, + "num_tokens": 201854950.0, + "step": 5288 + }, + { + "epoch": 0.672815163465208, + "ewc_loss": 0.0247802734375, + "ewc_loss_parallel": 2.47955322265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.575851440429688, + "learning_rate": 1e-06, + "loss": 0.4289, + "mean_token_accuracy": 0.8630980253219604, + "num_tokens": 201895638.0, + "step": 5289 + }, + { + "epoch": 0.6729423737437985, + "ewc_loss": 0.0250244140625, + "ewc_loss_parallel": 2.5033950805664062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.49209976196289, + "learning_rate": 1e-06, + "loss": 0.4746, + "mean_token_accuracy": 0.8465415239334106, + "num_tokens": 201928243.0, + "step": 5290 + }, + { + "epoch": 0.673069584022389, + "ewc_loss": 0.025146484375, + "ewc_loss_parallel": 2.5153160095214844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.6063289642334, + "learning_rate": 1e-06, + "loss": 0.4354, + "mean_token_accuracy": 0.8580161333084106, + "num_tokens": 201964687.0, + "step": 5291 + }, + { + "epoch": 0.6731967943009796, + "ewc_loss": 0.02490234375, + "ewc_loss_parallel": 2.491474151611328e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.33125877380371, + "learning_rate": 1e-06, + "loss": 0.3951, + "mean_token_accuracy": 0.8717309236526489, + "num_tokens": 201997166.0, + "step": 5292 + }, + { + "epoch": 0.67332400457957, + "ewc_loss": 0.0250244140625, + "ewc_loss_parallel": 2.5033950805664062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.596586227416992, + "learning_rate": 1e-06, + "loss": 0.4419, + "mean_token_accuracy": 0.8621461987495422, + "num_tokens": 202030771.0, + "step": 5293 + }, + { + "epoch": 0.6734512148581605, + "ewc_loss": 0.0250244140625, + "ewc_loss_parallel": 2.5033950805664062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.45862579345703, + "learning_rate": 1e-06, + "loss": 0.4642, + "mean_token_accuracy": 0.851453423500061, + "num_tokens": 202077280.0, + "step": 5294 + }, + { + "epoch": 0.673578425136751, + "ewc_loss": 0.0250244140625, + "ewc_loss_parallel": 2.5033950805664062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.472429275512695, + "learning_rate": 1e-06, + "loss": 0.4973, + "mean_token_accuracy": 0.8515303730964661, + "num_tokens": 202115273.0, + "step": 5295 + }, + { + "epoch": 0.6737056354153416, + "ewc_loss": 0.0252685546875, + "ewc_loss_parallel": 2.5272369384765625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.692642211914062, + "learning_rate": 1e-06, + "loss": 0.4155, + "mean_token_accuracy": 0.8698461055755615, + "num_tokens": 202145347.0, + "step": 5296 + }, + { + "epoch": 0.6738328456939321, + "ewc_loss": 0.0247802734375, + "ewc_loss_parallel": 2.47955322265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.482778549194336, + "learning_rate": 1e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.8642014265060425, + "num_tokens": 202184621.0, + "step": 5297 + }, + { + "epoch": 0.6739600559725226, + "ewc_loss": 0.025146484375, + "ewc_loss_parallel": 2.5153160095214844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.90960693359375, + "learning_rate": 1e-06, + "loss": 0.4376, + "mean_token_accuracy": 0.8621965646743774, + "num_tokens": 202230099.0, + "step": 5298 + }, + { + "epoch": 0.674087266251113, + "ewc_loss": 0.02490234375, + "ewc_loss_parallel": 2.491474151611328e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.385196685791016, + "learning_rate": 1e-06, + "loss": 0.4636, + "mean_token_accuracy": 0.8524367809295654, + "num_tokens": 202267872.0, + "step": 5299 + }, + { + "epoch": 0.6742144765297036, + "ewc_loss": 0.02490234375, + "ewc_loss_parallel": 2.491474151611328e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.65826416015625, + "learning_rate": 1e-06, + "loss": 0.4363, + "mean_token_accuracy": 0.8577785491943359, + "num_tokens": 202305517.0, + "step": 5300 + }, + { + "epoch": 0.6743416868082941, + "ewc_loss": 0.0250244140625, + "ewc_loss_parallel": 2.5033950805664062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.808082580566406, + "learning_rate": 1e-06, + "loss": 0.4854, + "mean_token_accuracy": 0.8452915549278259, + "num_tokens": 202340671.0, + "step": 5301 + }, + { + "epoch": 0.6744688970868846, + "ewc_loss": 0.025146484375, + "ewc_loss_parallel": 2.5153160095214844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.286144256591797, + "learning_rate": 1e-06, + "loss": 0.4176, + "mean_token_accuracy": 0.8671454191207886, + "num_tokens": 202379552.0, + "step": 5302 + }, + { + "epoch": 0.6745961073654752, + "ewc_loss": 0.0250244140625, + "ewc_loss_parallel": 2.5033950805664062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.67013168334961, + "learning_rate": 1e-06, + "loss": 0.4613, + "mean_token_accuracy": 0.8539602756500244, + "num_tokens": 202421404.0, + "step": 5303 + }, + { + "epoch": 0.6747233176440657, + "ewc_loss": 0.0250244140625, + "ewc_loss_parallel": 2.5033950805664062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.450651168823242, + "learning_rate": 1e-06, + "loss": 0.4933, + "mean_token_accuracy": 0.8452476263046265, + "num_tokens": 202463765.0, + "step": 5304 + }, + { + "epoch": 0.6748505279226561, + "ewc_loss": 0.0250244140625, + "ewc_loss_parallel": 2.5033950805664062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.376020431518555, + "learning_rate": 1e-06, + "loss": 0.4608, + "mean_token_accuracy": 0.8546313047409058, + "num_tokens": 202505872.0, + "step": 5305 + }, + { + "epoch": 0.6749777382012466, + "ewc_loss": 0.02490234375, + "ewc_loss_parallel": 2.491474151611328e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.671777725219727, + "learning_rate": 1e-06, + "loss": 0.4597, + "mean_token_accuracy": 0.8573065996170044, + "num_tokens": 202544085.0, + "step": 5306 + }, + { + "epoch": 0.6751049484798372, + "ewc_loss": 0.02490234375, + "ewc_loss_parallel": 2.491474151611328e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.36931800842285, + "learning_rate": 1e-06, + "loss": 0.4618, + "mean_token_accuracy": 0.8555864691734314, + "num_tokens": 202588237.0, + "step": 5307 + }, + { + "epoch": 0.6752321587584277, + "ewc_loss": 0.02490234375, + "ewc_loss_parallel": 2.491474151611328e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.607196807861328, + "learning_rate": 1e-06, + "loss": 0.4575, + "mean_token_accuracy": 0.8531116247177124, + "num_tokens": 202628462.0, + "step": 5308 + }, + { + "epoch": 0.6753593690370182, + "ewc_loss": 0.025146484375, + "ewc_loss_parallel": 2.5153160095214844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.382659912109375, + "learning_rate": 1e-06, + "loss": 0.4388, + "mean_token_accuracy": 0.8608343005180359, + "num_tokens": 202668149.0, + "step": 5309 + }, + { + "epoch": 0.6754865793156087, + "ewc_loss": 0.02490234375, + "ewc_loss_parallel": 2.491474151611328e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.633567810058594, + "learning_rate": 1e-06, + "loss": 0.3936, + "mean_token_accuracy": 0.8741471171379089, + "num_tokens": 202708651.0, + "step": 5310 + }, + { + "epoch": 0.6756137895941993, + "ewc_loss": 0.0250244140625, + "ewc_loss_parallel": 2.5033950805664062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.66452980041504, + "learning_rate": 1e-06, + "loss": 0.4252, + "mean_token_accuracy": 0.8634887933731079, + "num_tokens": 202738613.0, + "step": 5311 + }, + { + "epoch": 0.6757409998727897, + "ewc_loss": 0.02490234375, + "ewc_loss_parallel": 2.491474151611328e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.326927185058594, + "learning_rate": 1e-06, + "loss": 0.4604, + "mean_token_accuracy": 0.8553298711776733, + "num_tokens": 202779279.0, + "step": 5312 + }, + { + "epoch": 0.6758682101513802, + "ewc_loss": 0.0250244140625, + "ewc_loss_parallel": 2.5033950805664062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.54570960998535, + "learning_rate": 1e-06, + "loss": 0.4294, + "mean_token_accuracy": 0.8592398166656494, + "num_tokens": 202812095.0, + "step": 5313 + }, + { + "epoch": 0.6759954204299707, + "ewc_loss": 0.0250244140625, + "ewc_loss_parallel": 2.5033950805664062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.553852081298828, + "learning_rate": 1e-06, + "loss": 0.4559, + "mean_token_accuracy": 0.8545504808425903, + "num_tokens": 202855705.0, + "step": 5314 + }, + { + "epoch": 0.6761226307085613, + "ewc_loss": 0.025146484375, + "ewc_loss_parallel": 2.5153160095214844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.298845291137695, + "learning_rate": 1e-06, + "loss": 0.4973, + "mean_token_accuracy": 0.8419208526611328, + "num_tokens": 202894180.0, + "step": 5315 + }, + { + "epoch": 0.6762498409871518, + "ewc_loss": 0.025146484375, + "ewc_loss_parallel": 2.5153160095214844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.69062042236328, + "learning_rate": 1e-06, + "loss": 0.4216, + "mean_token_accuracy": 0.8649137020111084, + "num_tokens": 202925842.0, + "step": 5316 + }, + { + "epoch": 0.6763770512657423, + "ewc_loss": 0.0250244140625, + "ewc_loss_parallel": 2.5033950805664062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.359582901000977, + "learning_rate": 1e-06, + "loss": 0.4097, + "mean_token_accuracy": 0.8653523325920105, + "num_tokens": 202959103.0, + "step": 5317 + }, + { + "epoch": 0.6765042615443327, + "ewc_loss": 0.02490234375, + "ewc_loss_parallel": 2.491474151611328e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.47310447692871, + "learning_rate": 1e-06, + "loss": 0.4433, + "mean_token_accuracy": 0.858925461769104, + "num_tokens": 203000393.0, + "step": 5318 + }, + { + "epoch": 0.6766314718229233, + "ewc_loss": 0.025146484375, + "ewc_loss_parallel": 2.5153160095214844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.424814224243164, + "learning_rate": 1e-06, + "loss": 0.4542, + "mean_token_accuracy": 0.8605160117149353, + "num_tokens": 203041613.0, + "step": 5319 + }, + { + "epoch": 0.6767586821015138, + "ewc_loss": 0.0252685546875, + "ewc_loss_parallel": 2.5272369384765625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.434791564941406, + "learning_rate": 1e-06, + "loss": 0.3851, + "mean_token_accuracy": 0.8740172982215881, + "num_tokens": 203073744.0, + "step": 5320 + }, + { + "epoch": 0.6768858923801043, + "ewc_loss": 0.0250244140625, + "ewc_loss_parallel": 2.5033950805664062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.407974243164062, + "learning_rate": 1e-06, + "loss": 0.485, + "mean_token_accuracy": 0.8434180021286011, + "num_tokens": 203108350.0, + "step": 5321 + }, + { + "epoch": 0.6770131026586949, + "ewc_loss": 0.025146484375, + "ewc_loss_parallel": 2.5153160095214844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.561784744262695, + "learning_rate": 1e-06, + "loss": 0.4313, + "mean_token_accuracy": 0.8624427318572998, + "num_tokens": 203139149.0, + "step": 5322 + }, + { + "epoch": 0.6771403129372854, + "ewc_loss": 0.0252685546875, + "ewc_loss_parallel": 2.5272369384765625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.498123168945312, + "learning_rate": 1e-06, + "loss": 0.4911, + "mean_token_accuracy": 0.8421905636787415, + "num_tokens": 203173953.0, + "step": 5323 + }, + { + "epoch": 0.6772675232158758, + "ewc_loss": 0.025146484375, + "ewc_loss_parallel": 2.5153160095214844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.419225692749023, + "learning_rate": 1e-06, + "loss": 0.4053, + "mean_token_accuracy": 0.8688395023345947, + "num_tokens": 203208915.0, + "step": 5324 + }, + { + "epoch": 0.6773947334944663, + "ewc_loss": 0.0252685546875, + "ewc_loss_parallel": 2.5272369384765625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.49044418334961, + "learning_rate": 1e-06, + "loss": 0.4741, + "mean_token_accuracy": 0.8469557762145996, + "num_tokens": 203251906.0, + "step": 5325 + }, + { + "epoch": 0.6775219437730569, + "ewc_loss": 0.025146484375, + "ewc_loss_parallel": 2.5153160095214844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.661500930786133, + "learning_rate": 1e-06, + "loss": 0.4485, + "mean_token_accuracy": 0.8595161437988281, + "num_tokens": 203289460.0, + "step": 5326 + }, + { + "epoch": 0.6776491540516474, + "ewc_loss": 0.025390625, + "ewc_loss_parallel": 2.5391578674316406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.51250648498535, + "learning_rate": 1e-06, + "loss": 0.4081, + "mean_token_accuracy": 0.8668351173400879, + "num_tokens": 203325647.0, + "step": 5327 + }, + { + "epoch": 0.6777763643302379, + "ewc_loss": 0.0250244140625, + "ewc_loss_parallel": 2.5033950805664062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.5154972076416, + "learning_rate": 1e-06, + "loss": 0.45, + "mean_token_accuracy": 0.8549315929412842, + "num_tokens": 203364972.0, + "step": 5328 + }, + { + "epoch": 0.6779035746088284, + "ewc_loss": 0.0252685546875, + "ewc_loss_parallel": 2.5272369384765625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.471187591552734, + "learning_rate": 1e-06, + "loss": 0.4369, + "mean_token_accuracy": 0.8587616682052612, + "num_tokens": 203407778.0, + "step": 5329 + }, + { + "epoch": 0.6780307848874189, + "ewc_loss": 0.025146484375, + "ewc_loss_parallel": 2.5153160095214844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.36145782470703, + "learning_rate": 1e-06, + "loss": 0.4398, + "mean_token_accuracy": 0.8598818778991699, + "num_tokens": 203443820.0, + "step": 5330 + }, + { + "epoch": 0.6781579951660094, + "ewc_loss": 0.0250244140625, + "ewc_loss_parallel": 2.5033950805664062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.423309326171875, + "learning_rate": 1e-06, + "loss": 0.4182, + "mean_token_accuracy": 0.8648061752319336, + "num_tokens": 203475742.0, + "step": 5331 + }, + { + "epoch": 0.6782852054445999, + "ewc_loss": 0.025390625, + "ewc_loss_parallel": 2.5391578674316406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.52734375, + "learning_rate": 1e-06, + "loss": 0.5243, + "mean_token_accuracy": 0.83323073387146, + "num_tokens": 203512617.0, + "step": 5332 + }, + { + "epoch": 0.6784124157231904, + "ewc_loss": 0.0252685546875, + "ewc_loss_parallel": 2.5272369384765625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.31043815612793, + "learning_rate": 1e-06, + "loss": 0.4837, + "mean_token_accuracy": 0.8436768651008606, + "num_tokens": 203546778.0, + "step": 5333 + }, + { + "epoch": 0.678539626001781, + "ewc_loss": 0.025146484375, + "ewc_loss_parallel": 2.5153160095214844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.443838119506836, + "learning_rate": 1e-06, + "loss": 0.4348, + "mean_token_accuracy": 0.8603938221931458, + "num_tokens": 203585222.0, + "step": 5334 + }, + { + "epoch": 0.6786668362803715, + "ewc_loss": 0.0252685546875, + "ewc_loss_parallel": 2.5272369384765625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.447298049926758, + "learning_rate": 1e-06, + "loss": 0.4573, + "mean_token_accuracy": 0.8528442978858948, + "num_tokens": 203627014.0, + "step": 5335 + }, + { + "epoch": 0.6787940465589619, + "ewc_loss": 0.0252685546875, + "ewc_loss_parallel": 2.5272369384765625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.400402069091797, + "learning_rate": 1e-06, + "loss": 0.4194, + "mean_token_accuracy": 0.8654933571815491, + "num_tokens": 203659189.0, + "step": 5336 + }, + { + "epoch": 0.6789212568375524, + "ewc_loss": 0.0252685546875, + "ewc_loss_parallel": 2.5272369384765625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.510412216186523, + "learning_rate": 1e-06, + "loss": 0.4268, + "mean_token_accuracy": 0.869488000869751, + "num_tokens": 203695355.0, + "step": 5337 + }, + { + "epoch": 0.679048467116143, + "ewc_loss": 0.0252685546875, + "ewc_loss_parallel": 2.5272369384765625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.434356689453125, + "learning_rate": 1e-06, + "loss": 0.4141, + "mean_token_accuracy": 0.8669682741165161, + "num_tokens": 203733446.0, + "step": 5338 + }, + { + "epoch": 0.6791756773947335, + "ewc_loss": 0.0252685546875, + "ewc_loss_parallel": 2.5272369384765625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.455705642700195, + "learning_rate": 1e-06, + "loss": 0.4944, + "mean_token_accuracy": 0.8432244658470154, + "num_tokens": 203775387.0, + "step": 5339 + }, + { + "epoch": 0.679302887673324, + "ewc_loss": 0.025390625, + "ewc_loss_parallel": 2.5391578674316406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.432533264160156, + "learning_rate": 1e-06, + "loss": 0.4492, + "mean_token_accuracy": 0.8543431758880615, + "num_tokens": 203810713.0, + "step": 5340 + }, + { + "epoch": 0.6794300979519146, + "ewc_loss": 0.025390625, + "ewc_loss_parallel": 2.5391578674316406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.47257423400879, + "learning_rate": 1e-06, + "loss": 0.4041, + "mean_token_accuracy": 0.8711246848106384, + "num_tokens": 203847144.0, + "step": 5341 + }, + { + "epoch": 0.679557308230505, + "ewc_loss": 0.0252685546875, + "ewc_loss_parallel": 2.5272369384765625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.438520431518555, + "learning_rate": 1e-06, + "loss": 0.4255, + "mean_token_accuracy": 0.8645315170288086, + "num_tokens": 203889716.0, + "step": 5342 + }, + { + "epoch": 0.6796845185090955, + "ewc_loss": 0.025390625, + "ewc_loss_parallel": 2.5391578674316406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.41187858581543, + "learning_rate": 1e-06, + "loss": 0.4923, + "mean_token_accuracy": 0.8448299169540405, + "num_tokens": 203927692.0, + "step": 5343 + }, + { + "epoch": 0.679811728787686, + "ewc_loss": 0.0255126953125, + "ewc_loss_parallel": 2.5510787963867188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.504240036010742, + "learning_rate": 1e-06, + "loss": 0.4616, + "mean_token_accuracy": 0.8460104465484619, + "num_tokens": 203960180.0, + "step": 5344 + }, + { + "epoch": 0.6799389390662766, + "ewc_loss": 0.0255126953125, + "ewc_loss_parallel": 2.5510787963867188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.609132766723633, + "learning_rate": 1e-06, + "loss": 0.4002, + "mean_token_accuracy": 0.8704211115837097, + "num_tokens": 203997375.0, + "step": 5345 + }, + { + "epoch": 0.6800661493448671, + "ewc_loss": 0.025390625, + "ewc_loss_parallel": 2.5391578674316406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.4603271484375, + "learning_rate": 1e-06, + "loss": 0.4565, + "mean_token_accuracy": 0.8550173044204712, + "num_tokens": 204037588.0, + "step": 5346 + }, + { + "epoch": 0.6801933596234576, + "ewc_loss": 0.0252685546875, + "ewc_loss_parallel": 2.5272369384765625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.49159049987793, + "learning_rate": 1e-06, + "loss": 0.4855, + "mean_token_accuracy": 0.8514643907546997, + "num_tokens": 204080993.0, + "step": 5347 + }, + { + "epoch": 0.680320569902048, + "ewc_loss": 0.025390625, + "ewc_loss_parallel": 2.5391578674316406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.708599090576172, + "learning_rate": 1e-06, + "loss": 0.4435, + "mean_token_accuracy": 0.8611118197441101, + "num_tokens": 204116863.0, + "step": 5348 + }, + { + "epoch": 0.6804477801806386, + "ewc_loss": 0.025390625, + "ewc_loss_parallel": 2.5391578674316406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.58187484741211, + "learning_rate": 1e-06, + "loss": 0.4587, + "mean_token_accuracy": 0.852901816368103, + "num_tokens": 204156551.0, + "step": 5349 + }, + { + "epoch": 0.6805749904592291, + "ewc_loss": 0.025146484375, + "ewc_loss_parallel": 2.5153160095214844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.518905639648438, + "learning_rate": 1e-06, + "loss": 0.4575, + "mean_token_accuracy": 0.8525192737579346, + "num_tokens": 204196210.0, + "step": 5350 + }, + { + "epoch": 0.6807022007378196, + "ewc_loss": 0.025390625, + "ewc_loss_parallel": 2.5391578674316406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.55660629272461, + "learning_rate": 1e-06, + "loss": 0.445, + "mean_token_accuracy": 0.8572418689727783, + "num_tokens": 204234791.0, + "step": 5351 + }, + { + "epoch": 0.6808294110164101, + "ewc_loss": 0.025146484375, + "ewc_loss_parallel": 2.5153160095214844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.64864158630371, + "learning_rate": 1e-06, + "loss": 0.431, + "mean_token_accuracy": 0.8616456985473633, + "num_tokens": 204271631.0, + "step": 5352 + }, + { + "epoch": 0.6809566212950007, + "ewc_loss": 0.0250244140625, + "ewc_loss_parallel": 2.5033950805664062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.405731201171875, + "learning_rate": 1e-06, + "loss": 0.404, + "mean_token_accuracy": 0.8694037795066833, + "num_tokens": 204311879.0, + "step": 5353 + }, + { + "epoch": 0.6810838315735911, + "ewc_loss": 0.025146484375, + "ewc_loss_parallel": 2.5153160095214844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.522659301757812, + "learning_rate": 1e-06, + "loss": 0.4392, + "mean_token_accuracy": 0.8622673749923706, + "num_tokens": 204357123.0, + "step": 5354 + }, + { + "epoch": 0.6812110418521816, + "ewc_loss": 0.025146484375, + "ewc_loss_parallel": 2.5153160095214844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.494401931762695, + "learning_rate": 1e-06, + "loss": 0.4424, + "mean_token_accuracy": 0.8573817014694214, + "num_tokens": 204397196.0, + "step": 5355 + }, + { + "epoch": 0.6813382521307721, + "ewc_loss": 0.025146484375, + "ewc_loss_parallel": 2.5153160095214844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.51112174987793, + "learning_rate": 1e-06, + "loss": 0.4696, + "mean_token_accuracy": 0.848859965801239, + "num_tokens": 204434654.0, + "step": 5356 + }, + { + "epoch": 0.6814654624093627, + "ewc_loss": 0.025146484375, + "ewc_loss_parallel": 2.5153160095214844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.391056060791016, + "learning_rate": 1e-06, + "loss": 0.3781, + "mean_token_accuracy": 0.881393551826477, + "num_tokens": 204466581.0, + "step": 5357 + }, + { + "epoch": 0.6815926726879532, + "ewc_loss": 0.0250244140625, + "ewc_loss_parallel": 2.5033950805664062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.568384170532227, + "learning_rate": 1e-06, + "loss": 0.4073, + "mean_token_accuracy": 0.871419370174408, + "num_tokens": 204503890.0, + "step": 5358 + }, + { + "epoch": 0.6817198829665437, + "ewc_loss": 0.0250244140625, + "ewc_loss_parallel": 2.5033950805664062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.51369857788086, + "learning_rate": 1e-06, + "loss": 0.4592, + "mean_token_accuracy": 0.855054497718811, + "num_tokens": 204537160.0, + "step": 5359 + }, + { + "epoch": 0.6818470932451343, + "ewc_loss": 0.025146484375, + "ewc_loss_parallel": 2.5153160095214844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.51264190673828, + "learning_rate": 1e-06, + "loss": 0.4165, + "mean_token_accuracy": 0.8695096969604492, + "num_tokens": 204570077.0, + "step": 5360 + }, + { + "epoch": 0.6819743035237247, + "ewc_loss": 0.0252685546875, + "ewc_loss_parallel": 2.5272369384765625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.55474853515625, + "learning_rate": 1e-06, + "loss": 0.3981, + "mean_token_accuracy": 0.8745018243789673, + "num_tokens": 204603638.0, + "step": 5361 + }, + { + "epoch": 0.6821015138023152, + "ewc_loss": 0.0252685546875, + "ewc_loss_parallel": 2.5272369384765625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.37645721435547, + "learning_rate": 1e-06, + "loss": 0.4656, + "mean_token_accuracy": 0.8554812669754028, + "num_tokens": 204644584.0, + "step": 5362 + }, + { + "epoch": 0.6822287240809057, + "ewc_loss": 0.025146484375, + "ewc_loss_parallel": 2.5153160095214844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.432933807373047, + "learning_rate": 1e-06, + "loss": 0.4794, + "mean_token_accuracy": 0.8446940779685974, + "num_tokens": 204678459.0, + "step": 5363 + }, + { + "epoch": 0.6823559343594963, + "ewc_loss": 0.025146484375, + "ewc_loss_parallel": 2.5153160095214844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.48377799987793, + "learning_rate": 1e-06, + "loss": 0.454, + "mean_token_accuracy": 0.856701672077179, + "num_tokens": 204712923.0, + "step": 5364 + }, + { + "epoch": 0.6824831446380868, + "ewc_loss": 0.0252685546875, + "ewc_loss_parallel": 2.5272369384765625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.58852195739746, + "learning_rate": 1e-06, + "loss": 0.4927, + "mean_token_accuracy": 0.8415573835372925, + "num_tokens": 204751303.0, + "step": 5365 + }, + { + "epoch": 0.6826103549166773, + "ewc_loss": 0.0250244140625, + "ewc_loss_parallel": 2.5033950805664062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.292598724365234, + "learning_rate": 1e-06, + "loss": 0.4372, + "mean_token_accuracy": 0.8594552278518677, + "num_tokens": 204791685.0, + "step": 5366 + }, + { + "epoch": 0.6827375651952677, + "ewc_loss": 0.025146484375, + "ewc_loss_parallel": 2.5153160095214844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.517864227294922, + "learning_rate": 1e-06, + "loss": 0.4348, + "mean_token_accuracy": 0.8589467406272888, + "num_tokens": 204827748.0, + "step": 5367 + }, + { + "epoch": 0.6828647754738583, + "ewc_loss": 0.0252685546875, + "ewc_loss_parallel": 2.5272369384765625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.5050106048584, + "learning_rate": 1e-06, + "loss": 0.3899, + "mean_token_accuracy": 0.8744166493415833, + "num_tokens": 204866568.0, + "step": 5368 + }, + { + "epoch": 0.6829919857524488, + "ewc_loss": 0.025146484375, + "ewc_loss_parallel": 2.5153160095214844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.48953628540039, + "learning_rate": 1e-06, + "loss": 0.4611, + "mean_token_accuracy": 0.8579548597335815, + "num_tokens": 204908334.0, + "step": 5369 + }, + { + "epoch": 0.6831191960310393, + "ewc_loss": 0.025146484375, + "ewc_loss_parallel": 2.5153160095214844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.685392379760742, + "learning_rate": 1e-06, + "loss": 0.4283, + "mean_token_accuracy": 0.862116813659668, + "num_tokens": 204949929.0, + "step": 5370 + }, + { + "epoch": 0.6832464063096299, + "ewc_loss": 0.025390625, + "ewc_loss_parallel": 2.5391578674316406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.444217681884766, + "learning_rate": 1e-06, + "loss": 0.4709, + "mean_token_accuracy": 0.8495839834213257, + "num_tokens": 204991435.0, + "step": 5371 + }, + { + "epoch": 0.6833736165882204, + "ewc_loss": 0.025146484375, + "ewc_loss_parallel": 2.5153160095214844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.48164176940918, + "learning_rate": 1e-06, + "loss": 0.4234, + "mean_token_accuracy": 0.8653990626335144, + "num_tokens": 205028434.0, + "step": 5372 + }, + { + "epoch": 0.6835008268668108, + "ewc_loss": 0.0252685546875, + "ewc_loss_parallel": 2.5272369384765625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.415775299072266, + "learning_rate": 1e-06, + "loss": 0.4475, + "mean_token_accuracy": 0.8531416654586792, + "num_tokens": 205064138.0, + "step": 5373 + }, + { + "epoch": 0.6836280371454013, + "ewc_loss": 0.0252685546875, + "ewc_loss_parallel": 2.5272369384765625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.468721389770508, + "learning_rate": 1e-06, + "loss": 0.4625, + "mean_token_accuracy": 0.852615475654602, + "num_tokens": 205104058.0, + "step": 5374 + }, + { + "epoch": 0.6837552474239919, + "ewc_loss": 0.025146484375, + "ewc_loss_parallel": 2.5153160095214844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.58036994934082, + "learning_rate": 1e-06, + "loss": 0.4443, + "mean_token_accuracy": 0.8579784035682678, + "num_tokens": 205142609.0, + "step": 5375 + }, + { + "epoch": 0.6838824577025824, + "ewc_loss": 0.0252685546875, + "ewc_loss_parallel": 2.5272369384765625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.45664405822754, + "learning_rate": 1e-06, + "loss": 0.4138, + "mean_token_accuracy": 0.869630753993988, + "num_tokens": 205180407.0, + "step": 5376 + }, + { + "epoch": 0.6840096679811729, + "ewc_loss": 0.025390625, + "ewc_loss_parallel": 2.5391578674316406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.500368118286133, + "learning_rate": 1e-06, + "loss": 0.3793, + "mean_token_accuracy": 0.8799489736557007, + "num_tokens": 205217340.0, + "step": 5377 + }, + { + "epoch": 0.6841368782597634, + "ewc_loss": 0.025146484375, + "ewc_loss_parallel": 2.5153160095214844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.6567325592041, + "learning_rate": 1e-06, + "loss": 0.4991, + "mean_token_accuracy": 0.8472579717636108, + "num_tokens": 205263671.0, + "step": 5378 + }, + { + "epoch": 0.6842640885383539, + "ewc_loss": 0.025146484375, + "ewc_loss_parallel": 2.5153160095214844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.341657638549805, + "learning_rate": 1e-06, + "loss": 0.4472, + "mean_token_accuracy": 0.8557682633399963, + "num_tokens": 205302970.0, + "step": 5379 + }, + { + "epoch": 0.6843912988169444, + "ewc_loss": 0.0252685546875, + "ewc_loss_parallel": 2.5272369384765625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.62169075012207, + "learning_rate": 1e-06, + "loss": 0.43, + "mean_token_accuracy": 0.8630498647689819, + "num_tokens": 205342247.0, + "step": 5380 + }, + { + "epoch": 0.6845185090955349, + "ewc_loss": 0.0252685546875, + "ewc_loss_parallel": 2.5272369384765625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.46871566772461, + "learning_rate": 1e-06, + "loss": 0.4877, + "mean_token_accuracy": 0.8442478179931641, + "num_tokens": 205374897.0, + "step": 5381 + }, + { + "epoch": 0.6846457193741254, + "ewc_loss": 0.0250244140625, + "ewc_loss_parallel": 2.5033950805664062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.49636459350586, + "learning_rate": 1e-06, + "loss": 0.4433, + "mean_token_accuracy": 0.8557592034339905, + "num_tokens": 205411645.0, + "step": 5382 + }, + { + "epoch": 0.684772929652716, + "ewc_loss": 0.0250244140625, + "ewc_loss_parallel": 2.5033950805664062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.436952590942383, + "learning_rate": 1e-06, + "loss": 0.436, + "mean_token_accuracy": 0.8584302663803101, + "num_tokens": 205450972.0, + "step": 5383 + }, + { + "epoch": 0.6849001399313065, + "ewc_loss": 0.0252685546875, + "ewc_loss_parallel": 2.5272369384765625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.403390884399414, + "learning_rate": 1e-06, + "loss": 0.4412, + "mean_token_accuracy": 0.8600565195083618, + "num_tokens": 205494314.0, + "step": 5384 + }, + { + "epoch": 0.6850273502098969, + "ewc_loss": 0.0255126953125, + "ewc_loss_parallel": 2.5510787963867188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.548213958740234, + "learning_rate": 1e-06, + "loss": 0.4771, + "mean_token_accuracy": 0.8460935354232788, + "num_tokens": 205534696.0, + "step": 5385 + }, + { + "epoch": 0.6851545604884874, + "ewc_loss": 0.025146484375, + "ewc_loss_parallel": 2.5153160095214844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.526897430419922, + "learning_rate": 1e-06, + "loss": 0.4123, + "mean_token_accuracy": 0.8707859516143799, + "num_tokens": 205575579.0, + "step": 5386 + }, + { + "epoch": 0.685281770767078, + "ewc_loss": 0.025146484375, + "ewc_loss_parallel": 2.5153160095214844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.59327507019043, + "learning_rate": 1e-06, + "loss": 0.4271, + "mean_token_accuracy": 0.8621677160263062, + "num_tokens": 205609249.0, + "step": 5387 + }, + { + "epoch": 0.6854089810456685, + "ewc_loss": 0.0252685546875, + "ewc_loss_parallel": 2.5272369384765625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.705854415893555, + "learning_rate": 1e-06, + "loss": 0.456, + "mean_token_accuracy": 0.8556163907051086, + "num_tokens": 205644135.0, + "step": 5388 + }, + { + "epoch": 0.685536191324259, + "ewc_loss": 0.0252685546875, + "ewc_loss_parallel": 2.5272369384765625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.4672794342041, + "learning_rate": 1e-06, + "loss": 0.4683, + "mean_token_accuracy": 0.8525543212890625, + "num_tokens": 205685312.0, + "step": 5389 + }, + { + "epoch": 0.6856634016028496, + "ewc_loss": 0.025146484375, + "ewc_loss_parallel": 2.5153160095214844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.518476486206055, + "learning_rate": 1e-06, + "loss": 0.4554, + "mean_token_accuracy": 0.8529906272888184, + "num_tokens": 205720525.0, + "step": 5390 + }, + { + "epoch": 0.68579061188144, + "ewc_loss": 0.025390625, + "ewc_loss_parallel": 2.5391578674316406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.635828018188477, + "learning_rate": 1e-06, + "loss": 0.5051, + "mean_token_accuracy": 0.8399347066879272, + "num_tokens": 205763358.0, + "step": 5391 + }, + { + "epoch": 0.6859178221600305, + "ewc_loss": 0.0250244140625, + "ewc_loss_parallel": 2.5033950805664062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.59174919128418, + "learning_rate": 1e-06, + "loss": 0.4328, + "mean_token_accuracy": 0.8625978827476501, + "num_tokens": 205801413.0, + "step": 5392 + }, + { + "epoch": 0.686045032438621, + "ewc_loss": 0.025634765625, + "ewc_loss_parallel": 2.562999725341797e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.70726203918457, + "learning_rate": 1e-06, + "loss": 0.4442, + "mean_token_accuracy": 0.8604317307472229, + "num_tokens": 205840442.0, + "step": 5393 + }, + { + "epoch": 0.6861722427172116, + "ewc_loss": 0.025390625, + "ewc_loss_parallel": 2.5391578674316406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.48228645324707, + "learning_rate": 1e-06, + "loss": 0.4042, + "mean_token_accuracy": 0.8727189898490906, + "num_tokens": 205877319.0, + "step": 5394 + }, + { + "epoch": 0.6862994529958021, + "ewc_loss": 0.0255126953125, + "ewc_loss_parallel": 2.5510787963867188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.70545768737793, + "learning_rate": 1e-06, + "loss": 0.4504, + "mean_token_accuracy": 0.8570373058319092, + "num_tokens": 205916186.0, + "step": 5395 + }, + { + "epoch": 0.6864266632743926, + "ewc_loss": 0.0250244140625, + "ewc_loss_parallel": 2.5033950805664062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.57757568359375, + "learning_rate": 1e-06, + "loss": 0.4715, + "mean_token_accuracy": 0.8491688966751099, + "num_tokens": 205956336.0, + "step": 5396 + }, + { + "epoch": 0.686553873552983, + "ewc_loss": 0.0252685546875, + "ewc_loss_parallel": 2.5272369384765625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.59188461303711, + "learning_rate": 1e-06, + "loss": 0.4415, + "mean_token_accuracy": 0.8568805456161499, + "num_tokens": 205989570.0, + "step": 5397 + }, + { + "epoch": 0.6866810838315736, + "ewc_loss": 0.0250244140625, + "ewc_loss_parallel": 2.5033950805664062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.47696876525879, + "learning_rate": 1e-06, + "loss": 0.473, + "mean_token_accuracy": 0.8510276079177856, + "num_tokens": 206026139.0, + "step": 5398 + }, + { + "epoch": 0.6868082941101641, + "ewc_loss": 0.025146484375, + "ewc_loss_parallel": 2.5153160095214844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.53588104248047, + "learning_rate": 1e-06, + "loss": 0.4556, + "mean_token_accuracy": 0.8534896373748779, + "num_tokens": 206070059.0, + "step": 5399 + }, + { + "epoch": 0.6869355043887546, + "ewc_loss": 0.025390625, + "ewc_loss_parallel": 2.5391578674316406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.504722595214844, + "learning_rate": 1e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.8625826835632324, + "num_tokens": 206104645.0, + "step": 5400 + }, + { + "epoch": 0.6870627146673451, + "ewc_loss": 0.025146484375, + "ewc_loss_parallel": 2.5153160095214844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.42623519897461, + "learning_rate": 1e-06, + "loss": 0.4445, + "mean_token_accuracy": 0.8565578460693359, + "num_tokens": 206141220.0, + "step": 5401 + }, + { + "epoch": 0.6871899249459357, + "ewc_loss": 0.0252685546875, + "ewc_loss_parallel": 2.5272369384765625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.595306396484375, + "learning_rate": 1e-06, + "loss": 0.4544, + "mean_token_accuracy": 0.8564929962158203, + "num_tokens": 206177917.0, + "step": 5402 + }, + { + "epoch": 0.6873171352245261, + "ewc_loss": 0.0252685546875, + "ewc_loss_parallel": 2.5272369384765625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.23080062866211, + "learning_rate": 1e-06, + "loss": 0.4417, + "mean_token_accuracy": 0.8584308624267578, + "num_tokens": 206213419.0, + "step": 5403 + }, + { + "epoch": 0.6874443455031166, + "ewc_loss": 0.0252685546875, + "ewc_loss_parallel": 2.5272369384765625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.706687927246094, + "learning_rate": 1e-06, + "loss": 0.4917, + "mean_token_accuracy": 0.8430562019348145, + "num_tokens": 206252421.0, + "step": 5404 + }, + { + "epoch": 0.6875715557817071, + "ewc_loss": 0.025634765625, + "ewc_loss_parallel": 2.562999725341797e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.40771484375, + "learning_rate": 1e-06, + "loss": 0.4156, + "mean_token_accuracy": 0.8635157346725464, + "num_tokens": 206287195.0, + "step": 5405 + }, + { + "epoch": 0.6876987660602977, + "ewc_loss": 0.025146484375, + "ewc_loss_parallel": 2.5153160095214844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.661884307861328, + "learning_rate": 1e-06, + "loss": 0.513, + "mean_token_accuracy": 0.8391972184181213, + "num_tokens": 206328339.0, + "step": 5406 + }, + { + "epoch": 0.6878259763388882, + "ewc_loss": 0.0255126953125, + "ewc_loss_parallel": 2.5510787963867188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.5484676361084, + "learning_rate": 1e-06, + "loss": 0.4991, + "mean_token_accuracy": 0.8434740900993347, + "num_tokens": 206369523.0, + "step": 5407 + }, + { + "epoch": 0.6879531866174787, + "ewc_loss": 0.0252685546875, + "ewc_loss_parallel": 2.5272369384765625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.658124923706055, + "learning_rate": 1e-06, + "loss": 0.3752, + "mean_token_accuracy": 0.877430260181427, + "num_tokens": 206402864.0, + "step": 5408 + }, + { + "epoch": 0.6880803968960693, + "ewc_loss": 0.0252685546875, + "ewc_loss_parallel": 2.5272369384765625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.492748260498047, + "learning_rate": 1e-06, + "loss": 0.4771, + "mean_token_accuracy": 0.8467191457748413, + "num_tokens": 206439004.0, + "step": 5409 + }, + { + "epoch": 0.6882076071746597, + "ewc_loss": 0.0252685546875, + "ewc_loss_parallel": 2.5272369384765625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.582216262817383, + "learning_rate": 1e-06, + "loss": 0.4392, + "mean_token_accuracy": 0.8602920770645142, + "num_tokens": 206472320.0, + "step": 5410 + }, + { + "epoch": 0.6883348174532502, + "ewc_loss": 0.025390625, + "ewc_loss_parallel": 2.5391578674316406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.475435256958008, + "learning_rate": 1e-06, + "loss": 0.4738, + "mean_token_accuracy": 0.8514659404754639, + "num_tokens": 206518649.0, + "step": 5411 + }, + { + "epoch": 0.6884620277318407, + "ewc_loss": 0.025146484375, + "ewc_loss_parallel": 2.5153160095214844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.574569702148438, + "learning_rate": 1e-06, + "loss": 0.393, + "mean_token_accuracy": 0.8728154897689819, + "num_tokens": 206551318.0, + "step": 5412 + }, + { + "epoch": 0.6885892380104313, + "ewc_loss": 0.025146484375, + "ewc_loss_parallel": 2.5153160095214844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.555938720703125, + "learning_rate": 1e-06, + "loss": 0.4782, + "mean_token_accuracy": 0.8462889194488525, + "num_tokens": 206588173.0, + "step": 5413 + }, + { + "epoch": 0.6887164482890218, + "ewc_loss": 0.025390625, + "ewc_loss_parallel": 2.5391578674316406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.696617126464844, + "learning_rate": 1e-06, + "loss": 0.4659, + "mean_token_accuracy": 0.8569303750991821, + "num_tokens": 206624013.0, + "step": 5414 + }, + { + "epoch": 0.6888436585676123, + "ewc_loss": 0.0252685546875, + "ewc_loss_parallel": 2.5272369384765625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.572275161743164, + "learning_rate": 1e-06, + "loss": 0.4243, + "mean_token_accuracy": 0.8661746978759766, + "num_tokens": 206668180.0, + "step": 5415 + }, + { + "epoch": 0.6889708688462027, + "ewc_loss": 0.0252685546875, + "ewc_loss_parallel": 2.5272369384765625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.50442123413086, + "learning_rate": 1e-06, + "loss": 0.4566, + "mean_token_accuracy": 0.85409015417099, + "num_tokens": 206710619.0, + "step": 5416 + }, + { + "epoch": 0.6890980791247933, + "ewc_loss": 0.025146484375, + "ewc_loss_parallel": 2.5153160095214844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.518325805664062, + "learning_rate": 1e-06, + "loss": 0.4275, + "mean_token_accuracy": 0.8632930517196655, + "num_tokens": 206750121.0, + "step": 5417 + }, + { + "epoch": 0.6892252894033838, + "ewc_loss": 0.0252685546875, + "ewc_loss_parallel": 2.5272369384765625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.630155563354492, + "learning_rate": 1e-06, + "loss": 0.4056, + "mean_token_accuracy": 0.8701218366622925, + "num_tokens": 206789538.0, + "step": 5418 + }, + { + "epoch": 0.6893524996819743, + "ewc_loss": 0.025390625, + "ewc_loss_parallel": 2.5391578674316406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.338153839111328, + "learning_rate": 1e-06, + "loss": 0.4454, + "mean_token_accuracy": 0.8589000105857849, + "num_tokens": 206827513.0, + "step": 5419 + }, + { + "epoch": 0.6894797099605648, + "ewc_loss": 0.025146484375, + "ewc_loss_parallel": 2.5153160095214844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.577207565307617, + "learning_rate": 1e-06, + "loss": 0.4251, + "mean_token_accuracy": 0.8646340370178223, + "num_tokens": 206866563.0, + "step": 5420 + }, + { + "epoch": 0.6896069202391554, + "ewc_loss": 0.0252685546875, + "ewc_loss_parallel": 2.5272369384765625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.57097053527832, + "learning_rate": 1e-06, + "loss": 0.4055, + "mean_token_accuracy": 0.8687722086906433, + "num_tokens": 206906281.0, + "step": 5421 + }, + { + "epoch": 0.6897341305177458, + "ewc_loss": 0.025390625, + "ewc_loss_parallel": 2.5391578674316406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.704185485839844, + "learning_rate": 1e-06, + "loss": 0.4983, + "mean_token_accuracy": 0.8392397165298462, + "num_tokens": 206944055.0, + "step": 5422 + }, + { + "epoch": 0.6898613407963363, + "ewc_loss": 0.025146484375, + "ewc_loss_parallel": 2.5153160095214844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.588321685791016, + "learning_rate": 1e-06, + "loss": 0.4357, + "mean_token_accuracy": 0.8588382005691528, + "num_tokens": 206984421.0, + "step": 5423 + }, + { + "epoch": 0.6899885510749268, + "ewc_loss": 0.0252685546875, + "ewc_loss_parallel": 2.5272369384765625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.754352569580078, + "learning_rate": 1e-06, + "loss": 0.475, + "mean_token_accuracy": 0.8477967381477356, + "num_tokens": 207020319.0, + "step": 5424 + }, + { + "epoch": 0.6901157613535174, + "ewc_loss": 0.025390625, + "ewc_loss_parallel": 2.5391578674316406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.589611053466797, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8612186908721924, + "num_tokens": 207061487.0, + "step": 5425 + }, + { + "epoch": 0.6902429716321079, + "ewc_loss": 0.025146484375, + "ewc_loss_parallel": 2.5153160095214844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.06606674194336, + "learning_rate": 1e-06, + "loss": 0.437, + "mean_token_accuracy": 0.8596837520599365, + "num_tokens": 207098343.0, + "step": 5426 + }, + { + "epoch": 0.6903701819106984, + "ewc_loss": 0.0252685546875, + "ewc_loss_parallel": 2.5272369384765625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.61520767211914, + "learning_rate": 1e-06, + "loss": 0.4584, + "mean_token_accuracy": 0.8553601503372192, + "num_tokens": 207140381.0, + "step": 5427 + }, + { + "epoch": 0.6904973921892888, + "ewc_loss": 0.02490234375, + "ewc_loss_parallel": 2.491474151611328e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.630470275878906, + "learning_rate": 1e-06, + "loss": 0.404, + "mean_token_accuracy": 0.8718588352203369, + "num_tokens": 207173923.0, + "step": 5428 + }, + { + "epoch": 0.6906246024678794, + "ewc_loss": 0.0250244140625, + "ewc_loss_parallel": 2.5033950805664062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.611026763916016, + "learning_rate": 1e-06, + "loss": 0.4256, + "mean_token_accuracy": 0.8645755052566528, + "num_tokens": 207214840.0, + "step": 5429 + }, + { + "epoch": 0.6907518127464699, + "ewc_loss": 0.0252685546875, + "ewc_loss_parallel": 2.5272369384765625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.518503189086914, + "learning_rate": 1e-06, + "loss": 0.4454, + "mean_token_accuracy": 0.8576047420501709, + "num_tokens": 207247850.0, + "step": 5430 + }, + { + "epoch": 0.6908790230250604, + "ewc_loss": 0.0255126953125, + "ewc_loss_parallel": 2.5510787963867188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.739925384521484, + "learning_rate": 1e-06, + "loss": 0.4394, + "mean_token_accuracy": 0.8590920567512512, + "num_tokens": 207286698.0, + "step": 5431 + }, + { + "epoch": 0.691006233303651, + "ewc_loss": 0.0255126953125, + "ewc_loss_parallel": 2.5510787963867188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.618223190307617, + "learning_rate": 1e-06, + "loss": 0.4554, + "mean_token_accuracy": 0.8535851240158081, + "num_tokens": 207322470.0, + "step": 5432 + }, + { + "epoch": 0.6911334435822415, + "ewc_loss": 0.0252685546875, + "ewc_loss_parallel": 2.5272369384765625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.6240291595459, + "learning_rate": 1e-06, + "loss": 0.4807, + "mean_token_accuracy": 0.8452558517456055, + "num_tokens": 207361517.0, + "step": 5433 + }, + { + "epoch": 0.6912606538608319, + "ewc_loss": 0.0252685546875, + "ewc_loss_parallel": 2.5272369384765625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.444053649902344, + "learning_rate": 1e-06, + "loss": 0.4069, + "mean_token_accuracy": 0.8679006099700928, + "num_tokens": 207394504.0, + "step": 5434 + }, + { + "epoch": 0.6913878641394224, + "ewc_loss": 0.0252685546875, + "ewc_loss_parallel": 2.5272369384765625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.76421356201172, + "learning_rate": 1e-06, + "loss": 0.5022, + "mean_token_accuracy": 0.8406714200973511, + "num_tokens": 207430148.0, + "step": 5435 + }, + { + "epoch": 0.691515074418013, + "ewc_loss": 0.02490234375, + "ewc_loss_parallel": 2.491474151611328e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.412883758544922, + "learning_rate": 1e-06, + "loss": 0.4455, + "mean_token_accuracy": 0.8561471700668335, + "num_tokens": 207461094.0, + "step": 5436 + }, + { + "epoch": 0.6916422846966035, + "ewc_loss": 0.025146484375, + "ewc_loss_parallel": 2.5153160095214844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.670270919799805, + "learning_rate": 1e-06, + "loss": 0.4279, + "mean_token_accuracy": 0.8655012845993042, + "num_tokens": 207500572.0, + "step": 5437 + }, + { + "epoch": 0.691769494975194, + "ewc_loss": 0.025634765625, + "ewc_loss_parallel": 2.562999725341797e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.70747947692871, + "learning_rate": 1e-06, + "loss": 0.4777, + "mean_token_accuracy": 0.851774275302887, + "num_tokens": 207534977.0, + "step": 5438 + }, + { + "epoch": 0.6918967052537845, + "ewc_loss": 0.0252685546875, + "ewc_loss_parallel": 2.5272369384765625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.482349395751953, + "learning_rate": 1e-06, + "loss": 0.4281, + "mean_token_accuracy": 0.8642470836639404, + "num_tokens": 207569809.0, + "step": 5439 + }, + { + "epoch": 0.692023915532375, + "ewc_loss": 0.025146484375, + "ewc_loss_parallel": 2.5153160095214844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.58627700805664, + "learning_rate": 1e-06, + "loss": 0.433, + "mean_token_accuracy": 0.859582781791687, + "num_tokens": 207606785.0, + "step": 5440 + }, + { + "epoch": 0.6921511258109655, + "ewc_loss": 0.0255126953125, + "ewc_loss_parallel": 2.5510787963867188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.625843048095703, + "learning_rate": 1e-06, + "loss": 0.4548, + "mean_token_accuracy": 0.8575510382652283, + "num_tokens": 207648344.0, + "step": 5441 + }, + { + "epoch": 0.692278336089556, + "ewc_loss": 0.025634765625, + "ewc_loss_parallel": 2.562999725341797e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.544355392456055, + "learning_rate": 1e-06, + "loss": 0.4259, + "mean_token_accuracy": 0.8658632040023804, + "num_tokens": 207680962.0, + "step": 5442 + }, + { + "epoch": 0.6924055463681466, + "ewc_loss": 0.0255126953125, + "ewc_loss_parallel": 2.5510787963867188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.476701736450195, + "learning_rate": 1e-06, + "loss": 0.3373, + "mean_token_accuracy": 0.8913013935089111, + "num_tokens": 207709162.0, + "step": 5443 + }, + { + "epoch": 0.6925327566467371, + "ewc_loss": 0.0255126953125, + "ewc_loss_parallel": 2.5510787963867188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.498594284057617, + "learning_rate": 1e-06, + "loss": 0.4662, + "mean_token_accuracy": 0.8548520803451538, + "num_tokens": 207746089.0, + "step": 5444 + }, + { + "epoch": 0.6926599669253276, + "ewc_loss": 0.025146484375, + "ewc_loss_parallel": 2.5153160095214844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.566734313964844, + "learning_rate": 1e-06, + "loss": 0.4103, + "mean_token_accuracy": 0.8698192834854126, + "num_tokens": 207791235.0, + "step": 5445 + }, + { + "epoch": 0.692787177203918, + "ewc_loss": 0.0255126953125, + "ewc_loss_parallel": 2.5510787963867188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.475814819335938, + "learning_rate": 1e-06, + "loss": 0.4694, + "mean_token_accuracy": 0.8508811593055725, + "num_tokens": 207824527.0, + "step": 5446 + }, + { + "epoch": 0.6929143874825086, + "ewc_loss": 0.025634765625, + "ewc_loss_parallel": 2.562999725341797e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.809423446655273, + "learning_rate": 1e-06, + "loss": 0.4151, + "mean_token_accuracy": 0.8689929842948914, + "num_tokens": 207863535.0, + "step": 5447 + }, + { + "epoch": 0.6930415977610991, + "ewc_loss": 0.0255126953125, + "ewc_loss_parallel": 2.5510787963867188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.538915634155273, + "learning_rate": 1e-06, + "loss": 0.4648, + "mean_token_accuracy": 0.8536374568939209, + "num_tokens": 207899694.0, + "step": 5448 + }, + { + "epoch": 0.6931688080396896, + "ewc_loss": 0.025390625, + "ewc_loss_parallel": 2.5391578674316406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.52728271484375, + "learning_rate": 1e-06, + "loss": 0.4803, + "mean_token_accuracy": 0.8502679467201233, + "num_tokens": 207931440.0, + "step": 5449 + }, + { + "epoch": 0.6932960183182801, + "ewc_loss": 0.025634765625, + "ewc_loss_parallel": 2.562999725341797e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.52878761291504, + "learning_rate": 1e-06, + "loss": 0.4414, + "mean_token_accuracy": 0.8576250076293945, + "num_tokens": 207963758.0, + "step": 5450 + }, + { + "epoch": 0.6934232285968707, + "ewc_loss": 0.0255126953125, + "ewc_loss_parallel": 2.5510787963867188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.61290168762207, + "learning_rate": 1e-06, + "loss": 0.4324, + "mean_token_accuracy": 0.8641480803489685, + "num_tokens": 208002383.0, + "step": 5451 + }, + { + "epoch": 0.6935504388754611, + "ewc_loss": 0.0255126953125, + "ewc_loss_parallel": 2.5510787963867188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.747886657714844, + "learning_rate": 1e-06, + "loss": 0.4289, + "mean_token_accuracy": 0.8630238771438599, + "num_tokens": 208038723.0, + "step": 5452 + }, + { + "epoch": 0.6936776491540516, + "ewc_loss": 0.0255126953125, + "ewc_loss_parallel": 2.5510787963867188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.717578887939453, + "learning_rate": 1e-06, + "loss": 0.416, + "mean_token_accuracy": 0.8637030720710754, + "num_tokens": 208071851.0, + "step": 5453 + }, + { + "epoch": 0.6938048594326421, + "ewc_loss": 0.0252685546875, + "ewc_loss_parallel": 2.5272369384765625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.73993492126465, + "learning_rate": 1e-06, + "loss": 0.5106, + "mean_token_accuracy": 0.841429591178894, + "num_tokens": 208106916.0, + "step": 5454 + }, + { + "epoch": 0.6939320697112327, + "ewc_loss": 0.025390625, + "ewc_loss_parallel": 2.5391578674316406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.61444854736328, + "learning_rate": 1e-06, + "loss": 0.4051, + "mean_token_accuracy": 0.8683387041091919, + "num_tokens": 208146883.0, + "step": 5455 + }, + { + "epoch": 0.6940592799898232, + "ewc_loss": 0.025390625, + "ewc_loss_parallel": 2.5391578674316406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.649925231933594, + "learning_rate": 1e-06, + "loss": 0.4323, + "mean_token_accuracy": 0.862384557723999, + "num_tokens": 208187364.0, + "step": 5456 + }, + { + "epoch": 0.6941864902684137, + "ewc_loss": 0.025390625, + "ewc_loss_parallel": 2.5391578674316406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.950321197509766, + "learning_rate": 1e-06, + "loss": 0.4138, + "mean_token_accuracy": 0.8676486015319824, + "num_tokens": 208226087.0, + "step": 5457 + }, + { + "epoch": 0.6943137005470043, + "ewc_loss": 0.0252685546875, + "ewc_loss_parallel": 2.5272369384765625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.601850509643555, + "learning_rate": 1e-06, + "loss": 0.4497, + "mean_token_accuracy": 0.855428159236908, + "num_tokens": 208265193.0, + "step": 5458 + }, + { + "epoch": 0.6944409108255947, + "ewc_loss": 0.025146484375, + "ewc_loss_parallel": 2.5153160095214844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.66978645324707, + "learning_rate": 1e-06, + "loss": 0.4052, + "mean_token_accuracy": 0.8722876310348511, + "num_tokens": 208305163.0, + "step": 5459 + }, + { + "epoch": 0.6945681211041852, + "ewc_loss": 0.025390625, + "ewc_loss_parallel": 2.5391578674316406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.71515464782715, + "learning_rate": 1e-06, + "loss": 0.4769, + "mean_token_accuracy": 0.848050594329834, + "num_tokens": 208342229.0, + "step": 5460 + }, + { + "epoch": 0.6946953313827757, + "ewc_loss": 0.0252685546875, + "ewc_loss_parallel": 2.5272369384765625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.59257698059082, + "learning_rate": 1e-06, + "loss": 0.4677, + "mean_token_accuracy": 0.8491359353065491, + "num_tokens": 208384606.0, + "step": 5461 + }, + { + "epoch": 0.6948225416613663, + "ewc_loss": 0.025146484375, + "ewc_loss_parallel": 2.5153160095214844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.893638610839844, + "learning_rate": 1e-06, + "loss": 0.387, + "mean_token_accuracy": 0.8794071674346924, + "num_tokens": 208416777.0, + "step": 5462 + }, + { + "epoch": 0.6949497519399568, + "ewc_loss": 0.025146484375, + "ewc_loss_parallel": 2.5153160095214844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.55622673034668, + "learning_rate": 1e-06, + "loss": 0.4806, + "mean_token_accuracy": 0.8438908457756042, + "num_tokens": 208456101.0, + "step": 5463 + }, + { + "epoch": 0.6950769622185473, + "ewc_loss": 0.02490234375, + "ewc_loss_parallel": 2.491474151611328e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.642051696777344, + "learning_rate": 1e-06, + "loss": 0.4124, + "mean_token_accuracy": 0.8693584203720093, + "num_tokens": 208489201.0, + "step": 5464 + }, + { + "epoch": 0.6952041724971377, + "ewc_loss": 0.0255126953125, + "ewc_loss_parallel": 2.5510787963867188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.910432815551758, + "learning_rate": 1e-06, + "loss": 0.3898, + "mean_token_accuracy": 0.8736470937728882, + "num_tokens": 208525006.0, + "step": 5465 + }, + { + "epoch": 0.6953313827757283, + "ewc_loss": 0.025146484375, + "ewc_loss_parallel": 2.5153160095214844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.5006160736084, + "learning_rate": 1e-06, + "loss": 0.4515, + "mean_token_accuracy": 0.8568676114082336, + "num_tokens": 208568997.0, + "step": 5466 + }, + { + "epoch": 0.6954585930543188, + "ewc_loss": 0.025390625, + "ewc_loss_parallel": 2.5391578674316406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.69193458557129, + "learning_rate": 1e-06, + "loss": 0.459, + "mean_token_accuracy": 0.8515927791595459, + "num_tokens": 208605925.0, + "step": 5467 + }, + { + "epoch": 0.6955858033329093, + "ewc_loss": 0.0252685546875, + "ewc_loss_parallel": 2.5272369384765625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.578428268432617, + "learning_rate": 1e-06, + "loss": 0.5321, + "mean_token_accuracy": 0.8350382447242737, + "num_tokens": 208646290.0, + "step": 5468 + }, + { + "epoch": 0.6957130136114998, + "ewc_loss": 0.025146484375, + "ewc_loss_parallel": 2.5153160095214844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.596525192260742, + "learning_rate": 1e-06, + "loss": 0.4322, + "mean_token_accuracy": 0.8611417412757874, + "num_tokens": 208686424.0, + "step": 5469 + }, + { + "epoch": 0.6958402238900904, + "ewc_loss": 0.025390625, + "ewc_loss_parallel": 2.5391578674316406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.58121109008789, + "learning_rate": 1e-06, + "loss": 0.445, + "mean_token_accuracy": 0.8617743849754333, + "num_tokens": 208719643.0, + "step": 5470 + }, + { + "epoch": 0.6959674341686808, + "ewc_loss": 0.0252685546875, + "ewc_loss_parallel": 2.5272369384765625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.396011352539062, + "learning_rate": 1e-06, + "loss": 0.5046, + "mean_token_accuracy": 0.8424609899520874, + "num_tokens": 208760804.0, + "step": 5471 + }, + { + "epoch": 0.6960946444472713, + "ewc_loss": 0.0255126953125, + "ewc_loss_parallel": 2.5510787963867188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.733957290649414, + "learning_rate": 1e-06, + "loss": 0.466, + "mean_token_accuracy": 0.8533243536949158, + "num_tokens": 208803022.0, + "step": 5472 + }, + { + "epoch": 0.6962218547258618, + "ewc_loss": 0.0255126953125, + "ewc_loss_parallel": 2.5510787963867188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.488990783691406, + "learning_rate": 1e-06, + "loss": 0.4364, + "mean_token_accuracy": 0.8599865436553955, + "num_tokens": 208844205.0, + "step": 5473 + }, + { + "epoch": 0.6963490650044524, + "ewc_loss": 0.0252685546875, + "ewc_loss_parallel": 2.5272369384765625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.60079574584961, + "learning_rate": 1e-06, + "loss": 0.4088, + "mean_token_accuracy": 0.869513988494873, + "num_tokens": 208886882.0, + "step": 5474 + }, + { + "epoch": 0.6964762752830429, + "ewc_loss": 0.0255126953125, + "ewc_loss_parallel": 2.5510787963867188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.487218856811523, + "learning_rate": 1e-06, + "loss": 0.4509, + "mean_token_accuracy": 0.8568602800369263, + "num_tokens": 208926649.0, + "step": 5475 + }, + { + "epoch": 0.6966034855616334, + "ewc_loss": 0.0255126953125, + "ewc_loss_parallel": 2.5510787963867188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.6299991607666, + "learning_rate": 1e-06, + "loss": 0.4633, + "mean_token_accuracy": 0.8532997965812683, + "num_tokens": 208963403.0, + "step": 5476 + }, + { + "epoch": 0.6967306958402238, + "ewc_loss": 0.0255126953125, + "ewc_loss_parallel": 2.5510787963867188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.588054656982422, + "learning_rate": 1e-06, + "loss": 0.4816, + "mean_token_accuracy": 0.8515642285346985, + "num_tokens": 209002475.0, + "step": 5477 + }, + { + "epoch": 0.6968579061188144, + "ewc_loss": 0.0252685546875, + "ewc_loss_parallel": 2.5272369384765625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.528228759765625, + "learning_rate": 1e-06, + "loss": 0.4426, + "mean_token_accuracy": 0.859249472618103, + "num_tokens": 209044216.0, + "step": 5478 + }, + { + "epoch": 0.6969851163974049, + "ewc_loss": 0.0255126953125, + "ewc_loss_parallel": 2.5510787963867188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.483068466186523, + "learning_rate": 1e-06, + "loss": 0.4776, + "mean_token_accuracy": 0.8484359979629517, + "num_tokens": 209080774.0, + "step": 5479 + }, + { + "epoch": 0.6971123266759954, + "ewc_loss": 0.0255126953125, + "ewc_loss_parallel": 2.5510787963867188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.55754852294922, + "learning_rate": 1e-06, + "loss": 0.4274, + "mean_token_accuracy": 0.8652287125587463, + "num_tokens": 209117167.0, + "step": 5480 + }, + { + "epoch": 0.697239536954586, + "ewc_loss": 0.025390625, + "ewc_loss_parallel": 2.5391578674316406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.5550479888916, + "learning_rate": 1e-06, + "loss": 0.4636, + "mean_token_accuracy": 0.8527759313583374, + "num_tokens": 209148872.0, + "step": 5481 + }, + { + "epoch": 0.6973667472331765, + "ewc_loss": 0.0255126953125, + "ewc_loss_parallel": 2.5510787963867188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.79514503479004, + "learning_rate": 1e-06, + "loss": 0.4395, + "mean_token_accuracy": 0.8611791729927063, + "num_tokens": 209187912.0, + "step": 5482 + }, + { + "epoch": 0.6974939575117669, + "ewc_loss": 0.025390625, + "ewc_loss_parallel": 2.5391578674316406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.409183502197266, + "learning_rate": 1e-06, + "loss": 0.4174, + "mean_token_accuracy": 0.8665585517883301, + "num_tokens": 209227055.0, + "step": 5483 + }, + { + "epoch": 0.6976211677903574, + "ewc_loss": 0.0255126953125, + "ewc_loss_parallel": 2.5510787963867188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.63766098022461, + "learning_rate": 1e-06, + "loss": 0.3935, + "mean_token_accuracy": 0.873843789100647, + "num_tokens": 209255343.0, + "step": 5484 + }, + { + "epoch": 0.697748378068948, + "ewc_loss": 0.025390625, + "ewc_loss_parallel": 2.5391578674316406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.628965377807617, + "learning_rate": 1e-06, + "loss": 0.4607, + "mean_token_accuracy": 0.8527157306671143, + "num_tokens": 209294621.0, + "step": 5485 + }, + { + "epoch": 0.6978755883475385, + "ewc_loss": 0.025634765625, + "ewc_loss_parallel": 2.562999725341797e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.48184585571289, + "learning_rate": 1e-06, + "loss": 0.39, + "mean_token_accuracy": 0.8746687173843384, + "num_tokens": 209336519.0, + "step": 5486 + }, + { + "epoch": 0.698002798626129, + "ewc_loss": 0.0252685546875, + "ewc_loss_parallel": 2.5272369384765625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.904104232788086, + "learning_rate": 1e-06, + "loss": 0.4587, + "mean_token_accuracy": 0.8567258715629578, + "num_tokens": 209372913.0, + "step": 5487 + }, + { + "epoch": 0.6981300089047195, + "ewc_loss": 0.025390625, + "ewc_loss_parallel": 2.5391578674316406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.46046257019043, + "learning_rate": 1e-06, + "loss": 0.4625, + "mean_token_accuracy": 0.8501132726669312, + "num_tokens": 209417768.0, + "step": 5488 + }, + { + "epoch": 0.69825721918331, + "ewc_loss": 0.0252685546875, + "ewc_loss_parallel": 2.5272369384765625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.66332244873047, + "learning_rate": 1e-06, + "loss": 0.4237, + "mean_token_accuracy": 0.8609336614608765, + "num_tokens": 209451221.0, + "step": 5489 + }, + { + "epoch": 0.6983844294619005, + "ewc_loss": 0.0255126953125, + "ewc_loss_parallel": 2.5510787963867188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.718814849853516, + "learning_rate": 1e-06, + "loss": 0.4653, + "mean_token_accuracy": 0.8580175638198853, + "num_tokens": 209488710.0, + "step": 5490 + }, + { + "epoch": 0.698511639740491, + "ewc_loss": 0.0252685546875, + "ewc_loss_parallel": 2.5272369384765625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.469223022460938, + "learning_rate": 1e-06, + "loss": 0.4376, + "mean_token_accuracy": 0.85577392578125, + "num_tokens": 209525166.0, + "step": 5491 + }, + { + "epoch": 0.6986388500190815, + "ewc_loss": 0.0255126953125, + "ewc_loss_parallel": 2.5510787963867188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.909814834594727, + "learning_rate": 1e-06, + "loss": 0.4455, + "mean_token_accuracy": 0.8609896898269653, + "num_tokens": 209564804.0, + "step": 5492 + }, + { + "epoch": 0.6987660602976721, + "ewc_loss": 0.0255126953125, + "ewc_loss_parallel": 2.5510787963867188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.489343643188477, + "learning_rate": 1e-06, + "loss": 0.4324, + "mean_token_accuracy": 0.8628177642822266, + "num_tokens": 209607155.0, + "step": 5493 + }, + { + "epoch": 0.6988932705762626, + "ewc_loss": 0.025390625, + "ewc_loss_parallel": 2.5391578674316406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.56978416442871, + "learning_rate": 1e-06, + "loss": 0.4789, + "mean_token_accuracy": 0.8487330675125122, + "num_tokens": 209641226.0, + "step": 5494 + }, + { + "epoch": 0.699020480854853, + "ewc_loss": 0.025390625, + "ewc_loss_parallel": 2.5391578674316406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.502071380615234, + "learning_rate": 1e-06, + "loss": 0.4432, + "mean_token_accuracy": 0.8560518026351929, + "num_tokens": 209679180.0, + "step": 5495 + }, + { + "epoch": 0.6991476911334435, + "ewc_loss": 0.0252685546875, + "ewc_loss_parallel": 2.5272369384765625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.663387298583984, + "learning_rate": 1e-06, + "loss": 0.4639, + "mean_token_accuracy": 0.8510491847991943, + "num_tokens": 209716412.0, + "step": 5496 + }, + { + "epoch": 0.6992749014120341, + "ewc_loss": 0.025634765625, + "ewc_loss_parallel": 2.562999725341797e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.489187240600586, + "learning_rate": 1e-06, + "loss": 0.4752, + "mean_token_accuracy": 0.8505464196205139, + "num_tokens": 209755697.0, + "step": 5497 + }, + { + "epoch": 0.6994021116906246, + "ewc_loss": 0.025390625, + "ewc_loss_parallel": 2.5391578674316406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.038402557373047, + "learning_rate": 1e-06, + "loss": 0.4232, + "mean_token_accuracy": 0.8656336069107056, + "num_tokens": 209793654.0, + "step": 5498 + }, + { + "epoch": 0.6995293219692151, + "ewc_loss": 0.0257568359375, + "ewc_loss_parallel": 2.574920654296875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.492136001586914, + "learning_rate": 1e-06, + "loss": 0.4567, + "mean_token_accuracy": 0.856715977191925, + "num_tokens": 209830703.0, + "step": 5499 + }, + { + "epoch": 0.6996565322478057, + "ewc_loss": 0.025634765625, + "ewc_loss_parallel": 2.562999725341797e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.8231201171875, + "learning_rate": 1e-06, + "loss": 0.4481, + "mean_token_accuracy": 0.8596088886260986, + "num_tokens": 209868994.0, + "step": 5500 + }, + { + "epoch": 0.6997837425263961, + "ewc_loss": 0.0257568359375, + "ewc_loss_parallel": 2.574920654296875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.628164291381836, + "learning_rate": 1e-06, + "loss": 0.4131, + "mean_token_accuracy": 0.868904709815979, + "num_tokens": 209909605.0, + "step": 5501 + }, + { + "epoch": 0.6999109528049866, + "ewc_loss": 0.0255126953125, + "ewc_loss_parallel": 2.5510787963867188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.577713012695312, + "learning_rate": 1e-06, + "loss": 0.4266, + "mean_token_accuracy": 0.8648240566253662, + "num_tokens": 209945196.0, + "step": 5502 + }, + { + "epoch": 0.7000381630835771, + "ewc_loss": 0.025390625, + "ewc_loss_parallel": 2.5391578674316406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.662389755249023, + "learning_rate": 1e-06, + "loss": 0.4623, + "mean_token_accuracy": 0.8595690727233887, + "num_tokens": 209988585.0, + "step": 5503 + }, + { + "epoch": 0.7001653733621677, + "ewc_loss": 0.025634765625, + "ewc_loss_parallel": 2.562999725341797e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.746044158935547, + "learning_rate": 1e-06, + "loss": 0.4385, + "mean_token_accuracy": 0.8586270809173584, + "num_tokens": 210032084.0, + "step": 5504 + }, + { + "epoch": 0.7002925836407582, + "ewc_loss": 0.0255126953125, + "ewc_loss_parallel": 2.5510787963867188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.511463165283203, + "learning_rate": 1e-06, + "loss": 0.4853, + "mean_token_accuracy": 0.8486500978469849, + "num_tokens": 210070639.0, + "step": 5505 + }, + { + "epoch": 0.7004197939193487, + "ewc_loss": 0.025634765625, + "ewc_loss_parallel": 2.562999725341797e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.65510368347168, + "learning_rate": 1e-06, + "loss": 0.4107, + "mean_token_accuracy": 0.8667988181114197, + "num_tokens": 210113793.0, + "step": 5506 + }, + { + "epoch": 0.7005470041979391, + "ewc_loss": 0.0255126953125, + "ewc_loss_parallel": 2.5510787963867188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.911550521850586, + "learning_rate": 1e-06, + "loss": 0.4753, + "mean_token_accuracy": 0.8501590490341187, + "num_tokens": 210153517.0, + "step": 5507 + }, + { + "epoch": 0.7006742144765297, + "ewc_loss": 0.0252685546875, + "ewc_loss_parallel": 2.5272369384765625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.640531539916992, + "learning_rate": 1e-06, + "loss": 0.4513, + "mean_token_accuracy": 0.8544399738311768, + "num_tokens": 210186818.0, + "step": 5508 + }, + { + "epoch": 0.7008014247551202, + "ewc_loss": 0.025390625, + "ewc_loss_parallel": 2.5391578674316406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.837411880493164, + "learning_rate": 1e-06, + "loss": 0.462, + "mean_token_accuracy": 0.8515729904174805, + "num_tokens": 210224283.0, + "step": 5509 + }, + { + "epoch": 0.7009286350337107, + "ewc_loss": 0.0255126953125, + "ewc_loss_parallel": 2.5510787963867188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.5792293548584, + "learning_rate": 1e-06, + "loss": 0.4528, + "mean_token_accuracy": 0.8537285327911377, + "num_tokens": 210263439.0, + "step": 5510 + }, + { + "epoch": 0.7010558453123013, + "ewc_loss": 0.025390625, + "ewc_loss_parallel": 2.5391578674316406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.7557373046875, + "learning_rate": 1e-06, + "loss": 0.4327, + "mean_token_accuracy": 0.861211359500885, + "num_tokens": 210305887.0, + "step": 5511 + }, + { + "epoch": 0.7011830555908918, + "ewc_loss": 0.025634765625, + "ewc_loss_parallel": 2.562999725341797e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.9171142578125, + "learning_rate": 1e-06, + "loss": 0.4441, + "mean_token_accuracy": 0.858139157295227, + "num_tokens": 210340118.0, + "step": 5512 + }, + { + "epoch": 0.7013102658694823, + "ewc_loss": 0.0252685546875, + "ewc_loss_parallel": 2.5272369384765625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.53414535522461, + "learning_rate": 1e-06, + "loss": 0.4388, + "mean_token_accuracy": 0.8601911664009094, + "num_tokens": 210376098.0, + "step": 5513 + }, + { + "epoch": 0.7014374761480727, + "ewc_loss": 0.025146484375, + "ewc_loss_parallel": 2.5153160095214844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.605693817138672, + "learning_rate": 1e-06, + "loss": 0.4515, + "mean_token_accuracy": 0.862366795539856, + "num_tokens": 210418801.0, + "step": 5514 + }, + { + "epoch": 0.7015646864266633, + "ewc_loss": 0.0255126953125, + "ewc_loss_parallel": 2.5510787963867188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.784257888793945, + "learning_rate": 1e-06, + "loss": 0.4711, + "mean_token_accuracy": 0.8482547402381897, + "num_tokens": 210452392.0, + "step": 5515 + }, + { + "epoch": 0.7016918967052538, + "ewc_loss": 0.0252685546875, + "ewc_loss_parallel": 2.5272369384765625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.543928146362305, + "learning_rate": 1e-06, + "loss": 0.4291, + "mean_token_accuracy": 0.8665475845336914, + "num_tokens": 210489697.0, + "step": 5516 + }, + { + "epoch": 0.7018191069838443, + "ewc_loss": 0.0255126953125, + "ewc_loss_parallel": 2.5510787963867188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.621746063232422, + "learning_rate": 1e-06, + "loss": 0.4924, + "mean_token_accuracy": 0.8431438207626343, + "num_tokens": 210530212.0, + "step": 5517 + }, + { + "epoch": 0.7019463172624348, + "ewc_loss": 0.0252685546875, + "ewc_loss_parallel": 2.5272369384765625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.79007911682129, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.8644417524337769, + "num_tokens": 210564974.0, + "step": 5518 + }, + { + "epoch": 0.7020735275410254, + "ewc_loss": 0.0252685546875, + "ewc_loss_parallel": 2.5272369384765625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.664695739746094, + "learning_rate": 1e-06, + "loss": 0.441, + "mean_token_accuracy": 0.8595051765441895, + "num_tokens": 210601272.0, + "step": 5519 + }, + { + "epoch": 0.7022007378196158, + "ewc_loss": 0.025146484375, + "ewc_loss_parallel": 2.5153160095214844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.63638687133789, + "learning_rate": 1e-06, + "loss": 0.4317, + "mean_token_accuracy": 0.8615559339523315, + "num_tokens": 210636908.0, + "step": 5520 + }, + { + "epoch": 0.7023279480982063, + "ewc_loss": 0.0252685546875, + "ewc_loss_parallel": 2.5272369384765625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.714967727661133, + "learning_rate": 1e-06, + "loss": 0.4572, + "mean_token_accuracy": 0.8559244871139526, + "num_tokens": 210672629.0, + "step": 5521 + }, + { + "epoch": 0.7024551583767968, + "ewc_loss": 0.0252685546875, + "ewc_loss_parallel": 2.5272369384765625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.431121826171875, + "learning_rate": 1e-06, + "loss": 0.3992, + "mean_token_accuracy": 0.8727867007255554, + "num_tokens": 210703531.0, + "step": 5522 + }, + { + "epoch": 0.7025823686553874, + "ewc_loss": 0.025390625, + "ewc_loss_parallel": 2.5391578674316406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.708961486816406, + "learning_rate": 1e-06, + "loss": 0.3976, + "mean_token_accuracy": 0.8728567361831665, + "num_tokens": 210742946.0, + "step": 5523 + }, + { + "epoch": 0.7027095789339779, + "ewc_loss": 0.025634765625, + "ewc_loss_parallel": 2.562999725341797e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.6518497467041, + "learning_rate": 1e-06, + "loss": 0.4118, + "mean_token_accuracy": 0.8676024079322815, + "num_tokens": 210778345.0, + "step": 5524 + }, + { + "epoch": 0.7028367892125684, + "ewc_loss": 0.0255126953125, + "ewc_loss_parallel": 2.5510787963867188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.506248474121094, + "learning_rate": 1e-06, + "loss": 0.4296, + "mean_token_accuracy": 0.8656424283981323, + "num_tokens": 210821544.0, + "step": 5525 + }, + { + "epoch": 0.7029639994911588, + "ewc_loss": 0.0255126953125, + "ewc_loss_parallel": 2.5510787963867188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.521053314208984, + "learning_rate": 1e-06, + "loss": 0.4414, + "mean_token_accuracy": 0.8629918098449707, + "num_tokens": 210858060.0, + "step": 5526 + }, + { + "epoch": 0.7030912097697494, + "ewc_loss": 0.025634765625, + "ewc_loss_parallel": 2.562999725341797e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.613277435302734, + "learning_rate": 1e-06, + "loss": 0.4577, + "mean_token_accuracy": 0.853748619556427, + "num_tokens": 210900365.0, + "step": 5527 + }, + { + "epoch": 0.7032184200483399, + "ewc_loss": 0.025634765625, + "ewc_loss_parallel": 2.562999725341797e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.758291244506836, + "learning_rate": 1e-06, + "loss": 0.4492, + "mean_token_accuracy": 0.8537037968635559, + "num_tokens": 210941118.0, + "step": 5528 + }, + { + "epoch": 0.7033456303269304, + "ewc_loss": 0.025390625, + "ewc_loss_parallel": 2.5391578674316406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.49309730529785, + "learning_rate": 1e-06, + "loss": 0.4209, + "mean_token_accuracy": 0.8661469221115112, + "num_tokens": 210979865.0, + "step": 5529 + }, + { + "epoch": 0.703472840605521, + "ewc_loss": 0.0255126953125, + "ewc_loss_parallel": 2.5510787963867188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.728818893432617, + "learning_rate": 1e-06, + "loss": 0.4366, + "mean_token_accuracy": 0.8592228889465332, + "num_tokens": 211016385.0, + "step": 5530 + }, + { + "epoch": 0.7036000508841115, + "ewc_loss": 0.0255126953125, + "ewc_loss_parallel": 2.5510787963867188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.48691749572754, + "learning_rate": 1e-06, + "loss": 0.4854, + "mean_token_accuracy": 0.8471806049346924, + "num_tokens": 211059071.0, + "step": 5531 + }, + { + "epoch": 0.7037272611627019, + "ewc_loss": 0.0255126953125, + "ewc_loss_parallel": 2.5510787963867188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.635068893432617, + "learning_rate": 1e-06, + "loss": 0.4259, + "mean_token_accuracy": 0.8617343902587891, + "num_tokens": 211094189.0, + "step": 5532 + }, + { + "epoch": 0.7038544714412924, + "ewc_loss": 0.0255126953125, + "ewc_loss_parallel": 2.5510787963867188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.777254104614258, + "learning_rate": 1e-06, + "loss": 0.4002, + "mean_token_accuracy": 0.8729976415634155, + "num_tokens": 211130524.0, + "step": 5533 + }, + { + "epoch": 0.703981681719883, + "ewc_loss": 0.025390625, + "ewc_loss_parallel": 2.5391578674316406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.727920532226562, + "learning_rate": 1e-06, + "loss": 0.4606, + "mean_token_accuracy": 0.8528088331222534, + "num_tokens": 211172335.0, + "step": 5534 + }, + { + "epoch": 0.7041088919984735, + "ewc_loss": 0.0252685546875, + "ewc_loss_parallel": 2.5272369384765625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.698322296142578, + "learning_rate": 1e-06, + "loss": 0.4332, + "mean_token_accuracy": 0.8614920377731323, + "num_tokens": 211205406.0, + "step": 5535 + }, + { + "epoch": 0.704236102277064, + "ewc_loss": 0.0255126953125, + "ewc_loss_parallel": 2.5510787963867188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.837488174438477, + "learning_rate": 1e-06, + "loss": 0.4584, + "mean_token_accuracy": 0.8550399541854858, + "num_tokens": 211246443.0, + "step": 5536 + }, + { + "epoch": 0.7043633125556545, + "ewc_loss": 0.0255126953125, + "ewc_loss_parallel": 2.5510787963867188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.715787887573242, + "learning_rate": 1e-06, + "loss": 0.3967, + "mean_token_accuracy": 0.8735146522521973, + "num_tokens": 211286707.0, + "step": 5537 + }, + { + "epoch": 0.704490522834245, + "ewc_loss": 0.0252685546875, + "ewc_loss_parallel": 2.5272369384765625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.55600357055664, + "learning_rate": 1e-06, + "loss": 0.404, + "mean_token_accuracy": 0.8708081841468811, + "num_tokens": 211319448.0, + "step": 5538 + }, + { + "epoch": 0.7046177331128355, + "ewc_loss": 0.025634765625, + "ewc_loss_parallel": 2.562999725341797e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.553260803222656, + "learning_rate": 1e-06, + "loss": 0.4596, + "mean_token_accuracy": 0.8534194231033325, + "num_tokens": 211363747.0, + "step": 5539 + }, + { + "epoch": 0.704744943391426, + "ewc_loss": 0.025390625, + "ewc_loss_parallel": 2.5391578674316406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.65361213684082, + "learning_rate": 1e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.8662611246109009, + "num_tokens": 211408080.0, + "step": 5540 + }, + { + "epoch": 0.7048721536700165, + "ewc_loss": 0.0255126953125, + "ewc_loss_parallel": 2.5510787963867188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.621259689331055, + "learning_rate": 1e-06, + "loss": 0.4766, + "mean_token_accuracy": 0.8492926359176636, + "num_tokens": 211445656.0, + "step": 5541 + }, + { + "epoch": 0.7049993639486071, + "ewc_loss": 0.025146484375, + "ewc_loss_parallel": 2.5153160095214844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.541290283203125, + "learning_rate": 1e-06, + "loss": 0.413, + "mean_token_accuracy": 0.868895411491394, + "num_tokens": 211484342.0, + "step": 5542 + }, + { + "epoch": 0.7051265742271976, + "ewc_loss": 0.025146484375, + "ewc_loss_parallel": 2.5153160095214844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.67727279663086, + "learning_rate": 1e-06, + "loss": 0.4431, + "mean_token_accuracy": 0.8547888994216919, + "num_tokens": 211518535.0, + "step": 5543 + }, + { + "epoch": 0.705253784505788, + "ewc_loss": 0.0255126953125, + "ewc_loss_parallel": 2.5510787963867188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.6876277923584, + "learning_rate": 1e-06, + "loss": 0.4596, + "mean_token_accuracy": 0.8532991409301758, + "num_tokens": 211557236.0, + "step": 5544 + }, + { + "epoch": 0.7053809947843785, + "ewc_loss": 0.025390625, + "ewc_loss_parallel": 2.5391578674316406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.558561325073242, + "learning_rate": 1e-06, + "loss": 0.4054, + "mean_token_accuracy": 0.870479166507721, + "num_tokens": 211599216.0, + "step": 5545 + }, + { + "epoch": 0.7055082050629691, + "ewc_loss": 0.0255126953125, + "ewc_loss_parallel": 2.5510787963867188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.553247451782227, + "learning_rate": 1e-06, + "loss": 0.4932, + "mean_token_accuracy": 0.8414430618286133, + "num_tokens": 211638930.0, + "step": 5546 + }, + { + "epoch": 0.7056354153415596, + "ewc_loss": 0.0255126953125, + "ewc_loss_parallel": 2.5510787963867188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.71930694580078, + "learning_rate": 1e-06, + "loss": 0.4785, + "mean_token_accuracy": 0.8518249988555908, + "num_tokens": 211680765.0, + "step": 5547 + }, + { + "epoch": 0.7057626256201501, + "ewc_loss": 0.025634765625, + "ewc_loss_parallel": 2.562999725341797e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.791078567504883, + "learning_rate": 1e-06, + "loss": 0.3938, + "mean_token_accuracy": 0.8735635876655579, + "num_tokens": 211719037.0, + "step": 5548 + }, + { + "epoch": 0.7058898358987407, + "ewc_loss": 0.0252685546875, + "ewc_loss_parallel": 2.5272369384765625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.603906631469727, + "learning_rate": 1e-06, + "loss": 0.4542, + "mean_token_accuracy": 0.85337895154953, + "num_tokens": 211759655.0, + "step": 5549 + }, + { + "epoch": 0.7060170461773311, + "ewc_loss": 0.025634765625, + "ewc_loss_parallel": 2.562999725341797e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.741653442382812, + "learning_rate": 1e-06, + "loss": 0.5045, + "mean_token_accuracy": 0.8421114683151245, + "num_tokens": 211800405.0, + "step": 5550 + }, + { + "epoch": 0.7061442564559216, + "ewc_loss": 0.025634765625, + "ewc_loss_parallel": 2.562999725341797e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.59941291809082, + "learning_rate": 1e-06, + "loss": 0.5006, + "mean_token_accuracy": 0.8408757448196411, + "num_tokens": 211846057.0, + "step": 5551 + }, + { + "epoch": 0.7062714667345121, + "ewc_loss": 0.025390625, + "ewc_loss_parallel": 2.5391578674316406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.90534782409668, + "learning_rate": 1e-06, + "loss": 0.4455, + "mean_token_accuracy": 0.8568508625030518, + "num_tokens": 211885944.0, + "step": 5552 + }, + { + "epoch": 0.7063986770131027, + "ewc_loss": 0.0255126953125, + "ewc_loss_parallel": 2.5510787963867188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.59206199645996, + "learning_rate": 1e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.8626556992530823, + "num_tokens": 211928727.0, + "step": 5553 + }, + { + "epoch": 0.7065258872916932, + "ewc_loss": 0.025390625, + "ewc_loss_parallel": 2.5391578674316406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.615074157714844, + "learning_rate": 1e-06, + "loss": 0.4714, + "mean_token_accuracy": 0.8492277264595032, + "num_tokens": 211967593.0, + "step": 5554 + }, + { + "epoch": 0.7066530975702837, + "ewc_loss": 0.0255126953125, + "ewc_loss_parallel": 2.5510787963867188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.66881561279297, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8532274961471558, + "num_tokens": 212001023.0, + "step": 5555 + }, + { + "epoch": 0.7067803078488741, + "ewc_loss": 0.0255126953125, + "ewc_loss_parallel": 2.5510787963867188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.707836151123047, + "learning_rate": 1e-06, + "loss": 0.4875, + "mean_token_accuracy": 0.8512296676635742, + "num_tokens": 212045611.0, + "step": 5556 + }, + { + "epoch": 0.7069075181274647, + "ewc_loss": 0.025390625, + "ewc_loss_parallel": 2.5391578674316406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.68781280517578, + "learning_rate": 1e-06, + "loss": 0.4635, + "mean_token_accuracy": 0.8538520336151123, + "num_tokens": 212087167.0, + "step": 5557 + }, + { + "epoch": 0.7070347284060552, + "ewc_loss": 0.0255126953125, + "ewc_loss_parallel": 2.5510787963867188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.51172637939453, + "learning_rate": 1e-06, + "loss": 0.434, + "mean_token_accuracy": 0.8606602549552917, + "num_tokens": 212124254.0, + "step": 5558 + }, + { + "epoch": 0.7071619386846457, + "ewc_loss": 0.025634765625, + "ewc_loss_parallel": 2.562999725341797e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.710988998413086, + "learning_rate": 1e-06, + "loss": 0.4686, + "mean_token_accuracy": 0.8529226779937744, + "num_tokens": 212162706.0, + "step": 5559 + }, + { + "epoch": 0.7072891489632362, + "ewc_loss": 0.025634765625, + "ewc_loss_parallel": 2.562999725341797e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.71660804748535, + "learning_rate": 1e-06, + "loss": 0.4458, + "mean_token_accuracy": 0.8581644296646118, + "num_tokens": 212197359.0, + "step": 5560 + }, + { + "epoch": 0.7074163592418268, + "ewc_loss": 0.025390625, + "ewc_loss_parallel": 2.5391578674316406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.525924682617188, + "learning_rate": 1e-06, + "loss": 0.421, + "mean_token_accuracy": 0.865699052810669, + "num_tokens": 212237300.0, + "step": 5561 + }, + { + "epoch": 0.7075435695204173, + "ewc_loss": 0.025634765625, + "ewc_loss_parallel": 2.562999725341797e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.560997009277344, + "learning_rate": 1e-06, + "loss": 0.503, + "mean_token_accuracy": 0.8398507833480835, + "num_tokens": 212283472.0, + "step": 5562 + }, + { + "epoch": 0.7076707797990077, + "ewc_loss": 0.025390625, + "ewc_loss_parallel": 2.5391578674316406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.85010528564453, + "learning_rate": 1e-06, + "loss": 0.4427, + "mean_token_accuracy": 0.8534522652626038, + "num_tokens": 212321433.0, + "step": 5563 + }, + { + "epoch": 0.7077979900775982, + "ewc_loss": 0.025390625, + "ewc_loss_parallel": 2.5391578674316406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.52655601501465, + "learning_rate": 1e-06, + "loss": 0.444, + "mean_token_accuracy": 0.8568450212478638, + "num_tokens": 212362234.0, + "step": 5564 + }, + { + "epoch": 0.7079252003561888, + "ewc_loss": 0.0252685546875, + "ewc_loss_parallel": 2.5272369384765625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.73235321044922, + "learning_rate": 1e-06, + "loss": 0.4829, + "mean_token_accuracy": 0.8494424223899841, + "num_tokens": 212399259.0, + "step": 5565 + }, + { + "epoch": 0.7080524106347793, + "ewc_loss": 0.025634765625, + "ewc_loss_parallel": 2.562999725341797e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.591489791870117, + "learning_rate": 1e-06, + "loss": 0.4465, + "mean_token_accuracy": 0.8588093519210815, + "num_tokens": 212438489.0, + "step": 5566 + }, + { + "epoch": 0.7081796209133698, + "ewc_loss": 0.025634765625, + "ewc_loss_parallel": 2.562999725341797e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.6820011138916, + "learning_rate": 1e-06, + "loss": 0.4742, + "mean_token_accuracy": 0.8500183820724487, + "num_tokens": 212482283.0, + "step": 5567 + }, + { + "epoch": 0.7083068311919604, + "ewc_loss": 0.025390625, + "ewc_loss_parallel": 2.5391578674316406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.594635009765625, + "learning_rate": 1e-06, + "loss": 0.461, + "mean_token_accuracy": 0.8538028001785278, + "num_tokens": 212522749.0, + "step": 5568 + }, + { + "epoch": 0.7084340414705508, + "ewc_loss": 0.0257568359375, + "ewc_loss_parallel": 2.574920654296875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.42905616760254, + "learning_rate": 1e-06, + "loss": 0.4641, + "mean_token_accuracy": 0.8495906591415405, + "num_tokens": 212559937.0, + "step": 5569 + }, + { + "epoch": 0.7085612517491413, + "ewc_loss": 0.025634765625, + "ewc_loss_parallel": 2.562999725341797e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.643970489501953, + "learning_rate": 1e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.8641430139541626, + "num_tokens": 212595578.0, + "step": 5570 + }, + { + "epoch": 0.7086884620277318, + "ewc_loss": 0.0257568359375, + "ewc_loss_parallel": 2.574920654296875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.679771423339844, + "learning_rate": 1e-06, + "loss": 0.4096, + "mean_token_accuracy": 0.8696244955062866, + "num_tokens": 212634085.0, + "step": 5571 + }, + { + "epoch": 0.7088156723063224, + "ewc_loss": 0.0257568359375, + "ewc_loss_parallel": 2.574920654296875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.622739791870117, + "learning_rate": 1e-06, + "loss": 0.4886, + "mean_token_accuracy": 0.8473275899887085, + "num_tokens": 212673557.0, + "step": 5572 + }, + { + "epoch": 0.7089428825849129, + "ewc_loss": 0.0257568359375, + "ewc_loss_parallel": 2.574920654296875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.72766876220703, + "learning_rate": 1e-06, + "loss": 0.412, + "mean_token_accuracy": 0.8681477308273315, + "num_tokens": 212711142.0, + "step": 5573 + }, + { + "epoch": 0.7090700928635034, + "ewc_loss": 0.02587890625, + "ewc_loss_parallel": 2.586841583251953e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.525943756103516, + "learning_rate": 1e-06, + "loss": 0.4456, + "mean_token_accuracy": 0.857940673828125, + "num_tokens": 212745570.0, + "step": 5574 + }, + { + "epoch": 0.7091973031420938, + "ewc_loss": 0.025634765625, + "ewc_loss_parallel": 2.562999725341797e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.707626342773438, + "learning_rate": 1e-06, + "loss": 0.4268, + "mean_token_accuracy": 0.8622592091560364, + "num_tokens": 212783048.0, + "step": 5575 + }, + { + "epoch": 0.7093245134206844, + "ewc_loss": 0.0260009765625, + "ewc_loss_parallel": 2.5987625122070312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.803394317626953, + "learning_rate": 1e-06, + "loss": 0.3839, + "mean_token_accuracy": 0.8765433430671692, + "num_tokens": 212821231.0, + "step": 5576 + }, + { + "epoch": 0.7094517236992749, + "ewc_loss": 0.0255126953125, + "ewc_loss_parallel": 2.5510787963867188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.798137664794922, + "learning_rate": 1e-06, + "loss": 0.4553, + "mean_token_accuracy": 0.856773316860199, + "num_tokens": 212861715.0, + "step": 5577 + }, + { + "epoch": 0.7095789339778654, + "ewc_loss": 0.02587890625, + "ewc_loss_parallel": 2.586841583251953e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.655879974365234, + "learning_rate": 1e-06, + "loss": 0.4762, + "mean_token_accuracy": 0.8480468988418579, + "num_tokens": 212900655.0, + "step": 5578 + }, + { + "epoch": 0.709706144256456, + "ewc_loss": 0.025634765625, + "ewc_loss_parallel": 2.562999725341797e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.67107582092285, + "learning_rate": 1e-06, + "loss": 0.4811, + "mean_token_accuracy": 0.8496466875076294, + "num_tokens": 212939308.0, + "step": 5579 + }, + { + "epoch": 0.7098333545350465, + "ewc_loss": 0.02587890625, + "ewc_loss_parallel": 2.586841583251953e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.69240951538086, + "learning_rate": 1e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.8624744415283203, + "num_tokens": 212981174.0, + "step": 5580 + }, + { + "epoch": 0.7099605648136369, + "ewc_loss": 0.0257568359375, + "ewc_loss_parallel": 2.574920654296875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.551319122314453, + "learning_rate": 1e-06, + "loss": 0.4064, + "mean_token_accuracy": 0.868867814540863, + "num_tokens": 213016401.0, + "step": 5581 + }, + { + "epoch": 0.7100877750922274, + "ewc_loss": 0.025634765625, + "ewc_loss_parallel": 2.562999725341797e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.643512725830078, + "learning_rate": 1e-06, + "loss": 0.4896, + "mean_token_accuracy": 0.8468486666679382, + "num_tokens": 213058480.0, + "step": 5582 + }, + { + "epoch": 0.710214985370818, + "ewc_loss": 0.02587890625, + "ewc_loss_parallel": 2.586841583251953e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.715139389038086, + "learning_rate": 1e-06, + "loss": 0.4149, + "mean_token_accuracy": 0.8712561130523682, + "num_tokens": 213093527.0, + "step": 5583 + }, + { + "epoch": 0.7103421956494085, + "ewc_loss": 0.025390625, + "ewc_loss_parallel": 2.5391578674316406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.736539840698242, + "learning_rate": 1e-06, + "loss": 0.461, + "mean_token_accuracy": 0.8557080626487732, + "num_tokens": 213126778.0, + "step": 5584 + }, + { + "epoch": 0.710469405927999, + "ewc_loss": 0.025390625, + "ewc_loss_parallel": 2.5391578674316406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.38559341430664, + "learning_rate": 1e-06, + "loss": 0.4394, + "mean_token_accuracy": 0.861039400100708, + "num_tokens": 213170863.0, + "step": 5585 + }, + { + "epoch": 0.7105966162065895, + "ewc_loss": 0.025634765625, + "ewc_loss_parallel": 2.562999725341797e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.66667366027832, + "learning_rate": 1e-06, + "loss": 0.5126, + "mean_token_accuracy": 0.8351558446884155, + "num_tokens": 213211336.0, + "step": 5586 + }, + { + "epoch": 0.71072382648518, + "ewc_loss": 0.0257568359375, + "ewc_loss_parallel": 2.574920654296875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.700288772583008, + "learning_rate": 1e-06, + "loss": 0.4138, + "mean_token_accuracy": 0.8652898073196411, + "num_tokens": 213245262.0, + "step": 5587 + }, + { + "epoch": 0.7108510367637705, + "ewc_loss": 0.025634765625, + "ewc_loss_parallel": 2.562999725341797e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.71442222595215, + "learning_rate": 1e-06, + "loss": 0.4707, + "mean_token_accuracy": 0.8500149250030518, + "num_tokens": 213282150.0, + "step": 5588 + }, + { + "epoch": 0.710978247042361, + "ewc_loss": 0.02587890625, + "ewc_loss_parallel": 2.586841583251953e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.82441520690918, + "learning_rate": 1e-06, + "loss": 0.4235, + "mean_token_accuracy": 0.8635960817337036, + "num_tokens": 213325573.0, + "step": 5589 + }, + { + "epoch": 0.7111054573209515, + "ewc_loss": 0.025390625, + "ewc_loss_parallel": 2.5391578674316406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.65201759338379, + "learning_rate": 1e-06, + "loss": 0.4204, + "mean_token_accuracy": 0.8672682642936707, + "num_tokens": 213363738.0, + "step": 5590 + }, + { + "epoch": 0.7112326675995421, + "ewc_loss": 0.0255126953125, + "ewc_loss_parallel": 2.5510787963867188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.653125762939453, + "learning_rate": 1e-06, + "loss": 0.4184, + "mean_token_accuracy": 0.8636888861656189, + "num_tokens": 213404796.0, + "step": 5591 + }, + { + "epoch": 0.7113598778781326, + "ewc_loss": 0.025634765625, + "ewc_loss_parallel": 2.562999725341797e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.715633392333984, + "learning_rate": 1e-06, + "loss": 0.4557, + "mean_token_accuracy": 0.8517733216285706, + "num_tokens": 213436711.0, + "step": 5592 + }, + { + "epoch": 0.711487088156723, + "ewc_loss": 0.02587890625, + "ewc_loss_parallel": 2.586841583251953e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.842182159423828, + "learning_rate": 1e-06, + "loss": 0.434, + "mean_token_accuracy": 0.8614267706871033, + "num_tokens": 213473458.0, + "step": 5593 + }, + { + "epoch": 0.7116142984353135, + "ewc_loss": 0.025634765625, + "ewc_loss_parallel": 2.562999725341797e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.6884765625, + "learning_rate": 1e-06, + "loss": 0.4029, + "mean_token_accuracy": 0.8689360618591309, + "num_tokens": 213507355.0, + "step": 5594 + }, + { + "epoch": 0.7117415087139041, + "ewc_loss": 0.0255126953125, + "ewc_loss_parallel": 2.5510787963867188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.64044189453125, + "learning_rate": 1e-06, + "loss": 0.5001, + "mean_token_accuracy": 0.8418018817901611, + "num_tokens": 213543771.0, + "step": 5595 + }, + { + "epoch": 0.7118687189924946, + "ewc_loss": 0.0255126953125, + "ewc_loss_parallel": 2.5510787963867188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.64981460571289, + "learning_rate": 1e-06, + "loss": 0.4428, + "mean_token_accuracy": 0.8567004203796387, + "num_tokens": 213578317.0, + "step": 5596 + }, + { + "epoch": 0.7119959292710851, + "ewc_loss": 0.025634765625, + "ewc_loss_parallel": 2.562999725341797e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.61687469482422, + "learning_rate": 1e-06, + "loss": 0.4607, + "mean_token_accuracy": 0.8542336821556091, + "num_tokens": 213615795.0, + "step": 5597 + }, + { + "epoch": 0.7121231395496757, + "ewc_loss": 0.025634765625, + "ewc_loss_parallel": 2.562999725341797e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.833425521850586, + "learning_rate": 1e-06, + "loss": 0.4383, + "mean_token_accuracy": 0.8678750991821289, + "num_tokens": 213655452.0, + "step": 5598 + }, + { + "epoch": 0.7122503498282661, + "ewc_loss": 0.0260009765625, + "ewc_loss_parallel": 2.5987625122070312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.65110206604004, + "learning_rate": 1e-06, + "loss": 0.4544, + "mean_token_accuracy": 0.8497308492660522, + "num_tokens": 213688079.0, + "step": 5599 + }, + { + "epoch": 0.7123775601068566, + "ewc_loss": 0.0255126953125, + "ewc_loss_parallel": 2.5510787963867188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.583457946777344, + "learning_rate": 1e-06, + "loss": 0.3882, + "mean_token_accuracy": 0.8752586841583252, + "num_tokens": 213725310.0, + "step": 5600 + }, + { + "epoch": 0.7125047703854471, + "ewc_loss": 0.025634765625, + "ewc_loss_parallel": 2.562999725341797e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.55902862548828, + "learning_rate": 1e-06, + "loss": 0.4595, + "mean_token_accuracy": 0.8527411222457886, + "num_tokens": 213757950.0, + "step": 5601 + }, + { + "epoch": 0.7126319806640377, + "ewc_loss": 0.0257568359375, + "ewc_loss_parallel": 2.574920654296875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.51940155029297, + "learning_rate": 1e-06, + "loss": 0.4862, + "mean_token_accuracy": 0.8457401990890503, + "num_tokens": 213794146.0, + "step": 5602 + }, + { + "epoch": 0.7127591909426282, + "ewc_loss": 0.0257568359375, + "ewc_loss_parallel": 2.574920654296875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.750123977661133, + "learning_rate": 1e-06, + "loss": 0.425, + "mean_token_accuracy": 0.8630613684654236, + "num_tokens": 213834922.0, + "step": 5603 + }, + { + "epoch": 0.7128864012212187, + "ewc_loss": 0.026123046875, + "ewc_loss_parallel": 2.6106834411621094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.576187133789062, + "learning_rate": 1e-06, + "loss": 0.3865, + "mean_token_accuracy": 0.8773799538612366, + "num_tokens": 213879651.0, + "step": 5604 + }, + { + "epoch": 0.7130136114998091, + "ewc_loss": 0.02587890625, + "ewc_loss_parallel": 2.586841583251953e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.542551040649414, + "learning_rate": 1e-06, + "loss": 0.4551, + "mean_token_accuracy": 0.8577470779418945, + "num_tokens": 213921833.0, + "step": 5605 + }, + { + "epoch": 0.7131408217783997, + "ewc_loss": 0.0260009765625, + "ewc_loss_parallel": 2.5987625122070312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.548381805419922, + "learning_rate": 1e-06, + "loss": 0.4253, + "mean_token_accuracy": 0.8641151189804077, + "num_tokens": 213962909.0, + "step": 5606 + }, + { + "epoch": 0.7132680320569902, + "ewc_loss": 0.0260009765625, + "ewc_loss_parallel": 2.5987625122070312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.607378005981445, + "learning_rate": 1e-06, + "loss": 0.4378, + "mean_token_accuracy": 0.8608404994010925, + "num_tokens": 214002030.0, + "step": 5607 + }, + { + "epoch": 0.7133952423355807, + "ewc_loss": 0.0260009765625, + "ewc_loss_parallel": 2.5987625122070312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.733388900756836, + "learning_rate": 1e-06, + "loss": 0.4663, + "mean_token_accuracy": 0.8520345687866211, + "num_tokens": 214037094.0, + "step": 5608 + }, + { + "epoch": 0.7135224526141712, + "ewc_loss": 0.0260009765625, + "ewc_loss_parallel": 2.5987625122070312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.826820373535156, + "learning_rate": 1e-06, + "loss": 0.4248, + "mean_token_accuracy": 0.8626241683959961, + "num_tokens": 214076649.0, + "step": 5609 + }, + { + "epoch": 0.7136496628927618, + "ewc_loss": 0.02587890625, + "ewc_loss_parallel": 2.586841583251953e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.734237670898438, + "learning_rate": 1e-06, + "loss": 0.4506, + "mean_token_accuracy": 0.853783369064331, + "num_tokens": 214114194.0, + "step": 5610 + }, + { + "epoch": 0.7137768731713523, + "ewc_loss": 0.02587890625, + "ewc_loss_parallel": 2.586841583251953e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.622182846069336, + "learning_rate": 1e-06, + "loss": 0.4432, + "mean_token_accuracy": 0.8598649501800537, + "num_tokens": 214149652.0, + "step": 5611 + }, + { + "epoch": 0.7139040834499427, + "ewc_loss": 0.0257568359375, + "ewc_loss_parallel": 2.574920654296875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.65350341796875, + "learning_rate": 1e-06, + "loss": 0.41, + "mean_token_accuracy": 0.868546724319458, + "num_tokens": 214189068.0, + "step": 5612 + }, + { + "epoch": 0.7140312937285332, + "ewc_loss": 0.025634765625, + "ewc_loss_parallel": 2.562999725341797e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.8078670501709, + "learning_rate": 1e-06, + "loss": 0.4763, + "mean_token_accuracy": 0.8464235663414001, + "num_tokens": 214220251.0, + "step": 5613 + }, + { + "epoch": 0.7141585040071238, + "ewc_loss": 0.025634765625, + "ewc_loss_parallel": 2.562999725341797e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.570878982543945, + "learning_rate": 1e-06, + "loss": 0.4486, + "mean_token_accuracy": 0.8558962345123291, + "num_tokens": 214256705.0, + "step": 5614 + }, + { + "epoch": 0.7142857142857143, + "ewc_loss": 0.0257568359375, + "ewc_loss_parallel": 2.574920654296875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.679716110229492, + "learning_rate": 1e-06, + "loss": 0.4018, + "mean_token_accuracy": 0.8730483055114746, + "num_tokens": 214292563.0, + "step": 5615 + }, + { + "epoch": 0.7144129245643048, + "ewc_loss": 0.0257568359375, + "ewc_loss_parallel": 2.574920654296875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.689674377441406, + "learning_rate": 1e-06, + "loss": 0.465, + "mean_token_accuracy": 0.8524881601333618, + "num_tokens": 214333843.0, + "step": 5616 + }, + { + "epoch": 0.7145401348428954, + "ewc_loss": 0.025634765625, + "ewc_loss_parallel": 2.562999725341797e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.58241844177246, + "learning_rate": 1e-06, + "loss": 0.4075, + "mean_token_accuracy": 0.8703951239585876, + "num_tokens": 214372272.0, + "step": 5617 + }, + { + "epoch": 0.7146673451214858, + "ewc_loss": 0.0257568359375, + "ewc_loss_parallel": 2.574920654296875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.585933685302734, + "learning_rate": 1e-06, + "loss": 0.4278, + "mean_token_accuracy": 0.8652868270874023, + "num_tokens": 214416364.0, + "step": 5618 + }, + { + "epoch": 0.7147945554000763, + "ewc_loss": 0.0255126953125, + "ewc_loss_parallel": 2.5510787963867188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.0578556060791, + "learning_rate": 1e-06, + "loss": 0.4486, + "mean_token_accuracy": 0.8569232225418091, + "num_tokens": 214453102.0, + "step": 5619 + }, + { + "epoch": 0.7149217656786668, + "ewc_loss": 0.026123046875, + "ewc_loss_parallel": 2.6106834411621094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.55767059326172, + "learning_rate": 1e-06, + "loss": 0.425, + "mean_token_accuracy": 0.8628018498420715, + "num_tokens": 214492374.0, + "step": 5620 + }, + { + "epoch": 0.7150489759572574, + "ewc_loss": 0.025390625, + "ewc_loss_parallel": 2.5391578674316406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.178346633911133, + "learning_rate": 1e-06, + "loss": 0.4435, + "mean_token_accuracy": 0.8585045337677002, + "num_tokens": 214530918.0, + "step": 5621 + }, + { + "epoch": 0.7151761862358479, + "ewc_loss": 0.026123046875, + "ewc_loss_parallel": 2.6106834411621094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.744953155517578, + "learning_rate": 1e-06, + "loss": 0.3808, + "mean_token_accuracy": 0.8748236894607544, + "num_tokens": 214570227.0, + "step": 5622 + }, + { + "epoch": 0.7153033965144384, + "ewc_loss": 0.0255126953125, + "ewc_loss_parallel": 2.5510787963867188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.713979721069336, + "learning_rate": 1e-06, + "loss": 0.4171, + "mean_token_accuracy": 0.8620715141296387, + "num_tokens": 214609519.0, + "step": 5623 + }, + { + "epoch": 0.7154306067930288, + "ewc_loss": 0.0255126953125, + "ewc_loss_parallel": 2.5510787963867188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.711910247802734, + "learning_rate": 1e-06, + "loss": 0.4434, + "mean_token_accuracy": 0.8562455177307129, + "num_tokens": 214643564.0, + "step": 5624 + }, + { + "epoch": 0.7155578170716194, + "ewc_loss": 0.0257568359375, + "ewc_loss_parallel": 2.574920654296875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.563722610473633, + "learning_rate": 1e-06, + "loss": 0.4158, + "mean_token_accuracy": 0.8677464723587036, + "num_tokens": 214677632.0, + "step": 5625 + }, + { + "epoch": 0.7156850273502099, + "ewc_loss": 0.0257568359375, + "ewc_loss_parallel": 2.574920654296875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.978116989135742, + "learning_rate": 1e-06, + "loss": 0.4188, + "mean_token_accuracy": 0.8610891103744507, + "num_tokens": 214710363.0, + "step": 5626 + }, + { + "epoch": 0.7158122376288004, + "ewc_loss": 0.0255126953125, + "ewc_loss_parallel": 2.5510787963867188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.809547424316406, + "learning_rate": 1e-06, + "loss": 0.4228, + "mean_token_accuracy": 0.8666205406188965, + "num_tokens": 214746982.0, + "step": 5627 + }, + { + "epoch": 0.715939447907391, + "ewc_loss": 0.025634765625, + "ewc_loss_parallel": 2.562999725341797e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.707000732421875, + "learning_rate": 1e-06, + "loss": 0.4674, + "mean_token_accuracy": 0.850096583366394, + "num_tokens": 214786447.0, + "step": 5628 + }, + { + "epoch": 0.7160666581859815, + "ewc_loss": 0.0257568359375, + "ewc_loss_parallel": 2.574920654296875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.606945037841797, + "learning_rate": 1e-06, + "loss": 0.417, + "mean_token_accuracy": 0.8671063780784607, + "num_tokens": 214824627.0, + "step": 5629 + }, + { + "epoch": 0.7161938684645719, + "ewc_loss": 0.0257568359375, + "ewc_loss_parallel": 2.574920654296875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.757478713989258, + "learning_rate": 1e-06, + "loss": 0.5099, + "mean_token_accuracy": 0.8407160043716431, + "num_tokens": 214863051.0, + "step": 5630 + }, + { + "epoch": 0.7163210787431624, + "ewc_loss": 0.025634765625, + "ewc_loss_parallel": 2.562999725341797e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.8193302154541, + "learning_rate": 1e-06, + "loss": 0.5057, + "mean_token_accuracy": 0.8460022211074829, + "num_tokens": 214900524.0, + "step": 5631 + }, + { + "epoch": 0.716448289021753, + "ewc_loss": 0.025634765625, + "ewc_loss_parallel": 2.562999725341797e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.604930877685547, + "learning_rate": 1e-06, + "loss": 0.4256, + "mean_token_accuracy": 0.8651520609855652, + "num_tokens": 214943949.0, + "step": 5632 + }, + { + "epoch": 0.7165754993003435, + "ewc_loss": 0.0255126953125, + "ewc_loss_parallel": 2.5510787963867188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.743350982666016, + "learning_rate": 1e-06, + "loss": 0.4319, + "mean_token_accuracy": 0.8640186786651611, + "num_tokens": 214979126.0, + "step": 5633 + }, + { + "epoch": 0.716702709578934, + "ewc_loss": 0.025634765625, + "ewc_loss_parallel": 2.562999725341797e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.7022762298584, + "learning_rate": 1e-06, + "loss": 0.4308, + "mean_token_accuracy": 0.858666718006134, + "num_tokens": 215017740.0, + "step": 5634 + }, + { + "epoch": 0.7168299198575245, + "ewc_loss": 0.0255126953125, + "ewc_loss_parallel": 2.5510787963867188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.588871002197266, + "learning_rate": 1e-06, + "loss": 0.4184, + "mean_token_accuracy": 0.867821455001831, + "num_tokens": 215054472.0, + "step": 5635 + }, + { + "epoch": 0.716957130136115, + "ewc_loss": 0.0257568359375, + "ewc_loss_parallel": 2.574920654296875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.72745132446289, + "learning_rate": 1e-06, + "loss": 0.4389, + "mean_token_accuracy": 0.8615669012069702, + "num_tokens": 215086894.0, + "step": 5636 + }, + { + "epoch": 0.7170843404147055, + "ewc_loss": 0.0257568359375, + "ewc_loss_parallel": 2.574920654296875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.615930557250977, + "learning_rate": 1e-06, + "loss": 0.4411, + "mean_token_accuracy": 0.8608309626579285, + "num_tokens": 215128157.0, + "step": 5637 + }, + { + "epoch": 0.717211550693296, + "ewc_loss": 0.02587890625, + "ewc_loss_parallel": 2.586841583251953e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.907079696655273, + "learning_rate": 1e-06, + "loss": 0.4018, + "mean_token_accuracy": 0.8663646578788757, + "num_tokens": 215161008.0, + "step": 5638 + }, + { + "epoch": 0.7173387609718865, + "ewc_loss": 0.0257568359375, + "ewc_loss_parallel": 2.574920654296875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.939193725585938, + "learning_rate": 1e-06, + "loss": 0.5187, + "mean_token_accuracy": 0.8431225419044495, + "num_tokens": 215197157.0, + "step": 5639 + }, + { + "epoch": 0.7174659712504771, + "ewc_loss": 0.025634765625, + "ewc_loss_parallel": 2.562999725341797e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.647823333740234, + "learning_rate": 1e-06, + "loss": 0.4141, + "mean_token_accuracy": 0.8666174411773682, + "num_tokens": 215240270.0, + "step": 5640 + }, + { + "epoch": 0.7175931815290676, + "ewc_loss": 0.025634765625, + "ewc_loss_parallel": 2.562999725341797e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.67530059814453, + "learning_rate": 1e-06, + "loss": 0.4177, + "mean_token_accuracy": 0.8668144345283508, + "num_tokens": 215279828.0, + "step": 5641 + }, + { + "epoch": 0.717720391807658, + "ewc_loss": 0.0257568359375, + "ewc_loss_parallel": 2.574920654296875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.942340850830078, + "learning_rate": 1e-06, + "loss": 0.4532, + "mean_token_accuracy": 0.8532449007034302, + "num_tokens": 215315271.0, + "step": 5642 + }, + { + "epoch": 0.7178476020862485, + "ewc_loss": 0.0257568359375, + "ewc_loss_parallel": 2.574920654296875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.685640335083008, + "learning_rate": 1e-06, + "loss": 0.4243, + "mean_token_accuracy": 0.8614580631256104, + "num_tokens": 215352621.0, + "step": 5643 + }, + { + "epoch": 0.7179748123648391, + "ewc_loss": 0.0255126953125, + "ewc_loss_parallel": 2.5510787963867188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.715076446533203, + "learning_rate": 1e-06, + "loss": 0.4307, + "mean_token_accuracy": 0.8614062666893005, + "num_tokens": 215390231.0, + "step": 5644 + }, + { + "epoch": 0.7181020226434296, + "ewc_loss": 0.025634765625, + "ewc_loss_parallel": 2.562999725341797e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.903085708618164, + "learning_rate": 1e-06, + "loss": 0.4008, + "mean_token_accuracy": 0.8738332390785217, + "num_tokens": 215426812.0, + "step": 5645 + }, + { + "epoch": 0.7182292329220201, + "ewc_loss": 0.025390625, + "ewc_loss_parallel": 2.5391578674316406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.4892578125, + "learning_rate": 1e-06, + "loss": 0.4844, + "mean_token_accuracy": 0.8483664989471436, + "num_tokens": 215467551.0, + "step": 5646 + }, + { + "epoch": 0.7183564432006107, + "ewc_loss": 0.0255126953125, + "ewc_loss_parallel": 2.5510787963867188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.700693130493164, + "learning_rate": 1e-06, + "loss": 0.4089, + "mean_token_accuracy": 0.8683381676673889, + "num_tokens": 215508559.0, + "step": 5647 + }, + { + "epoch": 0.7184836534792011, + "ewc_loss": 0.025634765625, + "ewc_loss_parallel": 2.562999725341797e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.75897216796875, + "learning_rate": 1e-06, + "loss": 0.467, + "mean_token_accuracy": 0.8527050018310547, + "num_tokens": 215548709.0, + "step": 5648 + }, + { + "epoch": 0.7186108637577916, + "ewc_loss": 0.025634765625, + "ewc_loss_parallel": 2.562999725341797e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.611785888671875, + "learning_rate": 1e-06, + "loss": 0.4405, + "mean_token_accuracy": 0.8601179122924805, + "num_tokens": 215593844.0, + "step": 5649 + }, + { + "epoch": 0.7187380740363821, + "ewc_loss": 0.0257568359375, + "ewc_loss_parallel": 2.574920654296875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.727590560913086, + "learning_rate": 1e-06, + "loss": 0.4189, + "mean_token_accuracy": 0.8652157783508301, + "num_tokens": 215633853.0, + "step": 5650 + }, + { + "epoch": 0.7188652843149727, + "ewc_loss": 0.0257568359375, + "ewc_loss_parallel": 2.574920654296875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.644176483154297, + "learning_rate": 1e-06, + "loss": 0.4449, + "mean_token_accuracy": 0.8581444025039673, + "num_tokens": 215667272.0, + "step": 5651 + }, + { + "epoch": 0.7189924945935632, + "ewc_loss": 0.0255126953125, + "ewc_loss_parallel": 2.5510787963867188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.507009506225586, + "learning_rate": 1e-06, + "loss": 0.4955, + "mean_token_accuracy": 0.8433292508125305, + "num_tokens": 215705889.0, + "step": 5652 + }, + { + "epoch": 0.7191197048721537, + "ewc_loss": 0.02587890625, + "ewc_loss_parallel": 2.586841583251953e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.600908279418945, + "learning_rate": 1e-06, + "loss": 0.4115, + "mean_token_accuracy": 0.8683956861495972, + "num_tokens": 215749143.0, + "step": 5653 + }, + { + "epoch": 0.7192469151507441, + "ewc_loss": 0.0257568359375, + "ewc_loss_parallel": 2.574920654296875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.837711334228516, + "learning_rate": 1e-06, + "loss": 0.4554, + "mean_token_accuracy": 0.8537968397140503, + "num_tokens": 215787583.0, + "step": 5654 + }, + { + "epoch": 0.7193741254293347, + "ewc_loss": 0.0260009765625, + "ewc_loss_parallel": 2.5987625122070312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.677549362182617, + "learning_rate": 1e-06, + "loss": 0.4327, + "mean_token_accuracy": 0.8606213331222534, + "num_tokens": 215825552.0, + "step": 5655 + }, + { + "epoch": 0.7195013357079252, + "ewc_loss": 0.0257568359375, + "ewc_loss_parallel": 2.574920654296875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.658546447753906, + "learning_rate": 1e-06, + "loss": 0.4092, + "mean_token_accuracy": 0.8684845566749573, + "num_tokens": 215858310.0, + "step": 5656 + }, + { + "epoch": 0.7196285459865157, + "ewc_loss": 0.0257568359375, + "ewc_loss_parallel": 2.574920654296875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.577939987182617, + "learning_rate": 1e-06, + "loss": 0.4731, + "mean_token_accuracy": 0.8490127325057983, + "num_tokens": 215898403.0, + "step": 5657 + }, + { + "epoch": 0.7197557562651062, + "ewc_loss": 0.0257568359375, + "ewc_loss_parallel": 2.574920654296875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.700408935546875, + "learning_rate": 1e-06, + "loss": 0.5357, + "mean_token_accuracy": 0.8373379111289978, + "num_tokens": 215935930.0, + "step": 5658 + }, + { + "epoch": 0.7198829665436968, + "ewc_loss": 0.02587890625, + "ewc_loss_parallel": 2.586841583251953e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.594364166259766, + "learning_rate": 1e-06, + "loss": 0.4459, + "mean_token_accuracy": 0.8567394614219666, + "num_tokens": 215971509.0, + "step": 5659 + }, + { + "epoch": 0.7200101768222873, + "ewc_loss": 0.0257568359375, + "ewc_loss_parallel": 2.574920654296875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.815204620361328, + "learning_rate": 1e-06, + "loss": 0.4185, + "mean_token_accuracy": 0.8651672601699829, + "num_tokens": 216008696.0, + "step": 5660 + }, + { + "epoch": 0.7201373871008777, + "ewc_loss": 0.0257568359375, + "ewc_loss_parallel": 2.574920654296875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.68458366394043, + "learning_rate": 1e-06, + "loss": 0.438, + "mean_token_accuracy": 0.8598508834838867, + "num_tokens": 216045861.0, + "step": 5661 + }, + { + "epoch": 0.7202645973794682, + "ewc_loss": 0.0260009765625, + "ewc_loss_parallel": 2.5987625122070312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.640560150146484, + "learning_rate": 1e-06, + "loss": 0.4074, + "mean_token_accuracy": 0.8694025278091431, + "num_tokens": 216081307.0, + "step": 5662 + }, + { + "epoch": 0.7203918076580588, + "ewc_loss": 0.0257568359375, + "ewc_loss_parallel": 2.574920654296875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.781431198120117, + "learning_rate": 1e-06, + "loss": 0.4101, + "mean_token_accuracy": 0.8689778447151184, + "num_tokens": 216118506.0, + "step": 5663 + }, + { + "epoch": 0.7205190179366493, + "ewc_loss": 0.0257568359375, + "ewc_loss_parallel": 2.574920654296875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.590076446533203, + "learning_rate": 1e-06, + "loss": 0.4675, + "mean_token_accuracy": 0.8532217741012573, + "num_tokens": 216159000.0, + "step": 5664 + }, + { + "epoch": 0.7206462282152398, + "ewc_loss": 0.02587890625, + "ewc_loss_parallel": 2.586841583251953e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.595680236816406, + "learning_rate": 1e-06, + "loss": 0.4621, + "mean_token_accuracy": 0.8528568148612976, + "num_tokens": 216204712.0, + "step": 5665 + }, + { + "epoch": 0.7207734384938304, + "ewc_loss": 0.02587890625, + "ewc_loss_parallel": 2.586841583251953e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.68852996826172, + "learning_rate": 1e-06, + "loss": 0.4603, + "mean_token_accuracy": 0.8522253036499023, + "num_tokens": 216235001.0, + "step": 5666 + }, + { + "epoch": 0.7209006487724208, + "ewc_loss": 0.0260009765625, + "ewc_loss_parallel": 2.5987625122070312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.718868255615234, + "learning_rate": 1e-06, + "loss": 0.4798, + "mean_token_accuracy": 0.8527838587760925, + "num_tokens": 216271383.0, + "step": 5667 + }, + { + "epoch": 0.7210278590510113, + "ewc_loss": 0.02587890625, + "ewc_loss_parallel": 2.586841583251953e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.720487594604492, + "learning_rate": 1e-06, + "loss": 0.4329, + "mean_token_accuracy": 0.8595845699310303, + "num_tokens": 216307923.0, + "step": 5668 + }, + { + "epoch": 0.7211550693296018, + "ewc_loss": 0.026123046875, + "ewc_loss_parallel": 2.6106834411621094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.663463592529297, + "learning_rate": 1e-06, + "loss": 0.3778, + "mean_token_accuracy": 0.8784030675888062, + "num_tokens": 216345147.0, + "step": 5669 + }, + { + "epoch": 0.7212822796081924, + "ewc_loss": 0.02587890625, + "ewc_loss_parallel": 2.586841583251953e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.719440460205078, + "learning_rate": 1e-06, + "loss": 0.468, + "mean_token_accuracy": 0.8511463403701782, + "num_tokens": 216386991.0, + "step": 5670 + }, + { + "epoch": 0.7214094898867829, + "ewc_loss": 0.0257568359375, + "ewc_loss_parallel": 2.574920654296875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.651960372924805, + "learning_rate": 1e-06, + "loss": 0.426, + "mean_token_accuracy": 0.8623852729797363, + "num_tokens": 216425534.0, + "step": 5671 + }, + { + "epoch": 0.7215367001653734, + "ewc_loss": 0.02587890625, + "ewc_loss_parallel": 2.586841583251953e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.70762062072754, + "learning_rate": 1e-06, + "loss": 0.4477, + "mean_token_accuracy": 0.8599388003349304, + "num_tokens": 216466686.0, + "step": 5672 + }, + { + "epoch": 0.7216639104439638, + "ewc_loss": 0.025634765625, + "ewc_loss_parallel": 2.562999725341797e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.540504455566406, + "learning_rate": 1e-06, + "loss": 0.395, + "mean_token_accuracy": 0.8720797300338745, + "num_tokens": 216505245.0, + "step": 5673 + }, + { + "epoch": 0.7217911207225544, + "ewc_loss": 0.0260009765625, + "ewc_loss_parallel": 2.5987625122070312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.852649688720703, + "learning_rate": 1e-06, + "loss": 0.4369, + "mean_token_accuracy": 0.8569668531417847, + "num_tokens": 216538024.0, + "step": 5674 + }, + { + "epoch": 0.7219183310011449, + "ewc_loss": 0.0257568359375, + "ewc_loss_parallel": 2.574920654296875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.770891189575195, + "learning_rate": 1e-06, + "loss": 0.4466, + "mean_token_accuracy": 0.8589297533035278, + "num_tokens": 216573722.0, + "step": 5675 + }, + { + "epoch": 0.7220455412797354, + "ewc_loss": 0.0257568359375, + "ewc_loss_parallel": 2.574920654296875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.714401245117188, + "learning_rate": 1e-06, + "loss": 0.4217, + "mean_token_accuracy": 0.8654885292053223, + "num_tokens": 216610362.0, + "step": 5676 + }, + { + "epoch": 0.7221727515583259, + "ewc_loss": 0.025634765625, + "ewc_loss_parallel": 2.562999725341797e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.713424682617188, + "learning_rate": 1e-06, + "loss": 0.4653, + "mean_token_accuracy": 0.8509114980697632, + "num_tokens": 216648732.0, + "step": 5677 + }, + { + "epoch": 0.7222999618369165, + "ewc_loss": 0.025634765625, + "ewc_loss_parallel": 2.562999725341797e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.649682998657227, + "learning_rate": 1e-06, + "loss": 0.4008, + "mean_token_accuracy": 0.8672050833702087, + "num_tokens": 216680324.0, + "step": 5678 + }, + { + "epoch": 0.7224271721155069, + "ewc_loss": 0.02587890625, + "ewc_loss_parallel": 2.586841583251953e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.82868766784668, + "learning_rate": 1e-06, + "loss": 0.4242, + "mean_token_accuracy": 0.8641403913497925, + "num_tokens": 216712773.0, + "step": 5679 + }, + { + "epoch": 0.7225543823940974, + "ewc_loss": 0.0257568359375, + "ewc_loss_parallel": 2.574920654296875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.61639976501465, + "learning_rate": 1e-06, + "loss": 0.4616, + "mean_token_accuracy": 0.8524245619773865, + "num_tokens": 216752538.0, + "step": 5680 + }, + { + "epoch": 0.722681592672688, + "ewc_loss": 0.0257568359375, + "ewc_loss_parallel": 2.574920654296875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.841129302978516, + "learning_rate": 1e-06, + "loss": 0.4286, + "mean_token_accuracy": 0.8626365065574646, + "num_tokens": 216791795.0, + "step": 5681 + }, + { + "epoch": 0.7228088029512785, + "ewc_loss": 0.02587890625, + "ewc_loss_parallel": 2.586841583251953e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.676393508911133, + "learning_rate": 1e-06, + "loss": 0.4496, + "mean_token_accuracy": 0.8566485643386841, + "num_tokens": 216831729.0, + "step": 5682 + }, + { + "epoch": 0.722936013229869, + "ewc_loss": 0.0257568359375, + "ewc_loss_parallel": 2.574920654296875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.698514938354492, + "learning_rate": 1e-06, + "loss": 0.4376, + "mean_token_accuracy": 0.8611080050468445, + "num_tokens": 216869494.0, + "step": 5683 + }, + { + "epoch": 0.7230632235084595, + "ewc_loss": 0.02587890625, + "ewc_loss_parallel": 2.586841583251953e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.67701530456543, + "learning_rate": 1e-06, + "loss": 0.4921, + "mean_token_accuracy": 0.8416295051574707, + "num_tokens": 216906672.0, + "step": 5684 + }, + { + "epoch": 0.72319043378705, + "ewc_loss": 0.02587890625, + "ewc_loss_parallel": 2.586841583251953e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.75499153137207, + "learning_rate": 1e-06, + "loss": 0.4161, + "mean_token_accuracy": 0.8666875958442688, + "num_tokens": 216941587.0, + "step": 5685 + }, + { + "epoch": 0.7233176440656405, + "ewc_loss": 0.02587890625, + "ewc_loss_parallel": 2.586841583251953e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.925809860229492, + "learning_rate": 1e-06, + "loss": 0.4589, + "mean_token_accuracy": 0.8533892631530762, + "num_tokens": 216981812.0, + "step": 5686 + }, + { + "epoch": 0.723444854344231, + "ewc_loss": 0.0260009765625, + "ewc_loss_parallel": 2.5987625122070312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.718032836914062, + "learning_rate": 1e-06, + "loss": 0.4114, + "mean_token_accuracy": 0.8690398931503296, + "num_tokens": 217024137.0, + "step": 5687 + }, + { + "epoch": 0.7235720646228215, + "ewc_loss": 0.026123046875, + "ewc_loss_parallel": 2.6106834411621094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.81768035888672, + "learning_rate": 1e-06, + "loss": 0.3982, + "mean_token_accuracy": 0.8725360631942749, + "num_tokens": 217063753.0, + "step": 5688 + }, + { + "epoch": 0.7236992749014121, + "ewc_loss": 0.0257568359375, + "ewc_loss_parallel": 2.574920654296875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.801122665405273, + "learning_rate": 1e-06, + "loss": 0.442, + "mean_token_accuracy": 0.8587522506713867, + "num_tokens": 217105611.0, + "step": 5689 + }, + { + "epoch": 0.7238264851800026, + "ewc_loss": 0.02587890625, + "ewc_loss_parallel": 2.586841583251953e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.919288635253906, + "learning_rate": 1e-06, + "loss": 0.4978, + "mean_token_accuracy": 0.8393797278404236, + "num_tokens": 217145989.0, + "step": 5690 + }, + { + "epoch": 0.723953695458593, + "ewc_loss": 0.025634765625, + "ewc_loss_parallel": 2.562999725341797e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.738460540771484, + "learning_rate": 1e-06, + "loss": 0.379, + "mean_token_accuracy": 0.8787074089050293, + "num_tokens": 217183726.0, + "step": 5691 + }, + { + "epoch": 0.7240809057371835, + "ewc_loss": 0.0255126953125, + "ewc_loss_parallel": 2.5510787963867188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.781097412109375, + "learning_rate": 1e-06, + "loss": 0.4525, + "mean_token_accuracy": 0.8574961423873901, + "num_tokens": 217226644.0, + "step": 5692 + }, + { + "epoch": 0.7242081160157741, + "ewc_loss": 0.025634765625, + "ewc_loss_parallel": 2.562999725341797e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.744691848754883, + "learning_rate": 1e-06, + "loss": 0.4908, + "mean_token_accuracy": 0.8452857732772827, + "num_tokens": 217263327.0, + "step": 5693 + }, + { + "epoch": 0.7243353262943646, + "ewc_loss": 0.0260009765625, + "ewc_loss_parallel": 2.5987625122070312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.810190200805664, + "learning_rate": 1e-06, + "loss": 0.4194, + "mean_token_accuracy": 0.8653939962387085, + "num_tokens": 217300595.0, + "step": 5694 + }, + { + "epoch": 0.7244625365729551, + "ewc_loss": 0.025634765625, + "ewc_loss_parallel": 2.562999725341797e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.81233024597168, + "learning_rate": 1e-06, + "loss": 0.4768, + "mean_token_accuracy": 0.8463419675827026, + "num_tokens": 217336385.0, + "step": 5695 + }, + { + "epoch": 0.7245897468515456, + "ewc_loss": 0.0255126953125, + "ewc_loss_parallel": 2.5510787963867188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.751888275146484, + "learning_rate": 1e-06, + "loss": 0.4261, + "mean_token_accuracy": 0.8664553165435791, + "num_tokens": 217374309.0, + "step": 5696 + }, + { + "epoch": 0.7247169571301361, + "ewc_loss": 0.025634765625, + "ewc_loss_parallel": 2.562999725341797e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.747230529785156, + "learning_rate": 1e-06, + "loss": 0.4637, + "mean_token_accuracy": 0.8502011895179749, + "num_tokens": 217408386.0, + "step": 5697 + }, + { + "epoch": 0.7248441674087266, + "ewc_loss": 0.025634765625, + "ewc_loss_parallel": 2.562999725341797e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.789087295532227, + "learning_rate": 1e-06, + "loss": 0.496, + "mean_token_accuracy": 0.8459542393684387, + "num_tokens": 217442855.0, + "step": 5698 + }, + { + "epoch": 0.7249713776873171, + "ewc_loss": 0.0260009765625, + "ewc_loss_parallel": 2.5987625122070312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.805204391479492, + "learning_rate": 1e-06, + "loss": 0.4231, + "mean_token_accuracy": 0.8644190430641174, + "num_tokens": 217476892.0, + "step": 5699 + }, + { + "epoch": 0.7250985879659076, + "ewc_loss": 0.02587890625, + "ewc_loss_parallel": 2.586841583251953e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.720375061035156, + "learning_rate": 1e-06, + "loss": 0.447, + "mean_token_accuracy": 0.8580116033554077, + "num_tokens": 217513569.0, + "step": 5700 + }, + { + "epoch": 0.7252257982444982, + "ewc_loss": 0.025634765625, + "ewc_loss_parallel": 2.562999725341797e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.672119140625, + "learning_rate": 1e-06, + "loss": 0.449, + "mean_token_accuracy": 0.8538476228713989, + "num_tokens": 217545585.0, + "step": 5701 + }, + { + "epoch": 0.7253530085230887, + "ewc_loss": 0.02587890625, + "ewc_loss_parallel": 2.586841583251953e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.69599151611328, + "learning_rate": 1e-06, + "loss": 0.4355, + "mean_token_accuracy": 0.8617274165153503, + "num_tokens": 217583156.0, + "step": 5702 + }, + { + "epoch": 0.7254802188016791, + "ewc_loss": 0.0257568359375, + "ewc_loss_parallel": 2.574920654296875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.826236724853516, + "learning_rate": 1e-06, + "loss": 0.4371, + "mean_token_accuracy": 0.8596898913383484, + "num_tokens": 217619630.0, + "step": 5703 + }, + { + "epoch": 0.7256074290802697, + "ewc_loss": 0.025634765625, + "ewc_loss_parallel": 2.562999725341797e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.74468421936035, + "learning_rate": 1e-06, + "loss": 0.4008, + "mean_token_accuracy": 0.8705494403839111, + "num_tokens": 217653616.0, + "step": 5704 + }, + { + "epoch": 0.7257346393588602, + "ewc_loss": 0.025634765625, + "ewc_loss_parallel": 2.562999725341797e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.606807708740234, + "learning_rate": 1e-06, + "loss": 0.4411, + "mean_token_accuracy": 0.8549152612686157, + "num_tokens": 217692758.0, + "step": 5705 + }, + { + "epoch": 0.7258618496374507, + "ewc_loss": 0.02587890625, + "ewc_loss_parallel": 2.586841583251953e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.526594161987305, + "learning_rate": 1e-06, + "loss": 0.4959, + "mean_token_accuracy": 0.8446744084358215, + "num_tokens": 217734175.0, + "step": 5706 + }, + { + "epoch": 0.7259890599160412, + "ewc_loss": 0.026123046875, + "ewc_loss_parallel": 2.6106834411621094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.737642288208008, + "learning_rate": 1e-06, + "loss": 0.5068, + "mean_token_accuracy": 0.8371009826660156, + "num_tokens": 217773372.0, + "step": 5707 + }, + { + "epoch": 0.7261162701946318, + "ewc_loss": 0.0257568359375, + "ewc_loss_parallel": 2.574920654296875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.637224197387695, + "learning_rate": 1e-06, + "loss": 0.4012, + "mean_token_accuracy": 0.869242250919342, + "num_tokens": 217809174.0, + "step": 5708 + }, + { + "epoch": 0.7262434804732223, + "ewc_loss": 0.0257568359375, + "ewc_loss_parallel": 2.574920654296875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.74473762512207, + "learning_rate": 1e-06, + "loss": 0.453, + "mean_token_accuracy": 0.8534417748451233, + "num_tokens": 217845882.0, + "step": 5709 + }, + { + "epoch": 0.7263706907518127, + "ewc_loss": 0.02587890625, + "ewc_loss_parallel": 2.586841583251953e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.746917724609375, + "learning_rate": 1e-06, + "loss": 0.4461, + "mean_token_accuracy": 0.8559772968292236, + "num_tokens": 217885798.0, + "step": 5710 + }, + { + "epoch": 0.7264979010304032, + "ewc_loss": 0.02587890625, + "ewc_loss_parallel": 2.586841583251953e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.720861434936523, + "learning_rate": 1e-06, + "loss": 0.4489, + "mean_token_accuracy": 0.8565664291381836, + "num_tokens": 217924967.0, + "step": 5711 + }, + { + "epoch": 0.7266251113089938, + "ewc_loss": 0.0260009765625, + "ewc_loss_parallel": 2.5987625122070312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.60325050354004, + "learning_rate": 1e-06, + "loss": 0.401, + "mean_token_accuracy": 0.8704596161842346, + "num_tokens": 217967381.0, + "step": 5712 + }, + { + "epoch": 0.7267523215875843, + "ewc_loss": 0.026123046875, + "ewc_loss_parallel": 2.6106834411621094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.99053955078125, + "learning_rate": 1e-06, + "loss": 0.4219, + "mean_token_accuracy": 0.8655179142951965, + "num_tokens": 218001097.0, + "step": 5713 + }, + { + "epoch": 0.7268795318661748, + "ewc_loss": 0.02587890625, + "ewc_loss_parallel": 2.586841583251953e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.573945999145508, + "learning_rate": 1e-06, + "loss": 0.4386, + "mean_token_accuracy": 0.8619989156723022, + "num_tokens": 218040705.0, + "step": 5714 + }, + { + "epoch": 0.7270067421447653, + "ewc_loss": 0.0260009765625, + "ewc_loss_parallel": 2.5987625122070312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.965999603271484, + "learning_rate": 1e-06, + "loss": 0.433, + "mean_token_accuracy": 0.8593573570251465, + "num_tokens": 218077549.0, + "step": 5715 + }, + { + "epoch": 0.7271339524233558, + "ewc_loss": 0.0257568359375, + "ewc_loss_parallel": 2.574920654296875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.801904678344727, + "learning_rate": 1e-06, + "loss": 0.4497, + "mean_token_accuracy": 0.8591087460517883, + "num_tokens": 218116251.0, + "step": 5716 + }, + { + "epoch": 0.7272611627019463, + "ewc_loss": 0.02587890625, + "ewc_loss_parallel": 2.586841583251953e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.60144805908203, + "learning_rate": 1e-06, + "loss": 0.5212, + "mean_token_accuracy": 0.833808183670044, + "num_tokens": 218155798.0, + "step": 5717 + }, + { + "epoch": 0.7273883729805368, + "ewc_loss": 0.0260009765625, + "ewc_loss_parallel": 2.5987625122070312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.79608154296875, + "learning_rate": 1e-06, + "loss": 0.4206, + "mean_token_accuracy": 0.8652024269104004, + "num_tokens": 218191277.0, + "step": 5718 + }, + { + "epoch": 0.7275155832591274, + "ewc_loss": 0.0260009765625, + "ewc_loss_parallel": 2.5987625122070312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.581144332885742, + "learning_rate": 1e-06, + "loss": 0.4355, + "mean_token_accuracy": 0.8654374480247498, + "num_tokens": 218233165.0, + "step": 5719 + }, + { + "epoch": 0.7276427935377179, + "ewc_loss": 0.02587890625, + "ewc_loss_parallel": 2.586841583251953e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.731801986694336, + "learning_rate": 1e-06, + "loss": 0.5076, + "mean_token_accuracy": 0.8394927978515625, + "num_tokens": 218271263.0, + "step": 5720 + }, + { + "epoch": 0.7277700038163084, + "ewc_loss": 0.0257568359375, + "ewc_loss_parallel": 2.574920654296875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.84206771850586, + "learning_rate": 1e-06, + "loss": 0.4428, + "mean_token_accuracy": 0.8603696823120117, + "num_tokens": 218309081.0, + "step": 5721 + }, + { + "epoch": 0.7278972140948988, + "ewc_loss": 0.0260009765625, + "ewc_loss_parallel": 2.5987625122070312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.72444725036621, + "learning_rate": 1e-06, + "loss": 0.4588, + "mean_token_accuracy": 0.8549745082855225, + "num_tokens": 218348314.0, + "step": 5722 + }, + { + "epoch": 0.7280244243734894, + "ewc_loss": 0.02587890625, + "ewc_loss_parallel": 2.586841583251953e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.785173416137695, + "learning_rate": 1e-06, + "loss": 0.4311, + "mean_token_accuracy": 0.8623062372207642, + "num_tokens": 218387556.0, + "step": 5723 + }, + { + "epoch": 0.7281516346520799, + "ewc_loss": 0.0255126953125, + "ewc_loss_parallel": 2.5510787963867188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.79534912109375, + "learning_rate": 1e-06, + "loss": 0.4497, + "mean_token_accuracy": 0.8513888120651245, + "num_tokens": 218423130.0, + "step": 5724 + }, + { + "epoch": 0.7282788449306704, + "ewc_loss": 0.025634765625, + "ewc_loss_parallel": 2.562999725341797e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.738306045532227, + "learning_rate": 1e-06, + "loss": 0.4892, + "mean_token_accuracy": 0.8411463499069214, + "num_tokens": 218464079.0, + "step": 5725 + }, + { + "epoch": 0.7284060552092609, + "ewc_loss": 0.0257568359375, + "ewc_loss_parallel": 2.574920654296875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.93863868713379, + "learning_rate": 1e-06, + "loss": 0.4176, + "mean_token_accuracy": 0.866811990737915, + "num_tokens": 218501307.0, + "step": 5726 + }, + { + "epoch": 0.7285332654878515, + "ewc_loss": 0.02587890625, + "ewc_loss_parallel": 2.586841583251953e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.952756881713867, + "learning_rate": 1e-06, + "loss": 0.418, + "mean_token_accuracy": 0.8673772215843201, + "num_tokens": 218537710.0, + "step": 5727 + }, + { + "epoch": 0.7286604757664419, + "ewc_loss": 0.025634765625, + "ewc_loss_parallel": 2.562999725341797e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.635807037353516, + "learning_rate": 1e-06, + "loss": 0.457, + "mean_token_accuracy": 0.8550601005554199, + "num_tokens": 218576026.0, + "step": 5728 + }, + { + "epoch": 0.7287876860450324, + "ewc_loss": 0.02587890625, + "ewc_loss_parallel": 2.586841583251953e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.950050354003906, + "learning_rate": 1e-06, + "loss": 0.451, + "mean_token_accuracy": 0.8566758632659912, + "num_tokens": 218612491.0, + "step": 5729 + }, + { + "epoch": 0.7289148963236229, + "ewc_loss": 0.02587890625, + "ewc_loss_parallel": 2.586841583251953e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.81946563720703, + "learning_rate": 1e-06, + "loss": 0.4086, + "mean_token_accuracy": 0.8678928017616272, + "num_tokens": 218642541.0, + "step": 5730 + }, + { + "epoch": 0.7290421066022135, + "ewc_loss": 0.025390625, + "ewc_loss_parallel": 2.5391578674316406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.90555763244629, + "learning_rate": 1e-06, + "loss": 0.4762, + "mean_token_accuracy": 0.8454965353012085, + "num_tokens": 218678514.0, + "step": 5731 + }, + { + "epoch": 0.729169316880804, + "ewc_loss": 0.025634765625, + "ewc_loss_parallel": 2.562999725341797e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.750473022460938, + "learning_rate": 1e-06, + "loss": 0.4303, + "mean_token_accuracy": 0.8636298179626465, + "num_tokens": 218715415.0, + "step": 5732 + }, + { + "epoch": 0.7292965271593945, + "ewc_loss": 0.0260009765625, + "ewc_loss_parallel": 2.5987625122070312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.85146713256836, + "learning_rate": 1e-06, + "loss": 0.416, + "mean_token_accuracy": 0.8670016527175903, + "num_tokens": 218754422.0, + "step": 5733 + }, + { + "epoch": 0.7294237374379849, + "ewc_loss": 0.025634765625, + "ewc_loss_parallel": 2.562999725341797e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.82840919494629, + "learning_rate": 1e-06, + "loss": 0.4666, + "mean_token_accuracy": 0.8518233299255371, + "num_tokens": 218787141.0, + "step": 5734 + }, + { + "epoch": 0.7295509477165755, + "ewc_loss": 0.02587890625, + "ewc_loss_parallel": 2.586841583251953e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.717079162597656, + "learning_rate": 1e-06, + "loss": 0.3836, + "mean_token_accuracy": 0.8737041354179382, + "num_tokens": 218824764.0, + "step": 5735 + }, + { + "epoch": 0.729678157995166, + "ewc_loss": 0.025634765625, + "ewc_loss_parallel": 2.562999725341797e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.661422729492188, + "learning_rate": 1e-06, + "loss": 0.4416, + "mean_token_accuracy": 0.8572168946266174, + "num_tokens": 218866142.0, + "step": 5736 + }, + { + "epoch": 0.7298053682737565, + "ewc_loss": 0.02587890625, + "ewc_loss_parallel": 2.586841583251953e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.796552658081055, + "learning_rate": 1e-06, + "loss": 0.4598, + "mean_token_accuracy": 0.8552270531654358, + "num_tokens": 218909208.0, + "step": 5737 + }, + { + "epoch": 0.7299325785523471, + "ewc_loss": 0.02587890625, + "ewc_loss_parallel": 2.586841583251953e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.751026153564453, + "learning_rate": 1e-06, + "loss": 0.4145, + "mean_token_accuracy": 0.867027759552002, + "num_tokens": 218942469.0, + "step": 5738 + }, + { + "epoch": 0.7300597888309376, + "ewc_loss": 0.02587890625, + "ewc_loss_parallel": 2.586841583251953e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.84265899658203, + "learning_rate": 1e-06, + "loss": 0.473, + "mean_token_accuracy": 0.8530248999595642, + "num_tokens": 218982934.0, + "step": 5739 + }, + { + "epoch": 0.730186999109528, + "ewc_loss": 0.0257568359375, + "ewc_loss_parallel": 2.574920654296875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.946184158325195, + "learning_rate": 1e-06, + "loss": 0.4455, + "mean_token_accuracy": 0.8566335439682007, + "num_tokens": 219027055.0, + "step": 5740 + }, + { + "epoch": 0.7303142093881185, + "ewc_loss": 0.0260009765625, + "ewc_loss_parallel": 2.5987625122070312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.774763107299805, + "learning_rate": 1e-06, + "loss": 0.432, + "mean_token_accuracy": 0.8602538704872131, + "num_tokens": 219069639.0, + "step": 5741 + }, + { + "epoch": 0.7304414196667091, + "ewc_loss": 0.0257568359375, + "ewc_loss_parallel": 2.574920654296875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.73870277404785, + "learning_rate": 1e-06, + "loss": 0.4676, + "mean_token_accuracy": 0.8530651330947876, + "num_tokens": 219111774.0, + "step": 5742 + }, + { + "epoch": 0.7305686299452996, + "ewc_loss": 0.02587890625, + "ewc_loss_parallel": 2.586841583251953e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.754131317138672, + "learning_rate": 1e-06, + "loss": 0.4875, + "mean_token_accuracy": 0.8459293246269226, + "num_tokens": 219156962.0, + "step": 5743 + }, + { + "epoch": 0.7306958402238901, + "ewc_loss": 0.0260009765625, + "ewc_loss_parallel": 2.5987625122070312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.841381072998047, + "learning_rate": 1e-06, + "loss": 0.4123, + "mean_token_accuracy": 0.8684971332550049, + "num_tokens": 219193352.0, + "step": 5744 + }, + { + "epoch": 0.7308230505024806, + "ewc_loss": 0.0260009765625, + "ewc_loss_parallel": 2.5987625122070312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.16965675354004, + "learning_rate": 1e-06, + "loss": 0.4908, + "mean_token_accuracy": 0.8451303243637085, + "num_tokens": 219230041.0, + "step": 5745 + }, + { + "epoch": 0.7309502607810711, + "ewc_loss": 0.0260009765625, + "ewc_loss_parallel": 2.5987625122070312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.72676658630371, + "learning_rate": 1e-06, + "loss": 0.5103, + "mean_token_accuracy": 0.8370104432106018, + "num_tokens": 219262748.0, + "step": 5746 + }, + { + "epoch": 0.7310774710596616, + "ewc_loss": 0.0257568359375, + "ewc_loss_parallel": 2.574920654296875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.91830062866211, + "learning_rate": 1e-06, + "loss": 0.4059, + "mean_token_accuracy": 0.8706837892532349, + "num_tokens": 219296556.0, + "step": 5747 + }, + { + "epoch": 0.7312046813382521, + "ewc_loss": 0.02587890625, + "ewc_loss_parallel": 2.586841583251953e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.77901840209961, + "learning_rate": 1e-06, + "loss": 0.4848, + "mean_token_accuracy": 0.8467689156532288, + "num_tokens": 219330801.0, + "step": 5748 + }, + { + "epoch": 0.7313318916168426, + "ewc_loss": 0.0260009765625, + "ewc_loss_parallel": 2.5987625122070312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.70224952697754, + "learning_rate": 1e-06, + "loss": 0.4473, + "mean_token_accuracy": 0.8593894243240356, + "num_tokens": 219367470.0, + "step": 5749 + }, + { + "epoch": 0.7314591018954332, + "ewc_loss": 0.02587890625, + "ewc_loss_parallel": 2.586841583251953e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.871103286743164, + "learning_rate": 1e-06, + "loss": 0.4868, + "mean_token_accuracy": 0.8457421660423279, + "num_tokens": 219405291.0, + "step": 5750 + }, + { + "epoch": 0.7315863121740237, + "ewc_loss": 0.0257568359375, + "ewc_loss_parallel": 2.574920654296875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.62405014038086, + "learning_rate": 1e-06, + "loss": 0.4411, + "mean_token_accuracy": 0.8609312772750854, + "num_tokens": 219443365.0, + "step": 5751 + }, + { + "epoch": 0.7317135224526141, + "ewc_loss": 0.02587890625, + "ewc_loss_parallel": 2.586841583251953e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.791343688964844, + "learning_rate": 1e-06, + "loss": 0.4295, + "mean_token_accuracy": 0.8604271411895752, + "num_tokens": 219477524.0, + "step": 5752 + }, + { + "epoch": 0.7318407327312046, + "ewc_loss": 0.02587890625, + "ewc_loss_parallel": 2.586841583251953e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.712919235229492, + "learning_rate": 1e-06, + "loss": 0.468, + "mean_token_accuracy": 0.8503012657165527, + "num_tokens": 219514576.0, + "step": 5753 + }, + { + "epoch": 0.7319679430097952, + "ewc_loss": 0.02587890625, + "ewc_loss_parallel": 2.586841583251953e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.762868881225586, + "learning_rate": 1e-06, + "loss": 0.444, + "mean_token_accuracy": 0.8590582013130188, + "num_tokens": 219558253.0, + "step": 5754 + }, + { + "epoch": 0.7320951532883857, + "ewc_loss": 0.0260009765625, + "ewc_loss_parallel": 2.5987625122070312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.92549705505371, + "learning_rate": 1e-06, + "loss": 0.4553, + "mean_token_accuracy": 0.8546507358551025, + "num_tokens": 219601156.0, + "step": 5755 + }, + { + "epoch": 0.7322223635669762, + "ewc_loss": 0.0257568359375, + "ewc_loss_parallel": 2.574920654296875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.71043586730957, + "learning_rate": 1e-06, + "loss": 0.4789, + "mean_token_accuracy": 0.8485094308853149, + "num_tokens": 219638241.0, + "step": 5756 + }, + { + "epoch": 0.7323495738455668, + "ewc_loss": 0.025634765625, + "ewc_loss_parallel": 2.562999725341797e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.688962936401367, + "learning_rate": 1e-06, + "loss": 0.4363, + "mean_token_accuracy": 0.8597714304924011, + "num_tokens": 219678168.0, + "step": 5757 + }, + { + "epoch": 0.7324767841241572, + "ewc_loss": 0.0260009765625, + "ewc_loss_parallel": 2.5987625122070312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.80110740661621, + "learning_rate": 1e-06, + "loss": 0.4624, + "mean_token_accuracy": 0.8574010729789734, + "num_tokens": 219718115.0, + "step": 5758 + }, + { + "epoch": 0.7326039944027477, + "ewc_loss": 0.02587890625, + "ewc_loss_parallel": 2.586841583251953e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.693618774414062, + "learning_rate": 1e-06, + "loss": 0.4432, + "mean_token_accuracy": 0.8581593036651611, + "num_tokens": 219754221.0, + "step": 5759 + }, + { + "epoch": 0.7327312046813382, + "ewc_loss": 0.0257568359375, + "ewc_loss_parallel": 2.574920654296875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.8487606048584, + "learning_rate": 1e-06, + "loss": 0.4129, + "mean_token_accuracy": 0.8650240898132324, + "num_tokens": 219786880.0, + "step": 5760 + }, + { + "epoch": 0.7328584149599288, + "ewc_loss": 0.02587890625, + "ewc_loss_parallel": 2.586841583251953e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.77048110961914, + "learning_rate": 1e-06, + "loss": 0.4273, + "mean_token_accuracy": 0.8629415035247803, + "num_tokens": 219830585.0, + "step": 5761 + }, + { + "epoch": 0.7329856252385193, + "ewc_loss": 0.02587890625, + "ewc_loss_parallel": 2.586841583251953e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.804668426513672, + "learning_rate": 1e-06, + "loss": 0.432, + "mean_token_accuracy": 0.862210750579834, + "num_tokens": 219867249.0, + "step": 5762 + }, + { + "epoch": 0.7331128355171098, + "ewc_loss": 0.0257568359375, + "ewc_loss_parallel": 2.574920654296875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.77850914001465, + "learning_rate": 1e-06, + "loss": 0.4615, + "mean_token_accuracy": 0.8507000207901001, + "num_tokens": 219906046.0, + "step": 5763 + }, + { + "epoch": 0.7332400457957003, + "ewc_loss": 0.0260009765625, + "ewc_loss_parallel": 2.5987625122070312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.90469741821289, + "learning_rate": 1e-06, + "loss": 0.4678, + "mean_token_accuracy": 0.8501752614974976, + "num_tokens": 219947466.0, + "step": 5764 + }, + { + "epoch": 0.7333672560742908, + "ewc_loss": 0.0257568359375, + "ewc_loss_parallel": 2.574920654296875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.77450180053711, + "learning_rate": 1e-06, + "loss": 0.4368, + "mean_token_accuracy": 0.8606772422790527, + "num_tokens": 219985698.0, + "step": 5765 + }, + { + "epoch": 0.7334944663528813, + "ewc_loss": 0.025634765625, + "ewc_loss_parallel": 2.562999725341797e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.985858917236328, + "learning_rate": 1e-06, + "loss": 0.4314, + "mean_token_accuracy": 0.8594799041748047, + "num_tokens": 220026158.0, + "step": 5766 + }, + { + "epoch": 0.7336216766314718, + "ewc_loss": 0.0257568359375, + "ewc_loss_parallel": 2.574920654296875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.63484764099121, + "learning_rate": 1e-06, + "loss": 0.4416, + "mean_token_accuracy": 0.8572111129760742, + "num_tokens": 220071849.0, + "step": 5767 + }, + { + "epoch": 0.7337488869100623, + "ewc_loss": 0.02587890625, + "ewc_loss_parallel": 2.586841583251953e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.887691497802734, + "learning_rate": 1e-06, + "loss": 0.4311, + "mean_token_accuracy": 0.8655039072036743, + "num_tokens": 220110255.0, + "step": 5768 + }, + { + "epoch": 0.7338760971886529, + "ewc_loss": 0.02587890625, + "ewc_loss_parallel": 2.586841583251953e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.74901008605957, + "learning_rate": 1e-06, + "loss": 0.4921, + "mean_token_accuracy": 0.8439303636550903, + "num_tokens": 220144962.0, + "step": 5769 + }, + { + "epoch": 0.7340033074672434, + "ewc_loss": 0.0257568359375, + "ewc_loss_parallel": 2.574920654296875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.863677978515625, + "learning_rate": 1e-06, + "loss": 0.4814, + "mean_token_accuracy": 0.8494948148727417, + "num_tokens": 220182118.0, + "step": 5770 + }, + { + "epoch": 0.7341305177458338, + "ewc_loss": 0.026123046875, + "ewc_loss_parallel": 2.6106834411621094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.850345611572266, + "learning_rate": 1e-06, + "loss": 0.4939, + "mean_token_accuracy": 0.8496488332748413, + "num_tokens": 220218954.0, + "step": 5771 + }, + { + "epoch": 0.7342577280244243, + "ewc_loss": 0.025634765625, + "ewc_loss_parallel": 2.562999725341797e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.7659969329834, + "learning_rate": 1e-06, + "loss": 0.4629, + "mean_token_accuracy": 0.8579467535018921, + "num_tokens": 220259116.0, + "step": 5772 + }, + { + "epoch": 0.7343849383030149, + "ewc_loss": 0.0257568359375, + "ewc_loss_parallel": 2.574920654296875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.852270126342773, + "learning_rate": 1e-06, + "loss": 0.4499, + "mean_token_accuracy": 0.8552781939506531, + "num_tokens": 220301742.0, + "step": 5773 + }, + { + "epoch": 0.7345121485816054, + "ewc_loss": 0.025634765625, + "ewc_loss_parallel": 2.562999725341797e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.867305755615234, + "learning_rate": 1e-06, + "loss": 0.429, + "mean_token_accuracy": 0.8633328676223755, + "num_tokens": 220342572.0, + "step": 5774 + }, + { + "epoch": 0.7346393588601959, + "ewc_loss": 0.02587890625, + "ewc_loss_parallel": 2.586841583251953e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.90058708190918, + "learning_rate": 1e-06, + "loss": 0.423, + "mean_token_accuracy": 0.8638684749603271, + "num_tokens": 220376373.0, + "step": 5775 + }, + { + "epoch": 0.7347665691387865, + "ewc_loss": 0.02587890625, + "ewc_loss_parallel": 2.586841583251953e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.786827087402344, + "learning_rate": 1e-06, + "loss": 0.3879, + "mean_token_accuracy": 0.8747507333755493, + "num_tokens": 220413290.0, + "step": 5776 + }, + { + "epoch": 0.7348937794173769, + "ewc_loss": 0.0260009765625, + "ewc_loss_parallel": 2.5987625122070312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.878307342529297, + "learning_rate": 1e-06, + "loss": 0.4501, + "mean_token_accuracy": 0.8552120923995972, + "num_tokens": 220453219.0, + "step": 5777 + }, + { + "epoch": 0.7350209896959674, + "ewc_loss": 0.0260009765625, + "ewc_loss_parallel": 2.5987625122070312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.896724700927734, + "learning_rate": 1e-06, + "loss": 0.4532, + "mean_token_accuracy": 0.8591833710670471, + "num_tokens": 220488369.0, + "step": 5778 + }, + { + "epoch": 0.7351481999745579, + "ewc_loss": 0.0257568359375, + "ewc_loss_parallel": 2.574920654296875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.720176696777344, + "learning_rate": 1e-06, + "loss": 0.4327, + "mean_token_accuracy": 0.8624951839447021, + "num_tokens": 220527798.0, + "step": 5779 + }, + { + "epoch": 0.7352754102531485, + "ewc_loss": 0.02587890625, + "ewc_loss_parallel": 2.586841583251953e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.832406997680664, + "learning_rate": 1e-06, + "loss": 0.384, + "mean_token_accuracy": 0.8771847486495972, + "num_tokens": 220565398.0, + "step": 5780 + }, + { + "epoch": 0.735402620531739, + "ewc_loss": 0.026123046875, + "ewc_loss_parallel": 2.6106834411621094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.751802444458008, + "learning_rate": 1e-06, + "loss": 0.4072, + "mean_token_accuracy": 0.8686390519142151, + "num_tokens": 220603903.0, + "step": 5781 + }, + { + "epoch": 0.7355298308103295, + "ewc_loss": 0.0257568359375, + "ewc_loss_parallel": 2.574920654296875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.666879653930664, + "learning_rate": 1e-06, + "loss": 0.4966, + "mean_token_accuracy": 0.8416343927383423, + "num_tokens": 220644168.0, + "step": 5782 + }, + { + "epoch": 0.7356570410889199, + "ewc_loss": 0.0257568359375, + "ewc_loss_parallel": 2.574920654296875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.74669647216797, + "learning_rate": 1e-06, + "loss": 0.4044, + "mean_token_accuracy": 0.8706294298171997, + "num_tokens": 220679709.0, + "step": 5783 + }, + { + "epoch": 0.7357842513675105, + "ewc_loss": 0.0260009765625, + "ewc_loss_parallel": 2.5987625122070312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.828250885009766, + "learning_rate": 1e-06, + "loss": 0.4625, + "mean_token_accuracy": 0.8533631563186646, + "num_tokens": 220721184.0, + "step": 5784 + }, + { + "epoch": 0.735911461646101, + "ewc_loss": 0.025634765625, + "ewc_loss_parallel": 2.562999725341797e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.760868072509766, + "learning_rate": 1e-06, + "loss": 0.4286, + "mean_token_accuracy": 0.8631210923194885, + "num_tokens": 220755027.0, + "step": 5785 + }, + { + "epoch": 0.7360386719246915, + "ewc_loss": 0.0260009765625, + "ewc_loss_parallel": 2.5987625122070312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.875370025634766, + "learning_rate": 1e-06, + "loss": 0.5157, + "mean_token_accuracy": 0.8352177143096924, + "num_tokens": 220793597.0, + "step": 5786 + }, + { + "epoch": 0.736165882203282, + "ewc_loss": 0.0260009765625, + "ewc_loss_parallel": 2.5987625122070312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.77149200439453, + "learning_rate": 1e-06, + "loss": 0.4454, + "mean_token_accuracy": 0.858011782169342, + "num_tokens": 220833549.0, + "step": 5787 + }, + { + "epoch": 0.7362930924818726, + "ewc_loss": 0.02587890625, + "ewc_loss_parallel": 2.586841583251953e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.80802345275879, + "learning_rate": 1e-06, + "loss": 0.4159, + "mean_token_accuracy": 0.8648486137390137, + "num_tokens": 220865551.0, + "step": 5788 + }, + { + "epoch": 0.736420302760463, + "ewc_loss": 0.02587890625, + "ewc_loss_parallel": 2.586841583251953e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.88180160522461, + "learning_rate": 1e-06, + "loss": 0.4017, + "mean_token_accuracy": 0.8719204664230347, + "num_tokens": 220911626.0, + "step": 5789 + }, + { + "epoch": 0.7365475130390535, + "ewc_loss": 0.02587890625, + "ewc_loss_parallel": 2.586841583251953e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.81678581237793, + "learning_rate": 1e-06, + "loss": 0.4333, + "mean_token_accuracy": 0.860200047492981, + "num_tokens": 220949839.0, + "step": 5790 + }, + { + "epoch": 0.736674723317644, + "ewc_loss": 0.0257568359375, + "ewc_loss_parallel": 2.574920654296875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.781719207763672, + "learning_rate": 1e-06, + "loss": 0.4821, + "mean_token_accuracy": 0.8424844741821289, + "num_tokens": 220984069.0, + "step": 5791 + }, + { + "epoch": 0.7368019335962346, + "ewc_loss": 0.0257568359375, + "ewc_loss_parallel": 2.574920654296875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.739782333374023, + "learning_rate": 1e-06, + "loss": 0.3841, + "mean_token_accuracy": 0.8796541094779968, + "num_tokens": 221020999.0, + "step": 5792 + }, + { + "epoch": 0.7369291438748251, + "ewc_loss": 0.02587890625, + "ewc_loss_parallel": 2.586841583251953e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.81493377685547, + "learning_rate": 1e-06, + "loss": 0.4731, + "mean_token_accuracy": 0.8500749468803406, + "num_tokens": 221051082.0, + "step": 5793 + }, + { + "epoch": 0.7370563541534156, + "ewc_loss": 0.0260009765625, + "ewc_loss_parallel": 2.5987625122070312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.768064498901367, + "learning_rate": 1e-06, + "loss": 0.5029, + "mean_token_accuracy": 0.8423975110054016, + "num_tokens": 221095444.0, + "step": 5794 + }, + { + "epoch": 0.737183564432006, + "ewc_loss": 0.025634765625, + "ewc_loss_parallel": 2.562999725341797e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.873672485351562, + "learning_rate": 1e-06, + "loss": 0.4372, + "mean_token_accuracy": 0.8611021637916565, + "num_tokens": 221136433.0, + "step": 5795 + }, + { + "epoch": 0.7373107747105966, + "ewc_loss": 0.0260009765625, + "ewc_loss_parallel": 2.5987625122070312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.776514053344727, + "learning_rate": 1e-06, + "loss": 0.4716, + "mean_token_accuracy": 0.8511613011360168, + "num_tokens": 221171278.0, + "step": 5796 + }, + { + "epoch": 0.7374379849891871, + "ewc_loss": 0.0260009765625, + "ewc_loss_parallel": 2.5987625122070312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.95705223083496, + "learning_rate": 1e-06, + "loss": 0.4363, + "mean_token_accuracy": 0.857313871383667, + "num_tokens": 221209380.0, + "step": 5797 + }, + { + "epoch": 0.7375651952677776, + "ewc_loss": 0.02587890625, + "ewc_loss_parallel": 2.586841583251953e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.736083984375, + "learning_rate": 1e-06, + "loss": 0.4466, + "mean_token_accuracy": 0.8576059937477112, + "num_tokens": 221253076.0, + "step": 5798 + }, + { + "epoch": 0.7376924055463682, + "ewc_loss": 0.0260009765625, + "ewc_loss_parallel": 2.5987625122070312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.749296188354492, + "learning_rate": 1e-06, + "loss": 0.4493, + "mean_token_accuracy": 0.8545657992362976, + "num_tokens": 221290348.0, + "step": 5799 + }, + { + "epoch": 0.7378196158249587, + "ewc_loss": 0.0260009765625, + "ewc_loss_parallel": 2.5987625122070312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.758113861083984, + "learning_rate": 1e-06, + "loss": 0.4231, + "mean_token_accuracy": 0.8645769357681274, + "num_tokens": 221329635.0, + "step": 5800 + }, + { + "epoch": 0.7379468261035491, + "ewc_loss": 0.0260009765625, + "ewc_loss_parallel": 2.5987625122070312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.81904411315918, + "learning_rate": 1e-06, + "loss": 0.4219, + "mean_token_accuracy": 0.8652216196060181, + "num_tokens": 221368521.0, + "step": 5801 + }, + { + "epoch": 0.7380740363821396, + "ewc_loss": 0.026123046875, + "ewc_loss_parallel": 2.6106834411621094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.944555282592773, + "learning_rate": 1e-06, + "loss": 0.3964, + "mean_token_accuracy": 0.8710727095603943, + "num_tokens": 221407090.0, + "step": 5802 + }, + { + "epoch": 0.7382012466607302, + "ewc_loss": 0.02587890625, + "ewc_loss_parallel": 2.586841583251953e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.66781997680664, + "learning_rate": 1e-06, + "loss": 0.4071, + "mean_token_accuracy": 0.8687323927879333, + "num_tokens": 221450340.0, + "step": 5803 + }, + { + "epoch": 0.7383284569393207, + "ewc_loss": 0.02587890625, + "ewc_loss_parallel": 2.586841583251953e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.8428897857666, + "learning_rate": 1e-06, + "loss": 0.4532, + "mean_token_accuracy": 0.8589760065078735, + "num_tokens": 221491996.0, + "step": 5804 + }, + { + "epoch": 0.7384556672179112, + "ewc_loss": 0.02587890625, + "ewc_loss_parallel": 2.586841583251953e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.862211227416992, + "learning_rate": 1e-06, + "loss": 0.4123, + "mean_token_accuracy": 0.8671125173568726, + "num_tokens": 221531466.0, + "step": 5805 + }, + { + "epoch": 0.7385828774965018, + "ewc_loss": 0.026123046875, + "ewc_loss_parallel": 2.6106834411621094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.849443435668945, + "learning_rate": 1e-06, + "loss": 0.4968, + "mean_token_accuracy": 0.8391766548156738, + "num_tokens": 221567003.0, + "step": 5806 + }, + { + "epoch": 0.7387100877750922, + "ewc_loss": 0.02587890625, + "ewc_loss_parallel": 2.586841583251953e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.843090057373047, + "learning_rate": 1e-06, + "loss": 0.4026, + "mean_token_accuracy": 0.8716776371002197, + "num_tokens": 221607396.0, + "step": 5807 + }, + { + "epoch": 0.7388372980536827, + "ewc_loss": 0.0260009765625, + "ewc_loss_parallel": 2.5987625122070312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.883413314819336, + "learning_rate": 1e-06, + "loss": 0.4595, + "mean_token_accuracy": 0.8509660959243774, + "num_tokens": 221644776.0, + "step": 5808 + }, + { + "epoch": 0.7389645083322732, + "ewc_loss": 0.02587890625, + "ewc_loss_parallel": 2.586841583251953e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.758520126342773, + "learning_rate": 1e-06, + "loss": 0.4091, + "mean_token_accuracy": 0.8723734617233276, + "num_tokens": 221685643.0, + "step": 5809 + }, + { + "epoch": 0.7390917186108638, + "ewc_loss": 0.0260009765625, + "ewc_loss_parallel": 2.5987625122070312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.822437286376953, + "learning_rate": 1e-06, + "loss": 0.4609, + "mean_token_accuracy": 0.8528086543083191, + "num_tokens": 221724673.0, + "step": 5810 + }, + { + "epoch": 0.7392189288894543, + "ewc_loss": 0.0260009765625, + "ewc_loss_parallel": 2.5987625122070312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.771419525146484, + "learning_rate": 1e-06, + "loss": 0.529, + "mean_token_accuracy": 0.8343271017074585, + "num_tokens": 221766287.0, + "step": 5811 + }, + { + "epoch": 0.7393461391680448, + "ewc_loss": 0.0260009765625, + "ewc_loss_parallel": 2.5987625122070312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.71863555908203, + "learning_rate": 1e-06, + "loss": 0.4457, + "mean_token_accuracy": 0.8584248423576355, + "num_tokens": 221805357.0, + "step": 5812 + }, + { + "epoch": 0.7394733494466353, + "ewc_loss": 0.0260009765625, + "ewc_loss_parallel": 2.5987625122070312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.858413696289062, + "learning_rate": 1e-06, + "loss": 0.458, + "mean_token_accuracy": 0.8562465906143188, + "num_tokens": 221838513.0, + "step": 5813 + }, + { + "epoch": 0.7396005597252258, + "ewc_loss": 0.0260009765625, + "ewc_loss_parallel": 2.5987625122070312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.67293357849121, + "learning_rate": 1e-06, + "loss": 0.4755, + "mean_token_accuracy": 0.8475742936134338, + "num_tokens": 221882808.0, + "step": 5814 + }, + { + "epoch": 0.7397277700038163, + "ewc_loss": 0.0260009765625, + "ewc_loss_parallel": 2.5987625122070312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.97214126586914, + "learning_rate": 1e-06, + "loss": 0.397, + "mean_token_accuracy": 0.8712939620018005, + "num_tokens": 221918505.0, + "step": 5815 + }, + { + "epoch": 0.7398549802824068, + "ewc_loss": 0.0262451171875, + "ewc_loss_parallel": 2.6226043701171875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.602813720703125, + "learning_rate": 1e-06, + "loss": 0.4406, + "mean_token_accuracy": 0.8594906330108643, + "num_tokens": 221955983.0, + "step": 5816 + }, + { + "epoch": 0.7399821905609973, + "ewc_loss": 0.0260009765625, + "ewc_loss_parallel": 2.5987625122070312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.892045974731445, + "learning_rate": 1e-06, + "loss": 0.4427, + "mean_token_accuracy": 0.8584205508232117, + "num_tokens": 221997363.0, + "step": 5817 + }, + { + "epoch": 0.7401094008395879, + "ewc_loss": 0.026123046875, + "ewc_loss_parallel": 2.6106834411621094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.831308364868164, + "learning_rate": 1e-06, + "loss": 0.4892, + "mean_token_accuracy": 0.8453004360198975, + "num_tokens": 222039302.0, + "step": 5818 + }, + { + "epoch": 0.7402366111181784, + "ewc_loss": 0.0260009765625, + "ewc_loss_parallel": 2.5987625122070312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.773561477661133, + "learning_rate": 1e-06, + "loss": 0.448, + "mean_token_accuracy": 0.8583188056945801, + "num_tokens": 222075779.0, + "step": 5819 + }, + { + "epoch": 0.7403638213967688, + "ewc_loss": 0.0260009765625, + "ewc_loss_parallel": 2.5987625122070312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.827800750732422, + "learning_rate": 1e-06, + "loss": 0.3862, + "mean_token_accuracy": 0.8769021034240723, + "num_tokens": 222115120.0, + "step": 5820 + }, + { + "epoch": 0.7404910316753593, + "ewc_loss": 0.0260009765625, + "ewc_loss_parallel": 2.5987625122070312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.832759857177734, + "learning_rate": 1e-06, + "loss": 0.4681, + "mean_token_accuracy": 0.8509999513626099, + "num_tokens": 222156570.0, + "step": 5821 + }, + { + "epoch": 0.7406182419539499, + "ewc_loss": 0.02587890625, + "ewc_loss_parallel": 2.586841583251953e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.792375564575195, + "learning_rate": 1e-06, + "loss": 0.4435, + "mean_token_accuracy": 0.8587590456008911, + "num_tokens": 222189487.0, + "step": 5822 + }, + { + "epoch": 0.7407454522325404, + "ewc_loss": 0.0260009765625, + "ewc_loss_parallel": 2.5987625122070312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.699703216552734, + "learning_rate": 1e-06, + "loss": 0.4345, + "mean_token_accuracy": 0.8576903939247131, + "num_tokens": 222223163.0, + "step": 5823 + }, + { + "epoch": 0.7408726625111309, + "ewc_loss": 0.026123046875, + "ewc_loss_parallel": 2.6106834411621094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.73221206665039, + "learning_rate": 1e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.8712891936302185, + "num_tokens": 222259042.0, + "step": 5824 + }, + { + "epoch": 0.7409998727897215, + "ewc_loss": 0.0260009765625, + "ewc_loss_parallel": 2.5987625122070312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.838054656982422, + "learning_rate": 1e-06, + "loss": 0.5189, + "mean_token_accuracy": 0.8398726582527161, + "num_tokens": 222300497.0, + "step": 5825 + }, + { + "epoch": 0.7411270830683119, + "ewc_loss": 0.0257568359375, + "ewc_loss_parallel": 2.574920654296875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.803279876708984, + "learning_rate": 1e-06, + "loss": 0.4539, + "mean_token_accuracy": 0.8545247316360474, + "num_tokens": 222333630.0, + "step": 5826 + }, + { + "epoch": 0.7412542933469024, + "ewc_loss": 0.02587890625, + "ewc_loss_parallel": 2.586841583251953e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.81816864013672, + "learning_rate": 1e-06, + "loss": 0.467, + "mean_token_accuracy": 0.8536887168884277, + "num_tokens": 222371715.0, + "step": 5827 + }, + { + "epoch": 0.7413815036254929, + "ewc_loss": 0.0260009765625, + "ewc_loss_parallel": 2.5987625122070312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.964008331298828, + "learning_rate": 1e-06, + "loss": 0.4373, + "mean_token_accuracy": 0.8598896861076355, + "num_tokens": 222412044.0, + "step": 5828 + }, + { + "epoch": 0.7415087139040835, + "ewc_loss": 0.02587890625, + "ewc_loss_parallel": 2.586841583251953e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.663127899169922, + "learning_rate": 1e-06, + "loss": 0.4363, + "mean_token_accuracy": 0.8609216213226318, + "num_tokens": 222452668.0, + "step": 5829 + }, + { + "epoch": 0.741635924182674, + "ewc_loss": 0.0260009765625, + "ewc_loss_parallel": 2.5987625122070312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.861907958984375, + "learning_rate": 1e-06, + "loss": 0.427, + "mean_token_accuracy": 0.8610767722129822, + "num_tokens": 222485631.0, + "step": 5830 + }, + { + "epoch": 0.7417631344612645, + "ewc_loss": 0.0260009765625, + "ewc_loss_parallel": 2.5987625122070312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.59282112121582, + "learning_rate": 1e-06, + "loss": 0.3868, + "mean_token_accuracy": 0.8733296394348145, + "num_tokens": 222522967.0, + "step": 5831 + }, + { + "epoch": 0.7418903447398549, + "ewc_loss": 0.0260009765625, + "ewc_loss_parallel": 2.5987625122070312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.81248664855957, + "learning_rate": 1e-06, + "loss": 0.4761, + "mean_token_accuracy": 0.8491142988204956, + "num_tokens": 222563258.0, + "step": 5832 + }, + { + "epoch": 0.7420175550184455, + "ewc_loss": 0.0260009765625, + "ewc_loss_parallel": 2.5987625122070312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.82445526123047, + "learning_rate": 1e-06, + "loss": 0.4504, + "mean_token_accuracy": 0.8530303239822388, + "num_tokens": 222599248.0, + "step": 5833 + }, + { + "epoch": 0.742144765297036, + "ewc_loss": 0.0260009765625, + "ewc_loss_parallel": 2.5987625122070312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.805360794067383, + "learning_rate": 1e-06, + "loss": 0.4222, + "mean_token_accuracy": 0.8658416271209717, + "num_tokens": 222642575.0, + "step": 5834 + }, + { + "epoch": 0.7422719755756265, + "ewc_loss": 0.0257568359375, + "ewc_loss_parallel": 2.574920654296875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.8265438079834, + "learning_rate": 1e-06, + "loss": 0.5141, + "mean_token_accuracy": 0.834994375705719, + "num_tokens": 222687330.0, + "step": 5835 + }, + { + "epoch": 0.742399185854217, + "ewc_loss": 0.02587890625, + "ewc_loss_parallel": 2.586841583251953e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.83700942993164, + "learning_rate": 1e-06, + "loss": 0.5282, + "mean_token_accuracy": 0.8326683640480042, + "num_tokens": 222720020.0, + "step": 5836 + }, + { + "epoch": 0.7425263961328076, + "ewc_loss": 0.0260009765625, + "ewc_loss_parallel": 2.5987625122070312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.7198543548584, + "learning_rate": 1e-06, + "loss": 0.4, + "mean_token_accuracy": 0.8724595308303833, + "num_tokens": 222758634.0, + "step": 5837 + }, + { + "epoch": 0.742653606411398, + "ewc_loss": 0.0260009765625, + "ewc_loss_parallel": 2.5987625122070312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.829326629638672, + "learning_rate": 1e-06, + "loss": 0.4122, + "mean_token_accuracy": 0.8660843968391418, + "num_tokens": 222794823.0, + "step": 5838 + }, + { + "epoch": 0.7427808166899885, + "ewc_loss": 0.026123046875, + "ewc_loss_parallel": 2.6106834411621094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.8605899810791, + "learning_rate": 1e-06, + "loss": 0.4372, + "mean_token_accuracy": 0.8588659763336182, + "num_tokens": 222833836.0, + "step": 5839 + }, + { + "epoch": 0.742908026968579, + "ewc_loss": 0.02587890625, + "ewc_loss_parallel": 2.586841583251953e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.730083465576172, + "learning_rate": 1e-06, + "loss": 0.4328, + "mean_token_accuracy": 0.8598140478134155, + "num_tokens": 222875712.0, + "step": 5840 + }, + { + "epoch": 0.7430352372471696, + "ewc_loss": 0.026123046875, + "ewc_loss_parallel": 2.6106834411621094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.79835319519043, + "learning_rate": 1e-06, + "loss": 0.482, + "mean_token_accuracy": 0.8478257656097412, + "num_tokens": 222915494.0, + "step": 5841 + }, + { + "epoch": 0.7431624475257601, + "ewc_loss": 0.026123046875, + "ewc_loss_parallel": 2.6106834411621094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.84943389892578, + "learning_rate": 1e-06, + "loss": 0.385, + "mean_token_accuracy": 0.8755784034729004, + "num_tokens": 222951675.0, + "step": 5842 + }, + { + "epoch": 0.7432896578043506, + "ewc_loss": 0.0260009765625, + "ewc_loss_parallel": 2.5987625122070312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.712202072143555, + "learning_rate": 1e-06, + "loss": 0.4605, + "mean_token_accuracy": 0.8536715507507324, + "num_tokens": 222993245.0, + "step": 5843 + }, + { + "epoch": 0.743416868082941, + "ewc_loss": 0.026123046875, + "ewc_loss_parallel": 2.6106834411621094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.85616683959961, + "learning_rate": 1e-06, + "loss": 0.4223, + "mean_token_accuracy": 0.8656094670295715, + "num_tokens": 223029876.0, + "step": 5844 + }, + { + "epoch": 0.7435440783615316, + "ewc_loss": 0.026123046875, + "ewc_loss_parallel": 2.6106834411621094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.622777938842773, + "learning_rate": 1e-06, + "loss": 0.4273, + "mean_token_accuracy": 0.8638389110565186, + "num_tokens": 223072550.0, + "step": 5845 + }, + { + "epoch": 0.7436712886401221, + "ewc_loss": 0.0260009765625, + "ewc_loss_parallel": 2.5987625122070312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.78431510925293, + "learning_rate": 1e-06, + "loss": 0.4473, + "mean_token_accuracy": 0.8577702045440674, + "num_tokens": 223112892.0, + "step": 5846 + }, + { + "epoch": 0.7437984989187126, + "ewc_loss": 0.0260009765625, + "ewc_loss_parallel": 2.5987625122070312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.866153717041016, + "learning_rate": 1e-06, + "loss": 0.4286, + "mean_token_accuracy": 0.8641114830970764, + "num_tokens": 223157093.0, + "step": 5847 + }, + { + "epoch": 0.7439257091973032, + "ewc_loss": 0.0257568359375, + "ewc_loss_parallel": 2.574920654296875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.78076934814453, + "learning_rate": 1e-06, + "loss": 0.4693, + "mean_token_accuracy": 0.8499798774719238, + "num_tokens": 223190756.0, + "step": 5848 + }, + { + "epoch": 0.7440529194758937, + "ewc_loss": 0.0260009765625, + "ewc_loss_parallel": 2.5987625122070312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.7658634185791, + "learning_rate": 1e-06, + "loss": 0.4805, + "mean_token_accuracy": 0.8494071960449219, + "num_tokens": 223226013.0, + "step": 5849 + }, + { + "epoch": 0.7441801297544841, + "ewc_loss": 0.02587890625, + "ewc_loss_parallel": 2.586841583251953e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.781705856323242, + "learning_rate": 1e-06, + "loss": 0.3997, + "mean_token_accuracy": 0.871042788028717, + "num_tokens": 223262858.0, + "step": 5850 + }, + { + "epoch": 0.7443073400330746, + "ewc_loss": 0.0262451171875, + "ewc_loss_parallel": 2.6226043701171875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.828229904174805, + "learning_rate": 1e-06, + "loss": 0.4271, + "mean_token_accuracy": 0.8626872301101685, + "num_tokens": 223310528.0, + "step": 5851 + }, + { + "epoch": 0.7444345503116652, + "ewc_loss": 0.0260009765625, + "ewc_loss_parallel": 2.5987625122070312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.846927642822266, + "learning_rate": 1e-06, + "loss": 0.4186, + "mean_token_accuracy": 0.8609433174133301, + "num_tokens": 223345021.0, + "step": 5852 + }, + { + "epoch": 0.7445617605902557, + "ewc_loss": 0.0260009765625, + "ewc_loss_parallel": 2.5987625122070312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.81793212890625, + "learning_rate": 1e-06, + "loss": 0.4122, + "mean_token_accuracy": 0.868968665599823, + "num_tokens": 223382055.0, + "step": 5853 + }, + { + "epoch": 0.7446889708688462, + "ewc_loss": 0.02587890625, + "ewc_loss_parallel": 2.586841583251953e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.755897521972656, + "learning_rate": 1e-06, + "loss": 0.4698, + "mean_token_accuracy": 0.8487458825111389, + "num_tokens": 223418441.0, + "step": 5854 + }, + { + "epoch": 0.7448161811474368, + "ewc_loss": 0.026123046875, + "ewc_loss_parallel": 2.6106834411621094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.775489807128906, + "learning_rate": 1e-06, + "loss": 0.4582, + "mean_token_accuracy": 0.8576722145080566, + "num_tokens": 223458150.0, + "step": 5855 + }, + { + "epoch": 0.7449433914260272, + "ewc_loss": 0.0260009765625, + "ewc_loss_parallel": 2.5987625122070312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.907201766967773, + "learning_rate": 1e-06, + "loss": 0.4311, + "mean_token_accuracy": 0.8557633757591248, + "num_tokens": 223496916.0, + "step": 5856 + }, + { + "epoch": 0.7450706017046177, + "ewc_loss": 0.026123046875, + "ewc_loss_parallel": 2.6106834411621094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.750455856323242, + "learning_rate": 1e-06, + "loss": 0.4217, + "mean_token_accuracy": 0.8650650978088379, + "num_tokens": 223534808.0, + "step": 5857 + }, + { + "epoch": 0.7451978119832082, + "ewc_loss": 0.026123046875, + "ewc_loss_parallel": 2.6106834411621094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.73218536376953, + "learning_rate": 1e-06, + "loss": 0.4105, + "mean_token_accuracy": 0.8682005405426025, + "num_tokens": 223571255.0, + "step": 5858 + }, + { + "epoch": 0.7453250222617988, + "ewc_loss": 0.0260009765625, + "ewc_loss_parallel": 2.5987625122070312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.777450561523438, + "learning_rate": 1e-06, + "loss": 0.4352, + "mean_token_accuracy": 0.8627659678459167, + "num_tokens": 223616608.0, + "step": 5859 + }, + { + "epoch": 0.7454522325403893, + "ewc_loss": 0.0260009765625, + "ewc_loss_parallel": 2.5987625122070312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.622896194458008, + "learning_rate": 1e-06, + "loss": 0.4492, + "mean_token_accuracy": 0.8595105409622192, + "num_tokens": 223656188.0, + "step": 5860 + }, + { + "epoch": 0.7455794428189798, + "ewc_loss": 0.026123046875, + "ewc_loss_parallel": 2.6106834411621094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.89360237121582, + "learning_rate": 1e-06, + "loss": 0.4564, + "mean_token_accuracy": 0.8547030687332153, + "num_tokens": 223693787.0, + "step": 5861 + }, + { + "epoch": 0.7457066530975703, + "ewc_loss": 0.0262451171875, + "ewc_loss_parallel": 2.6226043701171875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.651521682739258, + "learning_rate": 1e-06, + "loss": 0.4022, + "mean_token_accuracy": 0.8705368638038635, + "num_tokens": 223729350.0, + "step": 5862 + }, + { + "epoch": 0.7458338633761608, + "ewc_loss": 0.026123046875, + "ewc_loss_parallel": 2.6106834411621094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.782676696777344, + "learning_rate": 1e-06, + "loss": 0.4883, + "mean_token_accuracy": 0.847353458404541, + "num_tokens": 223770846.0, + "step": 5863 + }, + { + "epoch": 0.7459610736547513, + "ewc_loss": 0.0264892578125, + "ewc_loss_parallel": 2.6464462280273438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.824308395385742, + "learning_rate": 1e-06, + "loss": 0.4385, + "mean_token_accuracy": 0.8587530851364136, + "num_tokens": 223814194.0, + "step": 5864 + }, + { + "epoch": 0.7460882839333418, + "ewc_loss": 0.026123046875, + "ewc_loss_parallel": 2.6106834411621094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.860586166381836, + "learning_rate": 1e-06, + "loss": 0.4539, + "mean_token_accuracy": 0.851985514163971, + "num_tokens": 223849141.0, + "step": 5865 + }, + { + "epoch": 0.7462154942119323, + "ewc_loss": 0.0262451171875, + "ewc_loss_parallel": 2.6226043701171875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.79564094543457, + "learning_rate": 1e-06, + "loss": 0.3841, + "mean_token_accuracy": 0.8808447122573853, + "num_tokens": 223888654.0, + "step": 5866 + }, + { + "epoch": 0.7463427044905229, + "ewc_loss": 0.02587890625, + "ewc_loss_parallel": 2.586841583251953e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.768352508544922, + "learning_rate": 1e-06, + "loss": 0.4242, + "mean_token_accuracy": 0.8622283339500427, + "num_tokens": 223924509.0, + "step": 5867 + }, + { + "epoch": 0.7464699147691134, + "ewc_loss": 0.0263671875, + "ewc_loss_parallel": 2.6345252990722656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.96934700012207, + "learning_rate": 1e-06, + "loss": 0.4908, + "mean_token_accuracy": 0.8442597389221191, + "num_tokens": 223966191.0, + "step": 5868 + }, + { + "epoch": 0.7465971250477038, + "ewc_loss": 0.026123046875, + "ewc_loss_parallel": 2.6106834411621094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.698070526123047, + "learning_rate": 1e-06, + "loss": 0.4253, + "mean_token_accuracy": 0.8613653182983398, + "num_tokens": 224001415.0, + "step": 5869 + }, + { + "epoch": 0.7467243353262943, + "ewc_loss": 0.026123046875, + "ewc_loss_parallel": 2.6106834411621094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.99905776977539, + "learning_rate": 1e-06, + "loss": 0.4193, + "mean_token_accuracy": 0.8655742406845093, + "num_tokens": 224042820.0, + "step": 5870 + }, + { + "epoch": 0.7468515456048849, + "ewc_loss": 0.026123046875, + "ewc_loss_parallel": 2.6106834411621094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.6132869720459, + "learning_rate": 1e-06, + "loss": 0.4145, + "mean_token_accuracy": 0.8650059103965759, + "num_tokens": 224078411.0, + "step": 5871 + }, + { + "epoch": 0.7469787558834754, + "ewc_loss": 0.026123046875, + "ewc_loss_parallel": 2.6106834411621094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.91102409362793, + "learning_rate": 1e-06, + "loss": 0.4314, + "mean_token_accuracy": 0.8655298352241516, + "num_tokens": 224117136.0, + "step": 5872 + }, + { + "epoch": 0.7471059661620659, + "ewc_loss": 0.0262451171875, + "ewc_loss_parallel": 2.6226043701171875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.916339874267578, + "learning_rate": 1e-06, + "loss": 0.4595, + "mean_token_accuracy": 0.8522910475730896, + "num_tokens": 224155879.0, + "step": 5873 + }, + { + "epoch": 0.7472331764406565, + "ewc_loss": 0.0260009765625, + "ewc_loss_parallel": 2.5987625122070312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.74352264404297, + "learning_rate": 1e-06, + "loss": 0.4056, + "mean_token_accuracy": 0.8700993061065674, + "num_tokens": 224195186.0, + "step": 5874 + }, + { + "epoch": 0.7473603867192469, + "ewc_loss": 0.0262451171875, + "ewc_loss_parallel": 2.6226043701171875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.91390609741211, + "learning_rate": 1e-06, + "loss": 0.4454, + "mean_token_accuracy": 0.8554857969284058, + "num_tokens": 224235722.0, + "step": 5875 + }, + { + "epoch": 0.7474875969978374, + "ewc_loss": 0.0262451171875, + "ewc_loss_parallel": 2.6226043701171875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.996261596679688, + "learning_rate": 1e-06, + "loss": 0.4221, + "mean_token_accuracy": 0.8663965463638306, + "num_tokens": 224268461.0, + "step": 5876 + }, + { + "epoch": 0.7476148072764279, + "ewc_loss": 0.0262451171875, + "ewc_loss_parallel": 2.6226043701171875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.860469818115234, + "learning_rate": 1e-06, + "loss": 0.3984, + "mean_token_accuracy": 0.8722500801086426, + "num_tokens": 224302921.0, + "step": 5877 + }, + { + "epoch": 0.7477420175550185, + "ewc_loss": 0.026123046875, + "ewc_loss_parallel": 2.6106834411621094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.77776527404785, + "learning_rate": 1e-06, + "loss": 0.4276, + "mean_token_accuracy": 0.860843300819397, + "num_tokens": 224344081.0, + "step": 5878 + }, + { + "epoch": 0.747869227833609, + "ewc_loss": 0.0263671875, + "ewc_loss_parallel": 2.6345252990722656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.89954376220703, + "learning_rate": 1e-06, + "loss": 0.4851, + "mean_token_accuracy": 0.8464658856391907, + "num_tokens": 224384464.0, + "step": 5879 + }, + { + "epoch": 0.7479964381121995, + "ewc_loss": 0.0263671875, + "ewc_loss_parallel": 2.6345252990722656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.841426849365234, + "learning_rate": 1e-06, + "loss": 0.4102, + "mean_token_accuracy": 0.8682011365890503, + "num_tokens": 224425542.0, + "step": 5880 + }, + { + "epoch": 0.7481236483907899, + "ewc_loss": 0.0262451171875, + "ewc_loss_parallel": 2.6226043701171875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.786264419555664, + "learning_rate": 1e-06, + "loss": 0.39, + "mean_token_accuracy": 0.8762255907058716, + "num_tokens": 224465342.0, + "step": 5881 + }, + { + "epoch": 0.7482508586693805, + "ewc_loss": 0.02587890625, + "ewc_loss_parallel": 2.586841583251953e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.717662811279297, + "learning_rate": 1e-06, + "loss": 0.4172, + "mean_token_accuracy": 0.8643826842308044, + "num_tokens": 224504787.0, + "step": 5882 + }, + { + "epoch": 0.748378068947971, + "ewc_loss": 0.026123046875, + "ewc_loss_parallel": 2.6106834411621094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.614755630493164, + "learning_rate": 1e-06, + "loss": 0.4362, + "mean_token_accuracy": 0.8599458336830139, + "num_tokens": 224548534.0, + "step": 5883 + }, + { + "epoch": 0.7485052792265615, + "ewc_loss": 0.026123046875, + "ewc_loss_parallel": 2.6106834411621094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.912078857421875, + "learning_rate": 1e-06, + "loss": 0.4562, + "mean_token_accuracy": 0.85173499584198, + "num_tokens": 224590251.0, + "step": 5884 + }, + { + "epoch": 0.748632489505152, + "ewc_loss": 0.026123046875, + "ewc_loss_parallel": 2.6106834411621094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.916362762451172, + "learning_rate": 1e-06, + "loss": 0.4119, + "mean_token_accuracy": 0.8698038458824158, + "num_tokens": 224633745.0, + "step": 5885 + }, + { + "epoch": 0.7487596997837426, + "ewc_loss": 0.0260009765625, + "ewc_loss_parallel": 2.5987625122070312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.799619674682617, + "learning_rate": 1e-06, + "loss": 0.4945, + "mean_token_accuracy": 0.8439440727233887, + "num_tokens": 224671319.0, + "step": 5886 + }, + { + "epoch": 0.748886910062333, + "ewc_loss": 0.0262451171875, + "ewc_loss_parallel": 2.6226043701171875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.604278564453125, + "learning_rate": 1e-06, + "loss": 0.4497, + "mean_token_accuracy": 0.8593409061431885, + "num_tokens": 224712670.0, + "step": 5887 + }, + { + "epoch": 0.7490141203409235, + "ewc_loss": 0.0260009765625, + "ewc_loss_parallel": 2.5987625122070312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.773757934570312, + "learning_rate": 1e-06, + "loss": 0.4506, + "mean_token_accuracy": 0.8574210405349731, + "num_tokens": 224747840.0, + "step": 5888 + }, + { + "epoch": 0.749141330619514, + "ewc_loss": 0.0260009765625, + "ewc_loss_parallel": 2.5987625122070312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.827838897705078, + "learning_rate": 1e-06, + "loss": 0.403, + "mean_token_accuracy": 0.8696594834327698, + "num_tokens": 224783274.0, + "step": 5889 + }, + { + "epoch": 0.7492685408981046, + "ewc_loss": 0.0260009765625, + "ewc_loss_parallel": 2.5987625122070312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.717548370361328, + "learning_rate": 1e-06, + "loss": 0.4509, + "mean_token_accuracy": 0.8556386232376099, + "num_tokens": 224820456.0, + "step": 5890 + }, + { + "epoch": 0.7493957511766951, + "ewc_loss": 0.0262451171875, + "ewc_loss_parallel": 2.6226043701171875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.774444580078125, + "learning_rate": 1e-06, + "loss": 0.4247, + "mean_token_accuracy": 0.863755464553833, + "num_tokens": 224858569.0, + "step": 5891 + }, + { + "epoch": 0.7495229614552856, + "ewc_loss": 0.0260009765625, + "ewc_loss_parallel": 2.5987625122070312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.69243812561035, + "learning_rate": 1e-06, + "loss": 0.4452, + "mean_token_accuracy": 0.8581041097640991, + "num_tokens": 224901033.0, + "step": 5892 + }, + { + "epoch": 0.749650171733876, + "ewc_loss": 0.026123046875, + "ewc_loss_parallel": 2.6106834411621094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.721233367919922, + "learning_rate": 1e-06, + "loss": 0.4017, + "mean_token_accuracy": 0.8752763867378235, + "num_tokens": 224934345.0, + "step": 5893 + }, + { + "epoch": 0.7497773820124666, + "ewc_loss": 0.0262451171875, + "ewc_loss_parallel": 2.6226043701171875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.95439338684082, + "learning_rate": 1e-06, + "loss": 0.462, + "mean_token_accuracy": 0.8524115681648254, + "num_tokens": 224972479.0, + "step": 5894 + }, + { + "epoch": 0.7499045922910571, + "ewc_loss": 0.0260009765625, + "ewc_loss_parallel": 2.5987625122070312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.769596099853516, + "learning_rate": 1e-06, + "loss": 0.3901, + "mean_token_accuracy": 0.8758008480072021, + "num_tokens": 225014213.0, + "step": 5895 + }, + { + "epoch": 0.7500318025696476, + "ewc_loss": 0.0260009765625, + "ewc_loss_parallel": 2.5987625122070312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.925003051757812, + "learning_rate": 1e-06, + "loss": 0.4141, + "mean_token_accuracy": 0.8668606281280518, + "num_tokens": 225049929.0, + "step": 5896 + }, + { + "epoch": 0.7501590128482382, + "ewc_loss": 0.026123046875, + "ewc_loss_parallel": 2.6106834411621094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.86912727355957, + "learning_rate": 1e-06, + "loss": 0.4853, + "mean_token_accuracy": 0.8444294333457947, + "num_tokens": 225086409.0, + "step": 5897 + }, + { + "epoch": 0.7502862231268287, + "ewc_loss": 0.026123046875, + "ewc_loss_parallel": 2.6106834411621094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.822093963623047, + "learning_rate": 1e-06, + "loss": 0.5021, + "mean_token_accuracy": 0.8419100046157837, + "num_tokens": 225124443.0, + "step": 5898 + }, + { + "epoch": 0.7504134334054191, + "ewc_loss": 0.0260009765625, + "ewc_loss_parallel": 2.5987625122070312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.760934829711914, + "learning_rate": 1e-06, + "loss": 0.4739, + "mean_token_accuracy": 0.8486426472663879, + "num_tokens": 225166665.0, + "step": 5899 + }, + { + "epoch": 0.7505406436840096, + "ewc_loss": 0.026123046875, + "ewc_loss_parallel": 2.6106834411621094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.24357795715332, + "learning_rate": 1e-06, + "loss": 0.524, + "mean_token_accuracy": 0.8350999355316162, + "num_tokens": 225203841.0, + "step": 5900 + }, + { + "epoch": 0.7506678539626002, + "ewc_loss": 0.02587890625, + "ewc_loss_parallel": 2.586841583251953e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.74713706970215, + "learning_rate": 1e-06, + "loss": 0.4941, + "mean_token_accuracy": 0.8433576822280884, + "num_tokens": 225234844.0, + "step": 5901 + }, + { + "epoch": 0.7507950642411907, + "ewc_loss": 0.02587890625, + "ewc_loss_parallel": 2.586841583251953e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.081661224365234, + "learning_rate": 1e-06, + "loss": 0.502, + "mean_token_accuracy": 0.8416483402252197, + "num_tokens": 225269828.0, + "step": 5902 + }, + { + "epoch": 0.7509222745197812, + "ewc_loss": 0.0262451171875, + "ewc_loss_parallel": 2.6226043701171875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.90031623840332, + "learning_rate": 1e-06, + "loss": 0.4235, + "mean_token_accuracy": 0.8643550872802734, + "num_tokens": 225312361.0, + "step": 5903 + }, + { + "epoch": 0.7510494847983717, + "ewc_loss": 0.02587890625, + "ewc_loss_parallel": 2.586841583251953e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.46976089477539, + "learning_rate": 1e-06, + "loss": 0.4916, + "mean_token_accuracy": 0.8487135171890259, + "num_tokens": 225353708.0, + "step": 5904 + }, + { + "epoch": 0.7511766950769622, + "ewc_loss": 0.0263671875, + "ewc_loss_parallel": 2.6345252990722656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.32440948486328, + "learning_rate": 1e-06, + "loss": 0.376, + "mean_token_accuracy": 0.8786876201629639, + "num_tokens": 225399730.0, + "step": 5905 + }, + { + "epoch": 0.7513039053555527, + "ewc_loss": 0.0250244140625, + "ewc_loss_parallel": 2.5033950805664062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.64433479309082, + "learning_rate": 1e-06, + "loss": 0.453, + "mean_token_accuracy": 0.8535385131835938, + "num_tokens": 225441836.0, + "step": 5906 + }, + { + "epoch": 0.7514311156341432, + "ewc_loss": 0.0260009765625, + "ewc_loss_parallel": 2.5987625122070312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.62436866760254, + "learning_rate": 1e-06, + "loss": 0.4234, + "mean_token_accuracy": 0.8665592670440674, + "num_tokens": 225471004.0, + "step": 5907 + }, + { + "epoch": 0.7515583259127337, + "ewc_loss": 0.0260009765625, + "ewc_loss_parallel": 2.5987625122070312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.888784408569336, + "learning_rate": 1e-06, + "loss": 0.5026, + "mean_token_accuracy": 0.8416113257408142, + "num_tokens": 225505789.0, + "step": 5908 + }, + { + "epoch": 0.7516855361913243, + "ewc_loss": 0.0255126953125, + "ewc_loss_parallel": 2.5510787963867188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.659866333007812, + "learning_rate": 1e-06, + "loss": 0.4383, + "mean_token_accuracy": 0.8616242408752441, + "num_tokens": 225541138.0, + "step": 5909 + }, + { + "epoch": 0.7518127464699148, + "ewc_loss": 0.0260009765625, + "ewc_loss_parallel": 2.5987625122070312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.336177825927734, + "learning_rate": 1e-06, + "loss": 0.4188, + "mean_token_accuracy": 0.8626384735107422, + "num_tokens": 225581695.0, + "step": 5910 + }, + { + "epoch": 0.7519399567485053, + "ewc_loss": 0.0260009765625, + "ewc_loss_parallel": 2.5987625122070312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.199872970581055, + "learning_rate": 1e-06, + "loss": 0.4357, + "mean_token_accuracy": 0.8648334741592407, + "num_tokens": 225619532.0, + "step": 5911 + }, + { + "epoch": 0.7520671670270958, + "ewc_loss": 0.0257568359375, + "ewc_loss_parallel": 2.574920654296875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.660171508789062, + "learning_rate": 1e-06, + "loss": 0.455, + "mean_token_accuracy": 0.8509125709533691, + "num_tokens": 225657403.0, + "step": 5912 + }, + { + "epoch": 0.7521943773056863, + "ewc_loss": 0.025634765625, + "ewc_loss_parallel": 2.562999725341797e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.712757110595703, + "learning_rate": 1e-06, + "loss": 0.4633, + "mean_token_accuracy": 0.8546264171600342, + "num_tokens": 225693402.0, + "step": 5913 + }, + { + "epoch": 0.7523215875842768, + "ewc_loss": 0.02587890625, + "ewc_loss_parallel": 2.586841583251953e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.208148956298828, + "learning_rate": 1e-06, + "loss": 0.4468, + "mean_token_accuracy": 0.8576618432998657, + "num_tokens": 225727208.0, + "step": 5914 + }, + { + "epoch": 0.7524487978628673, + "ewc_loss": 0.026123046875, + "ewc_loss_parallel": 2.6106834411621094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.888776779174805, + "learning_rate": 1e-06, + "loss": 0.4132, + "mean_token_accuracy": 0.8640004396438599, + "num_tokens": 225762117.0, + "step": 5915 + }, + { + "epoch": 0.7525760081414579, + "ewc_loss": 0.0257568359375, + "ewc_loss_parallel": 2.574920654296875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.7503719329834, + "learning_rate": 1e-06, + "loss": 0.4896, + "mean_token_accuracy": 0.8434149622917175, + "num_tokens": 225794737.0, + "step": 5916 + }, + { + "epoch": 0.7527032184200484, + "ewc_loss": 0.0262451171875, + "ewc_loss_parallel": 2.6226043701171875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.847625732421875, + "learning_rate": 1e-06, + "loss": 0.452, + "mean_token_accuracy": 0.8583009243011475, + "num_tokens": 225829074.0, + "step": 5917 + }, + { + "epoch": 0.7528304286986388, + "ewc_loss": 0.026123046875, + "ewc_loss_parallel": 2.6106834411621094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.88146209716797, + "learning_rate": 1e-06, + "loss": 0.4008, + "mean_token_accuracy": 0.8725073337554932, + "num_tokens": 225871344.0, + "step": 5918 + }, + { + "epoch": 0.7529576389772293, + "ewc_loss": 0.026123046875, + "ewc_loss_parallel": 2.6106834411621094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.967918395996094, + "learning_rate": 1e-06, + "loss": 0.4554, + "mean_token_accuracy": 0.8541659116744995, + "num_tokens": 225911321.0, + "step": 5919 + }, + { + "epoch": 0.7530848492558199, + "ewc_loss": 0.0260009765625, + "ewc_loss_parallel": 2.5987625122070312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.587778091430664, + "learning_rate": 1e-06, + "loss": 0.4138, + "mean_token_accuracy": 0.867635190486908, + "num_tokens": 225944613.0, + "step": 5920 + }, + { + "epoch": 0.7532120595344104, + "ewc_loss": 0.0260009765625, + "ewc_loss_parallel": 2.5987625122070312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.011091232299805, + "learning_rate": 1e-06, + "loss": 0.4334, + "mean_token_accuracy": 0.8646047115325928, + "num_tokens": 225989501.0, + "step": 5921 + }, + { + "epoch": 0.7533392698130009, + "ewc_loss": 0.0263671875, + "ewc_loss_parallel": 2.6345252990722656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.817609786987305, + "learning_rate": 1e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.8606719970703125, + "num_tokens": 226024984.0, + "step": 5922 + }, + { + "epoch": 0.7534664800915915, + "ewc_loss": 0.026123046875, + "ewc_loss_parallel": 2.6106834411621094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.850797653198242, + "learning_rate": 1e-06, + "loss": 0.4012, + "mean_token_accuracy": 0.8719950914382935, + "num_tokens": 226063255.0, + "step": 5923 + }, + { + "epoch": 0.7535936903701819, + "ewc_loss": 0.0262451171875, + "ewc_loss_parallel": 2.6226043701171875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.810508728027344, + "learning_rate": 1e-06, + "loss": 0.4878, + "mean_token_accuracy": 0.8472317457199097, + "num_tokens": 226105538.0, + "step": 5924 + }, + { + "epoch": 0.7537209006487724, + "ewc_loss": 0.02587890625, + "ewc_loss_parallel": 2.586841583251953e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.821069717407227, + "learning_rate": 1e-06, + "loss": 0.4323, + "mean_token_accuracy": 0.8625744581222534, + "num_tokens": 226137449.0, + "step": 5925 + }, + { + "epoch": 0.7538481109273629, + "ewc_loss": 0.0264892578125, + "ewc_loss_parallel": 2.6464462280273438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.825193405151367, + "learning_rate": 1e-06, + "loss": 0.4691, + "mean_token_accuracy": 0.8531394004821777, + "num_tokens": 226171078.0, + "step": 5926 + }, + { + "epoch": 0.7539753212059535, + "ewc_loss": 0.0260009765625, + "ewc_loss_parallel": 2.5987625122070312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.670564651489258, + "learning_rate": 1e-06, + "loss": 0.4348, + "mean_token_accuracy": 0.8611009120941162, + "num_tokens": 226208060.0, + "step": 5927 + }, + { + "epoch": 0.754102531484544, + "ewc_loss": 0.0263671875, + "ewc_loss_parallel": 2.6345252990722656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.862070083618164, + "learning_rate": 1e-06, + "loss": 0.4077, + "mean_token_accuracy": 0.8693563342094421, + "num_tokens": 226245330.0, + "step": 5928 + }, + { + "epoch": 0.7542297417631345, + "ewc_loss": 0.026123046875, + "ewc_loss_parallel": 2.6106834411621094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.817886352539062, + "learning_rate": 1e-06, + "loss": 0.3863, + "mean_token_accuracy": 0.8800505995750427, + "num_tokens": 226281974.0, + "step": 5929 + }, + { + "epoch": 0.7543569520417249, + "ewc_loss": 0.0260009765625, + "ewc_loss_parallel": 2.5987625122070312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.693788528442383, + "learning_rate": 1e-06, + "loss": 0.4685, + "mean_token_accuracy": 0.8488762378692627, + "num_tokens": 226323302.0, + "step": 5930 + }, + { + "epoch": 0.7544841623203155, + "ewc_loss": 0.026123046875, + "ewc_loss_parallel": 2.6106834411621094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.82107925415039, + "learning_rate": 1e-06, + "loss": 0.4821, + "mean_token_accuracy": 0.846662700176239, + "num_tokens": 226361346.0, + "step": 5931 + }, + { + "epoch": 0.754611372598906, + "ewc_loss": 0.026123046875, + "ewc_loss_parallel": 2.6106834411621094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.840017318725586, + "learning_rate": 1e-06, + "loss": 0.456, + "mean_token_accuracy": 0.8554390668869019, + "num_tokens": 226397744.0, + "step": 5932 + }, + { + "epoch": 0.7547385828774965, + "ewc_loss": 0.0260009765625, + "ewc_loss_parallel": 2.5987625122070312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.772071838378906, + "learning_rate": 1e-06, + "loss": 0.4221, + "mean_token_accuracy": 0.8671407699584961, + "num_tokens": 226436745.0, + "step": 5933 + }, + { + "epoch": 0.754865793156087, + "ewc_loss": 0.02587890625, + "ewc_loss_parallel": 2.586841583251953e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.973644256591797, + "learning_rate": 1e-06, + "loss": 0.4103, + "mean_token_accuracy": 0.8678740859031677, + "num_tokens": 226470302.0, + "step": 5934 + }, + { + "epoch": 0.7549930034346776, + "ewc_loss": 0.0260009765625, + "ewc_loss_parallel": 2.5987625122070312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.825563430786133, + "learning_rate": 1e-06, + "loss": 0.4367, + "mean_token_accuracy": 0.8619318008422852, + "num_tokens": 226500090.0, + "step": 5935 + }, + { + "epoch": 0.755120213713268, + "ewc_loss": 0.0257568359375, + "ewc_loss_parallel": 2.574920654296875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.652469635009766, + "learning_rate": 1e-06, + "loss": 0.4327, + "mean_token_accuracy": 0.8613057136535645, + "num_tokens": 226533359.0, + "step": 5936 + }, + { + "epoch": 0.7552474239918585, + "ewc_loss": 0.026123046875, + "ewc_loss_parallel": 2.6106834411621094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.240447998046875, + "learning_rate": 1e-06, + "loss": 0.4669, + "mean_token_accuracy": 0.8494470119476318, + "num_tokens": 226567060.0, + "step": 5937 + }, + { + "epoch": 0.755374634270449, + "ewc_loss": 0.0264892578125, + "ewc_loss_parallel": 2.6464462280273438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.650653839111328, + "learning_rate": 1e-06, + "loss": 0.4577, + "mean_token_accuracy": 0.8571509122848511, + "num_tokens": 226604270.0, + "step": 5938 + }, + { + "epoch": 0.7555018445490396, + "ewc_loss": 0.026123046875, + "ewc_loss_parallel": 2.6106834411621094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.771160125732422, + "learning_rate": 1e-06, + "loss": 0.4486, + "mean_token_accuracy": 0.8579801321029663, + "num_tokens": 226641310.0, + "step": 5939 + }, + { + "epoch": 0.7556290548276301, + "ewc_loss": 0.0263671875, + "ewc_loss_parallel": 2.6345252990722656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.867216110229492, + "learning_rate": 1e-06, + "loss": 0.4651, + "mean_token_accuracy": 0.8481439352035522, + "num_tokens": 226678305.0, + "step": 5940 + }, + { + "epoch": 0.7557562651062206, + "ewc_loss": 0.0264892578125, + "ewc_loss_parallel": 2.6464462280273438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.967092514038086, + "learning_rate": 1e-06, + "loss": 0.4177, + "mean_token_accuracy": 0.8657342195510864, + "num_tokens": 226715628.0, + "step": 5941 + }, + { + "epoch": 0.755883475384811, + "ewc_loss": 0.0262451171875, + "ewc_loss_parallel": 2.6226043701171875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.781898498535156, + "learning_rate": 1e-06, + "loss": 0.4011, + "mean_token_accuracy": 0.8709772229194641, + "num_tokens": 226755346.0, + "step": 5942 + }, + { + "epoch": 0.7560106856634016, + "ewc_loss": 0.0263671875, + "ewc_loss_parallel": 2.6345252990722656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.89968490600586, + "learning_rate": 1e-06, + "loss": 0.4711, + "mean_token_accuracy": 0.8489253520965576, + "num_tokens": 226792243.0, + "step": 5943 + }, + { + "epoch": 0.7561378959419921, + "ewc_loss": 0.0264892578125, + "ewc_loss_parallel": 2.6464462280273438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.798309326171875, + "learning_rate": 1e-06, + "loss": 0.4056, + "mean_token_accuracy": 0.8736761808395386, + "num_tokens": 226829033.0, + "step": 5944 + }, + { + "epoch": 0.7562651062205826, + "ewc_loss": 0.0263671875, + "ewc_loss_parallel": 2.6345252990722656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.806020736694336, + "learning_rate": 1e-06, + "loss": 0.4224, + "mean_token_accuracy": 0.873822033405304, + "num_tokens": 226870645.0, + "step": 5945 + }, + { + "epoch": 0.7563923164991732, + "ewc_loss": 0.0264892578125, + "ewc_loss_parallel": 2.6464462280273438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.918964385986328, + "learning_rate": 1e-06, + "loss": 0.4202, + "mean_token_accuracy": 0.8635441064834595, + "num_tokens": 226902395.0, + "step": 5946 + }, + { + "epoch": 0.7565195267777637, + "ewc_loss": 0.026123046875, + "ewc_loss_parallel": 2.6106834411621094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.80671501159668, + "learning_rate": 1e-06, + "loss": 0.426, + "mean_token_accuracy": 0.8662260174751282, + "num_tokens": 226942620.0, + "step": 5947 + }, + { + "epoch": 0.7566467370563541, + "ewc_loss": 0.0262451171875, + "ewc_loss_parallel": 2.6226043701171875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.89434242248535, + "learning_rate": 1e-06, + "loss": 0.417, + "mean_token_accuracy": 0.8680517077445984, + "num_tokens": 226976472.0, + "step": 5948 + }, + { + "epoch": 0.7567739473349446, + "ewc_loss": 0.026123046875, + "ewc_loss_parallel": 2.6106834411621094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.77634620666504, + "learning_rate": 1e-06, + "loss": 0.4683, + "mean_token_accuracy": 0.8490056991577148, + "num_tokens": 227013626.0, + "step": 5949 + }, + { + "epoch": 0.7569011576135352, + "ewc_loss": 0.0262451171875, + "ewc_loss_parallel": 2.6226043701171875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.88412857055664, + "learning_rate": 1e-06, + "loss": 0.4783, + "mean_token_accuracy": 0.8470627069473267, + "num_tokens": 227049336.0, + "step": 5950 + }, + { + "epoch": 0.7570283678921257, + "ewc_loss": 0.0262451171875, + "ewc_loss_parallel": 2.6226043701171875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.830894470214844, + "learning_rate": 1e-06, + "loss": 0.3731, + "mean_token_accuracy": 0.8831948041915894, + "num_tokens": 227087371.0, + "step": 5951 + }, + { + "epoch": 0.7571555781707162, + "ewc_loss": 0.0263671875, + "ewc_loss_parallel": 2.6345252990722656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.954959869384766, + "learning_rate": 1e-06, + "loss": 0.4404, + "mean_token_accuracy": 0.8620630502700806, + "num_tokens": 227129188.0, + "step": 5952 + }, + { + "epoch": 0.7572827884493067, + "ewc_loss": 0.0262451171875, + "ewc_loss_parallel": 2.6226043701171875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.832904815673828, + "learning_rate": 1e-06, + "loss": 0.4528, + "mean_token_accuracy": 0.8520344495773315, + "num_tokens": 227161807.0, + "step": 5953 + }, + { + "epoch": 0.7574099987278972, + "ewc_loss": 0.0263671875, + "ewc_loss_parallel": 2.6345252990722656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.854265213012695, + "learning_rate": 1e-06, + "loss": 0.4654, + "mean_token_accuracy": 0.8506690263748169, + "num_tokens": 227198431.0, + "step": 5954 + }, + { + "epoch": 0.7575372090064877, + "ewc_loss": 0.0262451171875, + "ewc_loss_parallel": 2.6226043701171875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.883710861206055, + "learning_rate": 1e-06, + "loss": 0.3748, + "mean_token_accuracy": 0.8797363638877869, + "num_tokens": 227236046.0, + "step": 5955 + }, + { + "epoch": 0.7576644192850782, + "ewc_loss": 0.0263671875, + "ewc_loss_parallel": 2.6345252990722656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.768901824951172, + "learning_rate": 1e-06, + "loss": 0.3784, + "mean_token_accuracy": 0.8768004775047302, + "num_tokens": 227272925.0, + "step": 5956 + }, + { + "epoch": 0.7577916295636687, + "ewc_loss": 0.0262451171875, + "ewc_loss_parallel": 2.6226043701171875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.963985443115234, + "learning_rate": 1e-06, + "loss": 0.424, + "mean_token_accuracy": 0.8639953136444092, + "num_tokens": 227312311.0, + "step": 5957 + }, + { + "epoch": 0.7579188398422593, + "ewc_loss": 0.0263671875, + "ewc_loss_parallel": 2.6345252990722656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.769638061523438, + "learning_rate": 1e-06, + "loss": 0.3785, + "mean_token_accuracy": 0.8799738883972168, + "num_tokens": 227349632.0, + "step": 5958 + }, + { + "epoch": 0.7580460501208498, + "ewc_loss": 0.0264892578125, + "ewc_loss_parallel": 2.6464462280273438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.980318069458008, + "learning_rate": 1e-06, + "loss": 0.4437, + "mean_token_accuracy": 0.8588095307350159, + "num_tokens": 227392982.0, + "step": 5959 + }, + { + "epoch": 0.7581732603994403, + "ewc_loss": 0.0264892578125, + "ewc_loss_parallel": 2.6464462280273438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.859291076660156, + "learning_rate": 1e-06, + "loss": 0.4485, + "mean_token_accuracy": 0.8574841618537903, + "num_tokens": 227431878.0, + "step": 5960 + }, + { + "epoch": 0.7583004706780307, + "ewc_loss": 0.026611328125, + "ewc_loss_parallel": 2.658367156982422e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.96712875366211, + "learning_rate": 1e-06, + "loss": 0.4667, + "mean_token_accuracy": 0.8503028154373169, + "num_tokens": 227470401.0, + "step": 5961 + }, + { + "epoch": 0.7584276809566213, + "ewc_loss": 0.0263671875, + "ewc_loss_parallel": 2.6345252990722656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.87676239013672, + "learning_rate": 1e-06, + "loss": 0.4227, + "mean_token_accuracy": 0.8649346232414246, + "num_tokens": 227507189.0, + "step": 5962 + }, + { + "epoch": 0.7585548912352118, + "ewc_loss": 0.0262451171875, + "ewc_loss_parallel": 2.6226043701171875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.890743255615234, + "learning_rate": 1e-06, + "loss": 0.4243, + "mean_token_accuracy": 0.8639575242996216, + "num_tokens": 227548070.0, + "step": 5963 + }, + { + "epoch": 0.7586821015138023, + "ewc_loss": 0.0264892578125, + "ewc_loss_parallel": 2.6464462280273438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.86811065673828, + "learning_rate": 1e-06, + "loss": 0.4149, + "mean_token_accuracy": 0.8651565909385681, + "num_tokens": 227584360.0, + "step": 5964 + }, + { + "epoch": 0.7588093117923929, + "ewc_loss": 0.0263671875, + "ewc_loss_parallel": 2.6345252990722656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.813154220581055, + "learning_rate": 1e-06, + "loss": 0.4176, + "mean_token_accuracy": 0.8665019869804382, + "num_tokens": 227618961.0, + "step": 5965 + }, + { + "epoch": 0.7589365220709834, + "ewc_loss": 0.0264892578125, + "ewc_loss_parallel": 2.6464462280273438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.997982025146484, + "learning_rate": 1e-06, + "loss": 0.4345, + "mean_token_accuracy": 0.8612532615661621, + "num_tokens": 227658304.0, + "step": 5966 + }, + { + "epoch": 0.7590637323495738, + "ewc_loss": 0.0262451171875, + "ewc_loss_parallel": 2.6226043701171875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.67268180847168, + "learning_rate": 1e-06, + "loss": 0.443, + "mean_token_accuracy": 0.8580784797668457, + "num_tokens": 227701526.0, + "step": 5967 + }, + { + "epoch": 0.7591909426281643, + "ewc_loss": 0.026611328125, + "ewc_loss_parallel": 2.658367156982422e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.989280700683594, + "learning_rate": 1e-06, + "loss": 0.4228, + "mean_token_accuracy": 0.8652418851852417, + "num_tokens": 227744930.0, + "step": 5968 + }, + { + "epoch": 0.7593181529067549, + "ewc_loss": 0.026123046875, + "ewc_loss_parallel": 2.6106834411621094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.891746520996094, + "learning_rate": 1e-06, + "loss": 0.4068, + "mean_token_accuracy": 0.8719797134399414, + "num_tokens": 227780607.0, + "step": 5969 + }, + { + "epoch": 0.7594453631853454, + "ewc_loss": 0.026611328125, + "ewc_loss_parallel": 2.658367156982422e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.040361404418945, + "learning_rate": 1e-06, + "loss": 0.4483, + "mean_token_accuracy": 0.8573850393295288, + "num_tokens": 227825742.0, + "step": 5970 + }, + { + "epoch": 0.7595725734639359, + "ewc_loss": 0.0263671875, + "ewc_loss_parallel": 2.6345252990722656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.924428939819336, + "learning_rate": 1e-06, + "loss": 0.497, + "mean_token_accuracy": 0.8421950340270996, + "num_tokens": 227865278.0, + "step": 5971 + }, + { + "epoch": 0.7596997837425264, + "ewc_loss": 0.0264892578125, + "ewc_loss_parallel": 2.6464462280273438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.89557647705078, + "learning_rate": 1e-06, + "loss": 0.4642, + "mean_token_accuracy": 0.8573371171951294, + "num_tokens": 227906912.0, + "step": 5972 + }, + { + "epoch": 0.7598269940211169, + "ewc_loss": 0.0264892578125, + "ewc_loss_parallel": 2.6464462280273438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.946487426757812, + "learning_rate": 1e-06, + "loss": 0.4597, + "mean_token_accuracy": 0.8581478595733643, + "num_tokens": 227947067.0, + "step": 5973 + }, + { + "epoch": 0.7599542042997074, + "ewc_loss": 0.0262451171875, + "ewc_loss_parallel": 2.6226043701171875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.817461013793945, + "learning_rate": 1e-06, + "loss": 0.4986, + "mean_token_accuracy": 0.8427116870880127, + "num_tokens": 227988563.0, + "step": 5974 + }, + { + "epoch": 0.7600814145782979, + "ewc_loss": 0.0263671875, + "ewc_loss_parallel": 2.6345252990722656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.7740535736084, + "learning_rate": 1e-06, + "loss": 0.4693, + "mean_token_accuracy": 0.8540136814117432, + "num_tokens": 228028556.0, + "step": 5975 + }, + { + "epoch": 0.7602086248568884, + "ewc_loss": 0.0263671875, + "ewc_loss_parallel": 2.6345252990722656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.97617530822754, + "learning_rate": 1e-06, + "loss": 0.3866, + "mean_token_accuracy": 0.8745723962783813, + "num_tokens": 228066265.0, + "step": 5976 + }, + { + "epoch": 0.760335835135479, + "ewc_loss": 0.026123046875, + "ewc_loss_parallel": 2.6106834411621094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.76224708557129, + "learning_rate": 1e-06, + "loss": 0.4441, + "mean_token_accuracy": 0.8577290773391724, + "num_tokens": 228103732.0, + "step": 5977 + }, + { + "epoch": 0.7604630454140695, + "ewc_loss": 0.026123046875, + "ewc_loss_parallel": 2.6106834411621094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.895456314086914, + "learning_rate": 1e-06, + "loss": 0.4581, + "mean_token_accuracy": 0.858320415019989, + "num_tokens": 228149609.0, + "step": 5978 + }, + { + "epoch": 0.7605902556926599, + "ewc_loss": 0.0262451171875, + "ewc_loss_parallel": 2.6226043701171875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.02490997314453, + "learning_rate": 1e-06, + "loss": 0.4238, + "mean_token_accuracy": 0.8636730909347534, + "num_tokens": 228188462.0, + "step": 5979 + }, + { + "epoch": 0.7607174659712505, + "ewc_loss": 0.0263671875, + "ewc_loss_parallel": 2.6345252990722656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.820592880249023, + "learning_rate": 1e-06, + "loss": 0.4034, + "mean_token_accuracy": 0.8736433982849121, + "num_tokens": 228229252.0, + "step": 5980 + }, + { + "epoch": 0.760844676249841, + "ewc_loss": 0.026123046875, + "ewc_loss_parallel": 2.6106834411621094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.891477584838867, + "learning_rate": 1e-06, + "loss": 0.4448, + "mean_token_accuracy": 0.8582890629768372, + "num_tokens": 228267397.0, + "step": 5981 + }, + { + "epoch": 0.7609718865284315, + "ewc_loss": 0.0262451171875, + "ewc_loss_parallel": 2.6226043701171875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.88705062866211, + "learning_rate": 1e-06, + "loss": 0.4159, + "mean_token_accuracy": 0.868221640586853, + "num_tokens": 228307561.0, + "step": 5982 + }, + { + "epoch": 0.761099096807022, + "ewc_loss": 0.0260009765625, + "ewc_loss_parallel": 2.5987625122070312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.807559967041016, + "learning_rate": 1e-06, + "loss": 0.4269, + "mean_token_accuracy": 0.8625767827033997, + "num_tokens": 228347667.0, + "step": 5983 + }, + { + "epoch": 0.7612263070856126, + "ewc_loss": 0.0262451171875, + "ewc_loss_parallel": 2.6226043701171875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.823572158813477, + "learning_rate": 1e-06, + "loss": 0.4475, + "mean_token_accuracy": 0.8587849140167236, + "num_tokens": 228386820.0, + "step": 5984 + }, + { + "epoch": 0.761353517364203, + "ewc_loss": 0.0264892578125, + "ewc_loss_parallel": 2.6464462280273438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.029937744140625, + "learning_rate": 1e-06, + "loss": 0.4119, + "mean_token_accuracy": 0.8699530959129333, + "num_tokens": 228427304.0, + "step": 5985 + }, + { + "epoch": 0.7614807276427935, + "ewc_loss": 0.0262451171875, + "ewc_loss_parallel": 2.6226043701171875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.984411239624023, + "learning_rate": 1e-06, + "loss": 0.4638, + "mean_token_accuracy": 0.8507232666015625, + "num_tokens": 228470779.0, + "step": 5986 + }, + { + "epoch": 0.761607937921384, + "ewc_loss": 0.0260009765625, + "ewc_loss_parallel": 2.5987625122070312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.97810935974121, + "learning_rate": 1e-06, + "loss": 0.3809, + "mean_token_accuracy": 0.8770205974578857, + "num_tokens": 228500454.0, + "step": 5987 + }, + { + "epoch": 0.7617351481999746, + "ewc_loss": 0.026123046875, + "ewc_loss_parallel": 2.6106834411621094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.960323333740234, + "learning_rate": 1e-06, + "loss": 0.4178, + "mean_token_accuracy": 0.8676433563232422, + "num_tokens": 228537374.0, + "step": 5988 + }, + { + "epoch": 0.7618623584785651, + "ewc_loss": 0.0263671875, + "ewc_loss_parallel": 2.6345252990722656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.747802734375, + "learning_rate": 1e-06, + "loss": 0.4401, + "mean_token_accuracy": 0.8612940311431885, + "num_tokens": 228577215.0, + "step": 5989 + }, + { + "epoch": 0.7619895687571556, + "ewc_loss": 0.026123046875, + "ewc_loss_parallel": 2.6106834411621094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.06878662109375, + "learning_rate": 1e-06, + "loss": 0.4217, + "mean_token_accuracy": 0.8651086091995239, + "num_tokens": 228611693.0, + "step": 5990 + }, + { + "epoch": 0.762116779035746, + "ewc_loss": 0.0262451171875, + "ewc_loss_parallel": 2.6226043701171875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.874940872192383, + "learning_rate": 1e-06, + "loss": 0.4487, + "mean_token_accuracy": 0.8592557907104492, + "num_tokens": 228644231.0, + "step": 5991 + }, + { + "epoch": 0.7622439893143366, + "ewc_loss": 0.0262451171875, + "ewc_loss_parallel": 2.6226043701171875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.888906478881836, + "learning_rate": 1e-06, + "loss": 0.4018, + "mean_token_accuracy": 0.8712295293807983, + "num_tokens": 228684776.0, + "step": 5992 + }, + { + "epoch": 0.7623711995929271, + "ewc_loss": 0.026123046875, + "ewc_loss_parallel": 2.6106834411621094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.888973236083984, + "learning_rate": 1e-06, + "loss": 0.416, + "mean_token_accuracy": 0.8667749762535095, + "num_tokens": 228724214.0, + "step": 5993 + }, + { + "epoch": 0.7624984098715176, + "ewc_loss": 0.026123046875, + "ewc_loss_parallel": 2.6106834411621094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.811962127685547, + "learning_rate": 1e-06, + "loss": 0.4435, + "mean_token_accuracy": 0.8596004843711853, + "num_tokens": 228765098.0, + "step": 5994 + }, + { + "epoch": 0.7626256201501082, + "ewc_loss": 0.0260009765625, + "ewc_loss_parallel": 2.5987625122070312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.78785514831543, + "learning_rate": 1e-06, + "loss": 0.4283, + "mean_token_accuracy": 0.8623843193054199, + "num_tokens": 228802423.0, + "step": 5995 + }, + { + "epoch": 0.7627528304286987, + "ewc_loss": 0.0263671875, + "ewc_loss_parallel": 2.6345252990722656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.120769500732422, + "learning_rate": 1e-06, + "loss": 0.4747, + "mean_token_accuracy": 0.8505186438560486, + "num_tokens": 228840218.0, + "step": 5996 + }, + { + "epoch": 0.7628800407072891, + "ewc_loss": 0.026123046875, + "ewc_loss_parallel": 2.6106834411621094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.842504501342773, + "learning_rate": 1e-06, + "loss": 0.4212, + "mean_token_accuracy": 0.8665231466293335, + "num_tokens": 228882810.0, + "step": 5997 + }, + { + "epoch": 0.7630072509858796, + "ewc_loss": 0.0260009765625, + "ewc_loss_parallel": 2.5987625122070312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.73900032043457, + "learning_rate": 1e-06, + "loss": 0.4184, + "mean_token_accuracy": 0.8668527603149414, + "num_tokens": 228917664.0, + "step": 5998 + }, + { + "epoch": 0.7631344612644702, + "ewc_loss": 0.02587890625, + "ewc_loss_parallel": 2.586841583251953e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.935232162475586, + "learning_rate": 1e-06, + "loss": 0.4153, + "mean_token_accuracy": 0.8676157593727112, + "num_tokens": 228951017.0, + "step": 5999 + }, + { + "epoch": 0.7632616715430607, + "ewc_loss": 0.0264892578125, + "ewc_loss_parallel": 2.6464462280273438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.77201271057129, + "learning_rate": 1e-06, + "loss": 0.4203, + "mean_token_accuracy": 0.8621574640274048, + "num_tokens": 228990949.0, + "step": 6000 + }, + { + "epoch": 0.7633888818216512, + "ewc_loss": 0.0262451171875, + "ewc_loss_parallel": 2.6226043701171875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.970922470092773, + "learning_rate": 1e-06, + "loss": 0.4229, + "mean_token_accuracy": 0.8659272789955139, + "num_tokens": 229026604.0, + "step": 6001 + }, + { + "epoch": 0.7635160921002417, + "ewc_loss": 0.0262451171875, + "ewc_loss_parallel": 2.6226043701171875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.865331649780273, + "learning_rate": 1e-06, + "loss": 0.4556, + "mean_token_accuracy": 0.8549196124076843, + "num_tokens": 229061810.0, + "step": 6002 + }, + { + "epoch": 0.7636433023788322, + "ewc_loss": 0.026123046875, + "ewc_loss_parallel": 2.6106834411621094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.795881271362305, + "learning_rate": 1e-06, + "loss": 0.5067, + "mean_token_accuracy": 0.8382241725921631, + "num_tokens": 229107765.0, + "step": 6003 + }, + { + "epoch": 0.7637705126574227, + "ewc_loss": 0.026123046875, + "ewc_loss_parallel": 2.6106834411621094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.945388793945312, + "learning_rate": 1e-06, + "loss": 0.4792, + "mean_token_accuracy": 0.848699688911438, + "num_tokens": 229147822.0, + "step": 6004 + }, + { + "epoch": 0.7638977229360132, + "ewc_loss": 0.026123046875, + "ewc_loss_parallel": 2.6106834411621094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.86481285095215, + "learning_rate": 1e-06, + "loss": 0.4103, + "mean_token_accuracy": 0.8711901903152466, + "num_tokens": 229184421.0, + "step": 6005 + }, + { + "epoch": 0.7640249332146037, + "ewc_loss": 0.026123046875, + "ewc_loss_parallel": 2.6106834411621094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.874616622924805, + "learning_rate": 1e-06, + "loss": 0.4354, + "mean_token_accuracy": 0.8609842658042908, + "num_tokens": 229221737.0, + "step": 6006 + }, + { + "epoch": 0.7641521434931943, + "ewc_loss": 0.0264892578125, + "ewc_loss_parallel": 2.6464462280273438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.843353271484375, + "learning_rate": 1e-06, + "loss": 0.4606, + "mean_token_accuracy": 0.8552612662315369, + "num_tokens": 229254727.0, + "step": 6007 + }, + { + "epoch": 0.7642793537717848, + "ewc_loss": 0.026123046875, + "ewc_loss_parallel": 2.6106834411621094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.902238845825195, + "learning_rate": 1e-06, + "loss": 0.4088, + "mean_token_accuracy": 0.8716042637825012, + "num_tokens": 229292648.0, + "step": 6008 + }, + { + "epoch": 0.7644065640503753, + "ewc_loss": 0.0264892578125, + "ewc_loss_parallel": 2.6464462280273438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.82555389404297, + "learning_rate": 1e-06, + "loss": 0.419, + "mean_token_accuracy": 0.8654574155807495, + "num_tokens": 229335708.0, + "step": 6009 + }, + { + "epoch": 0.7645337743289657, + "ewc_loss": 0.0262451171875, + "ewc_loss_parallel": 2.6226043701171875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.951452255249023, + "learning_rate": 1e-06, + "loss": 0.4135, + "mean_token_accuracy": 0.8689897656440735, + "num_tokens": 229373791.0, + "step": 6010 + }, + { + "epoch": 0.7646609846075563, + "ewc_loss": 0.0263671875, + "ewc_loss_parallel": 2.6345252990722656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.89849090576172, + "learning_rate": 1e-06, + "loss": 0.408, + "mean_token_accuracy": 0.8714113831520081, + "num_tokens": 229409891.0, + "step": 6011 + }, + { + "epoch": 0.7647881948861468, + "ewc_loss": 0.026123046875, + "ewc_loss_parallel": 2.6106834411621094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.022336959838867, + "learning_rate": 1e-06, + "loss": 0.4364, + "mean_token_accuracy": 0.8636151552200317, + "num_tokens": 229449859.0, + "step": 6012 + }, + { + "epoch": 0.7649154051647373, + "ewc_loss": 0.026123046875, + "ewc_loss_parallel": 2.6106834411621094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.982221603393555, + "learning_rate": 1e-06, + "loss": 0.4111, + "mean_token_accuracy": 0.8678328990936279, + "num_tokens": 229484224.0, + "step": 6013 + }, + { + "epoch": 0.7650426154433279, + "ewc_loss": 0.026123046875, + "ewc_loss_parallel": 2.6106834411621094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.75784683227539, + "learning_rate": 1e-06, + "loss": 0.4494, + "mean_token_accuracy": 0.8572824597358704, + "num_tokens": 229521104.0, + "step": 6014 + }, + { + "epoch": 0.7651698257219184, + "ewc_loss": 0.026611328125, + "ewc_loss_parallel": 2.658367156982422e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.937402725219727, + "learning_rate": 1e-06, + "loss": 0.4848, + "mean_token_accuracy": 0.8458824157714844, + "num_tokens": 229558918.0, + "step": 6015 + }, + { + "epoch": 0.7652970360005088, + "ewc_loss": 0.0262451171875, + "ewc_loss_parallel": 2.6226043701171875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.94582748413086, + "learning_rate": 1e-06, + "loss": 0.4931, + "mean_token_accuracy": 0.8478429913520813, + "num_tokens": 229596931.0, + "step": 6016 + }, + { + "epoch": 0.7654242462790993, + "ewc_loss": 0.0262451171875, + "ewc_loss_parallel": 2.6226043701171875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.01388168334961, + "learning_rate": 1e-06, + "loss": 0.4789, + "mean_token_accuracy": 0.8461939692497253, + "num_tokens": 229633903.0, + "step": 6017 + }, + { + "epoch": 0.7655514565576899, + "ewc_loss": 0.026123046875, + "ewc_loss_parallel": 2.6106834411621094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.934289932250977, + "learning_rate": 1e-06, + "loss": 0.4462, + "mean_token_accuracy": 0.8579695820808411, + "num_tokens": 229674224.0, + "step": 6018 + }, + { + "epoch": 0.7656786668362804, + "ewc_loss": 0.0264892578125, + "ewc_loss_parallel": 2.6464462280273438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.94135856628418, + "learning_rate": 1e-06, + "loss": 0.4368, + "mean_token_accuracy": 0.8587111830711365, + "num_tokens": 229710547.0, + "step": 6019 + }, + { + "epoch": 0.7658058771148709, + "ewc_loss": 0.0262451171875, + "ewc_loss_parallel": 2.6226043701171875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.805830001831055, + "learning_rate": 1e-06, + "loss": 0.3829, + "mean_token_accuracy": 0.8765188455581665, + "num_tokens": 229752020.0, + "step": 6020 + }, + { + "epoch": 0.7659330873934614, + "ewc_loss": 0.026123046875, + "ewc_loss_parallel": 2.6106834411621094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.894887924194336, + "learning_rate": 1e-06, + "loss": 0.4392, + "mean_token_accuracy": 0.8585318922996521, + "num_tokens": 229785710.0, + "step": 6021 + }, + { + "epoch": 0.7660602976720519, + "ewc_loss": 0.0264892578125, + "ewc_loss_parallel": 2.6464462280273438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.87745475769043, + "learning_rate": 1e-06, + "loss": 0.4095, + "mean_token_accuracy": 0.8682039380073547, + "num_tokens": 229824118.0, + "step": 6022 + }, + { + "epoch": 0.7661875079506424, + "ewc_loss": 0.0263671875, + "ewc_loss_parallel": 2.6345252990722656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.972515106201172, + "learning_rate": 1e-06, + "loss": 0.3942, + "mean_token_accuracy": 0.8731793165206909, + "num_tokens": 229854231.0, + "step": 6023 + }, + { + "epoch": 0.7663147182292329, + "ewc_loss": 0.0262451171875, + "ewc_loss_parallel": 2.6226043701171875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.122474670410156, + "learning_rate": 1e-06, + "loss": 0.4584, + "mean_token_accuracy": 0.8509002923965454, + "num_tokens": 229888102.0, + "step": 6024 + }, + { + "epoch": 0.7664419285078234, + "ewc_loss": 0.0262451171875, + "ewc_loss_parallel": 2.6226043701171875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.80000114440918, + "learning_rate": 1e-06, + "loss": 0.4307, + "mean_token_accuracy": 0.8656288981437683, + "num_tokens": 229927634.0, + "step": 6025 + }, + { + "epoch": 0.766569138786414, + "ewc_loss": 0.026611328125, + "ewc_loss_parallel": 2.658367156982422e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.946678161621094, + "learning_rate": 1e-06, + "loss": 0.4111, + "mean_token_accuracy": 0.8701512813568115, + "num_tokens": 229966526.0, + "step": 6026 + }, + { + "epoch": 0.7666963490650045, + "ewc_loss": 0.0264892578125, + "ewc_loss_parallel": 2.6464462280273438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.794815063476562, + "learning_rate": 1e-06, + "loss": 0.3984, + "mean_token_accuracy": 0.870092511177063, + "num_tokens": 230002761.0, + "step": 6027 + }, + { + "epoch": 0.7668235593435949, + "ewc_loss": 0.0260009765625, + "ewc_loss_parallel": 2.5987625122070312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.985858917236328, + "learning_rate": 1e-06, + "loss": 0.4107, + "mean_token_accuracy": 0.867033839225769, + "num_tokens": 230041303.0, + "step": 6028 + }, + { + "epoch": 0.7669507696221854, + "ewc_loss": 0.0267333984375, + "ewc_loss_parallel": 2.6702880859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.983440399169922, + "learning_rate": 1e-06, + "loss": 0.4272, + "mean_token_accuracy": 0.8651696443557739, + "num_tokens": 230081117.0, + "step": 6029 + }, + { + "epoch": 0.767077979900776, + "ewc_loss": 0.026123046875, + "ewc_loss_parallel": 2.6106834411621094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.009502410888672, + "learning_rate": 1e-06, + "loss": 0.4491, + "mean_token_accuracy": 0.8557018637657166, + "num_tokens": 230119411.0, + "step": 6030 + }, + { + "epoch": 0.7672051901793665, + "ewc_loss": 0.026611328125, + "ewc_loss_parallel": 2.658367156982422e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.813615798950195, + "learning_rate": 1e-06, + "loss": 0.4119, + "mean_token_accuracy": 0.8688546419143677, + "num_tokens": 230161111.0, + "step": 6031 + }, + { + "epoch": 0.767332400457957, + "ewc_loss": 0.0263671875, + "ewc_loss_parallel": 2.6345252990722656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.942899703979492, + "learning_rate": 1e-06, + "loss": 0.4427, + "mean_token_accuracy": 0.8567146062850952, + "num_tokens": 230196999.0, + "step": 6032 + }, + { + "epoch": 0.7674596107365476, + "ewc_loss": 0.0263671875, + "ewc_loss_parallel": 2.6345252990722656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.95068359375, + "learning_rate": 1e-06, + "loss": 0.426, + "mean_token_accuracy": 0.8632901906967163, + "num_tokens": 230231495.0, + "step": 6033 + }, + { + "epoch": 0.767586821015138, + "ewc_loss": 0.0264892578125, + "ewc_loss_parallel": 2.6464462280273438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.153459548950195, + "learning_rate": 1e-06, + "loss": 0.4694, + "mean_token_accuracy": 0.8544420599937439, + "num_tokens": 230267732.0, + "step": 6034 + }, + { + "epoch": 0.7677140312937285, + "ewc_loss": 0.0263671875, + "ewc_loss_parallel": 2.6345252990722656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.81810760498047, + "learning_rate": 1e-06, + "loss": 0.4338, + "mean_token_accuracy": 0.8604040145874023, + "num_tokens": 230297559.0, + "step": 6035 + }, + { + "epoch": 0.767841241572319, + "ewc_loss": 0.0263671875, + "ewc_loss_parallel": 2.6345252990722656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.837921142578125, + "learning_rate": 1e-06, + "loss": 0.4733, + "mean_token_accuracy": 0.8495180606842041, + "num_tokens": 230336396.0, + "step": 6036 + }, + { + "epoch": 0.7679684518509096, + "ewc_loss": 0.0264892578125, + "ewc_loss_parallel": 2.6464462280273438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.021648406982422, + "learning_rate": 1e-06, + "loss": 0.4353, + "mean_token_accuracy": 0.8610097169876099, + "num_tokens": 230368152.0, + "step": 6037 + }, + { + "epoch": 0.7680956621295001, + "ewc_loss": 0.0263671875, + "ewc_loss_parallel": 2.6345252990722656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.92858123779297, + "learning_rate": 1e-06, + "loss": 0.4232, + "mean_token_accuracy": 0.8669732809066772, + "num_tokens": 230408081.0, + "step": 6038 + }, + { + "epoch": 0.7682228724080906, + "ewc_loss": 0.0263671875, + "ewc_loss_parallel": 2.6345252990722656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.92005157470703, + "learning_rate": 1e-06, + "loss": 0.4733, + "mean_token_accuracy": 0.8543006181716919, + "num_tokens": 230441691.0, + "step": 6039 + }, + { + "epoch": 0.768350082686681, + "ewc_loss": 0.0264892578125, + "ewc_loss_parallel": 2.6464462280273438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.95534324645996, + "learning_rate": 1e-06, + "loss": 0.4427, + "mean_token_accuracy": 0.8593357801437378, + "num_tokens": 230480089.0, + "step": 6040 + }, + { + "epoch": 0.7684772929652716, + "ewc_loss": 0.0264892578125, + "ewc_loss_parallel": 2.6464462280273438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.1091365814209, + "learning_rate": 1e-06, + "loss": 0.4468, + "mean_token_accuracy": 0.8562706708908081, + "num_tokens": 230518655.0, + "step": 6041 + }, + { + "epoch": 0.7686045032438621, + "ewc_loss": 0.0264892578125, + "ewc_loss_parallel": 2.6464462280273438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.84674072265625, + "learning_rate": 1e-06, + "loss": 0.4231, + "mean_token_accuracy": 0.8645907640457153, + "num_tokens": 230555717.0, + "step": 6042 + }, + { + "epoch": 0.7687317135224526, + "ewc_loss": 0.0264892578125, + "ewc_loss_parallel": 2.6464462280273438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.237577438354492, + "learning_rate": 1e-06, + "loss": 0.4168, + "mean_token_accuracy": 0.8699454069137573, + "num_tokens": 230596699.0, + "step": 6043 + }, + { + "epoch": 0.7688589238010431, + "ewc_loss": 0.0264892578125, + "ewc_loss_parallel": 2.6464462280273438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.891889572143555, + "learning_rate": 1e-06, + "loss": 0.4111, + "mean_token_accuracy": 0.8699014186859131, + "num_tokens": 230633284.0, + "step": 6044 + }, + { + "epoch": 0.7689861340796337, + "ewc_loss": 0.0260009765625, + "ewc_loss_parallel": 2.5987625122070312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.918132781982422, + "learning_rate": 1e-06, + "loss": 0.4126, + "mean_token_accuracy": 0.8651580810546875, + "num_tokens": 230673133.0, + "step": 6045 + }, + { + "epoch": 0.7691133443582241, + "ewc_loss": 0.0264892578125, + "ewc_loss_parallel": 2.6464462280273438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.004732131958008, + "learning_rate": 1e-06, + "loss": 0.4303, + "mean_token_accuracy": 0.8621478080749512, + "num_tokens": 230719003.0, + "step": 6046 + }, + { + "epoch": 0.7692405546368146, + "ewc_loss": 0.0267333984375, + "ewc_loss_parallel": 2.6702880859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.79255485534668, + "learning_rate": 1e-06, + "loss": 0.4055, + "mean_token_accuracy": 0.8705766797065735, + "num_tokens": 230754805.0, + "step": 6047 + }, + { + "epoch": 0.7693677649154052, + "ewc_loss": 0.0264892578125, + "ewc_loss_parallel": 2.6464462280273438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.91214942932129, + "learning_rate": 1e-06, + "loss": 0.4762, + "mean_token_accuracy": 0.8496326804161072, + "num_tokens": 230793249.0, + "step": 6048 + }, + { + "epoch": 0.7694949751939957, + "ewc_loss": 0.0262451171875, + "ewc_loss_parallel": 2.6226043701171875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.986732482910156, + "learning_rate": 1e-06, + "loss": 0.3913, + "mean_token_accuracy": 0.878839373588562, + "num_tokens": 230826025.0, + "step": 6049 + }, + { + "epoch": 0.7696221854725862, + "ewc_loss": 0.0263671875, + "ewc_loss_parallel": 2.6345252990722656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.77625274658203, + "learning_rate": 1e-06, + "loss": 0.402, + "mean_token_accuracy": 0.8734496831893921, + "num_tokens": 230866615.0, + "step": 6050 + }, + { + "epoch": 0.7697493957511767, + "ewc_loss": 0.026123046875, + "ewc_loss_parallel": 2.6106834411621094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.044551849365234, + "learning_rate": 1e-06, + "loss": 0.491, + "mean_token_accuracy": 0.8456616401672363, + "num_tokens": 230901587.0, + "step": 6051 + }, + { + "epoch": 0.7698766060297672, + "ewc_loss": 0.026611328125, + "ewc_loss_parallel": 2.658367156982422e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.96477508544922, + "learning_rate": 1e-06, + "loss": 0.4244, + "mean_token_accuracy": 0.8674569129943848, + "num_tokens": 230944267.0, + "step": 6052 + }, + { + "epoch": 0.7700038163083577, + "ewc_loss": 0.0262451171875, + "ewc_loss_parallel": 2.6226043701171875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.934215545654297, + "learning_rate": 1e-06, + "loss": 0.4607, + "mean_token_accuracy": 0.8519457578659058, + "num_tokens": 230980022.0, + "step": 6053 + }, + { + "epoch": 0.7701310265869482, + "ewc_loss": 0.0263671875, + "ewc_loss_parallel": 2.6345252990722656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.010271072387695, + "learning_rate": 1e-06, + "loss": 0.4152, + "mean_token_accuracy": 0.8684362173080444, + "num_tokens": 231018110.0, + "step": 6054 + }, + { + "epoch": 0.7702582368655387, + "ewc_loss": 0.0263671875, + "ewc_loss_parallel": 2.6345252990722656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.96355438232422, + "learning_rate": 1e-06, + "loss": 0.4137, + "mean_token_accuracy": 0.8660203814506531, + "num_tokens": 231056531.0, + "step": 6055 + }, + { + "epoch": 0.7703854471441293, + "ewc_loss": 0.026611328125, + "ewc_loss_parallel": 2.658367156982422e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.72271728515625, + "learning_rate": 1e-06, + "loss": 0.4129, + "mean_token_accuracy": 0.8667062520980835, + "num_tokens": 231096270.0, + "step": 6056 + }, + { + "epoch": 0.7705126574227198, + "ewc_loss": 0.026611328125, + "ewc_loss_parallel": 2.658367156982422e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.18218231201172, + "learning_rate": 1e-06, + "loss": 0.4537, + "mean_token_accuracy": 0.8547207117080688, + "num_tokens": 231136300.0, + "step": 6057 + }, + { + "epoch": 0.7706398677013102, + "ewc_loss": 0.0264892578125, + "ewc_loss_parallel": 2.6464462280273438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.9255428314209, + "learning_rate": 1e-06, + "loss": 0.4839, + "mean_token_accuracy": 0.8490334749221802, + "num_tokens": 231169005.0, + "step": 6058 + }, + { + "epoch": 0.7707670779799007, + "ewc_loss": 0.0263671875, + "ewc_loss_parallel": 2.6345252990722656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.026391983032227, + "learning_rate": 1e-06, + "loss": 0.4035, + "mean_token_accuracy": 0.8683620691299438, + "num_tokens": 231205274.0, + "step": 6059 + }, + { + "epoch": 0.7708942882584913, + "ewc_loss": 0.0264892578125, + "ewc_loss_parallel": 2.6464462280273438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.97621726989746, + "learning_rate": 1e-06, + "loss": 0.4128, + "mean_token_accuracy": 0.8668285608291626, + "num_tokens": 231239022.0, + "step": 6060 + }, + { + "epoch": 0.7710214985370818, + "ewc_loss": 0.026611328125, + "ewc_loss_parallel": 2.658367156982422e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.937049865722656, + "learning_rate": 1e-06, + "loss": 0.4959, + "mean_token_accuracy": 0.8433091640472412, + "num_tokens": 231282952.0, + "step": 6061 + }, + { + "epoch": 0.7711487088156723, + "ewc_loss": 0.026611328125, + "ewc_loss_parallel": 2.658367156982422e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.930818557739258, + "learning_rate": 1e-06, + "loss": 0.4415, + "mean_token_accuracy": 0.8595370054244995, + "num_tokens": 231322980.0, + "step": 6062 + }, + { + "epoch": 0.7712759190942629, + "ewc_loss": 0.026611328125, + "ewc_loss_parallel": 2.658367156982422e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.04190444946289, + "learning_rate": 1e-06, + "loss": 0.4716, + "mean_token_accuracy": 0.853803813457489, + "num_tokens": 231358382.0, + "step": 6063 + }, + { + "epoch": 0.7714031293728534, + "ewc_loss": 0.0267333984375, + "ewc_loss_parallel": 2.6702880859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.946144104003906, + "learning_rate": 1e-06, + "loss": 0.4063, + "mean_token_accuracy": 0.8699874877929688, + "num_tokens": 231401915.0, + "step": 6064 + }, + { + "epoch": 0.7715303396514438, + "ewc_loss": 0.0263671875, + "ewc_loss_parallel": 2.6345252990722656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.782068252563477, + "learning_rate": 1e-06, + "loss": 0.496, + "mean_token_accuracy": 0.8387669324874878, + "num_tokens": 231433241.0, + "step": 6065 + }, + { + "epoch": 0.7716575499300343, + "ewc_loss": 0.0264892578125, + "ewc_loss_parallel": 2.6464462280273438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.03438949584961, + "learning_rate": 1e-06, + "loss": 0.4549, + "mean_token_accuracy": 0.8554501533508301, + "num_tokens": 231478318.0, + "step": 6066 + }, + { + "epoch": 0.7717847602086249, + "ewc_loss": 0.0262451171875, + "ewc_loss_parallel": 2.6226043701171875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.837772369384766, + "learning_rate": 1e-06, + "loss": 0.459, + "mean_token_accuracy": 0.8540983200073242, + "num_tokens": 231518584.0, + "step": 6067 + }, + { + "epoch": 0.7719119704872154, + "ewc_loss": 0.0262451171875, + "ewc_loss_parallel": 2.6226043701171875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.899024963378906, + "learning_rate": 1e-06, + "loss": 0.432, + "mean_token_accuracy": 0.860853910446167, + "num_tokens": 231556689.0, + "step": 6068 + }, + { + "epoch": 0.7720391807658059, + "ewc_loss": 0.0267333984375, + "ewc_loss_parallel": 2.6702880859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.920930862426758, + "learning_rate": 1e-06, + "loss": 0.4357, + "mean_token_accuracy": 0.8599168062210083, + "num_tokens": 231593936.0, + "step": 6069 + }, + { + "epoch": 0.7721663910443964, + "ewc_loss": 0.0262451171875, + "ewc_loss_parallel": 2.6226043701171875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.96780014038086, + "learning_rate": 1e-06, + "loss": 0.3834, + "mean_token_accuracy": 0.880449652671814, + "num_tokens": 231634785.0, + "step": 6070 + }, + { + "epoch": 0.7722936013229869, + "ewc_loss": 0.0267333984375, + "ewc_loss_parallel": 2.6702880859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.81751251220703, + "learning_rate": 1e-06, + "loss": 0.4646, + "mean_token_accuracy": 0.8569530844688416, + "num_tokens": 231675798.0, + "step": 6071 + }, + { + "epoch": 0.7724208116015774, + "ewc_loss": 0.0267333984375, + "ewc_loss_parallel": 2.6702880859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.93804359436035, + "learning_rate": 1e-06, + "loss": 0.4408, + "mean_token_accuracy": 0.8588302135467529, + "num_tokens": 231711611.0, + "step": 6072 + }, + { + "epoch": 0.7725480218801679, + "ewc_loss": 0.0264892578125, + "ewc_loss_parallel": 2.6464462280273438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.95537567138672, + "learning_rate": 1e-06, + "loss": 0.4671, + "mean_token_accuracy": 0.8516547679901123, + "num_tokens": 231746997.0, + "step": 6073 + }, + { + "epoch": 0.7726752321587584, + "ewc_loss": 0.02685546875, + "ewc_loss_parallel": 2.682209014892578e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.167009353637695, + "learning_rate": 1e-06, + "loss": 0.4968, + "mean_token_accuracy": 0.8412966728210449, + "num_tokens": 231787696.0, + "step": 6074 + }, + { + "epoch": 0.772802442437349, + "ewc_loss": 0.0267333984375, + "ewc_loss_parallel": 2.6702880859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.84001922607422, + "learning_rate": 1e-06, + "loss": 0.4809, + "mean_token_accuracy": 0.8524815440177917, + "num_tokens": 231826127.0, + "step": 6075 + }, + { + "epoch": 0.7729296527159395, + "ewc_loss": 0.0264892578125, + "ewc_loss_parallel": 2.6464462280273438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.85650634765625, + "learning_rate": 1e-06, + "loss": 0.4469, + "mean_token_accuracy": 0.8559294939041138, + "num_tokens": 231857538.0, + "step": 6076 + }, + { + "epoch": 0.7730568629945299, + "ewc_loss": 0.0269775390625, + "ewc_loss_parallel": 2.6941299438476562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.940784454345703, + "learning_rate": 1e-06, + "loss": 0.4442, + "mean_token_accuracy": 0.8591330647468567, + "num_tokens": 231893913.0, + "step": 6077 + }, + { + "epoch": 0.7731840732731204, + "ewc_loss": 0.02685546875, + "ewc_loss_parallel": 2.682209014892578e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.849178314208984, + "learning_rate": 1e-06, + "loss": 0.4243, + "mean_token_accuracy": 0.8629733324050903, + "num_tokens": 231934274.0, + "step": 6078 + }, + { + "epoch": 0.773311283551711, + "ewc_loss": 0.02685546875, + "ewc_loss_parallel": 2.682209014892578e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.94005012512207, + "learning_rate": 1e-06, + "loss": 0.4519, + "mean_token_accuracy": 0.854106068611145, + "num_tokens": 231983197.0, + "step": 6079 + }, + { + "epoch": 0.7734384938303015, + "ewc_loss": 0.0267333984375, + "ewc_loss_parallel": 2.6702880859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.749311447143555, + "learning_rate": 1e-06, + "loss": 0.455, + "mean_token_accuracy": 0.8567616939544678, + "num_tokens": 232021265.0, + "step": 6080 + }, + { + "epoch": 0.773565704108892, + "ewc_loss": 0.026611328125, + "ewc_loss_parallel": 2.658367156982422e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.939403533935547, + "learning_rate": 1e-06, + "loss": 0.4286, + "mean_token_accuracy": 0.8589425086975098, + "num_tokens": 232060992.0, + "step": 6081 + }, + { + "epoch": 0.7736929143874826, + "ewc_loss": 0.02685546875, + "ewc_loss_parallel": 2.682209014892578e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.869266510009766, + "learning_rate": 1e-06, + "loss": 0.4369, + "mean_token_accuracy": 0.8585830926895142, + "num_tokens": 232096758.0, + "step": 6082 + }, + { + "epoch": 0.773820124666073, + "ewc_loss": 0.0267333984375, + "ewc_loss_parallel": 2.6702880859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.003931045532227, + "learning_rate": 1e-06, + "loss": 0.4663, + "mean_token_accuracy": 0.8574955463409424, + "num_tokens": 232133773.0, + "step": 6083 + }, + { + "epoch": 0.7739473349446635, + "ewc_loss": 0.02685546875, + "ewc_loss_parallel": 2.682209014892578e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.825925827026367, + "learning_rate": 1e-06, + "loss": 0.4405, + "mean_token_accuracy": 0.8586210012435913, + "num_tokens": 232173182.0, + "step": 6084 + }, + { + "epoch": 0.774074545223254, + "ewc_loss": 0.0264892578125, + "ewc_loss_parallel": 2.6464462280273438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.952795028686523, + "learning_rate": 1e-06, + "loss": 0.4095, + "mean_token_accuracy": 0.8686667084693909, + "num_tokens": 232206478.0, + "step": 6085 + }, + { + "epoch": 0.7742017555018446, + "ewc_loss": 0.0267333984375, + "ewc_loss_parallel": 2.6702880859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.010526657104492, + "learning_rate": 1e-06, + "loss": 0.4569, + "mean_token_accuracy": 0.8538219928741455, + "num_tokens": 232242231.0, + "step": 6086 + }, + { + "epoch": 0.7743289657804351, + "ewc_loss": 0.0267333984375, + "ewc_loss_parallel": 2.6702880859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.909666061401367, + "learning_rate": 1e-06, + "loss": 0.4214, + "mean_token_accuracy": 0.8639165163040161, + "num_tokens": 232270636.0, + "step": 6087 + }, + { + "epoch": 0.7744561760590256, + "ewc_loss": 0.0267333984375, + "ewc_loss_parallel": 2.6702880859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.86789321899414, + "learning_rate": 1e-06, + "loss": 0.4107, + "mean_token_accuracy": 0.8661330342292786, + "num_tokens": 232309974.0, + "step": 6088 + }, + { + "epoch": 0.774583386337616, + "ewc_loss": 0.0264892578125, + "ewc_loss_parallel": 2.6464462280273438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.77719497680664, + "learning_rate": 1e-06, + "loss": 0.4027, + "mean_token_accuracy": 0.8688626289367676, + "num_tokens": 232344118.0, + "step": 6089 + }, + { + "epoch": 0.7747105966162066, + "ewc_loss": 0.026611328125, + "ewc_loss_parallel": 2.658367156982422e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.090070724487305, + "learning_rate": 1e-06, + "loss": 0.4842, + "mean_token_accuracy": 0.8500902056694031, + "num_tokens": 232385183.0, + "step": 6090 + }, + { + "epoch": 0.7748378068947971, + "ewc_loss": 0.0267333984375, + "ewc_loss_parallel": 2.6702880859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.870866775512695, + "learning_rate": 1e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.8618756532669067, + "num_tokens": 232421670.0, + "step": 6091 + }, + { + "epoch": 0.7749650171733876, + "ewc_loss": 0.026611328125, + "ewc_loss_parallel": 2.658367156982422e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.952919006347656, + "learning_rate": 1e-06, + "loss": 0.473, + "mean_token_accuracy": 0.8487290143966675, + "num_tokens": 232459563.0, + "step": 6092 + }, + { + "epoch": 0.7750922274519781, + "ewc_loss": 0.0264892578125, + "ewc_loss_parallel": 2.6464462280273438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.91413116455078, + "learning_rate": 1e-06, + "loss": 0.4385, + "mean_token_accuracy": 0.8608059883117676, + "num_tokens": 232496357.0, + "step": 6093 + }, + { + "epoch": 0.7752194377305687, + "ewc_loss": 0.0267333984375, + "ewc_loss_parallel": 2.6702880859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.01103973388672, + "learning_rate": 1e-06, + "loss": 0.4303, + "mean_token_accuracy": 0.8612885475158691, + "num_tokens": 232530436.0, + "step": 6094 + }, + { + "epoch": 0.7753466480091591, + "ewc_loss": 0.0267333984375, + "ewc_loss_parallel": 2.6702880859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.90215301513672, + "learning_rate": 1e-06, + "loss": 0.4636, + "mean_token_accuracy": 0.8546631932258606, + "num_tokens": 232568585.0, + "step": 6095 + }, + { + "epoch": 0.7754738582877496, + "ewc_loss": 0.0267333984375, + "ewc_loss_parallel": 2.6702880859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.164377212524414, + "learning_rate": 1e-06, + "loss": 0.4071, + "mean_token_accuracy": 0.866848886013031, + "num_tokens": 232602506.0, + "step": 6096 + }, + { + "epoch": 0.7756010685663401, + "ewc_loss": 0.0264892578125, + "ewc_loss_parallel": 2.6464462280273438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.02834129333496, + "learning_rate": 1e-06, + "loss": 0.4101, + "mean_token_accuracy": 0.8680398464202881, + "num_tokens": 232642819.0, + "step": 6097 + }, + { + "epoch": 0.7757282788449307, + "ewc_loss": 0.0267333984375, + "ewc_loss_parallel": 2.6702880859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.046607971191406, + "learning_rate": 1e-06, + "loss": 0.4824, + "mean_token_accuracy": 0.8487035036087036, + "num_tokens": 232679505.0, + "step": 6098 + }, + { + "epoch": 0.7758554891235212, + "ewc_loss": 0.02685546875, + "ewc_loss_parallel": 2.682209014892578e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.9832820892334, + "learning_rate": 1e-06, + "loss": 0.4323, + "mean_token_accuracy": 0.8655955791473389, + "num_tokens": 232715701.0, + "step": 6099 + }, + { + "epoch": 0.7759826994021117, + "ewc_loss": 0.0264892578125, + "ewc_loss_parallel": 2.6464462280273438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.855052947998047, + "learning_rate": 1e-06, + "loss": 0.4316, + "mean_token_accuracy": 0.861897885799408, + "num_tokens": 232750786.0, + "step": 6100 + }, + { + "epoch": 0.7761099096807021, + "ewc_loss": 0.0267333984375, + "ewc_loss_parallel": 2.6702880859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.047048568725586, + "learning_rate": 1e-06, + "loss": 0.4761, + "mean_token_accuracy": 0.849600076675415, + "num_tokens": 232791370.0, + "step": 6101 + }, + { + "epoch": 0.7762371199592927, + "ewc_loss": 0.0267333984375, + "ewc_loss_parallel": 2.6702880859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.982437133789062, + "learning_rate": 1e-06, + "loss": 0.4295, + "mean_token_accuracy": 0.8622651100158691, + "num_tokens": 232830895.0, + "step": 6102 + }, + { + "epoch": 0.7763643302378832, + "ewc_loss": 0.0264892578125, + "ewc_loss_parallel": 2.6464462280273438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.878854751586914, + "learning_rate": 1e-06, + "loss": 0.4382, + "mean_token_accuracy": 0.8595724105834961, + "num_tokens": 232869940.0, + "step": 6103 + }, + { + "epoch": 0.7764915405164737, + "ewc_loss": 0.0267333984375, + "ewc_loss_parallel": 2.6702880859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.02651596069336, + "learning_rate": 1e-06, + "loss": 0.4464, + "mean_token_accuracy": 0.8565382361412048, + "num_tokens": 232910743.0, + "step": 6104 + }, + { + "epoch": 0.7766187507950643, + "ewc_loss": 0.0267333984375, + "ewc_loss_parallel": 2.6702880859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.802824020385742, + "learning_rate": 1e-06, + "loss": 0.4492, + "mean_token_accuracy": 0.8603627681732178, + "num_tokens": 232948428.0, + "step": 6105 + }, + { + "epoch": 0.7767459610736548, + "ewc_loss": 0.0264892578125, + "ewc_loss_parallel": 2.6464462280273438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.097135543823242, + "learning_rate": 1e-06, + "loss": 0.4162, + "mean_token_accuracy": 0.8662844300270081, + "num_tokens": 232985142.0, + "step": 6106 + }, + { + "epoch": 0.7768731713522452, + "ewc_loss": 0.0263671875, + "ewc_loss_parallel": 2.6345252990722656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.89925193786621, + "learning_rate": 1e-06, + "loss": 0.4071, + "mean_token_accuracy": 0.8693492412567139, + "num_tokens": 233020810.0, + "step": 6107 + }, + { + "epoch": 0.7770003816308357, + "ewc_loss": 0.0264892578125, + "ewc_loss_parallel": 2.6464462280273438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.02103614807129, + "learning_rate": 1e-06, + "loss": 0.4685, + "mean_token_accuracy": 0.8525911569595337, + "num_tokens": 233055674.0, + "step": 6108 + }, + { + "epoch": 0.7771275919094263, + "ewc_loss": 0.0263671875, + "ewc_loss_parallel": 2.6345252990722656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.995668411254883, + "learning_rate": 1e-06, + "loss": 0.4468, + "mean_token_accuracy": 0.8556927442550659, + "num_tokens": 233084506.0, + "step": 6109 + }, + { + "epoch": 0.7772548021880168, + "ewc_loss": 0.0262451171875, + "ewc_loss_parallel": 2.6226043701171875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.948026657104492, + "learning_rate": 1e-06, + "loss": 0.3801, + "mean_token_accuracy": 0.8766050934791565, + "num_tokens": 233120172.0, + "step": 6110 + }, + { + "epoch": 0.7773820124666073, + "ewc_loss": 0.0263671875, + "ewc_loss_parallel": 2.6345252990722656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.96908187866211, + "learning_rate": 1e-06, + "loss": 0.4661, + "mean_token_accuracy": 0.850521445274353, + "num_tokens": 233160075.0, + "step": 6111 + }, + { + "epoch": 0.7775092227451978, + "ewc_loss": 0.0262451171875, + "ewc_loss_parallel": 2.6226043701171875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.964950561523438, + "learning_rate": 1e-06, + "loss": 0.4473, + "mean_token_accuracy": 0.8526051044464111, + "num_tokens": 233201714.0, + "step": 6112 + }, + { + "epoch": 0.7776364330237884, + "ewc_loss": 0.0267333984375, + "ewc_loss_parallel": 2.6702880859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.779783248901367, + "learning_rate": 1e-06, + "loss": 0.4259, + "mean_token_accuracy": 0.8669381141662598, + "num_tokens": 233237642.0, + "step": 6113 + }, + { + "epoch": 0.7777636433023788, + "ewc_loss": 0.0264892578125, + "ewc_loss_parallel": 2.6464462280273438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.898279190063477, + "learning_rate": 1e-06, + "loss": 0.4125, + "mean_token_accuracy": 0.8655222058296204, + "num_tokens": 233279712.0, + "step": 6114 + }, + { + "epoch": 0.7778908535809693, + "ewc_loss": 0.02685546875, + "ewc_loss_parallel": 2.682209014892578e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.99747657775879, + "learning_rate": 1e-06, + "loss": 0.4081, + "mean_token_accuracy": 0.8663098812103271, + "num_tokens": 233322441.0, + "step": 6115 + }, + { + "epoch": 0.7780180638595598, + "ewc_loss": 0.0267333984375, + "ewc_loss_parallel": 2.6702880859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.938682556152344, + "learning_rate": 1e-06, + "loss": 0.4601, + "mean_token_accuracy": 0.8549256324768066, + "num_tokens": 233371053.0, + "step": 6116 + }, + { + "epoch": 0.7781452741381504, + "ewc_loss": 0.0267333984375, + "ewc_loss_parallel": 2.6702880859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.02489471435547, + "learning_rate": 1e-06, + "loss": 0.4339, + "mean_token_accuracy": 0.8625986576080322, + "num_tokens": 233405372.0, + "step": 6117 + }, + { + "epoch": 0.7782724844167409, + "ewc_loss": 0.0267333984375, + "ewc_loss_parallel": 2.6702880859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.0789737701416, + "learning_rate": 1e-06, + "loss": 0.4222, + "mean_token_accuracy": 0.8669276237487793, + "num_tokens": 233447209.0, + "step": 6118 + }, + { + "epoch": 0.7783996946953314, + "ewc_loss": 0.0267333984375, + "ewc_loss_parallel": 2.6702880859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.06728172302246, + "learning_rate": 1e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.8626532554626465, + "num_tokens": 233484621.0, + "step": 6119 + }, + { + "epoch": 0.7785269049739219, + "ewc_loss": 0.026611328125, + "ewc_loss_parallel": 2.658367156982422e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.155841827392578, + "learning_rate": 1e-06, + "loss": 0.4874, + "mean_token_accuracy": 0.8440326452255249, + "num_tokens": 233520756.0, + "step": 6120 + }, + { + "epoch": 0.7786541152525124, + "ewc_loss": 0.0263671875, + "ewc_loss_parallel": 2.6345252990722656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.861713409423828, + "learning_rate": 1e-06, + "loss": 0.4563, + "mean_token_accuracy": 0.852626621723175, + "num_tokens": 233559120.0, + "step": 6121 + }, + { + "epoch": 0.7787813255311029, + "ewc_loss": 0.0267333984375, + "ewc_loss_parallel": 2.6702880859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.938405990600586, + "learning_rate": 1e-06, + "loss": 0.3905, + "mean_token_accuracy": 0.8722527623176575, + "num_tokens": 233590513.0, + "step": 6122 + }, + { + "epoch": 0.7789085358096934, + "ewc_loss": 0.026611328125, + "ewc_loss_parallel": 2.658367156982422e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.24382972717285, + "learning_rate": 1e-06, + "loss": 0.3889, + "mean_token_accuracy": 0.8777288794517517, + "num_tokens": 233625986.0, + "step": 6123 + }, + { + "epoch": 0.779035746088284, + "ewc_loss": 0.0264892578125, + "ewc_loss_parallel": 2.6464462280273438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.9617919921875, + "learning_rate": 1e-06, + "loss": 0.403, + "mean_token_accuracy": 0.8694005012512207, + "num_tokens": 233655097.0, + "step": 6124 + }, + { + "epoch": 0.7791629563668745, + "ewc_loss": 0.026611328125, + "ewc_loss_parallel": 2.658367156982422e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.869979858398438, + "learning_rate": 1e-06, + "loss": 0.465, + "mean_token_accuracy": 0.8506235480308533, + "num_tokens": 233687815.0, + "step": 6125 + }, + { + "epoch": 0.7792901666454649, + "ewc_loss": 0.02685546875, + "ewc_loss_parallel": 2.682209014892578e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.909259796142578, + "learning_rate": 1e-06, + "loss": 0.3938, + "mean_token_accuracy": 0.8734849095344543, + "num_tokens": 233722611.0, + "step": 6126 + }, + { + "epoch": 0.7794173769240554, + "ewc_loss": 0.02685546875, + "ewc_loss_parallel": 2.682209014892578e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.031023025512695, + "learning_rate": 1e-06, + "loss": 0.4974, + "mean_token_accuracy": 0.848554253578186, + "num_tokens": 233765345.0, + "step": 6127 + }, + { + "epoch": 0.779544587202646, + "ewc_loss": 0.0267333984375, + "ewc_loss_parallel": 2.6702880859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.815956115722656, + "learning_rate": 1e-06, + "loss": 0.4299, + "mean_token_accuracy": 0.8624296188354492, + "num_tokens": 233804358.0, + "step": 6128 + }, + { + "epoch": 0.7796717974812365, + "ewc_loss": 0.02685546875, + "ewc_loss_parallel": 2.682209014892578e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.2441349029541, + "learning_rate": 1e-06, + "loss": 0.4727, + "mean_token_accuracy": 0.8500936031341553, + "num_tokens": 233845117.0, + "step": 6129 + }, + { + "epoch": 0.779799007759827, + "ewc_loss": 0.02685546875, + "ewc_loss_parallel": 2.682209014892578e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.855600357055664, + "learning_rate": 1e-06, + "loss": 0.4531, + "mean_token_accuracy": 0.8527514934539795, + "num_tokens": 233884091.0, + "step": 6130 + }, + { + "epoch": 0.7799262180384176, + "ewc_loss": 0.0264892578125, + "ewc_loss_parallel": 2.6464462280273438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.060623168945312, + "learning_rate": 1e-06, + "loss": 0.4419, + "mean_token_accuracy": 0.8577194213867188, + "num_tokens": 233922590.0, + "step": 6131 + }, + { + "epoch": 0.780053428317008, + "ewc_loss": 0.02685546875, + "ewc_loss_parallel": 2.682209014892578e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.074806213378906, + "learning_rate": 1e-06, + "loss": 0.4363, + "mean_token_accuracy": 0.8631494045257568, + "num_tokens": 233954138.0, + "step": 6132 + }, + { + "epoch": 0.7801806385955985, + "ewc_loss": 0.026611328125, + "ewc_loss_parallel": 2.658367156982422e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.967531204223633, + "learning_rate": 1e-06, + "loss": 0.4177, + "mean_token_accuracy": 0.8699942827224731, + "num_tokens": 233988035.0, + "step": 6133 + }, + { + "epoch": 0.780307848874189, + "ewc_loss": 0.0267333984375, + "ewc_loss_parallel": 2.6702880859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.857797622680664, + "learning_rate": 1e-06, + "loss": 0.4735, + "mean_token_accuracy": 0.851043701171875, + "num_tokens": 234023747.0, + "step": 6134 + }, + { + "epoch": 0.7804350591527796, + "ewc_loss": 0.0267333984375, + "ewc_loss_parallel": 2.6702880859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.872554779052734, + "learning_rate": 1e-06, + "loss": 0.41, + "mean_token_accuracy": 0.8688197135925293, + "num_tokens": 234064182.0, + "step": 6135 + }, + { + "epoch": 0.7805622694313701, + "ewc_loss": 0.0267333984375, + "ewc_loss_parallel": 2.6702880859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.15941047668457, + "learning_rate": 1e-06, + "loss": 0.4894, + "mean_token_accuracy": 0.8438364863395691, + "num_tokens": 234106533.0, + "step": 6136 + }, + { + "epoch": 0.7806894797099606, + "ewc_loss": 0.0267333984375, + "ewc_loss_parallel": 2.6702880859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.94999885559082, + "learning_rate": 1e-06, + "loss": 0.3929, + "mean_token_accuracy": 0.873496413230896, + "num_tokens": 234145492.0, + "step": 6137 + }, + { + "epoch": 0.780816689988551, + "ewc_loss": 0.0262451171875, + "ewc_loss_parallel": 2.6226043701171875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.092748641967773, + "learning_rate": 1e-06, + "loss": 0.4426, + "mean_token_accuracy": 0.8640516400337219, + "num_tokens": 234181709.0, + "step": 6138 + }, + { + "epoch": 0.7809439002671416, + "ewc_loss": 0.0267333984375, + "ewc_loss_parallel": 2.6702880859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.972366333007812, + "learning_rate": 1e-06, + "loss": 0.4349, + "mean_token_accuracy": 0.8613409996032715, + "num_tokens": 234217854.0, + "step": 6139 + }, + { + "epoch": 0.7810711105457321, + "ewc_loss": 0.0264892578125, + "ewc_loss_parallel": 2.6464462280273438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.93665885925293, + "learning_rate": 1e-06, + "loss": 0.4326, + "mean_token_accuracy": 0.8605115413665771, + "num_tokens": 234255769.0, + "step": 6140 + }, + { + "epoch": 0.7811983208243226, + "ewc_loss": 0.0263671875, + "ewc_loss_parallel": 2.6345252990722656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.021068572998047, + "learning_rate": 1e-06, + "loss": 0.4823, + "mean_token_accuracy": 0.8470244407653809, + "num_tokens": 234290828.0, + "step": 6141 + }, + { + "epoch": 0.7813255311029131, + "ewc_loss": 0.026611328125, + "ewc_loss_parallel": 2.658367156982422e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.959609985351562, + "learning_rate": 1e-06, + "loss": 0.4486, + "mean_token_accuracy": 0.8600315451622009, + "num_tokens": 234326336.0, + "step": 6142 + }, + { + "epoch": 0.7814527413815037, + "ewc_loss": 0.0264892578125, + "ewc_loss_parallel": 2.6464462280273438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.006258010864258, + "learning_rate": 1e-06, + "loss": 0.4554, + "mean_token_accuracy": 0.8556998372077942, + "num_tokens": 234366210.0, + "step": 6143 + }, + { + "epoch": 0.7815799516600941, + "ewc_loss": 0.026611328125, + "ewc_loss_parallel": 2.658367156982422e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.006362915039062, + "learning_rate": 1e-06, + "loss": 0.4652, + "mean_token_accuracy": 0.8522133827209473, + "num_tokens": 234398975.0, + "step": 6144 + }, + { + "epoch": 0.7817071619386846, + "ewc_loss": 0.0263671875, + "ewc_loss_parallel": 2.6345252990722656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.97003173828125, + "learning_rate": 1e-06, + "loss": 0.459, + "mean_token_accuracy": 0.8545277714729309, + "num_tokens": 234437746.0, + "step": 6145 + }, + { + "epoch": 0.7818343722172751, + "ewc_loss": 0.0267333984375, + "ewc_loss_parallel": 2.6702880859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.271881103515625, + "learning_rate": 1e-06, + "loss": 0.4212, + "mean_token_accuracy": 0.8649570941925049, + "num_tokens": 234478402.0, + "step": 6146 + }, + { + "epoch": 0.7819615824958657, + "ewc_loss": 0.0264892578125, + "ewc_loss_parallel": 2.6464462280273438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.88041114807129, + "learning_rate": 1e-06, + "loss": 0.4826, + "mean_token_accuracy": 0.8465726375579834, + "num_tokens": 234509699.0, + "step": 6147 + }, + { + "epoch": 0.7820887927744562, + "ewc_loss": 0.0264892578125, + "ewc_loss_parallel": 2.6464462280273438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.072542190551758, + "learning_rate": 1e-06, + "loss": 0.457, + "mean_token_accuracy": 0.855309009552002, + "num_tokens": 234551515.0, + "step": 6148 + }, + { + "epoch": 0.7822160030530467, + "ewc_loss": 0.0267333984375, + "ewc_loss_parallel": 2.6702880859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.22836685180664, + "learning_rate": 1e-06, + "loss": 0.4132, + "mean_token_accuracy": 0.8675776124000549, + "num_tokens": 234582811.0, + "step": 6149 + }, + { + "epoch": 0.7823432133316371, + "ewc_loss": 0.0264892578125, + "ewc_loss_parallel": 2.6464462280273438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.960847854614258, + "learning_rate": 1e-06, + "loss": 0.465, + "mean_token_accuracy": 0.8525365591049194, + "num_tokens": 234625299.0, + "step": 6150 + }, + { + "epoch": 0.7824704236102277, + "ewc_loss": 0.0263671875, + "ewc_loss_parallel": 2.6345252990722656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.087257385253906, + "learning_rate": 1e-06, + "loss": 0.4154, + "mean_token_accuracy": 0.8670452237129211, + "num_tokens": 234665641.0, + "step": 6151 + }, + { + "epoch": 0.7825976338888182, + "ewc_loss": 0.026611328125, + "ewc_loss_parallel": 2.658367156982422e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.94498634338379, + "learning_rate": 1e-06, + "loss": 0.448, + "mean_token_accuracy": 0.8596774339675903, + "num_tokens": 234704668.0, + "step": 6152 + }, + { + "epoch": 0.7827248441674087, + "ewc_loss": 0.026611328125, + "ewc_loss_parallel": 2.658367156982422e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.995990753173828, + "learning_rate": 1e-06, + "loss": 0.4246, + "mean_token_accuracy": 0.8645184636116028, + "num_tokens": 234738080.0, + "step": 6153 + }, + { + "epoch": 0.7828520544459993, + "ewc_loss": 0.026611328125, + "ewc_loss_parallel": 2.658367156982422e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.010454177856445, + "learning_rate": 1e-06, + "loss": 0.4423, + "mean_token_accuracy": 0.8580265045166016, + "num_tokens": 234781077.0, + "step": 6154 + }, + { + "epoch": 0.7829792647245898, + "ewc_loss": 0.026611328125, + "ewc_loss_parallel": 2.658367156982422e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.062475204467773, + "learning_rate": 1e-06, + "loss": 0.4761, + "mean_token_accuracy": 0.8443485498428345, + "num_tokens": 234809374.0, + "step": 6155 + }, + { + "epoch": 0.7831064750031802, + "ewc_loss": 0.0267333984375, + "ewc_loss_parallel": 2.6702880859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.084880828857422, + "learning_rate": 1e-06, + "loss": 0.4632, + "mean_token_accuracy": 0.8485630750656128, + "num_tokens": 234845585.0, + "step": 6156 + }, + { + "epoch": 0.7832336852817707, + "ewc_loss": 0.026611328125, + "ewc_loss_parallel": 2.658367156982422e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.7689208984375, + "learning_rate": 1e-06, + "loss": 0.437, + "mean_token_accuracy": 0.8595813512802124, + "num_tokens": 234883891.0, + "step": 6157 + }, + { + "epoch": 0.7833608955603613, + "ewc_loss": 0.0267333984375, + "ewc_loss_parallel": 2.6702880859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.16419792175293, + "learning_rate": 1e-06, + "loss": 0.4401, + "mean_token_accuracy": 0.8595091104507446, + "num_tokens": 234919445.0, + "step": 6158 + }, + { + "epoch": 0.7834881058389518, + "ewc_loss": 0.0267333984375, + "ewc_loss_parallel": 2.6702880859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.99405288696289, + "learning_rate": 1e-06, + "loss": 0.428, + "mean_token_accuracy": 0.8627040982246399, + "num_tokens": 234951465.0, + "step": 6159 + }, + { + "epoch": 0.7836153161175423, + "ewc_loss": 0.02685546875, + "ewc_loss_parallel": 2.682209014892578e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.88661766052246, + "learning_rate": 1e-06, + "loss": 0.4665, + "mean_token_accuracy": 0.8515296578407288, + "num_tokens": 234989970.0, + "step": 6160 + }, + { + "epoch": 0.7837425263961328, + "ewc_loss": 0.0264892578125, + "ewc_loss_parallel": 2.6464462280273438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.103515625, + "learning_rate": 1e-06, + "loss": 0.4326, + "mean_token_accuracy": 0.8618686199188232, + "num_tokens": 235029673.0, + "step": 6161 + }, + { + "epoch": 0.7838697366747234, + "ewc_loss": 0.0267333984375, + "ewc_loss_parallel": 2.6702880859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.029508590698242, + "learning_rate": 1e-06, + "loss": 0.4411, + "mean_token_accuracy": 0.8596208095550537, + "num_tokens": 235065389.0, + "step": 6162 + }, + { + "epoch": 0.7839969469533138, + "ewc_loss": 0.0264892578125, + "ewc_loss_parallel": 2.6464462280273438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.09527587890625, + "learning_rate": 1e-06, + "loss": 0.3975, + "mean_token_accuracy": 0.8714672923088074, + "num_tokens": 235099795.0, + "step": 6163 + }, + { + "epoch": 0.7841241572319043, + "ewc_loss": 0.0267333984375, + "ewc_loss_parallel": 2.6702880859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.96609878540039, + "learning_rate": 1e-06, + "loss": 0.4323, + "mean_token_accuracy": 0.860968828201294, + "num_tokens": 235137475.0, + "step": 6164 + }, + { + "epoch": 0.7842513675104948, + "ewc_loss": 0.026611328125, + "ewc_loss_parallel": 2.658367156982422e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.011343002319336, + "learning_rate": 1e-06, + "loss": 0.4541, + "mean_token_accuracy": 0.8558198809623718, + "num_tokens": 235178454.0, + "step": 6165 + }, + { + "epoch": 0.7843785777890854, + "ewc_loss": 0.0267333984375, + "ewc_loss_parallel": 2.6702880859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.996870040893555, + "learning_rate": 1e-06, + "loss": 0.3794, + "mean_token_accuracy": 0.8784475922584534, + "num_tokens": 235216139.0, + "step": 6166 + }, + { + "epoch": 0.7845057880676759, + "ewc_loss": 0.02685546875, + "ewc_loss_parallel": 2.682209014892578e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.99401092529297, + "learning_rate": 1e-06, + "loss": 0.3972, + "mean_token_accuracy": 0.8751188516616821, + "num_tokens": 235253571.0, + "step": 6167 + }, + { + "epoch": 0.7846329983462664, + "ewc_loss": 0.0264892578125, + "ewc_loss_parallel": 2.6464462280273438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.90559959411621, + "learning_rate": 1e-06, + "loss": 0.484, + "mean_token_accuracy": 0.8449760675430298, + "num_tokens": 235293950.0, + "step": 6168 + }, + { + "epoch": 0.7847602086248568, + "ewc_loss": 0.0267333984375, + "ewc_loss_parallel": 2.6702880859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.081951141357422, + "learning_rate": 1e-06, + "loss": 0.4217, + "mean_token_accuracy": 0.8645002841949463, + "num_tokens": 235336419.0, + "step": 6169 + }, + { + "epoch": 0.7848874189034474, + "ewc_loss": 0.026611328125, + "ewc_loss_parallel": 2.658367156982422e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.85360336303711, + "learning_rate": 1e-06, + "loss": 0.4292, + "mean_token_accuracy": 0.8658634424209595, + "num_tokens": 235376039.0, + "step": 6170 + }, + { + "epoch": 0.7850146291820379, + "ewc_loss": 0.0267333984375, + "ewc_loss_parallel": 2.6702880859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.333663940429688, + "learning_rate": 1e-06, + "loss": 0.4694, + "mean_token_accuracy": 0.8492467403411865, + "num_tokens": 235412736.0, + "step": 6171 + }, + { + "epoch": 0.7851418394606284, + "ewc_loss": 0.02685546875, + "ewc_loss_parallel": 2.682209014892578e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.150638580322266, + "learning_rate": 1e-06, + "loss": 0.4398, + "mean_token_accuracy": 0.8612205386161804, + "num_tokens": 235457318.0, + "step": 6172 + }, + { + "epoch": 0.785269049739219, + "ewc_loss": 0.026611328125, + "ewc_loss_parallel": 2.658367156982422e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.9840030670166, + "learning_rate": 1e-06, + "loss": 0.4658, + "mean_token_accuracy": 0.8506178259849548, + "num_tokens": 235491471.0, + "step": 6173 + }, + { + "epoch": 0.7853962600178095, + "ewc_loss": 0.026611328125, + "ewc_loss_parallel": 2.658367156982422e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.936084747314453, + "learning_rate": 1e-06, + "loss": 0.414, + "mean_token_accuracy": 0.8709653615951538, + "num_tokens": 235531324.0, + "step": 6174 + }, + { + "epoch": 0.7855234702963999, + "ewc_loss": 0.0264892578125, + "ewc_loss_parallel": 2.6464462280273438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.056921005249023, + "learning_rate": 1e-06, + "loss": 0.4913, + "mean_token_accuracy": 0.8478405475616455, + "num_tokens": 235573233.0, + "step": 6175 + }, + { + "epoch": 0.7856506805749904, + "ewc_loss": 0.0264892578125, + "ewc_loss_parallel": 2.6464462280273438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.917469024658203, + "learning_rate": 1e-06, + "loss": 0.4412, + "mean_token_accuracy": 0.8613768815994263, + "num_tokens": 235614400.0, + "step": 6176 + }, + { + "epoch": 0.785777890853581, + "ewc_loss": 0.026611328125, + "ewc_loss_parallel": 2.658367156982422e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.075050354003906, + "learning_rate": 1e-06, + "loss": 0.4192, + "mean_token_accuracy": 0.867129921913147, + "num_tokens": 235652247.0, + "step": 6177 + }, + { + "epoch": 0.7859051011321715, + "ewc_loss": 0.0267333984375, + "ewc_loss_parallel": 2.6702880859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.06022834777832, + "learning_rate": 1e-06, + "loss": 0.4638, + "mean_token_accuracy": 0.8576748967170715, + "num_tokens": 235681810.0, + "step": 6178 + }, + { + "epoch": 0.786032311410762, + "ewc_loss": 0.0262451171875, + "ewc_loss_parallel": 2.6226043701171875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.013330459594727, + "learning_rate": 1e-06, + "loss": 0.4091, + "mean_token_accuracy": 0.869592547416687, + "num_tokens": 235718422.0, + "step": 6179 + }, + { + "epoch": 0.7861595216893525, + "ewc_loss": 0.0267333984375, + "ewc_loss_parallel": 2.6702880859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.05731964111328, + "learning_rate": 1e-06, + "loss": 0.4588, + "mean_token_accuracy": 0.8558563590049744, + "num_tokens": 235758886.0, + "step": 6180 + }, + { + "epoch": 0.786286731967943, + "ewc_loss": 0.0264892578125, + "ewc_loss_parallel": 2.6464462280273438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.058509826660156, + "learning_rate": 1e-06, + "loss": 0.4212, + "mean_token_accuracy": 0.8642182946205139, + "num_tokens": 235792270.0, + "step": 6181 + }, + { + "epoch": 0.7864139422465335, + "ewc_loss": 0.02685546875, + "ewc_loss_parallel": 2.682209014892578e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.973466873168945, + "learning_rate": 1e-06, + "loss": 0.4664, + "mean_token_accuracy": 0.8509525656700134, + "num_tokens": 235826678.0, + "step": 6182 + }, + { + "epoch": 0.786541152525124, + "ewc_loss": 0.026611328125, + "ewc_loss_parallel": 2.658367156982422e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.117219924926758, + "learning_rate": 1e-06, + "loss": 0.3945, + "mean_token_accuracy": 0.8723067045211792, + "num_tokens": 235860437.0, + "step": 6183 + }, + { + "epoch": 0.7866683628037145, + "ewc_loss": 0.0267333984375, + "ewc_loss_parallel": 2.6702880859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.09185791015625, + "learning_rate": 1e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.8654117584228516, + "num_tokens": 235900865.0, + "step": 6184 + }, + { + "epoch": 0.7867955730823051, + "ewc_loss": 0.0267333984375, + "ewc_loss_parallel": 2.6702880859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.09522247314453, + "learning_rate": 1e-06, + "loss": 0.4893, + "mean_token_accuracy": 0.8459470868110657, + "num_tokens": 235937419.0, + "step": 6185 + }, + { + "epoch": 0.7869227833608956, + "ewc_loss": 0.0264892578125, + "ewc_loss_parallel": 2.6464462280273438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.083236694335938, + "learning_rate": 1e-06, + "loss": 0.4644, + "mean_token_accuracy": 0.8514103889465332, + "num_tokens": 235976425.0, + "step": 6186 + }, + { + "epoch": 0.787049993639486, + "ewc_loss": 0.0267333984375, + "ewc_loss_parallel": 2.6702880859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.121095657348633, + "learning_rate": 1e-06, + "loss": 0.4581, + "mean_token_accuracy": 0.8539451956748962, + "num_tokens": 236012968.0, + "step": 6187 + }, + { + "epoch": 0.7871772039180766, + "ewc_loss": 0.0267333984375, + "ewc_loss_parallel": 2.6702880859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.033971786499023, + "learning_rate": 1e-06, + "loss": 0.4423, + "mean_token_accuracy": 0.8618766069412231, + "num_tokens": 236052462.0, + "step": 6188 + }, + { + "epoch": 0.7873044141966671, + "ewc_loss": 0.026611328125, + "ewc_loss_parallel": 2.658367156982422e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.151039123535156, + "learning_rate": 1e-06, + "loss": 0.39, + "mean_token_accuracy": 0.8759979605674744, + "num_tokens": 236087604.0, + "step": 6189 + }, + { + "epoch": 0.7874316244752576, + "ewc_loss": 0.0264892578125, + "ewc_loss_parallel": 2.6464462280273438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.22124481201172, + "learning_rate": 1e-06, + "loss": 0.4275, + "mean_token_accuracy": 0.8606687784194946, + "num_tokens": 236126344.0, + "step": 6190 + }, + { + "epoch": 0.7875588347538481, + "ewc_loss": 0.0267333984375, + "ewc_loss_parallel": 2.6702880859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.193443298339844, + "learning_rate": 1e-06, + "loss": 0.4081, + "mean_token_accuracy": 0.8688052296638489, + "num_tokens": 236161913.0, + "step": 6191 + }, + { + "epoch": 0.7876860450324387, + "ewc_loss": 0.026611328125, + "ewc_loss_parallel": 2.658367156982422e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.060832977294922, + "learning_rate": 1e-06, + "loss": 0.4277, + "mean_token_accuracy": 0.8635882139205933, + "num_tokens": 236200612.0, + "step": 6192 + }, + { + "epoch": 0.7878132553110291, + "ewc_loss": 0.0263671875, + "ewc_loss_parallel": 2.6345252990722656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.236522674560547, + "learning_rate": 1e-06, + "loss": 0.4108, + "mean_token_accuracy": 0.8666821718215942, + "num_tokens": 236238136.0, + "step": 6193 + }, + { + "epoch": 0.7879404655896196, + "ewc_loss": 0.0264892578125, + "ewc_loss_parallel": 2.6464462280273438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.911239624023438, + "learning_rate": 1e-06, + "loss": 0.4463, + "mean_token_accuracy": 0.8574000597000122, + "num_tokens": 236281303.0, + "step": 6194 + }, + { + "epoch": 0.7880676758682101, + "ewc_loss": 0.0263671875, + "ewc_loss_parallel": 2.6345252990722656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.163515090942383, + "learning_rate": 1e-06, + "loss": 0.4518, + "mean_token_accuracy": 0.8580633401870728, + "num_tokens": 236322355.0, + "step": 6195 + }, + { + "epoch": 0.7881948861468007, + "ewc_loss": 0.0267333984375, + "ewc_loss_parallel": 2.6702880859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.12224769592285, + "learning_rate": 1e-06, + "loss": 0.4461, + "mean_token_accuracy": 0.859748363494873, + "num_tokens": 236354263.0, + "step": 6196 + }, + { + "epoch": 0.7883220964253912, + "ewc_loss": 0.0267333984375, + "ewc_loss_parallel": 2.6702880859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.066097259521484, + "learning_rate": 1e-06, + "loss": 0.4274, + "mean_token_accuracy": 0.8624296188354492, + "num_tokens": 236391581.0, + "step": 6197 + }, + { + "epoch": 0.7884493067039817, + "ewc_loss": 0.02685546875, + "ewc_loss_parallel": 2.682209014892578e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.084640502929688, + "learning_rate": 1e-06, + "loss": 0.4041, + "mean_token_accuracy": 0.869257926940918, + "num_tokens": 236426003.0, + "step": 6198 + }, + { + "epoch": 0.7885765169825721, + "ewc_loss": 0.0267333984375, + "ewc_loss_parallel": 2.6702880859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.956758499145508, + "learning_rate": 1e-06, + "loss": 0.4003, + "mean_token_accuracy": 0.870242714881897, + "num_tokens": 236464874.0, + "step": 6199 + }, + { + "epoch": 0.7887037272611627, + "ewc_loss": 0.0267333984375, + "ewc_loss_parallel": 2.6702880859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.09743309020996, + "learning_rate": 1e-06, + "loss": 0.4057, + "mean_token_accuracy": 0.8711981773376465, + "num_tokens": 236499826.0, + "step": 6200 + }, + { + "epoch": 0.7888309375397532, + "ewc_loss": 0.0267333984375, + "ewc_loss_parallel": 2.6702880859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.22772789001465, + "learning_rate": 1e-06, + "loss": 0.4053, + "mean_token_accuracy": 0.8727383613586426, + "num_tokens": 236538615.0, + "step": 6201 + }, + { + "epoch": 0.7889581478183437, + "ewc_loss": 0.0264892578125, + "ewc_loss_parallel": 2.6464462280273438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.94019317626953, + "learning_rate": 1e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.8589596152305603, + "num_tokens": 236577860.0, + "step": 6202 + }, + { + "epoch": 0.7890853580969343, + "ewc_loss": 0.026611328125, + "ewc_loss_parallel": 2.658367156982422e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.28258514404297, + "learning_rate": 1e-06, + "loss": 0.387, + "mean_token_accuracy": 0.8754881620407104, + "num_tokens": 236619041.0, + "step": 6203 + }, + { + "epoch": 0.7892125683755248, + "ewc_loss": 0.0267333984375, + "ewc_loss_parallel": 2.6702880859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.988842010498047, + "learning_rate": 1e-06, + "loss": 0.4185, + "mean_token_accuracy": 0.864922046661377, + "num_tokens": 236655788.0, + "step": 6204 + }, + { + "epoch": 0.7893397786541152, + "ewc_loss": 0.0264892578125, + "ewc_loss_parallel": 2.6464462280273438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.1618709564209, + "learning_rate": 1e-06, + "loss": 0.4018, + "mean_token_accuracy": 0.8724268078804016, + "num_tokens": 236691532.0, + "step": 6205 + }, + { + "epoch": 0.7894669889327057, + "ewc_loss": 0.0267333984375, + "ewc_loss_parallel": 2.6702880859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.114721298217773, + "learning_rate": 1e-06, + "loss": 0.4149, + "mean_token_accuracy": 0.8685917854309082, + "num_tokens": 236733792.0, + "step": 6206 + }, + { + "epoch": 0.7895941992112963, + "ewc_loss": 0.0264892578125, + "ewc_loss_parallel": 2.6464462280273438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.089336395263672, + "learning_rate": 1e-06, + "loss": 0.3929, + "mean_token_accuracy": 0.872782826423645, + "num_tokens": 236775454.0, + "step": 6207 + }, + { + "epoch": 0.7897214094898868, + "ewc_loss": 0.0263671875, + "ewc_loss_parallel": 2.6345252990722656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.031097412109375, + "learning_rate": 1e-06, + "loss": 0.3986, + "mean_token_accuracy": 0.8715932965278625, + "num_tokens": 236811572.0, + "step": 6208 + }, + { + "epoch": 0.7898486197684773, + "ewc_loss": 0.0263671875, + "ewc_loss_parallel": 2.6345252990722656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.92755889892578, + "learning_rate": 1e-06, + "loss": 0.4938, + "mean_token_accuracy": 0.846277117729187, + "num_tokens": 236851018.0, + "step": 6209 + }, + { + "epoch": 0.7899758300470678, + "ewc_loss": 0.026611328125, + "ewc_loss_parallel": 2.658367156982422e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.12850570678711, + "learning_rate": 1e-06, + "loss": 0.4202, + "mean_token_accuracy": 0.865332841873169, + "num_tokens": 236891871.0, + "step": 6210 + }, + { + "epoch": 0.7901030403256584, + "ewc_loss": 0.0264892578125, + "ewc_loss_parallel": 2.6464462280273438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.143775939941406, + "learning_rate": 1e-06, + "loss": 0.4273, + "mean_token_accuracy": 0.8650575280189514, + "num_tokens": 236926771.0, + "step": 6211 + }, + { + "epoch": 0.7902302506042488, + "ewc_loss": 0.0264892578125, + "ewc_loss_parallel": 2.6464462280273438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.016630172729492, + "learning_rate": 1e-06, + "loss": 0.4742, + "mean_token_accuracy": 0.8486782312393188, + "num_tokens": 236964768.0, + "step": 6212 + }, + { + "epoch": 0.7903574608828393, + "ewc_loss": 0.0262451171875, + "ewc_loss_parallel": 2.6226043701171875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.15033531188965, + "learning_rate": 1e-06, + "loss": 0.4632, + "mean_token_accuracy": 0.8499995470046997, + "num_tokens": 237005075.0, + "step": 6213 + }, + { + "epoch": 0.7904846711614298, + "ewc_loss": 0.0267333984375, + "ewc_loss_parallel": 2.6702880859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.997285842895508, + "learning_rate": 1e-06, + "loss": 0.4265, + "mean_token_accuracy": 0.8626943826675415, + "num_tokens": 237035517.0, + "step": 6214 + }, + { + "epoch": 0.7906118814400204, + "ewc_loss": 0.0264892578125, + "ewc_loss_parallel": 2.6464462280273438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.19022560119629, + "learning_rate": 1e-06, + "loss": 0.3772, + "mean_token_accuracy": 0.878423273563385, + "num_tokens": 237076319.0, + "step": 6215 + }, + { + "epoch": 0.7907390917186109, + "ewc_loss": 0.026611328125, + "ewc_loss_parallel": 2.658367156982422e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.069665908813477, + "learning_rate": 1e-06, + "loss": 0.4659, + "mean_token_accuracy": 0.8522449135780334, + "num_tokens": 237117935.0, + "step": 6216 + }, + { + "epoch": 0.7908663019972014, + "ewc_loss": 0.0264892578125, + "ewc_loss_parallel": 2.6464462280273438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.012666702270508, + "learning_rate": 1e-06, + "loss": 0.4043, + "mean_token_accuracy": 0.870228111743927, + "num_tokens": 237155789.0, + "step": 6217 + }, + { + "epoch": 0.7909935122757918, + "ewc_loss": 0.026611328125, + "ewc_loss_parallel": 2.658367156982422e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.12122344970703, + "learning_rate": 1e-06, + "loss": 0.4236, + "mean_token_accuracy": 0.8671736717224121, + "num_tokens": 237198870.0, + "step": 6218 + }, + { + "epoch": 0.7911207225543824, + "ewc_loss": 0.0264892578125, + "ewc_loss_parallel": 2.6464462280273438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.95026206970215, + "learning_rate": 1e-06, + "loss": 0.418, + "mean_token_accuracy": 0.8665213584899902, + "num_tokens": 237243084.0, + "step": 6219 + }, + { + "epoch": 0.7912479328329729, + "ewc_loss": 0.026611328125, + "ewc_loss_parallel": 2.658367156982422e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.09332847595215, + "learning_rate": 1e-06, + "loss": 0.4552, + "mean_token_accuracy": 0.8545942306518555, + "num_tokens": 237277245.0, + "step": 6220 + }, + { + "epoch": 0.7913751431115634, + "ewc_loss": 0.02685546875, + "ewc_loss_parallel": 2.682209014892578e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.193208694458008, + "learning_rate": 1e-06, + "loss": 0.4141, + "mean_token_accuracy": 0.8661568760871887, + "num_tokens": 237317841.0, + "step": 6221 + }, + { + "epoch": 0.791502353390154, + "ewc_loss": 0.02685546875, + "ewc_loss_parallel": 2.682209014892578e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.93174934387207, + "learning_rate": 1e-06, + "loss": 0.4665, + "mean_token_accuracy": 0.8508317470550537, + "num_tokens": 237349327.0, + "step": 6222 + }, + { + "epoch": 0.7916295636687445, + "ewc_loss": 0.026611328125, + "ewc_loss_parallel": 2.658367156982422e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.291749954223633, + "learning_rate": 1e-06, + "loss": 0.4835, + "mean_token_accuracy": 0.8475807309150696, + "num_tokens": 237396179.0, + "step": 6223 + }, + { + "epoch": 0.7917567739473349, + "ewc_loss": 0.0267333984375, + "ewc_loss_parallel": 2.6702880859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.995567321777344, + "learning_rate": 1e-06, + "loss": 0.4482, + "mean_token_accuracy": 0.8580548763275146, + "num_tokens": 237437766.0, + "step": 6224 + }, + { + "epoch": 0.7918839842259254, + "ewc_loss": 0.026611328125, + "ewc_loss_parallel": 2.658367156982422e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.957557678222656, + "learning_rate": 1e-06, + "loss": 0.4183, + "mean_token_accuracy": 0.8654785752296448, + "num_tokens": 237479239.0, + "step": 6225 + }, + { + "epoch": 0.792011194504516, + "ewc_loss": 0.02685546875, + "ewc_loss_parallel": 2.682209014892578e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.91204071044922, + "learning_rate": 1e-06, + "loss": 0.3981, + "mean_token_accuracy": 0.8747888803482056, + "num_tokens": 237514641.0, + "step": 6226 + }, + { + "epoch": 0.7921384047831065, + "ewc_loss": 0.0264892578125, + "ewc_loss_parallel": 2.6464462280273438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.014230728149414, + "learning_rate": 1e-06, + "loss": 0.4089, + "mean_token_accuracy": 0.8722401261329651, + "num_tokens": 237555319.0, + "step": 6227 + }, + { + "epoch": 0.792265615061697, + "ewc_loss": 0.02685546875, + "ewc_loss_parallel": 2.682209014892578e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.035564422607422, + "learning_rate": 1e-06, + "loss": 0.4346, + "mean_token_accuracy": 0.8644752502441406, + "num_tokens": 237597786.0, + "step": 6228 + }, + { + "epoch": 0.7923928253402875, + "ewc_loss": 0.0264892578125, + "ewc_loss_parallel": 2.6464462280273438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.0535888671875, + "learning_rate": 1e-06, + "loss": 0.3993, + "mean_token_accuracy": 0.8731366395950317, + "num_tokens": 237636584.0, + "step": 6229 + }, + { + "epoch": 0.792520035618878, + "ewc_loss": 0.02685546875, + "ewc_loss_parallel": 2.682209014892578e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.140396118164062, + "learning_rate": 1e-06, + "loss": 0.4621, + "mean_token_accuracy": 0.8529735207557678, + "num_tokens": 237676000.0, + "step": 6230 + }, + { + "epoch": 0.7926472458974685, + "ewc_loss": 0.0267333984375, + "ewc_loss_parallel": 2.6702880859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.901233673095703, + "learning_rate": 1e-06, + "loss": 0.3562, + "mean_token_accuracy": 0.8880269527435303, + "num_tokens": 237713003.0, + "step": 6231 + }, + { + "epoch": 0.792774456176059, + "ewc_loss": 0.0264892578125, + "ewc_loss_parallel": 2.6464462280273438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.09316062927246, + "learning_rate": 1e-06, + "loss": 0.423, + "mean_token_accuracy": 0.8634667992591858, + "num_tokens": 237742054.0, + "step": 6232 + }, + { + "epoch": 0.7929016664546495, + "ewc_loss": 0.02685546875, + "ewc_loss_parallel": 2.682209014892578e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.102596282958984, + "learning_rate": 1e-06, + "loss": 0.4534, + "mean_token_accuracy": 0.8539166450500488, + "num_tokens": 237783147.0, + "step": 6233 + }, + { + "epoch": 0.7930288767332401, + "ewc_loss": 0.02685546875, + "ewc_loss_parallel": 2.682209014892578e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.93143653869629, + "learning_rate": 1e-06, + "loss": 0.4423, + "mean_token_accuracy": 0.8553453683853149, + "num_tokens": 237814322.0, + "step": 6234 + }, + { + "epoch": 0.7931560870118306, + "ewc_loss": 0.0267333984375, + "ewc_loss_parallel": 2.6702880859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.138580322265625, + "learning_rate": 1e-06, + "loss": 0.4155, + "mean_token_accuracy": 0.8667981028556824, + "num_tokens": 237849278.0, + "step": 6235 + }, + { + "epoch": 0.793283297290421, + "ewc_loss": 0.0267333984375, + "ewc_loss_parallel": 2.6702880859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.90435028076172, + "learning_rate": 1e-06, + "loss": 0.4235, + "mean_token_accuracy": 0.8664528727531433, + "num_tokens": 237893434.0, + "step": 6236 + }, + { + "epoch": 0.7934105075690115, + "ewc_loss": 0.0264892578125, + "ewc_loss_parallel": 2.6464462280273438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.08489418029785, + "learning_rate": 1e-06, + "loss": 0.4092, + "mean_token_accuracy": 0.8692977428436279, + "num_tokens": 237932135.0, + "step": 6237 + }, + { + "epoch": 0.7935377178476021, + "ewc_loss": 0.0267333984375, + "ewc_loss_parallel": 2.6702880859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.016483306884766, + "learning_rate": 1e-06, + "loss": 0.4351, + "mean_token_accuracy": 0.8560446500778198, + "num_tokens": 237966320.0, + "step": 6238 + }, + { + "epoch": 0.7936649281261926, + "ewc_loss": 0.026611328125, + "ewc_loss_parallel": 2.658367156982422e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.063457489013672, + "learning_rate": 1e-06, + "loss": 0.409, + "mean_token_accuracy": 0.8657850027084351, + "num_tokens": 238006365.0, + "step": 6239 + }, + { + "epoch": 0.7937921384047831, + "ewc_loss": 0.0267333984375, + "ewc_loss_parallel": 2.6702880859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.9482364654541, + "learning_rate": 1e-06, + "loss": 0.4464, + "mean_token_accuracy": 0.8579879403114319, + "num_tokens": 238039307.0, + "step": 6240 + }, + { + "epoch": 0.7939193486833737, + "ewc_loss": 0.02685546875, + "ewc_loss_parallel": 2.682209014892578e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.178421020507812, + "learning_rate": 1e-06, + "loss": 0.4786, + "mean_token_accuracy": 0.8484683036804199, + "num_tokens": 238078015.0, + "step": 6241 + }, + { + "epoch": 0.7940465589619641, + "ewc_loss": 0.0267333984375, + "ewc_loss_parallel": 2.6702880859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.067087173461914, + "learning_rate": 1e-06, + "loss": 0.4464, + "mean_token_accuracy": 0.858795166015625, + "num_tokens": 238116790.0, + "step": 6242 + }, + { + "epoch": 0.7941737692405546, + "ewc_loss": 0.026611328125, + "ewc_loss_parallel": 2.658367156982422e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.997833251953125, + "learning_rate": 1e-06, + "loss": 0.4032, + "mean_token_accuracy": 0.8678802847862244, + "num_tokens": 238155330.0, + "step": 6243 + }, + { + "epoch": 0.7943009795191451, + "ewc_loss": 0.0267333984375, + "ewc_loss_parallel": 2.6702880859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.004531860351562, + "learning_rate": 1e-06, + "loss": 0.4767, + "mean_token_accuracy": 0.846168577671051, + "num_tokens": 238194492.0, + "step": 6244 + }, + { + "epoch": 0.7944281897977357, + "ewc_loss": 0.0264892578125, + "ewc_loss_parallel": 2.6464462280273438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.043670654296875, + "learning_rate": 1e-06, + "loss": 0.4356, + "mean_token_accuracy": 0.8614892959594727, + "num_tokens": 238230451.0, + "step": 6245 + }, + { + "epoch": 0.7945554000763262, + "ewc_loss": 0.0267333984375, + "ewc_loss_parallel": 2.6702880859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.17918586730957, + "learning_rate": 1e-06, + "loss": 0.4122, + "mean_token_accuracy": 0.8692207932472229, + "num_tokens": 238266331.0, + "step": 6246 + }, + { + "epoch": 0.7946826103549167, + "ewc_loss": 0.0264892578125, + "ewc_loss_parallel": 2.6464462280273438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.896032333374023, + "learning_rate": 1e-06, + "loss": 0.4693, + "mean_token_accuracy": 0.8508244752883911, + "num_tokens": 238307753.0, + "step": 6247 + }, + { + "epoch": 0.7948098206335071, + "ewc_loss": 0.0267333984375, + "ewc_loss_parallel": 2.6702880859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.106821060180664, + "learning_rate": 1e-06, + "loss": 0.444, + "mean_token_accuracy": 0.8623139262199402, + "num_tokens": 238349644.0, + "step": 6248 + }, + { + "epoch": 0.7949370309120977, + "ewc_loss": 0.0267333984375, + "ewc_loss_parallel": 2.6702880859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.095144271850586, + "learning_rate": 1e-06, + "loss": 0.4764, + "mean_token_accuracy": 0.8468754291534424, + "num_tokens": 238386308.0, + "step": 6249 + }, + { + "epoch": 0.7950642411906882, + "ewc_loss": 0.0267333984375, + "ewc_loss_parallel": 2.6702880859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.929668426513672, + "learning_rate": 1e-06, + "loss": 0.4785, + "mean_token_accuracy": 0.8461952209472656, + "num_tokens": 238427935.0, + "step": 6250 + }, + { + "epoch": 0.7951914514692787, + "ewc_loss": 0.02685546875, + "ewc_loss_parallel": 2.682209014892578e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.17371940612793, + "learning_rate": 1e-06, + "loss": 0.4185, + "mean_token_accuracy": 0.866757333278656, + "num_tokens": 238463639.0, + "step": 6251 + }, + { + "epoch": 0.7953186617478692, + "ewc_loss": 0.0267333984375, + "ewc_loss_parallel": 2.6702880859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.20859146118164, + "learning_rate": 1e-06, + "loss": 0.3951, + "mean_token_accuracy": 0.8740912079811096, + "num_tokens": 238498295.0, + "step": 6252 + }, + { + "epoch": 0.7954458720264598, + "ewc_loss": 0.02685546875, + "ewc_loss_parallel": 2.682209014892578e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.910205841064453, + "learning_rate": 1e-06, + "loss": 0.4387, + "mean_token_accuracy": 0.8608224391937256, + "num_tokens": 238538109.0, + "step": 6253 + }, + { + "epoch": 0.7955730823050502, + "ewc_loss": 0.026611328125, + "ewc_loss_parallel": 2.658367156982422e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.097612380981445, + "learning_rate": 1e-06, + "loss": 0.4792, + "mean_token_accuracy": 0.8480576276779175, + "num_tokens": 238574415.0, + "step": 6254 + }, + { + "epoch": 0.7957002925836407, + "ewc_loss": 0.0267333984375, + "ewc_loss_parallel": 2.6702880859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.089599609375, + "learning_rate": 1e-06, + "loss": 0.4328, + "mean_token_accuracy": 0.8632655739784241, + "num_tokens": 238616368.0, + "step": 6255 + }, + { + "epoch": 0.7958275028622313, + "ewc_loss": 0.0269775390625, + "ewc_loss_parallel": 2.6941299438476562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.2528018951416, + "learning_rate": 1e-06, + "loss": 0.4286, + "mean_token_accuracy": 0.8655019998550415, + "num_tokens": 238650634.0, + "step": 6256 + }, + { + "epoch": 0.7959547131408218, + "ewc_loss": 0.02685546875, + "ewc_loss_parallel": 2.682209014892578e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.998132705688477, + "learning_rate": 1e-06, + "loss": 0.4564, + "mean_token_accuracy": 0.8557082414627075, + "num_tokens": 238684383.0, + "step": 6257 + }, + { + "epoch": 0.7960819234194123, + "ewc_loss": 0.02685546875, + "ewc_loss_parallel": 2.682209014892578e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.091882705688477, + "learning_rate": 1e-06, + "loss": 0.4432, + "mean_token_accuracy": 0.8594083786010742, + "num_tokens": 238722863.0, + "step": 6258 + }, + { + "epoch": 0.7962091336980028, + "ewc_loss": 0.0267333984375, + "ewc_loss_parallel": 2.6702880859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.155258178710938, + "learning_rate": 1e-06, + "loss": 0.4311, + "mean_token_accuracy": 0.8631440997123718, + "num_tokens": 238765027.0, + "step": 6259 + }, + { + "epoch": 0.7963363439765934, + "ewc_loss": 0.0267333984375, + "ewc_loss_parallel": 2.6702880859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.10405731201172, + "learning_rate": 1e-06, + "loss": 0.4218, + "mean_token_accuracy": 0.8659020662307739, + "num_tokens": 238801950.0, + "step": 6260 + }, + { + "epoch": 0.7964635542551838, + "ewc_loss": 0.0267333984375, + "ewc_loss_parallel": 2.6702880859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.302473068237305, + "learning_rate": 1e-06, + "loss": 0.4579, + "mean_token_accuracy": 0.852388858795166, + "num_tokens": 238840963.0, + "step": 6261 + }, + { + "epoch": 0.7965907645337743, + "ewc_loss": 0.026611328125, + "ewc_loss_parallel": 2.658367156982422e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.970321655273438, + "learning_rate": 1e-06, + "loss": 0.4774, + "mean_token_accuracy": 0.8479366898536682, + "num_tokens": 238876167.0, + "step": 6262 + }, + { + "epoch": 0.7967179748123648, + "ewc_loss": 0.0267333984375, + "ewc_loss_parallel": 2.6702880859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.236080169677734, + "learning_rate": 1e-06, + "loss": 0.412, + "mean_token_accuracy": 0.867588996887207, + "num_tokens": 238910293.0, + "step": 6263 + }, + { + "epoch": 0.7968451850909554, + "ewc_loss": 0.0267333984375, + "ewc_loss_parallel": 2.6702880859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.046918869018555, + "learning_rate": 1e-06, + "loss": 0.4502, + "mean_token_accuracy": 0.8574386835098267, + "num_tokens": 238947279.0, + "step": 6264 + }, + { + "epoch": 0.7969723953695459, + "ewc_loss": 0.026611328125, + "ewc_loss_parallel": 2.658367156982422e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.053695678710938, + "learning_rate": 1e-06, + "loss": 0.4689, + "mean_token_accuracy": 0.8506898880004883, + "num_tokens": 238984148.0, + "step": 6265 + }, + { + "epoch": 0.7970996056481364, + "ewc_loss": 0.0267333984375, + "ewc_loss_parallel": 2.6702880859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.134626388549805, + "learning_rate": 1e-06, + "loss": 0.4656, + "mean_token_accuracy": 0.8475126028060913, + "num_tokens": 239022370.0, + "step": 6266 + }, + { + "epoch": 0.7972268159267268, + "ewc_loss": 0.02685546875, + "ewc_loss_parallel": 2.682209014892578e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.0875301361084, + "learning_rate": 1e-06, + "loss": 0.4488, + "mean_token_accuracy": 0.8579272627830505, + "num_tokens": 239064542.0, + "step": 6267 + }, + { + "epoch": 0.7973540262053174, + "ewc_loss": 0.0267333984375, + "ewc_loss_parallel": 2.6702880859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.08995246887207, + "learning_rate": 1e-06, + "loss": 0.4412, + "mean_token_accuracy": 0.860069990158081, + "num_tokens": 239103193.0, + "step": 6268 + }, + { + "epoch": 0.7974812364839079, + "ewc_loss": 0.026611328125, + "ewc_loss_parallel": 2.658367156982422e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.039030075073242, + "learning_rate": 1e-06, + "loss": 0.4137, + "mean_token_accuracy": 0.8688536882400513, + "num_tokens": 239137952.0, + "step": 6269 + }, + { + "epoch": 0.7976084467624984, + "ewc_loss": 0.0267333984375, + "ewc_loss_parallel": 2.6702880859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.124319076538086, + "learning_rate": 1e-06, + "loss": 0.4537, + "mean_token_accuracy": 0.850630521774292, + "num_tokens": 239174600.0, + "step": 6270 + }, + { + "epoch": 0.797735657041089, + "ewc_loss": 0.0267333984375, + "ewc_loss_parallel": 2.6702880859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.156217575073242, + "learning_rate": 1e-06, + "loss": 0.4138, + "mean_token_accuracy": 0.8667699098587036, + "num_tokens": 239206157.0, + "step": 6271 + }, + { + "epoch": 0.7978628673196795, + "ewc_loss": 0.02685546875, + "ewc_loss_parallel": 2.682209014892578e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.134685516357422, + "learning_rate": 1e-06, + "loss": 0.4147, + "mean_token_accuracy": 0.8655176758766174, + "num_tokens": 239243094.0, + "step": 6272 + }, + { + "epoch": 0.7979900775982699, + "ewc_loss": 0.026611328125, + "ewc_loss_parallel": 2.658367156982422e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.158185958862305, + "learning_rate": 1e-06, + "loss": 0.4249, + "mean_token_accuracy": 0.8625498414039612, + "num_tokens": 239281208.0, + "step": 6273 + }, + { + "epoch": 0.7981172878768604, + "ewc_loss": 0.0264892578125, + "ewc_loss_parallel": 2.6464462280273438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.119598388671875, + "learning_rate": 1e-06, + "loss": 0.449, + "mean_token_accuracy": 0.8570005893707275, + "num_tokens": 239320159.0, + "step": 6274 + }, + { + "epoch": 0.798244498155451, + "ewc_loss": 0.0264892578125, + "ewc_loss_parallel": 2.6464462280273438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.030593872070312, + "learning_rate": 1e-06, + "loss": 0.4689, + "mean_token_accuracy": 0.8504345417022705, + "num_tokens": 239360160.0, + "step": 6275 + }, + { + "epoch": 0.7983717084340415, + "ewc_loss": 0.026611328125, + "ewc_loss_parallel": 2.658367156982422e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.97396469116211, + "learning_rate": 1e-06, + "loss": 0.4218, + "mean_token_accuracy": 0.8640880584716797, + "num_tokens": 239398101.0, + "step": 6276 + }, + { + "epoch": 0.798498918712632, + "ewc_loss": 0.026611328125, + "ewc_loss_parallel": 2.658367156982422e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.132579803466797, + "learning_rate": 1e-06, + "loss": 0.4015, + "mean_token_accuracy": 0.8763004541397095, + "num_tokens": 239438560.0, + "step": 6277 + }, + { + "epoch": 0.7986261289912225, + "ewc_loss": 0.02685546875, + "ewc_loss_parallel": 2.682209014892578e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.868364334106445, + "learning_rate": 1e-06, + "loss": 0.4484, + "mean_token_accuracy": 0.8560038805007935, + "num_tokens": 239475851.0, + "step": 6278 + }, + { + "epoch": 0.798753339269813, + "ewc_loss": 0.02685546875, + "ewc_loss_parallel": 2.682209014892578e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.13437843322754, + "learning_rate": 1e-06, + "loss": 0.421, + "mean_token_accuracy": 0.8660527467727661, + "num_tokens": 239519165.0, + "step": 6279 + }, + { + "epoch": 0.7988805495484035, + "ewc_loss": 0.02685546875, + "ewc_loss_parallel": 2.682209014892578e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.079729080200195, + "learning_rate": 1e-06, + "loss": 0.4882, + "mean_token_accuracy": 0.8488577604293823, + "num_tokens": 239560602.0, + "step": 6280 + }, + { + "epoch": 0.799007759826994, + "ewc_loss": 0.02685546875, + "ewc_loss_parallel": 2.682209014892578e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.35991096496582, + "learning_rate": 1e-06, + "loss": 0.4248, + "mean_token_accuracy": 0.8630033135414124, + "num_tokens": 239591933.0, + "step": 6281 + }, + { + "epoch": 0.7991349701055845, + "ewc_loss": 0.0267333984375, + "ewc_loss_parallel": 2.6702880859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.055248260498047, + "learning_rate": 1e-06, + "loss": 0.4569, + "mean_token_accuracy": 0.855525016784668, + "num_tokens": 239625495.0, + "step": 6282 + }, + { + "epoch": 0.7992621803841751, + "ewc_loss": 0.0267333984375, + "ewc_loss_parallel": 2.6702880859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.3039608001709, + "learning_rate": 1e-06, + "loss": 0.3891, + "mean_token_accuracy": 0.8761997818946838, + "num_tokens": 239658249.0, + "step": 6283 + }, + { + "epoch": 0.7993893906627656, + "ewc_loss": 0.026611328125, + "ewc_loss_parallel": 2.658367156982422e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.020673751831055, + "learning_rate": 1e-06, + "loss": 0.4632, + "mean_token_accuracy": 0.8524084687232971, + "num_tokens": 239690194.0, + "step": 6284 + }, + { + "epoch": 0.799516600941356, + "ewc_loss": 0.026611328125, + "ewc_loss_parallel": 2.658367156982422e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.07090187072754, + "learning_rate": 1e-06, + "loss": 0.4488, + "mean_token_accuracy": 0.8596735000610352, + "num_tokens": 239728315.0, + "step": 6285 + }, + { + "epoch": 0.7996438112199465, + "ewc_loss": 0.02685546875, + "ewc_loss_parallel": 2.682209014892578e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.14285659790039, + "learning_rate": 1e-06, + "loss": 0.4108, + "mean_token_accuracy": 0.866269588470459, + "num_tokens": 239764505.0, + "step": 6286 + }, + { + "epoch": 0.7997710214985371, + "ewc_loss": 0.0267333984375, + "ewc_loss_parallel": 2.6702880859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.1981201171875, + "learning_rate": 1e-06, + "loss": 0.4813, + "mean_token_accuracy": 0.8452107906341553, + "num_tokens": 239809461.0, + "step": 6287 + }, + { + "epoch": 0.7998982317771276, + "ewc_loss": 0.026611328125, + "ewc_loss_parallel": 2.658367156982422e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.01395606994629, + "learning_rate": 1e-06, + "loss": 0.4545, + "mean_token_accuracy": 0.8560223579406738, + "num_tokens": 239845232.0, + "step": 6288 + }, + { + "epoch": 0.8000254420557181, + "ewc_loss": 0.026611328125, + "ewc_loss_parallel": 2.658367156982422e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.929319381713867, + "learning_rate": 1e-06, + "loss": 0.4201, + "mean_token_accuracy": 0.866894543170929, + "num_tokens": 239885489.0, + "step": 6289 + }, + { + "epoch": 0.8001526523343087, + "ewc_loss": 0.0267333984375, + "ewc_loss_parallel": 2.6702880859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.155195236206055, + "learning_rate": 1e-06, + "loss": 0.4737, + "mean_token_accuracy": 0.8477523326873779, + "num_tokens": 239921159.0, + "step": 6290 + }, + { + "epoch": 0.8002798626128991, + "ewc_loss": 0.02685546875, + "ewc_loss_parallel": 2.682209014892578e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.117321014404297, + "learning_rate": 1e-06, + "loss": 0.417, + "mean_token_accuracy": 0.8658440113067627, + "num_tokens": 239960879.0, + "step": 6291 + }, + { + "epoch": 0.8004070728914896, + "ewc_loss": 0.0267333984375, + "ewc_loss_parallel": 2.6702880859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.112384796142578, + "learning_rate": 1e-06, + "loss": 0.4672, + "mean_token_accuracy": 0.8503327369689941, + "num_tokens": 239994302.0, + "step": 6292 + }, + { + "epoch": 0.8005342831700801, + "ewc_loss": 0.0267333984375, + "ewc_loss_parallel": 2.6702880859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.136564254760742, + "learning_rate": 1e-06, + "loss": 0.42, + "mean_token_accuracy": 0.8671804666519165, + "num_tokens": 240027759.0, + "step": 6293 + }, + { + "epoch": 0.8006614934486707, + "ewc_loss": 0.0267333984375, + "ewc_loss_parallel": 2.6702880859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.341676712036133, + "learning_rate": 1e-06, + "loss": 0.4356, + "mean_token_accuracy": 0.8566831350326538, + "num_tokens": 240065049.0, + "step": 6294 + }, + { + "epoch": 0.8007887037272612, + "ewc_loss": 0.0264892578125, + "ewc_loss_parallel": 2.6464462280273438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.15658187866211, + "learning_rate": 1e-06, + "loss": 0.4909, + "mean_token_accuracy": 0.8468080759048462, + "num_tokens": 240101453.0, + "step": 6295 + }, + { + "epoch": 0.8009159140058517, + "ewc_loss": 0.02685546875, + "ewc_loss_parallel": 2.682209014892578e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.40717315673828, + "learning_rate": 1e-06, + "loss": 0.4901, + "mean_token_accuracy": 0.8401888608932495, + "num_tokens": 240136831.0, + "step": 6296 + }, + { + "epoch": 0.8010431242844421, + "ewc_loss": 0.02685546875, + "ewc_loss_parallel": 2.682209014892578e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.069934844970703, + "learning_rate": 1e-06, + "loss": 0.4604, + "mean_token_accuracy": 0.8592122197151184, + "num_tokens": 240173364.0, + "step": 6297 + }, + { + "epoch": 0.8011703345630327, + "ewc_loss": 0.0267333984375, + "ewc_loss_parallel": 2.6702880859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.184106826782227, + "learning_rate": 1e-06, + "loss": 0.4903, + "mean_token_accuracy": 0.8390892744064331, + "num_tokens": 240206718.0, + "step": 6298 + }, + { + "epoch": 0.8012975448416232, + "ewc_loss": 0.0267333984375, + "ewc_loss_parallel": 2.6702880859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.051895141601562, + "learning_rate": 1e-06, + "loss": 0.4792, + "mean_token_accuracy": 0.8487181663513184, + "num_tokens": 240245633.0, + "step": 6299 + }, + { + "epoch": 0.8014247551202137, + "ewc_loss": 0.0263671875, + "ewc_loss_parallel": 2.6345252990722656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.033321380615234, + "learning_rate": 1e-06, + "loss": 0.3825, + "mean_token_accuracy": 0.8749274015426636, + "num_tokens": 240283887.0, + "step": 6300 + }, + { + "epoch": 0.8015519653988042, + "ewc_loss": 0.02685546875, + "ewc_loss_parallel": 2.682209014892578e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.181671142578125, + "learning_rate": 1e-06, + "loss": 0.4531, + "mean_token_accuracy": 0.8528503179550171, + "num_tokens": 240322208.0, + "step": 6301 + }, + { + "epoch": 0.8016791756773948, + "ewc_loss": 0.026611328125, + "ewc_loss_parallel": 2.658367156982422e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.940021514892578, + "learning_rate": 1e-06, + "loss": 0.4297, + "mean_token_accuracy": 0.8636896014213562, + "num_tokens": 240356154.0, + "step": 6302 + }, + { + "epoch": 0.8018063859559852, + "ewc_loss": 0.0269775390625, + "ewc_loss_parallel": 2.6941299438476562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.983051300048828, + "learning_rate": 1e-06, + "loss": 0.442, + "mean_token_accuracy": 0.8590651750564575, + "num_tokens": 240389103.0, + "step": 6303 + }, + { + "epoch": 0.8019335962345757, + "ewc_loss": 0.02685546875, + "ewc_loss_parallel": 2.682209014892578e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.00531768798828, + "learning_rate": 1e-06, + "loss": 0.3849, + "mean_token_accuracy": 0.8765820860862732, + "num_tokens": 240429545.0, + "step": 6304 + }, + { + "epoch": 0.8020608065131662, + "ewc_loss": 0.0269775390625, + "ewc_loss_parallel": 2.6941299438476562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.31106185913086, + "learning_rate": 1e-06, + "loss": 0.3838, + "mean_token_accuracy": 0.8784658908843994, + "num_tokens": 240468556.0, + "step": 6305 + }, + { + "epoch": 0.8021880167917568, + "ewc_loss": 0.02685546875, + "ewc_loss_parallel": 2.682209014892578e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.28643226623535, + "learning_rate": 1e-06, + "loss": 0.4359, + "mean_token_accuracy": 0.8618754744529724, + "num_tokens": 240508534.0, + "step": 6306 + }, + { + "epoch": 0.8023152270703473, + "ewc_loss": 0.0267333984375, + "ewc_loss_parallel": 2.6702880859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.108749389648438, + "learning_rate": 1e-06, + "loss": 0.4392, + "mean_token_accuracy": 0.8605571985244751, + "num_tokens": 240540833.0, + "step": 6307 + }, + { + "epoch": 0.8024424373489378, + "ewc_loss": 0.0267333984375, + "ewc_loss_parallel": 2.6702880859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.1672306060791, + "learning_rate": 1e-06, + "loss": 0.4686, + "mean_token_accuracy": 0.852679967880249, + "num_tokens": 240577768.0, + "step": 6308 + }, + { + "epoch": 0.8025696476275284, + "ewc_loss": 0.0267333984375, + "ewc_loss_parallel": 2.6702880859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.071731567382812, + "learning_rate": 1e-06, + "loss": 0.4289, + "mean_token_accuracy": 0.8549606800079346, + "num_tokens": 240610607.0, + "step": 6309 + }, + { + "epoch": 0.8026968579061188, + "ewc_loss": 0.0272216796875, + "ewc_loss_parallel": 2.7179718017578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.256288528442383, + "learning_rate": 1e-06, + "loss": 0.4513, + "mean_token_accuracy": 0.8576676845550537, + "num_tokens": 240646056.0, + "step": 6310 + }, + { + "epoch": 0.8028240681847093, + "ewc_loss": 0.0267333984375, + "ewc_loss_parallel": 2.6702880859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.885093688964844, + "learning_rate": 1e-06, + "loss": 0.4206, + "mean_token_accuracy": 0.8682754039764404, + "num_tokens": 240682513.0, + "step": 6311 + }, + { + "epoch": 0.8029512784632998, + "ewc_loss": 0.0269775390625, + "ewc_loss_parallel": 2.6941299438476562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.157270431518555, + "learning_rate": 1e-06, + "loss": 0.4344, + "mean_token_accuracy": 0.8622438311576843, + "num_tokens": 240716311.0, + "step": 6312 + }, + { + "epoch": 0.8030784887418904, + "ewc_loss": 0.027099609375, + "ewc_loss_parallel": 2.7060508728027344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.106290817260742, + "learning_rate": 1e-06, + "loss": 0.4706, + "mean_token_accuracy": 0.8523706197738647, + "num_tokens": 240758873.0, + "step": 6313 + }, + { + "epoch": 0.8032056990204809, + "ewc_loss": 0.0269775390625, + "ewc_loss_parallel": 2.6941299438476562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.139039993286133, + "learning_rate": 1e-06, + "loss": 0.4209, + "mean_token_accuracy": 0.8656808137893677, + "num_tokens": 240797163.0, + "step": 6314 + }, + { + "epoch": 0.8033329092990714, + "ewc_loss": 0.027099609375, + "ewc_loss_parallel": 2.7060508728027344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.95204734802246, + "learning_rate": 1e-06, + "loss": 0.4034, + "mean_token_accuracy": 0.8718448877334595, + "num_tokens": 240835719.0, + "step": 6315 + }, + { + "epoch": 0.8034601195776618, + "ewc_loss": 0.0269775390625, + "ewc_loss_parallel": 2.6941299438476562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.436174392700195, + "learning_rate": 1e-06, + "loss": 0.4507, + "mean_token_accuracy": 0.8551478981971741, + "num_tokens": 240874359.0, + "step": 6316 + }, + { + "epoch": 0.8035873298562524, + "ewc_loss": 0.02685546875, + "ewc_loss_parallel": 2.682209014892578e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.126359939575195, + "learning_rate": 1e-06, + "loss": 0.3996, + "mean_token_accuracy": 0.8742374181747437, + "num_tokens": 240909125.0, + "step": 6317 + }, + { + "epoch": 0.8037145401348429, + "ewc_loss": 0.0267333984375, + "ewc_loss_parallel": 2.6702880859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.071474075317383, + "learning_rate": 1e-06, + "loss": 0.4284, + "mean_token_accuracy": 0.8656871318817139, + "num_tokens": 240949391.0, + "step": 6318 + }, + { + "epoch": 0.8038417504134334, + "ewc_loss": 0.0269775390625, + "ewc_loss_parallel": 2.6941299438476562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.23409080505371, + "learning_rate": 1e-06, + "loss": 0.4806, + "mean_token_accuracy": 0.8507471084594727, + "num_tokens": 240987390.0, + "step": 6319 + }, + { + "epoch": 0.803968960692024, + "ewc_loss": 0.0269775390625, + "ewc_loss_parallel": 2.6941299438476562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.08336639404297, + "learning_rate": 1e-06, + "loss": 0.4601, + "mean_token_accuracy": 0.8531978130340576, + "num_tokens": 241025359.0, + "step": 6320 + }, + { + "epoch": 0.8040961709706145, + "ewc_loss": 0.0269775390625, + "ewc_loss_parallel": 2.6941299438476562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.271472930908203, + "learning_rate": 1e-06, + "loss": 0.4571, + "mean_token_accuracy": 0.854466438293457, + "num_tokens": 241060532.0, + "step": 6321 + }, + { + "epoch": 0.8042233812492049, + "ewc_loss": 0.0267333984375, + "ewc_loss_parallel": 2.6702880859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.241701126098633, + "learning_rate": 1e-06, + "loss": 0.4136, + "mean_token_accuracy": 0.8671963214874268, + "num_tokens": 241098547.0, + "step": 6322 + }, + { + "epoch": 0.8043505915277954, + "ewc_loss": 0.026611328125, + "ewc_loss_parallel": 2.658367156982422e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.03084373474121, + "learning_rate": 1e-06, + "loss": 0.4549, + "mean_token_accuracy": 0.8604658842086792, + "num_tokens": 241131091.0, + "step": 6323 + }, + { + "epoch": 0.804477801806386, + "ewc_loss": 0.0267333984375, + "ewc_loss_parallel": 2.6702880859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.141889572143555, + "learning_rate": 1e-06, + "loss": 0.4681, + "mean_token_accuracy": 0.8507142663002014, + "num_tokens": 241164495.0, + "step": 6324 + }, + { + "epoch": 0.8046050120849765, + "ewc_loss": 0.0269775390625, + "ewc_loss_parallel": 2.6941299438476562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.11049461364746, + "learning_rate": 1e-06, + "loss": 0.4588, + "mean_token_accuracy": 0.8552432060241699, + "num_tokens": 241203777.0, + "step": 6325 + }, + { + "epoch": 0.804732222363567, + "ewc_loss": 0.0267333984375, + "ewc_loss_parallel": 2.6702880859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.095060348510742, + "learning_rate": 1e-06, + "loss": 0.4517, + "mean_token_accuracy": 0.8546813130378723, + "num_tokens": 241243219.0, + "step": 6326 + }, + { + "epoch": 0.8048594326421575, + "ewc_loss": 0.02685546875, + "ewc_loss_parallel": 2.682209014892578e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.091766357421875, + "learning_rate": 1e-06, + "loss": 0.4932, + "mean_token_accuracy": 0.8456049561500549, + "num_tokens": 241284547.0, + "step": 6327 + }, + { + "epoch": 0.804986642920748, + "ewc_loss": 0.026611328125, + "ewc_loss_parallel": 2.658367156982422e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.92861557006836, + "learning_rate": 1e-06, + "loss": 0.5074, + "mean_token_accuracy": 0.8430286049842834, + "num_tokens": 241331110.0, + "step": 6328 + }, + { + "epoch": 0.8051138531993385, + "ewc_loss": 0.0269775390625, + "ewc_loss_parallel": 2.6941299438476562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.927574157714844, + "learning_rate": 1e-06, + "loss": 0.4412, + "mean_token_accuracy": 0.8597520589828491, + "num_tokens": 241372720.0, + "step": 6329 + }, + { + "epoch": 0.805241063477929, + "ewc_loss": 0.0269775390625, + "ewc_loss_parallel": 2.6941299438476562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.161542892456055, + "learning_rate": 1e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.8625196218490601, + "num_tokens": 241408173.0, + "step": 6330 + }, + { + "epoch": 0.8053682737565195, + "ewc_loss": 0.0267333984375, + "ewc_loss_parallel": 2.6702880859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.017372131347656, + "learning_rate": 1e-06, + "loss": 0.415, + "mean_token_accuracy": 0.867050290107727, + "num_tokens": 241445021.0, + "step": 6331 + }, + { + "epoch": 0.8054954840351101, + "ewc_loss": 0.02685546875, + "ewc_loss_parallel": 2.682209014892578e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.05755615234375, + "learning_rate": 1e-06, + "loss": 0.4219, + "mean_token_accuracy": 0.8652908802032471, + "num_tokens": 241481874.0, + "step": 6332 + }, + { + "epoch": 0.8056226943137006, + "ewc_loss": 0.02685546875, + "ewc_loss_parallel": 2.682209014892578e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.27867889404297, + "learning_rate": 1e-06, + "loss": 0.484, + "mean_token_accuracy": 0.8455699682235718, + "num_tokens": 241523140.0, + "step": 6333 + }, + { + "epoch": 0.805749904592291, + "ewc_loss": 0.026611328125, + "ewc_loss_parallel": 2.658367156982422e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.957843780517578, + "learning_rate": 1e-06, + "loss": 0.4807, + "mean_token_accuracy": 0.841876208782196, + "num_tokens": 241563424.0, + "step": 6334 + }, + { + "epoch": 0.8058771148708815, + "ewc_loss": 0.02685546875, + "ewc_loss_parallel": 2.682209014892578e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.367990493774414, + "learning_rate": 1e-06, + "loss": 0.3996, + "mean_token_accuracy": 0.8725172877311707, + "num_tokens": 241600004.0, + "step": 6335 + }, + { + "epoch": 0.8060043251494721, + "ewc_loss": 0.0267333984375, + "ewc_loss_parallel": 2.6702880859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.050872802734375, + "learning_rate": 1e-06, + "loss": 0.4388, + "mean_token_accuracy": 0.8595888018608093, + "num_tokens": 241642704.0, + "step": 6336 + }, + { + "epoch": 0.8061315354280626, + "ewc_loss": 0.02685546875, + "ewc_loss_parallel": 2.682209014892578e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.92839241027832, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.8658822774887085, + "num_tokens": 241679176.0, + "step": 6337 + }, + { + "epoch": 0.8062587457066531, + "ewc_loss": 0.0267333984375, + "ewc_loss_parallel": 2.6702880859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.52946662902832, + "learning_rate": 1e-06, + "loss": 0.4286, + "mean_token_accuracy": 0.8653239607810974, + "num_tokens": 241721168.0, + "step": 6338 + }, + { + "epoch": 0.8063859559852437, + "ewc_loss": 0.0267333984375, + "ewc_loss_parallel": 2.6702880859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.012548446655273, + "learning_rate": 1e-06, + "loss": 0.4005, + "mean_token_accuracy": 0.8744286298751831, + "num_tokens": 241759735.0, + "step": 6339 + }, + { + "epoch": 0.8065131662638341, + "ewc_loss": 0.0269775390625, + "ewc_loss_parallel": 2.6941299438476562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.14212417602539, + "learning_rate": 1e-06, + "loss": 0.4419, + "mean_token_accuracy": 0.861147940158844, + "num_tokens": 241799445.0, + "step": 6340 + }, + { + "epoch": 0.8066403765424246, + "ewc_loss": 0.026611328125, + "ewc_loss_parallel": 2.658367156982422e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.20868682861328, + "learning_rate": 1e-06, + "loss": 0.4497, + "mean_token_accuracy": 0.8645150661468506, + "num_tokens": 241839333.0, + "step": 6341 + }, + { + "epoch": 0.8067675868210151, + "ewc_loss": 0.02685546875, + "ewc_loss_parallel": 2.682209014892578e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.057172775268555, + "learning_rate": 1e-06, + "loss": 0.4787, + "mean_token_accuracy": 0.8490163087844849, + "num_tokens": 241880248.0, + "step": 6342 + }, + { + "epoch": 0.8068947970996057, + "ewc_loss": 0.02685546875, + "ewc_loss_parallel": 2.682209014892578e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.103025436401367, + "learning_rate": 1e-06, + "loss": 0.396, + "mean_token_accuracy": 0.8744412660598755, + "num_tokens": 241926717.0, + "step": 6343 + }, + { + "epoch": 0.8070220073781962, + "ewc_loss": 0.02685546875, + "ewc_loss_parallel": 2.682209014892578e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.135589599609375, + "learning_rate": 1e-06, + "loss": 0.4686, + "mean_token_accuracy": 0.8513146638870239, + "num_tokens": 241966080.0, + "step": 6344 + }, + { + "epoch": 0.8071492176567867, + "ewc_loss": 0.027099609375, + "ewc_loss_parallel": 2.7060508728027344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.071874618530273, + "learning_rate": 1e-06, + "loss": 0.4551, + "mean_token_accuracy": 0.8576675057411194, + "num_tokens": 242011975.0, + "step": 6345 + }, + { + "epoch": 0.8072764279353771, + "ewc_loss": 0.02685546875, + "ewc_loss_parallel": 2.682209014892578e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.966394424438477, + "learning_rate": 1e-06, + "loss": 0.3903, + "mean_token_accuracy": 0.8734502792358398, + "num_tokens": 242046320.0, + "step": 6346 + }, + { + "epoch": 0.8074036382139677, + "ewc_loss": 0.027099609375, + "ewc_loss_parallel": 2.7060508728027344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.28807830810547, + "learning_rate": 1e-06, + "loss": 0.4355, + "mean_token_accuracy": 0.8620284199714661, + "num_tokens": 242085157.0, + "step": 6347 + }, + { + "epoch": 0.8075308484925582, + "ewc_loss": 0.0269775390625, + "ewc_loss_parallel": 2.6941299438476562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.1190185546875, + "learning_rate": 1e-06, + "loss": 0.4415, + "mean_token_accuracy": 0.8566288352012634, + "num_tokens": 242116548.0, + "step": 6348 + }, + { + "epoch": 0.8076580587711487, + "ewc_loss": 0.0269775390625, + "ewc_loss_parallel": 2.6941299438476562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.28365707397461, + "learning_rate": 1e-06, + "loss": 0.4128, + "mean_token_accuracy": 0.8665584325790405, + "num_tokens": 242151883.0, + "step": 6349 + }, + { + "epoch": 0.8077852690497392, + "ewc_loss": 0.027099609375, + "ewc_loss_parallel": 2.7060508728027344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.995309829711914, + "learning_rate": 1e-06, + "loss": 0.4448, + "mean_token_accuracy": 0.8581761121749878, + "num_tokens": 242190486.0, + "step": 6350 + }, + { + "epoch": 0.8079124793283298, + "ewc_loss": 0.0267333984375, + "ewc_loss_parallel": 2.6702880859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.181699752807617, + "learning_rate": 1e-06, + "loss": 0.4316, + "mean_token_accuracy": 0.864388108253479, + "num_tokens": 242232720.0, + "step": 6351 + }, + { + "epoch": 0.8080396896069202, + "ewc_loss": 0.0272216796875, + "ewc_loss_parallel": 2.7179718017578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.081449508666992, + "learning_rate": 1e-06, + "loss": 0.4317, + "mean_token_accuracy": 0.8616694211959839, + "num_tokens": 242273926.0, + "step": 6352 + }, + { + "epoch": 0.8081668998855107, + "ewc_loss": 0.0267333984375, + "ewc_loss_parallel": 2.6702880859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.3295955657959, + "learning_rate": 1e-06, + "loss": 0.4701, + "mean_token_accuracy": 0.8507112264633179, + "num_tokens": 242313484.0, + "step": 6353 + }, + { + "epoch": 0.8082941101641012, + "ewc_loss": 0.02685546875, + "ewc_loss_parallel": 2.682209014892578e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.069734573364258, + "learning_rate": 1e-06, + "loss": 0.4631, + "mean_token_accuracy": 0.8544845581054688, + "num_tokens": 242352101.0, + "step": 6354 + }, + { + "epoch": 0.8084213204426918, + "ewc_loss": 0.0269775390625, + "ewc_loss_parallel": 2.6941299438476562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.009458541870117, + "learning_rate": 1e-06, + "loss": 0.3916, + "mean_token_accuracy": 0.8757265210151672, + "num_tokens": 242392958.0, + "step": 6355 + }, + { + "epoch": 0.8085485307212823, + "ewc_loss": 0.02685546875, + "ewc_loss_parallel": 2.682209014892578e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.077131271362305, + "learning_rate": 1e-06, + "loss": 0.4337, + "mean_token_accuracy": 0.8613800406455994, + "num_tokens": 242432327.0, + "step": 6356 + }, + { + "epoch": 0.8086757409998728, + "ewc_loss": 0.0274658203125, + "ewc_loss_parallel": 2.7418136596679688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.139484405517578, + "learning_rate": 1e-06, + "loss": 0.5011, + "mean_token_accuracy": 0.8396549224853516, + "num_tokens": 242473714.0, + "step": 6357 + }, + { + "epoch": 0.8088029512784632, + "ewc_loss": 0.027099609375, + "ewc_loss_parallel": 2.7060508728027344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.12205696105957, + "learning_rate": 1e-06, + "loss": 0.4151, + "mean_token_accuracy": 0.8675944209098816, + "num_tokens": 242513179.0, + "step": 6358 + }, + { + "epoch": 0.8089301615570538, + "ewc_loss": 0.02685546875, + "ewc_loss_parallel": 2.682209014892578e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.08295440673828, + "learning_rate": 1e-06, + "loss": 0.4327, + "mean_token_accuracy": 0.8608957529067993, + "num_tokens": 242549398.0, + "step": 6359 + }, + { + "epoch": 0.8090573718356443, + "ewc_loss": 0.02685546875, + "ewc_loss_parallel": 2.682209014892578e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.18439483642578, + "learning_rate": 1e-06, + "loss": 0.4398, + "mean_token_accuracy": 0.8563028573989868, + "num_tokens": 242583705.0, + "step": 6360 + }, + { + "epoch": 0.8091845821142348, + "ewc_loss": 0.02685546875, + "ewc_loss_parallel": 2.682209014892578e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.98137664794922, + "learning_rate": 1e-06, + "loss": 0.4242, + "mean_token_accuracy": 0.8646669387817383, + "num_tokens": 242625900.0, + "step": 6361 + }, + { + "epoch": 0.8093117923928254, + "ewc_loss": 0.02685546875, + "ewc_loss_parallel": 2.682209014892578e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.356164932250977, + "learning_rate": 1e-06, + "loss": 0.4696, + "mean_token_accuracy": 0.8495333790779114, + "num_tokens": 242665766.0, + "step": 6362 + }, + { + "epoch": 0.8094390026714159, + "ewc_loss": 0.0269775390625, + "ewc_loss_parallel": 2.6941299438476562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.115434646606445, + "learning_rate": 1e-06, + "loss": 0.4208, + "mean_token_accuracy": 0.8630987405776978, + "num_tokens": 242707634.0, + "step": 6363 + }, + { + "epoch": 0.8095662129500064, + "ewc_loss": 0.027099609375, + "ewc_loss_parallel": 2.7060508728027344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.129505157470703, + "learning_rate": 1e-06, + "loss": 0.4482, + "mean_token_accuracy": 0.8560798168182373, + "num_tokens": 242746791.0, + "step": 6364 + }, + { + "epoch": 0.8096934232285968, + "ewc_loss": 0.0269775390625, + "ewc_loss_parallel": 2.6941299438476562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.95977783203125, + "learning_rate": 1e-06, + "loss": 0.3987, + "mean_token_accuracy": 0.8735599517822266, + "num_tokens": 242784861.0, + "step": 6365 + }, + { + "epoch": 0.8098206335071874, + "ewc_loss": 0.027099609375, + "ewc_loss_parallel": 2.7060508728027344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.166234970092773, + "learning_rate": 1e-06, + "loss": 0.3907, + "mean_token_accuracy": 0.8754320740699768, + "num_tokens": 242817612.0, + "step": 6366 + }, + { + "epoch": 0.8099478437857779, + "ewc_loss": 0.027099609375, + "ewc_loss_parallel": 2.7060508728027344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.075891494750977, + "learning_rate": 1e-06, + "loss": 0.4057, + "mean_token_accuracy": 0.8687283992767334, + "num_tokens": 242854951.0, + "step": 6367 + }, + { + "epoch": 0.8100750540643684, + "ewc_loss": 0.02685546875, + "ewc_loss_parallel": 2.682209014892578e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.143081665039062, + "learning_rate": 1e-06, + "loss": 0.3861, + "mean_token_accuracy": 0.8754256367683411, + "num_tokens": 242894166.0, + "step": 6368 + }, + { + "epoch": 0.8102022643429589, + "ewc_loss": 0.02685546875, + "ewc_loss_parallel": 2.682209014892578e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.06749153137207, + "learning_rate": 1e-06, + "loss": 0.4237, + "mean_token_accuracy": 0.866156280040741, + "num_tokens": 242936230.0, + "step": 6369 + }, + { + "epoch": 0.8103294746215495, + "ewc_loss": 0.027099609375, + "ewc_loss_parallel": 2.7060508728027344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.054746627807617, + "learning_rate": 1e-06, + "loss": 0.3991, + "mean_token_accuracy": 0.8720735311508179, + "num_tokens": 242973148.0, + "step": 6370 + }, + { + "epoch": 0.8104566849001399, + "ewc_loss": 0.0272216796875, + "ewc_loss_parallel": 2.7179718017578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.18045425415039, + "learning_rate": 1e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.8616651892662048, + "num_tokens": 243013029.0, + "step": 6371 + }, + { + "epoch": 0.8105838951787304, + "ewc_loss": 0.0272216796875, + "ewc_loss_parallel": 2.7179718017578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.010028839111328, + "learning_rate": 1e-06, + "loss": 0.4436, + "mean_token_accuracy": 0.8601510524749756, + "num_tokens": 243054847.0, + "step": 6372 + }, + { + "epoch": 0.810711105457321, + "ewc_loss": 0.0269775390625, + "ewc_loss_parallel": 2.6941299438476562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.023889541625977, + "learning_rate": 1e-06, + "loss": 0.4589, + "mean_token_accuracy": 0.8542062044143677, + "num_tokens": 243096524.0, + "step": 6373 + }, + { + "epoch": 0.8108383157359115, + "ewc_loss": 0.0269775390625, + "ewc_loss_parallel": 2.6941299438476562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.11980438232422, + "learning_rate": 1e-06, + "loss": 0.4493, + "mean_token_accuracy": 0.8540033102035522, + "num_tokens": 243132397.0, + "step": 6374 + }, + { + "epoch": 0.810965526014502, + "ewc_loss": 0.02734375, + "ewc_loss_parallel": 2.7298927307128906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.237661361694336, + "learning_rate": 1e-06, + "loss": 0.488, + "mean_token_accuracy": 0.8474041819572449, + "num_tokens": 243171790.0, + "step": 6375 + }, + { + "epoch": 0.8110927362930925, + "ewc_loss": 0.0272216796875, + "ewc_loss_parallel": 2.7179718017578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.255041122436523, + "learning_rate": 1e-06, + "loss": 0.4414, + "mean_token_accuracy": 0.8602715730667114, + "num_tokens": 243208217.0, + "step": 6376 + }, + { + "epoch": 0.811219946571683, + "ewc_loss": 0.027099609375, + "ewc_loss_parallel": 2.7060508728027344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.088197708129883, + "learning_rate": 1e-06, + "loss": 0.4258, + "mean_token_accuracy": 0.8653531074523926, + "num_tokens": 243245260.0, + "step": 6377 + }, + { + "epoch": 0.8113471568502735, + "ewc_loss": 0.0269775390625, + "ewc_loss_parallel": 2.6941299438476562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.25115966796875, + "learning_rate": 1e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.8687559366226196, + "num_tokens": 243283924.0, + "step": 6378 + }, + { + "epoch": 0.811474367128864, + "ewc_loss": 0.02685546875, + "ewc_loss_parallel": 2.682209014892578e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.214754104614258, + "learning_rate": 1e-06, + "loss": 0.4114, + "mean_token_accuracy": 0.8689393401145935, + "num_tokens": 243321977.0, + "step": 6379 + }, + { + "epoch": 0.8116015774074545, + "ewc_loss": 0.027099609375, + "ewc_loss_parallel": 2.7060508728027344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.044424057006836, + "learning_rate": 1e-06, + "loss": 0.4701, + "mean_token_accuracy": 0.8584640026092529, + "num_tokens": 243359349.0, + "step": 6380 + }, + { + "epoch": 0.8117287876860451, + "ewc_loss": 0.0267333984375, + "ewc_loss_parallel": 2.6702880859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.134925842285156, + "learning_rate": 1e-06, + "loss": 0.4319, + "mean_token_accuracy": 0.8648805618286133, + "num_tokens": 243395722.0, + "step": 6381 + }, + { + "epoch": 0.8118559979646356, + "ewc_loss": 0.02685546875, + "ewc_loss_parallel": 2.682209014892578e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.07355499267578, + "learning_rate": 1e-06, + "loss": 0.4663, + "mean_token_accuracy": 0.8500877618789673, + "num_tokens": 243429022.0, + "step": 6382 + }, + { + "epoch": 0.811983208243226, + "ewc_loss": 0.02685546875, + "ewc_loss_parallel": 2.682209014892578e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.18609046936035, + "learning_rate": 1e-06, + "loss": 0.504, + "mean_token_accuracy": 0.8398348093032837, + "num_tokens": 243468474.0, + "step": 6383 + }, + { + "epoch": 0.8121104185218165, + "ewc_loss": 0.02685546875, + "ewc_loss_parallel": 2.682209014892578e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.14722442626953, + "learning_rate": 1e-06, + "loss": 0.3794, + "mean_token_accuracy": 0.8760673999786377, + "num_tokens": 243503045.0, + "step": 6384 + }, + { + "epoch": 0.8122376288004071, + "ewc_loss": 0.0269775390625, + "ewc_loss_parallel": 2.6941299438476562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.149009704589844, + "learning_rate": 1e-06, + "loss": 0.4887, + "mean_token_accuracy": 0.8498591780662537, + "num_tokens": 243543865.0, + "step": 6385 + }, + { + "epoch": 0.8123648390789976, + "ewc_loss": 0.0269775390625, + "ewc_loss_parallel": 2.6941299438476562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.384422302246094, + "learning_rate": 1e-06, + "loss": 0.4573, + "mean_token_accuracy": 0.8538200855255127, + "num_tokens": 243581258.0, + "step": 6386 + }, + { + "epoch": 0.8124920493575881, + "ewc_loss": 0.0269775390625, + "ewc_loss_parallel": 2.6941299438476562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.057653427124023, + "learning_rate": 1e-06, + "loss": 0.4564, + "mean_token_accuracy": 0.8544498085975647, + "num_tokens": 243622777.0, + "step": 6387 + }, + { + "epoch": 0.8126192596361786, + "ewc_loss": 0.02685546875, + "ewc_loss_parallel": 2.682209014892578e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.920679092407227, + "learning_rate": 1e-06, + "loss": 0.4779, + "mean_token_accuracy": 0.8486695289611816, + "num_tokens": 243664549.0, + "step": 6388 + }, + { + "epoch": 0.8127464699147691, + "ewc_loss": 0.0272216796875, + "ewc_loss_parallel": 2.7179718017578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.32848358154297, + "learning_rate": 1e-06, + "loss": 0.4504, + "mean_token_accuracy": 0.8554815649986267, + "num_tokens": 243710408.0, + "step": 6389 + }, + { + "epoch": 0.8128736801933596, + "ewc_loss": 0.0267333984375, + "ewc_loss_parallel": 2.6702880859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.089229583740234, + "learning_rate": 1e-06, + "loss": 0.4465, + "mean_token_accuracy": 0.8614144921302795, + "num_tokens": 243752598.0, + "step": 6390 + }, + { + "epoch": 0.8130008904719501, + "ewc_loss": 0.02685546875, + "ewc_loss_parallel": 2.682209014892578e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.963075637817383, + "learning_rate": 1e-06, + "loss": 0.4475, + "mean_token_accuracy": 0.8571587204933167, + "num_tokens": 243789805.0, + "step": 6391 + }, + { + "epoch": 0.8131281007505406, + "ewc_loss": 0.02685546875, + "ewc_loss_parallel": 2.682209014892578e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.233896255493164, + "learning_rate": 1e-06, + "loss": 0.4105, + "mean_token_accuracy": 0.8712307214736938, + "num_tokens": 243824124.0, + "step": 6392 + }, + { + "epoch": 0.8132553110291312, + "ewc_loss": 0.027099609375, + "ewc_loss_parallel": 2.7060508728027344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.059293746948242, + "learning_rate": 1e-06, + "loss": 0.4523, + "mean_token_accuracy": 0.8597569465637207, + "num_tokens": 243863448.0, + "step": 6393 + }, + { + "epoch": 0.8133825213077217, + "ewc_loss": 0.0269775390625, + "ewc_loss_parallel": 2.6941299438476562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.126588821411133, + "learning_rate": 1e-06, + "loss": 0.4196, + "mean_token_accuracy": 0.8644253015518188, + "num_tokens": 243902085.0, + "step": 6394 + }, + { + "epoch": 0.8135097315863121, + "ewc_loss": 0.0269775390625, + "ewc_loss_parallel": 2.6941299438476562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.06757354736328, + "learning_rate": 1e-06, + "loss": 0.4574, + "mean_token_accuracy": 0.8553277254104614, + "num_tokens": 243945349.0, + "step": 6395 + }, + { + "epoch": 0.8136369418649027, + "ewc_loss": 0.0269775390625, + "ewc_loss_parallel": 2.6941299438476562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.0587158203125, + "learning_rate": 1e-06, + "loss": 0.4401, + "mean_token_accuracy": 0.8578933477401733, + "num_tokens": 243982221.0, + "step": 6396 + }, + { + "epoch": 0.8137641521434932, + "ewc_loss": 0.0269775390625, + "ewc_loss_parallel": 2.6941299438476562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.15221405029297, + "learning_rate": 1e-06, + "loss": 0.4763, + "mean_token_accuracy": 0.8476676344871521, + "num_tokens": 244012576.0, + "step": 6397 + }, + { + "epoch": 0.8138913624220837, + "ewc_loss": 0.0269775390625, + "ewc_loss_parallel": 2.6941299438476562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.113142013549805, + "learning_rate": 1e-06, + "loss": 0.4157, + "mean_token_accuracy": 0.8674191832542419, + "num_tokens": 244052089.0, + "step": 6398 + }, + { + "epoch": 0.8140185727006742, + "ewc_loss": 0.0269775390625, + "ewc_loss_parallel": 2.6941299438476562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.134435653686523, + "learning_rate": 1e-06, + "loss": 0.4289, + "mean_token_accuracy": 0.8620456457138062, + "num_tokens": 244088712.0, + "step": 6399 + }, + { + "epoch": 0.8141457829792648, + "ewc_loss": 0.0269775390625, + "ewc_loss_parallel": 2.6941299438476562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.042701721191406, + "learning_rate": 1e-06, + "loss": 0.4242, + "mean_token_accuracy": 0.8618605136871338, + "num_tokens": 244127655.0, + "step": 6400 + }, + { + "epoch": 0.8142729932578552, + "ewc_loss": 0.0272216796875, + "ewc_loss_parallel": 2.7179718017578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.978700637817383, + "learning_rate": 1e-06, + "loss": 0.4197, + "mean_token_accuracy": 0.8657667636871338, + "num_tokens": 244158588.0, + "step": 6401 + }, + { + "epoch": 0.8144002035364457, + "ewc_loss": 0.0269775390625, + "ewc_loss_parallel": 2.6941299438476562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.11896514892578, + "learning_rate": 1e-06, + "loss": 0.3707, + "mean_token_accuracy": 0.8789812326431274, + "num_tokens": 244193872.0, + "step": 6402 + }, + { + "epoch": 0.8145274138150362, + "ewc_loss": 0.0269775390625, + "ewc_loss_parallel": 2.6941299438476562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.03055763244629, + "learning_rate": 1e-06, + "loss": 0.4904, + "mean_token_accuracy": 0.8457807302474976, + "num_tokens": 244234055.0, + "step": 6403 + }, + { + "epoch": 0.8146546240936268, + "ewc_loss": 0.027099609375, + "ewc_loss_parallel": 2.7060508728027344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.175132751464844, + "learning_rate": 1e-06, + "loss": 0.4317, + "mean_token_accuracy": 0.8609969615936279, + "num_tokens": 244275163.0, + "step": 6404 + }, + { + "epoch": 0.8147818343722173, + "ewc_loss": 0.02685546875, + "ewc_loss_parallel": 2.682209014892578e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.09372901916504, + "learning_rate": 1e-06, + "loss": 0.442, + "mean_token_accuracy": 0.8574969172477722, + "num_tokens": 244315241.0, + "step": 6405 + }, + { + "epoch": 0.8149090446508078, + "ewc_loss": 0.0269775390625, + "ewc_loss_parallel": 2.6941299438476562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.049365997314453, + "learning_rate": 1e-06, + "loss": 0.4006, + "mean_token_accuracy": 0.8754117488861084, + "num_tokens": 244351947.0, + "step": 6406 + }, + { + "epoch": 0.8150362549293982, + "ewc_loss": 0.02734375, + "ewc_loss_parallel": 2.7298927307128906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.24545669555664, + "learning_rate": 1e-06, + "loss": 0.4392, + "mean_token_accuracy": 0.8598178625106812, + "num_tokens": 244387706.0, + "step": 6407 + }, + { + "epoch": 0.8151634652079888, + "ewc_loss": 0.027099609375, + "ewc_loss_parallel": 2.7060508728027344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.277729034423828, + "learning_rate": 1e-06, + "loss": 0.4079, + "mean_token_accuracy": 0.8689867258071899, + "num_tokens": 244428447.0, + "step": 6408 + }, + { + "epoch": 0.8152906754865793, + "ewc_loss": 0.0269775390625, + "ewc_loss_parallel": 2.6941299438476562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.990097045898438, + "learning_rate": 1e-06, + "loss": 0.4795, + "mean_token_accuracy": 0.845472514629364, + "num_tokens": 244471191.0, + "step": 6409 + }, + { + "epoch": 0.8154178857651698, + "ewc_loss": 0.0269775390625, + "ewc_loss_parallel": 2.6941299438476562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.332237243652344, + "learning_rate": 1e-06, + "loss": 0.5111, + "mean_token_accuracy": 0.8431408405303955, + "num_tokens": 244509198.0, + "step": 6410 + }, + { + "epoch": 0.8155450960437604, + "ewc_loss": 0.027099609375, + "ewc_loss_parallel": 2.7060508728027344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.116086959838867, + "learning_rate": 1e-06, + "loss": 0.4661, + "mean_token_accuracy": 0.8519274592399597, + "num_tokens": 244549677.0, + "step": 6411 + }, + { + "epoch": 0.8156723063223509, + "ewc_loss": 0.0269775390625, + "ewc_loss_parallel": 2.6941299438476562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.19076919555664, + "learning_rate": 1e-06, + "loss": 0.4685, + "mean_token_accuracy": 0.847995400428772, + "num_tokens": 244583586.0, + "step": 6412 + }, + { + "epoch": 0.8157995166009414, + "ewc_loss": 0.0269775390625, + "ewc_loss_parallel": 2.6941299438476562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.165170669555664, + "learning_rate": 1e-06, + "loss": 0.4129, + "mean_token_accuracy": 0.8681212067604065, + "num_tokens": 244621220.0, + "step": 6413 + }, + { + "epoch": 0.8159267268795318, + "ewc_loss": 0.02685546875, + "ewc_loss_parallel": 2.682209014892578e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.12793731689453, + "learning_rate": 1e-06, + "loss": 0.4079, + "mean_token_accuracy": 0.867416501045227, + "num_tokens": 244655941.0, + "step": 6414 + }, + { + "epoch": 0.8160539371581224, + "ewc_loss": 0.0269775390625, + "ewc_loss_parallel": 2.6941299438476562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.032148361206055, + "learning_rate": 1e-06, + "loss": 0.3827, + "mean_token_accuracy": 0.8777732849121094, + "num_tokens": 244685807.0, + "step": 6415 + }, + { + "epoch": 0.8161811474367129, + "ewc_loss": 0.027099609375, + "ewc_loss_parallel": 2.7060508728027344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.347822189331055, + "learning_rate": 1e-06, + "loss": 0.4426, + "mean_token_accuracy": 0.8584926128387451, + "num_tokens": 244726866.0, + "step": 6416 + }, + { + "epoch": 0.8163083577153034, + "ewc_loss": 0.0269775390625, + "ewc_loss_parallel": 2.6941299438476562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.22174835205078, + "learning_rate": 1e-06, + "loss": 0.4767, + "mean_token_accuracy": 0.8520472049713135, + "num_tokens": 244767273.0, + "step": 6417 + }, + { + "epoch": 0.8164355679938939, + "ewc_loss": 0.0267333984375, + "ewc_loss_parallel": 2.6702880859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.0543212890625, + "learning_rate": 1e-06, + "loss": 0.4419, + "mean_token_accuracy": 0.8557869791984558, + "num_tokens": 244806111.0, + "step": 6418 + }, + { + "epoch": 0.8165627782724845, + "ewc_loss": 0.0267333984375, + "ewc_loss_parallel": 2.6702880859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.36944007873535, + "learning_rate": 1e-06, + "loss": 0.4113, + "mean_token_accuracy": 0.8711286187171936, + "num_tokens": 244841706.0, + "step": 6419 + }, + { + "epoch": 0.8166899885510749, + "ewc_loss": 0.027099609375, + "ewc_loss_parallel": 2.7060508728027344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.94362449645996, + "learning_rate": 1e-06, + "loss": 0.4377, + "mean_token_accuracy": 0.8631158471107483, + "num_tokens": 244883908.0, + "step": 6420 + }, + { + "epoch": 0.8168171988296654, + "ewc_loss": 0.026611328125, + "ewc_loss_parallel": 2.658367156982422e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.20382308959961, + "learning_rate": 1e-06, + "loss": 0.4555, + "mean_token_accuracy": 0.8528251647949219, + "num_tokens": 244919090.0, + "step": 6421 + }, + { + "epoch": 0.8169444091082559, + "ewc_loss": 0.02685546875, + "ewc_loss_parallel": 2.682209014892578e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.37186050415039, + "learning_rate": 1e-06, + "loss": 0.3955, + "mean_token_accuracy": 0.8752084970474243, + "num_tokens": 244961141.0, + "step": 6422 + }, + { + "epoch": 0.8170716193868465, + "ewc_loss": 0.0267333984375, + "ewc_loss_parallel": 2.6702880859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.068819046020508, + "learning_rate": 1e-06, + "loss": 0.4778, + "mean_token_accuracy": 0.8439669609069824, + "num_tokens": 245004503.0, + "step": 6423 + }, + { + "epoch": 0.817198829665437, + "ewc_loss": 0.027099609375, + "ewc_loss_parallel": 2.7060508728027344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.0457763671875, + "learning_rate": 1e-06, + "loss": 0.4474, + "mean_token_accuracy": 0.8595314025878906, + "num_tokens": 245045042.0, + "step": 6424 + }, + { + "epoch": 0.8173260399440275, + "ewc_loss": 0.0269775390625, + "ewc_loss_parallel": 2.6941299438476562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.25054359436035, + "learning_rate": 1e-06, + "loss": 0.4458, + "mean_token_accuracy": 0.8592954874038696, + "num_tokens": 245085416.0, + "step": 6425 + }, + { + "epoch": 0.8174532502226179, + "ewc_loss": 0.0269775390625, + "ewc_loss_parallel": 2.6941299438476562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.075525283813477, + "learning_rate": 1e-06, + "loss": 0.4534, + "mean_token_accuracy": 0.8501995801925659, + "num_tokens": 245116137.0, + "step": 6426 + }, + { + "epoch": 0.8175804605012085, + "ewc_loss": 0.02685546875, + "ewc_loss_parallel": 2.682209014892578e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.103418350219727, + "learning_rate": 1e-06, + "loss": 0.3966, + "mean_token_accuracy": 0.8727579712867737, + "num_tokens": 245159181.0, + "step": 6427 + }, + { + "epoch": 0.817707670779799, + "ewc_loss": 0.0269775390625, + "ewc_loss_parallel": 2.6941299438476562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.180192947387695, + "learning_rate": 1e-06, + "loss": 0.4661, + "mean_token_accuracy": 0.8528150916099548, + "num_tokens": 245205199.0, + "step": 6428 + }, + { + "epoch": 0.8178348810583895, + "ewc_loss": 0.0267333984375, + "ewc_loss_parallel": 2.6702880859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.094526290893555, + "learning_rate": 1e-06, + "loss": 0.4222, + "mean_token_accuracy": 0.8645535707473755, + "num_tokens": 245242455.0, + "step": 6429 + }, + { + "epoch": 0.8179620913369801, + "ewc_loss": 0.0269775390625, + "ewc_loss_parallel": 2.6941299438476562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.057167053222656, + "learning_rate": 1e-06, + "loss": 0.375, + "mean_token_accuracy": 0.8796695470809937, + "num_tokens": 245280675.0, + "step": 6430 + }, + { + "epoch": 0.8180893016155706, + "ewc_loss": 0.02685546875, + "ewc_loss_parallel": 2.682209014892578e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.9987850189209, + "learning_rate": 1e-06, + "loss": 0.3816, + "mean_token_accuracy": 0.8765581250190735, + "num_tokens": 245316947.0, + "step": 6431 + }, + { + "epoch": 0.818216511894161, + "ewc_loss": 0.0269775390625, + "ewc_loss_parallel": 2.6941299438476562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.182357788085938, + "learning_rate": 1e-06, + "loss": 0.4243, + "mean_token_accuracy": 0.8649441599845886, + "num_tokens": 245355145.0, + "step": 6432 + }, + { + "epoch": 0.8183437221727515, + "ewc_loss": 0.0272216796875, + "ewc_loss_parallel": 2.7179718017578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.081340789794922, + "learning_rate": 1e-06, + "loss": 0.4036, + "mean_token_accuracy": 0.8718706965446472, + "num_tokens": 245390282.0, + "step": 6433 + }, + { + "epoch": 0.8184709324513421, + "ewc_loss": 0.0269775390625, + "ewc_loss_parallel": 2.6941299438476562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.956754684448242, + "learning_rate": 1e-06, + "loss": 0.4076, + "mean_token_accuracy": 0.8703941106796265, + "num_tokens": 245430262.0, + "step": 6434 + }, + { + "epoch": 0.8185981427299326, + "ewc_loss": 0.02685546875, + "ewc_loss_parallel": 2.682209014892578e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.01213264465332, + "learning_rate": 1e-06, + "loss": 0.4412, + "mean_token_accuracy": 0.8577359318733215, + "num_tokens": 245470874.0, + "step": 6435 + }, + { + "epoch": 0.8187253530085231, + "ewc_loss": 0.02685546875, + "ewc_loss_parallel": 2.682209014892578e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.092084884643555, + "learning_rate": 1e-06, + "loss": 0.4126, + "mean_token_accuracy": 0.8672628402709961, + "num_tokens": 245506626.0, + "step": 6436 + }, + { + "epoch": 0.8188525632871136, + "ewc_loss": 0.0269775390625, + "ewc_loss_parallel": 2.6941299438476562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.025257110595703, + "learning_rate": 1e-06, + "loss": 0.4887, + "mean_token_accuracy": 0.8449727296829224, + "num_tokens": 245546794.0, + "step": 6437 + }, + { + "epoch": 0.8189797735657041, + "ewc_loss": 0.02685546875, + "ewc_loss_parallel": 2.682209014892578e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.03449058532715, + "learning_rate": 1e-06, + "loss": 0.4268, + "mean_token_accuracy": 0.863320529460907, + "num_tokens": 245589544.0, + "step": 6438 + }, + { + "epoch": 0.8191069838442946, + "ewc_loss": 0.02685546875, + "ewc_loss_parallel": 2.682209014892578e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.029048919677734, + "learning_rate": 1e-06, + "loss": 0.397, + "mean_token_accuracy": 0.8768168687820435, + "num_tokens": 245628889.0, + "step": 6439 + }, + { + "epoch": 0.8192341941228851, + "ewc_loss": 0.0269775390625, + "ewc_loss_parallel": 2.6941299438476562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.93435287475586, + "learning_rate": 1e-06, + "loss": 0.4436, + "mean_token_accuracy": 0.8579092621803284, + "num_tokens": 245674615.0, + "step": 6440 + }, + { + "epoch": 0.8193614044014756, + "ewc_loss": 0.027099609375, + "ewc_loss_parallel": 2.7060508728027344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.024402618408203, + "learning_rate": 1e-06, + "loss": 0.43, + "mean_token_accuracy": 0.8603395223617554, + "num_tokens": 245713966.0, + "step": 6441 + }, + { + "epoch": 0.8194886146800662, + "ewc_loss": 0.0269775390625, + "ewc_loss_parallel": 2.6941299438476562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.020912170410156, + "learning_rate": 1e-06, + "loss": 0.4201, + "mean_token_accuracy": 0.8656332492828369, + "num_tokens": 245750035.0, + "step": 6442 + }, + { + "epoch": 0.8196158249586567, + "ewc_loss": 0.027099609375, + "ewc_loss_parallel": 2.7060508728027344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.021812438964844, + "learning_rate": 1e-06, + "loss": 0.3873, + "mean_token_accuracy": 0.8753892779350281, + "num_tokens": 245788506.0, + "step": 6443 + }, + { + "epoch": 0.8197430352372471, + "ewc_loss": 0.0272216796875, + "ewc_loss_parallel": 2.7179718017578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.10270118713379, + "learning_rate": 1e-06, + "loss": 0.4216, + "mean_token_accuracy": 0.8644245266914368, + "num_tokens": 245826547.0, + "step": 6444 + }, + { + "epoch": 0.8198702455158376, + "ewc_loss": 0.0272216796875, + "ewc_loss_parallel": 2.7179718017578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.080854415893555, + "learning_rate": 1e-06, + "loss": 0.4685, + "mean_token_accuracy": 0.8518098592758179, + "num_tokens": 245874363.0, + "step": 6445 + }, + { + "epoch": 0.8199974557944282, + "ewc_loss": 0.027099609375, + "ewc_loss_parallel": 2.7060508728027344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.195392608642578, + "learning_rate": 1e-06, + "loss": 0.499, + "mean_token_accuracy": 0.842311680316925, + "num_tokens": 245917813.0, + "step": 6446 + }, + { + "epoch": 0.8201246660730187, + "ewc_loss": 0.027099609375, + "ewc_loss_parallel": 2.7060508728027344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.217655181884766, + "learning_rate": 1e-06, + "loss": 0.4671, + "mean_token_accuracy": 0.8571004867553711, + "num_tokens": 245952177.0, + "step": 6447 + }, + { + "epoch": 0.8202518763516092, + "ewc_loss": 0.0272216796875, + "ewc_loss_parallel": 2.7179718017578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.303789138793945, + "learning_rate": 1e-06, + "loss": 0.4266, + "mean_token_accuracy": 0.8632556200027466, + "num_tokens": 245992897.0, + "step": 6448 + }, + { + "epoch": 0.8203790866301998, + "ewc_loss": 0.027099609375, + "ewc_loss_parallel": 2.7060508728027344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.052669525146484, + "learning_rate": 1e-06, + "loss": 0.4139, + "mean_token_accuracy": 0.8697067499160767, + "num_tokens": 246029657.0, + "step": 6449 + }, + { + "epoch": 0.8205062969087902, + "ewc_loss": 0.0272216796875, + "ewc_loss_parallel": 2.7179718017578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.410381317138672, + "learning_rate": 1e-06, + "loss": 0.4644, + "mean_token_accuracy": 0.8502682447433472, + "num_tokens": 246072425.0, + "step": 6450 + }, + { + "epoch": 0.8206335071873807, + "ewc_loss": 0.0269775390625, + "ewc_loss_parallel": 2.6941299438476562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.1239013671875, + "learning_rate": 1e-06, + "loss": 0.3816, + "mean_token_accuracy": 0.8788173794746399, + "num_tokens": 246113131.0, + "step": 6451 + }, + { + "epoch": 0.8207607174659712, + "ewc_loss": 0.027099609375, + "ewc_loss_parallel": 2.7060508728027344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.35790252685547, + "learning_rate": 1e-06, + "loss": 0.4331, + "mean_token_accuracy": 0.8641600608825684, + "num_tokens": 246154548.0, + "step": 6452 + }, + { + "epoch": 0.8208879277445618, + "ewc_loss": 0.0269775390625, + "ewc_loss_parallel": 2.6941299438476562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.06889533996582, + "learning_rate": 1e-06, + "loss": 0.4481, + "mean_token_accuracy": 0.8573954105377197, + "num_tokens": 246189518.0, + "step": 6453 + }, + { + "epoch": 0.8210151380231523, + "ewc_loss": 0.0269775390625, + "ewc_loss_parallel": 2.6941299438476562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.148998260498047, + "learning_rate": 1e-06, + "loss": 0.4593, + "mean_token_accuracy": 0.8567008376121521, + "num_tokens": 246235248.0, + "step": 6454 + }, + { + "epoch": 0.8211423483017428, + "ewc_loss": 0.0269775390625, + "ewc_loss_parallel": 2.6941299438476562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.140487670898438, + "learning_rate": 1e-06, + "loss": 0.4674, + "mean_token_accuracy": 0.8546096086502075, + "num_tokens": 246277016.0, + "step": 6455 + }, + { + "epoch": 0.8212695585803332, + "ewc_loss": 0.0269775390625, + "ewc_loss_parallel": 2.6941299438476562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.49595832824707, + "learning_rate": 1e-06, + "loss": 0.4411, + "mean_token_accuracy": 0.8581088185310364, + "num_tokens": 246312352.0, + "step": 6456 + }, + { + "epoch": 0.8213967688589238, + "ewc_loss": 0.027099609375, + "ewc_loss_parallel": 2.7060508728027344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.06416893005371, + "learning_rate": 1e-06, + "loss": 0.4857, + "mean_token_accuracy": 0.8477956652641296, + "num_tokens": 246355079.0, + "step": 6457 + }, + { + "epoch": 0.8215239791375143, + "ewc_loss": 0.0267333984375, + "ewc_loss_parallel": 2.6702880859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.123722076416016, + "learning_rate": 1e-06, + "loss": 0.4888, + "mean_token_accuracy": 0.8482943773269653, + "num_tokens": 246391073.0, + "step": 6458 + }, + { + "epoch": 0.8216511894161048, + "ewc_loss": 0.0269775390625, + "ewc_loss_parallel": 2.6941299438476562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.21968650817871, + "learning_rate": 1e-06, + "loss": 0.4157, + "mean_token_accuracy": 0.8668818473815918, + "num_tokens": 246435407.0, + "step": 6459 + }, + { + "epoch": 0.8217783996946953, + "ewc_loss": 0.02685546875, + "ewc_loss_parallel": 2.682209014892578e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.18050765991211, + "learning_rate": 1e-06, + "loss": 0.487, + "mean_token_accuracy": 0.8477624654769897, + "num_tokens": 246475303.0, + "step": 6460 + }, + { + "epoch": 0.8219056099732859, + "ewc_loss": 0.0272216796875, + "ewc_loss_parallel": 2.7179718017578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.008132934570312, + "learning_rate": 1e-06, + "loss": 0.441, + "mean_token_accuracy": 0.8582677841186523, + "num_tokens": 246515237.0, + "step": 6461 + }, + { + "epoch": 0.8220328202518764, + "ewc_loss": 0.027099609375, + "ewc_loss_parallel": 2.7060508728027344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.058042526245117, + "learning_rate": 1e-06, + "loss": 0.4656, + "mean_token_accuracy": 0.8560745716094971, + "num_tokens": 246554958.0, + "step": 6462 + }, + { + "epoch": 0.8221600305304668, + "ewc_loss": 0.0272216796875, + "ewc_loss_parallel": 2.7179718017578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.001131057739258, + "learning_rate": 1e-06, + "loss": 0.4503, + "mean_token_accuracy": 0.8614126443862915, + "num_tokens": 246592327.0, + "step": 6463 + }, + { + "epoch": 0.8222872408090574, + "ewc_loss": 0.027099609375, + "ewc_loss_parallel": 2.7060508728027344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.072063446044922, + "learning_rate": 1e-06, + "loss": 0.4121, + "mean_token_accuracy": 0.8651180267333984, + "num_tokens": 246626965.0, + "step": 6464 + }, + { + "epoch": 0.8224144510876479, + "ewc_loss": 0.027099609375, + "ewc_loss_parallel": 2.7060508728027344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.19202423095703, + "learning_rate": 1e-06, + "loss": 0.4445, + "mean_token_accuracy": 0.857043981552124, + "num_tokens": 246662049.0, + "step": 6465 + }, + { + "epoch": 0.8225416613662384, + "ewc_loss": 0.0269775390625, + "ewc_loss_parallel": 2.6941299438476562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.91138458251953, + "learning_rate": 1e-06, + "loss": 0.4263, + "mean_token_accuracy": 0.8625162839889526, + "num_tokens": 246696989.0, + "step": 6466 + }, + { + "epoch": 0.8226688716448289, + "ewc_loss": 0.0272216796875, + "ewc_loss_parallel": 2.7179718017578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.06461524963379, + "learning_rate": 1e-06, + "loss": 0.4243, + "mean_token_accuracy": 0.8650106191635132, + "num_tokens": 246738418.0, + "step": 6467 + }, + { + "epoch": 0.8227960819234195, + "ewc_loss": 0.0274658203125, + "ewc_loss_parallel": 2.7418136596679688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.3198299407959, + "learning_rate": 1e-06, + "loss": 0.4347, + "mean_token_accuracy": 0.8623123168945312, + "num_tokens": 246776779.0, + "step": 6468 + }, + { + "epoch": 0.8229232922020099, + "ewc_loss": 0.0272216796875, + "ewc_loss_parallel": 2.7179718017578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.9609317779541, + "learning_rate": 1e-06, + "loss": 0.4039, + "mean_token_accuracy": 0.8673238754272461, + "num_tokens": 246814029.0, + "step": 6469 + }, + { + "epoch": 0.8230505024806004, + "ewc_loss": 0.0272216796875, + "ewc_loss_parallel": 2.7179718017578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.242469787597656, + "learning_rate": 1e-06, + "loss": 0.4608, + "mean_token_accuracy": 0.8526344299316406, + "num_tokens": 246856390.0, + "step": 6470 + }, + { + "epoch": 0.8231777127591909, + "ewc_loss": 0.0272216796875, + "ewc_loss_parallel": 2.7179718017578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.206443786621094, + "learning_rate": 1e-06, + "loss": 0.44, + "mean_token_accuracy": 0.8618859052658081, + "num_tokens": 246895563.0, + "step": 6471 + }, + { + "epoch": 0.8233049230377815, + "ewc_loss": 0.0274658203125, + "ewc_loss_parallel": 2.7418136596679688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.255701065063477, + "learning_rate": 1e-06, + "loss": 0.4447, + "mean_token_accuracy": 0.8569186329841614, + "num_tokens": 246935956.0, + "step": 6472 + }, + { + "epoch": 0.823432133316372, + "ewc_loss": 0.027099609375, + "ewc_loss_parallel": 2.7060508728027344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.135055541992188, + "learning_rate": 1e-06, + "loss": 0.4398, + "mean_token_accuracy": 0.8565409183502197, + "num_tokens": 246984765.0, + "step": 6473 + }, + { + "epoch": 0.8235593435949625, + "ewc_loss": 0.0272216796875, + "ewc_loss_parallel": 2.7179718017578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.362136840820312, + "learning_rate": 1e-06, + "loss": 0.4301, + "mean_token_accuracy": 0.863682746887207, + "num_tokens": 247017086.0, + "step": 6474 + }, + { + "epoch": 0.8236865538735529, + "ewc_loss": 0.027099609375, + "ewc_loss_parallel": 2.7060508728027344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.945022583007812, + "learning_rate": 1e-06, + "loss": 0.418, + "mean_token_accuracy": 0.8661403656005859, + "num_tokens": 247056195.0, + "step": 6475 + }, + { + "epoch": 0.8238137641521435, + "ewc_loss": 0.0272216796875, + "ewc_loss_parallel": 2.7179718017578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.073537826538086, + "learning_rate": 1e-06, + "loss": 0.392, + "mean_token_accuracy": 0.8738910555839539, + "num_tokens": 247094063.0, + "step": 6476 + }, + { + "epoch": 0.823940974430734, + "ewc_loss": 0.0272216796875, + "ewc_loss_parallel": 2.7179718017578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.187376022338867, + "learning_rate": 1e-06, + "loss": 0.4421, + "mean_token_accuracy": 0.8601605892181396, + "num_tokens": 247131743.0, + "step": 6477 + }, + { + "epoch": 0.8240681847093245, + "ewc_loss": 0.027099609375, + "ewc_loss_parallel": 2.7060508728027344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.984533309936523, + "learning_rate": 1e-06, + "loss": 0.4373, + "mean_token_accuracy": 0.8583035469055176, + "num_tokens": 247169477.0, + "step": 6478 + }, + { + "epoch": 0.824195394987915, + "ewc_loss": 0.027099609375, + "ewc_loss_parallel": 2.7060508728027344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.152555465698242, + "learning_rate": 1e-06, + "loss": 0.4175, + "mean_token_accuracy": 0.8699653744697571, + "num_tokens": 247209159.0, + "step": 6479 + }, + { + "epoch": 0.8243226052665056, + "ewc_loss": 0.0272216796875, + "ewc_loss_parallel": 2.7179718017578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.177831649780273, + "learning_rate": 1e-06, + "loss": 0.4317, + "mean_token_accuracy": 0.8624988794326782, + "num_tokens": 247247603.0, + "step": 6480 + }, + { + "epoch": 0.824449815545096, + "ewc_loss": 0.0274658203125, + "ewc_loss_parallel": 2.7418136596679688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.144540786743164, + "learning_rate": 1e-06, + "loss": 0.4009, + "mean_token_accuracy": 0.8708807229995728, + "num_tokens": 247290184.0, + "step": 6481 + }, + { + "epoch": 0.8245770258236865, + "ewc_loss": 0.027099609375, + "ewc_loss_parallel": 2.7060508728027344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.21472930908203, + "learning_rate": 1e-06, + "loss": 0.4938, + "mean_token_accuracy": 0.8431962728500366, + "num_tokens": 247331862.0, + "step": 6482 + }, + { + "epoch": 0.824704236102277, + "ewc_loss": 0.0272216796875, + "ewc_loss_parallel": 2.7179718017578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.18839454650879, + "learning_rate": 1e-06, + "loss": 0.4511, + "mean_token_accuracy": 0.8608740568161011, + "num_tokens": 247368271.0, + "step": 6483 + }, + { + "epoch": 0.8248314463808676, + "ewc_loss": 0.02685546875, + "ewc_loss_parallel": 2.682209014892578e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.211605072021484, + "learning_rate": 1e-06, + "loss": 0.4393, + "mean_token_accuracy": 0.8599073886871338, + "num_tokens": 247408077.0, + "step": 6484 + }, + { + "epoch": 0.8249586566594581, + "ewc_loss": 0.0272216796875, + "ewc_loss_parallel": 2.7179718017578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.142253875732422, + "learning_rate": 1e-06, + "loss": 0.4104, + "mean_token_accuracy": 0.8691403269767761, + "num_tokens": 247447408.0, + "step": 6485 + }, + { + "epoch": 0.8250858669380486, + "ewc_loss": 0.0269775390625, + "ewc_loss_parallel": 2.6941299438476562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.01828384399414, + "learning_rate": 1e-06, + "loss": 0.408, + "mean_token_accuracy": 0.8722982406616211, + "num_tokens": 247489863.0, + "step": 6486 + }, + { + "epoch": 0.8252130772166391, + "ewc_loss": 0.0272216796875, + "ewc_loss_parallel": 2.7179718017578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.13508415222168, + "learning_rate": 1e-06, + "loss": 0.453, + "mean_token_accuracy": 0.8574404716491699, + "num_tokens": 247532233.0, + "step": 6487 + }, + { + "epoch": 0.8253402874952296, + "ewc_loss": 0.0277099609375, + "ewc_loss_parallel": 2.765655517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.198335647583008, + "learning_rate": 1e-06, + "loss": 0.5206, + "mean_token_accuracy": 0.8357227444648743, + "num_tokens": 247570719.0, + "step": 6488 + }, + { + "epoch": 0.8254674977738201, + "ewc_loss": 0.027099609375, + "ewc_loss_parallel": 2.7060508728027344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.29071617126465, + "learning_rate": 1e-06, + "loss": 0.4207, + "mean_token_accuracy": 0.8663398027420044, + "num_tokens": 247603302.0, + "step": 6489 + }, + { + "epoch": 0.8255947080524106, + "ewc_loss": 0.0274658203125, + "ewc_loss_parallel": 2.7418136596679688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.0618953704834, + "learning_rate": 1e-06, + "loss": 0.4268, + "mean_token_accuracy": 0.8641999959945679, + "num_tokens": 247642837.0, + "step": 6490 + }, + { + "epoch": 0.8257219183310012, + "ewc_loss": 0.0269775390625, + "ewc_loss_parallel": 2.6941299438476562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.072818756103516, + "learning_rate": 1e-06, + "loss": 0.4389, + "mean_token_accuracy": 0.8576563596725464, + "num_tokens": 247685246.0, + "step": 6491 + }, + { + "epoch": 0.8258491286095917, + "ewc_loss": 0.027099609375, + "ewc_loss_parallel": 2.7060508728027344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.30533218383789, + "learning_rate": 1e-06, + "loss": 0.4679, + "mean_token_accuracy": 0.8529685735702515, + "num_tokens": 247720871.0, + "step": 6492 + }, + { + "epoch": 0.8259763388881821, + "ewc_loss": 0.0272216796875, + "ewc_loss_parallel": 2.7179718017578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.91893768310547, + "learning_rate": 1e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.8621982932090759, + "num_tokens": 247759796.0, + "step": 6493 + }, + { + "epoch": 0.8261035491667726, + "ewc_loss": 0.027099609375, + "ewc_loss_parallel": 2.7060508728027344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.336753845214844, + "learning_rate": 1e-06, + "loss": 0.5128, + "mean_token_accuracy": 0.8399043083190918, + "num_tokens": 247806486.0, + "step": 6494 + }, + { + "epoch": 0.8262307594453632, + "ewc_loss": 0.0269775390625, + "ewc_loss_parallel": 2.6941299438476562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.095809936523438, + "learning_rate": 1e-06, + "loss": 0.4173, + "mean_token_accuracy": 0.8693068027496338, + "num_tokens": 247842031.0, + "step": 6495 + }, + { + "epoch": 0.8263579697239537, + "ewc_loss": 0.0269775390625, + "ewc_loss_parallel": 2.6941299438476562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.068241119384766, + "learning_rate": 1e-06, + "loss": 0.4224, + "mean_token_accuracy": 0.8644561767578125, + "num_tokens": 247883438.0, + "step": 6496 + }, + { + "epoch": 0.8264851800025442, + "ewc_loss": 0.0272216796875, + "ewc_loss_parallel": 2.7179718017578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.069292068481445, + "learning_rate": 1e-06, + "loss": 0.4058, + "mean_token_accuracy": 0.8693400025367737, + "num_tokens": 247921491.0, + "step": 6497 + }, + { + "epoch": 0.8266123902811348, + "ewc_loss": 0.02734375, + "ewc_loss_parallel": 2.7298927307128906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.12454605102539, + "learning_rate": 1e-06, + "loss": 0.4114, + "mean_token_accuracy": 0.8710867166519165, + "num_tokens": 247957454.0, + "step": 6498 + }, + { + "epoch": 0.8267396005597252, + "ewc_loss": 0.0269775390625, + "ewc_loss_parallel": 2.6941299438476562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.040035247802734, + "learning_rate": 1e-06, + "loss": 0.4944, + "mean_token_accuracy": 0.8460099101066589, + "num_tokens": 247997705.0, + "step": 6499 + }, + { + "epoch": 0.8268668108383157, + "ewc_loss": 0.02734375, + "ewc_loss_parallel": 2.7298927307128906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.217071533203125, + "learning_rate": 1e-06, + "loss": 0.4398, + "mean_token_accuracy": 0.8562028408050537, + "num_tokens": 248034536.0, + "step": 6500 + }, + { + "epoch": 0.8269940211169062, + "ewc_loss": 0.027099609375, + "ewc_loss_parallel": 2.7060508728027344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.032957077026367, + "learning_rate": 1e-06, + "loss": 0.4298, + "mean_token_accuracy": 0.8620256185531616, + "num_tokens": 248080916.0, + "step": 6501 + }, + { + "epoch": 0.8271212313954968, + "ewc_loss": 0.0269775390625, + "ewc_loss_parallel": 2.6941299438476562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.99283218383789, + "learning_rate": 1e-06, + "loss": 0.4257, + "mean_token_accuracy": 0.8597285151481628, + "num_tokens": 248119719.0, + "step": 6502 + }, + { + "epoch": 0.8272484416740873, + "ewc_loss": 0.027099609375, + "ewc_loss_parallel": 2.7060508728027344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.9995174407959, + "learning_rate": 1e-06, + "loss": 0.4468, + "mean_token_accuracy": 0.8582429885864258, + "num_tokens": 248156221.0, + "step": 6503 + }, + { + "epoch": 0.8273756519526778, + "ewc_loss": 0.0272216796875, + "ewc_loss_parallel": 2.7179718017578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.212261199951172, + "learning_rate": 1e-06, + "loss": 0.4297, + "mean_token_accuracy": 0.8654029369354248, + "num_tokens": 248191780.0, + "step": 6504 + }, + { + "epoch": 0.8275028622312682, + "ewc_loss": 0.02734375, + "ewc_loss_parallel": 2.7298927307128906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.099010467529297, + "learning_rate": 1e-06, + "loss": 0.454, + "mean_token_accuracy": 0.8558829426765442, + "num_tokens": 248228594.0, + "step": 6505 + }, + { + "epoch": 0.8276300725098588, + "ewc_loss": 0.0272216796875, + "ewc_loss_parallel": 2.7179718017578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.321813583374023, + "learning_rate": 1e-06, + "loss": 0.4183, + "mean_token_accuracy": 0.8653165102005005, + "num_tokens": 248260372.0, + "step": 6506 + }, + { + "epoch": 0.8277572827884493, + "ewc_loss": 0.0272216796875, + "ewc_loss_parallel": 2.7179718017578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.122573852539062, + "learning_rate": 1e-06, + "loss": 0.4656, + "mean_token_accuracy": 0.8535213470458984, + "num_tokens": 248299361.0, + "step": 6507 + }, + { + "epoch": 0.8278844930670398, + "ewc_loss": 0.027099609375, + "ewc_loss_parallel": 2.7060508728027344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.9700870513916, + "learning_rate": 1e-06, + "loss": 0.4774, + "mean_token_accuracy": 0.8521665334701538, + "num_tokens": 248337370.0, + "step": 6508 + }, + { + "epoch": 0.8280117033456303, + "ewc_loss": 0.0272216796875, + "ewc_loss_parallel": 2.7179718017578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.070823669433594, + "learning_rate": 1e-06, + "loss": 0.4319, + "mean_token_accuracy": 0.863524317741394, + "num_tokens": 248377541.0, + "step": 6509 + }, + { + "epoch": 0.8281389136242209, + "ewc_loss": 0.027099609375, + "ewc_loss_parallel": 2.7060508728027344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.200420379638672, + "learning_rate": 1e-06, + "loss": 0.5047, + "mean_token_accuracy": 0.8431490659713745, + "num_tokens": 248413928.0, + "step": 6510 + }, + { + "epoch": 0.8282661239028114, + "ewc_loss": 0.02734375, + "ewc_loss_parallel": 2.7298927307128906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.063766479492188, + "learning_rate": 1e-06, + "loss": 0.4356, + "mean_token_accuracy": 0.8605364561080933, + "num_tokens": 248453283.0, + "step": 6511 + }, + { + "epoch": 0.8283933341814018, + "ewc_loss": 0.0274658203125, + "ewc_loss_parallel": 2.7418136596679688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.14900016784668, + "learning_rate": 1e-06, + "loss": 0.4261, + "mean_token_accuracy": 0.8662046790122986, + "num_tokens": 248490286.0, + "step": 6512 + }, + { + "epoch": 0.8285205444599923, + "ewc_loss": 0.027099609375, + "ewc_loss_parallel": 2.7060508728027344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.078327178955078, + "learning_rate": 1e-06, + "loss": 0.4734, + "mean_token_accuracy": 0.8457839488983154, + "num_tokens": 248526349.0, + "step": 6513 + }, + { + "epoch": 0.8286477547385829, + "ewc_loss": 0.027099609375, + "ewc_loss_parallel": 2.7060508728027344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.164794921875, + "learning_rate": 1e-06, + "loss": 0.4555, + "mean_token_accuracy": 0.8530415296554565, + "num_tokens": 248561751.0, + "step": 6514 + }, + { + "epoch": 0.8287749650171734, + "ewc_loss": 0.0272216796875, + "ewc_loss_parallel": 2.7179718017578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.29014778137207, + "learning_rate": 1e-06, + "loss": 0.3961, + "mean_token_accuracy": 0.8737972378730774, + "num_tokens": 248601921.0, + "step": 6515 + }, + { + "epoch": 0.8289021752957639, + "ewc_loss": 0.027099609375, + "ewc_loss_parallel": 2.7060508728027344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.185991287231445, + "learning_rate": 1e-06, + "loss": 0.3918, + "mean_token_accuracy": 0.8719888925552368, + "num_tokens": 248641218.0, + "step": 6516 + }, + { + "epoch": 0.8290293855743545, + "ewc_loss": 0.02685546875, + "ewc_loss_parallel": 2.682209014892578e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.126901626586914, + "learning_rate": 1e-06, + "loss": 0.4296, + "mean_token_accuracy": 0.8615095019340515, + "num_tokens": 248672616.0, + "step": 6517 + }, + { + "epoch": 0.8291565958529449, + "ewc_loss": 0.0272216796875, + "ewc_loss_parallel": 2.7179718017578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.205358505249023, + "learning_rate": 1e-06, + "loss": 0.4247, + "mean_token_accuracy": 0.8647533655166626, + "num_tokens": 248708212.0, + "step": 6518 + }, + { + "epoch": 0.8292838061315354, + "ewc_loss": 0.027099609375, + "ewc_loss_parallel": 2.7060508728027344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.05115509033203, + "learning_rate": 1e-06, + "loss": 0.4228, + "mean_token_accuracy": 0.8663890361785889, + "num_tokens": 248746575.0, + "step": 6519 + }, + { + "epoch": 0.8294110164101259, + "ewc_loss": 0.0272216796875, + "ewc_loss_parallel": 2.7179718017578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.994049072265625, + "learning_rate": 1e-06, + "loss": 0.4461, + "mean_token_accuracy": 0.8565884828567505, + "num_tokens": 248789423.0, + "step": 6520 + }, + { + "epoch": 0.8295382266887165, + "ewc_loss": 0.02685546875, + "ewc_loss_parallel": 2.682209014892578e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.215831756591797, + "learning_rate": 1e-06, + "loss": 0.4237, + "mean_token_accuracy": 0.8631843328475952, + "num_tokens": 248824044.0, + "step": 6521 + }, + { + "epoch": 0.829665436967307, + "ewc_loss": 0.02734375, + "ewc_loss_parallel": 2.7298927307128906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.095001220703125, + "learning_rate": 1e-06, + "loss": 0.4432, + "mean_token_accuracy": 0.858395516872406, + "num_tokens": 248860805.0, + "step": 6522 + }, + { + "epoch": 0.8297926472458975, + "ewc_loss": 0.027099609375, + "ewc_loss_parallel": 2.7060508728027344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.03032875061035, + "learning_rate": 1e-06, + "loss": 0.4043, + "mean_token_accuracy": 0.8676472902297974, + "num_tokens": 248898958.0, + "step": 6523 + }, + { + "epoch": 0.8299198575244879, + "ewc_loss": 0.027099609375, + "ewc_loss_parallel": 2.7060508728027344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.183481216430664, + "learning_rate": 1e-06, + "loss": 0.44, + "mean_token_accuracy": 0.859928548336029, + "num_tokens": 248938847.0, + "step": 6524 + }, + { + "epoch": 0.8300470678030785, + "ewc_loss": 0.02734375, + "ewc_loss_parallel": 2.7298927307128906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.16603660583496, + "learning_rate": 1e-06, + "loss": 0.4436, + "mean_token_accuracy": 0.8579710721969604, + "num_tokens": 248977944.0, + "step": 6525 + }, + { + "epoch": 0.830174278081669, + "ewc_loss": 0.0272216796875, + "ewc_loss_parallel": 2.7179718017578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.163671493530273, + "learning_rate": 1e-06, + "loss": 0.4576, + "mean_token_accuracy": 0.8539944887161255, + "num_tokens": 249014610.0, + "step": 6526 + }, + { + "epoch": 0.8303014883602595, + "ewc_loss": 0.027099609375, + "ewc_loss_parallel": 2.7060508728027344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.209491729736328, + "learning_rate": 1e-06, + "loss": 0.5369, + "mean_token_accuracy": 0.8302983641624451, + "num_tokens": 249053869.0, + "step": 6527 + }, + { + "epoch": 0.83042869863885, + "ewc_loss": 0.02734375, + "ewc_loss_parallel": 2.7298927307128906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.443632125854492, + "learning_rate": 1e-06, + "loss": 0.4603, + "mean_token_accuracy": 0.8539159297943115, + "num_tokens": 249096874.0, + "step": 6528 + }, + { + "epoch": 0.8305559089174406, + "ewc_loss": 0.027099609375, + "ewc_loss_parallel": 2.7060508728027344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.075653076171875, + "learning_rate": 1e-06, + "loss": 0.441, + "mean_token_accuracy": 0.8598943948745728, + "num_tokens": 249133141.0, + "step": 6529 + }, + { + "epoch": 0.830683119196031, + "ewc_loss": 0.027099609375, + "ewc_loss_parallel": 2.7060508728027344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.217918395996094, + "learning_rate": 1e-06, + "loss": 0.4234, + "mean_token_accuracy": 0.8653461933135986, + "num_tokens": 249163813.0, + "step": 6530 + }, + { + "epoch": 0.8308103294746215, + "ewc_loss": 0.02734375, + "ewc_loss_parallel": 2.7298927307128906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.212146759033203, + "learning_rate": 1e-06, + "loss": 0.4599, + "mean_token_accuracy": 0.8530335426330566, + "num_tokens": 249202935.0, + "step": 6531 + }, + { + "epoch": 0.830937539753212, + "ewc_loss": 0.02734375, + "ewc_loss_parallel": 2.7298927307128906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.175748825073242, + "learning_rate": 1e-06, + "loss": 0.4125, + "mean_token_accuracy": 0.8679220676422119, + "num_tokens": 249241061.0, + "step": 6532 + }, + { + "epoch": 0.8310647500318026, + "ewc_loss": 0.02734375, + "ewc_loss_parallel": 2.7298927307128906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.32851219177246, + "learning_rate": 1e-06, + "loss": 0.4816, + "mean_token_accuracy": 0.8488101959228516, + "num_tokens": 249277863.0, + "step": 6533 + }, + { + "epoch": 0.8311919603103931, + "ewc_loss": 0.0269775390625, + "ewc_loss_parallel": 2.6941299438476562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.281766891479492, + "learning_rate": 1e-06, + "loss": 0.442, + "mean_token_accuracy": 0.8600817918777466, + "num_tokens": 249318047.0, + "step": 6534 + }, + { + "epoch": 0.8313191705889836, + "ewc_loss": 0.027099609375, + "ewc_loss_parallel": 2.7060508728027344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.044292449951172, + "learning_rate": 1e-06, + "loss": 0.4604, + "mean_token_accuracy": 0.8551809191703796, + "num_tokens": 249362095.0, + "step": 6535 + }, + { + "epoch": 0.831446380867574, + "ewc_loss": 0.0269775390625, + "ewc_loss_parallel": 2.6941299438476562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.144298553466797, + "learning_rate": 1e-06, + "loss": 0.3974, + "mean_token_accuracy": 0.874087393283844, + "num_tokens": 249403362.0, + "step": 6536 + }, + { + "epoch": 0.8315735911461646, + "ewc_loss": 0.0272216796875, + "ewc_loss_parallel": 2.7179718017578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.117889404296875, + "learning_rate": 1e-06, + "loss": 0.4175, + "mean_token_accuracy": 0.8678549528121948, + "num_tokens": 249438051.0, + "step": 6537 + }, + { + "epoch": 0.8317008014247551, + "ewc_loss": 0.0272216796875, + "ewc_loss_parallel": 2.7179718017578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.037479400634766, + "learning_rate": 1e-06, + "loss": 0.4271, + "mean_token_accuracy": 0.8653408288955688, + "num_tokens": 249477185.0, + "step": 6538 + }, + { + "epoch": 0.8318280117033456, + "ewc_loss": 0.027099609375, + "ewc_loss_parallel": 2.7060508728027344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.066173553466797, + "learning_rate": 1e-06, + "loss": 0.4746, + "mean_token_accuracy": 0.8539918661117554, + "num_tokens": 249516880.0, + "step": 6539 + }, + { + "epoch": 0.8319552219819362, + "ewc_loss": 0.0272216796875, + "ewc_loss_parallel": 2.7179718017578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.047754287719727, + "learning_rate": 1e-06, + "loss": 0.4085, + "mean_token_accuracy": 0.8711328506469727, + "num_tokens": 249555864.0, + "step": 6540 + }, + { + "epoch": 0.8320824322605267, + "ewc_loss": 0.0272216796875, + "ewc_loss_parallel": 2.7179718017578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.24569320678711, + "learning_rate": 1e-06, + "loss": 0.4277, + "mean_token_accuracy": 0.8598607778549194, + "num_tokens": 249590566.0, + "step": 6541 + }, + { + "epoch": 0.8322096425391171, + "ewc_loss": 0.02734375, + "ewc_loss_parallel": 2.7298927307128906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.0201473236084, + "learning_rate": 1e-06, + "loss": 0.3907, + "mean_token_accuracy": 0.8735103607177734, + "num_tokens": 249631523.0, + "step": 6542 + }, + { + "epoch": 0.8323368528177076, + "ewc_loss": 0.02734375, + "ewc_loss_parallel": 2.7298927307128906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.390962600708008, + "learning_rate": 1e-06, + "loss": 0.41, + "mean_token_accuracy": 0.8695917129516602, + "num_tokens": 249674113.0, + "step": 6543 + }, + { + "epoch": 0.8324640630962982, + "ewc_loss": 0.02734375, + "ewc_loss_parallel": 2.7298927307128906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.23052406311035, + "learning_rate": 1e-06, + "loss": 0.3997, + "mean_token_accuracy": 0.8712450861930847, + "num_tokens": 249712934.0, + "step": 6544 + }, + { + "epoch": 0.8325912733748887, + "ewc_loss": 0.02734375, + "ewc_loss_parallel": 2.7298927307128906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.471860885620117, + "learning_rate": 1e-06, + "loss": 0.4284, + "mean_token_accuracy": 0.8659578561782837, + "num_tokens": 249750295.0, + "step": 6545 + }, + { + "epoch": 0.8327184836534792, + "ewc_loss": 0.0272216796875, + "ewc_loss_parallel": 2.7179718017578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.3062744140625, + "learning_rate": 1e-06, + "loss": 0.4236, + "mean_token_accuracy": 0.8664008378982544, + "num_tokens": 249790289.0, + "step": 6546 + }, + { + "epoch": 0.8328456939320698, + "ewc_loss": 0.027099609375, + "ewc_loss_parallel": 2.7060508728027344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.356626510620117, + "learning_rate": 1e-06, + "loss": 0.3907, + "mean_token_accuracy": 0.8746992945671082, + "num_tokens": 249822873.0, + "step": 6547 + }, + { + "epoch": 0.8329729042106602, + "ewc_loss": 0.0269775390625, + "ewc_loss_parallel": 2.6941299438476562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.250354766845703, + "learning_rate": 1e-06, + "loss": 0.4393, + "mean_token_accuracy": 0.8627772331237793, + "num_tokens": 249860564.0, + "step": 6548 + }, + { + "epoch": 0.8331001144892507, + "ewc_loss": 0.0269775390625, + "ewc_loss_parallel": 2.6941299438476562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.13280487060547, + "learning_rate": 1e-06, + "loss": 0.4377, + "mean_token_accuracy": 0.8605437874794006, + "num_tokens": 249901448.0, + "step": 6549 + }, + { + "epoch": 0.8332273247678412, + "ewc_loss": 0.027099609375, + "ewc_loss_parallel": 2.7060508728027344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.64605140686035, + "learning_rate": 1e-06, + "loss": 0.4631, + "mean_token_accuracy": 0.8511340618133545, + "num_tokens": 249933524.0, + "step": 6550 + }, + { + "epoch": 0.8333545350464318, + "ewc_loss": 0.0267333984375, + "ewc_loss_parallel": 2.6702880859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.982177734375, + "learning_rate": 1e-06, + "loss": 0.4452, + "mean_token_accuracy": 0.8561087250709534, + "num_tokens": 249971856.0, + "step": 6551 + }, + { + "epoch": 0.8334817453250223, + "ewc_loss": 0.027099609375, + "ewc_loss_parallel": 2.7060508728027344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.3652400970459, + "learning_rate": 1e-06, + "loss": 0.4622, + "mean_token_accuracy": 0.85318922996521, + "num_tokens": 250013940.0, + "step": 6552 + }, + { + "epoch": 0.8336089556036128, + "ewc_loss": 0.02734375, + "ewc_loss_parallel": 2.7298927307128906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.386962890625, + "learning_rate": 1e-06, + "loss": 0.4739, + "mean_token_accuracy": 0.843220591545105, + "num_tokens": 250044826.0, + "step": 6553 + }, + { + "epoch": 0.8337361658822032, + "ewc_loss": 0.0269775390625, + "ewc_loss_parallel": 2.6941299438476562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.09979248046875, + "learning_rate": 1e-06, + "loss": 0.479, + "mean_token_accuracy": 0.8453073501586914, + "num_tokens": 250087409.0, + "step": 6554 + }, + { + "epoch": 0.8338633761607938, + "ewc_loss": 0.027099609375, + "ewc_loss_parallel": 2.7060508728027344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.354476928710938, + "learning_rate": 1e-06, + "loss": 0.406, + "mean_token_accuracy": 0.8680797815322876, + "num_tokens": 250126236.0, + "step": 6555 + }, + { + "epoch": 0.8339905864393843, + "ewc_loss": 0.0272216796875, + "ewc_loss_parallel": 2.7179718017578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.2764949798584, + "learning_rate": 1e-06, + "loss": 0.4035, + "mean_token_accuracy": 0.8691401481628418, + "num_tokens": 250163559.0, + "step": 6556 + }, + { + "epoch": 0.8341177967179748, + "ewc_loss": 0.0269775390625, + "ewc_loss_parallel": 2.6941299438476562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.046600341796875, + "learning_rate": 1e-06, + "loss": 0.4313, + "mean_token_accuracy": 0.8617960214614868, + "num_tokens": 250203536.0, + "step": 6557 + }, + { + "epoch": 0.8342450069965653, + "ewc_loss": 0.0272216796875, + "ewc_loss_parallel": 2.7179718017578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.2083683013916, + "learning_rate": 1e-06, + "loss": 0.4218, + "mean_token_accuracy": 0.8651447296142578, + "num_tokens": 250237455.0, + "step": 6558 + }, + { + "epoch": 0.8343722172751559, + "ewc_loss": 0.0272216796875, + "ewc_loss_parallel": 2.7179718017578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.186784744262695, + "learning_rate": 1e-06, + "loss": 0.4118, + "mean_token_accuracy": 0.8690845966339111, + "num_tokens": 250274559.0, + "step": 6559 + }, + { + "epoch": 0.8344994275537464, + "ewc_loss": 0.027099609375, + "ewc_loss_parallel": 2.7060508728027344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.23448944091797, + "learning_rate": 1e-06, + "loss": 0.4306, + "mean_token_accuracy": 0.8629332780838013, + "num_tokens": 250309392.0, + "step": 6560 + }, + { + "epoch": 0.8346266378323368, + "ewc_loss": 0.0272216796875, + "ewc_loss_parallel": 2.7179718017578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.04778480529785, + "learning_rate": 1e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.866283655166626, + "num_tokens": 250352010.0, + "step": 6561 + }, + { + "epoch": 0.8347538481109273, + "ewc_loss": 0.0269775390625, + "ewc_loss_parallel": 2.6941299438476562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.205711364746094, + "learning_rate": 1e-06, + "loss": 0.472, + "mean_token_accuracy": 0.8491142988204956, + "num_tokens": 250385745.0, + "step": 6562 + }, + { + "epoch": 0.8348810583895179, + "ewc_loss": 0.0274658203125, + "ewc_loss_parallel": 2.7418136596679688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.109926223754883, + "learning_rate": 1e-06, + "loss": 0.4437, + "mean_token_accuracy": 0.8589514493942261, + "num_tokens": 250426055.0, + "step": 6563 + }, + { + "epoch": 0.8350082686681084, + "ewc_loss": 0.0269775390625, + "ewc_loss_parallel": 2.6941299438476562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.264938354492188, + "learning_rate": 1e-06, + "loss": 0.4566, + "mean_token_accuracy": 0.8591338396072388, + "num_tokens": 250468795.0, + "step": 6564 + }, + { + "epoch": 0.8351354789466989, + "ewc_loss": 0.0274658203125, + "ewc_loss_parallel": 2.7418136596679688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.12257957458496, + "learning_rate": 1e-06, + "loss": 0.4567, + "mean_token_accuracy": 0.8596891164779663, + "num_tokens": 250509115.0, + "step": 6565 + }, + { + "epoch": 0.8352626892252895, + "ewc_loss": 0.0269775390625, + "ewc_loss_parallel": 2.6941299438476562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.175800323486328, + "learning_rate": 1e-06, + "loss": 0.4269, + "mean_token_accuracy": 0.861996054649353, + "num_tokens": 250547770.0, + "step": 6566 + }, + { + "epoch": 0.8353898995038799, + "ewc_loss": 0.0272216796875, + "ewc_loss_parallel": 2.7179718017578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.981338500976562, + "learning_rate": 1e-06, + "loss": 0.4424, + "mean_token_accuracy": 0.8583575487136841, + "num_tokens": 250589737.0, + "step": 6567 + }, + { + "epoch": 0.8355171097824704, + "ewc_loss": 0.0274658203125, + "ewc_loss_parallel": 2.7418136596679688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.251195907592773, + "learning_rate": 1e-06, + "loss": 0.4208, + "mean_token_accuracy": 0.8697787523269653, + "num_tokens": 250623058.0, + "step": 6568 + }, + { + "epoch": 0.8356443200610609, + "ewc_loss": 0.0274658203125, + "ewc_loss_parallel": 2.7418136596679688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.197622299194336, + "learning_rate": 1e-06, + "loss": 0.4407, + "mean_token_accuracy": 0.8616527915000916, + "num_tokens": 250660526.0, + "step": 6569 + }, + { + "epoch": 0.8357715303396515, + "ewc_loss": 0.0274658203125, + "ewc_loss_parallel": 2.7418136596679688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.04412841796875, + "learning_rate": 1e-06, + "loss": 0.499, + "mean_token_accuracy": 0.8450154066085815, + "num_tokens": 250698218.0, + "step": 6570 + }, + { + "epoch": 0.835898740618242, + "ewc_loss": 0.0272216796875, + "ewc_loss_parallel": 2.7179718017578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.150022506713867, + "learning_rate": 1e-06, + "loss": 0.4653, + "mean_token_accuracy": 0.8542118072509766, + "num_tokens": 250743410.0, + "step": 6571 + }, + { + "epoch": 0.8360259508968325, + "ewc_loss": 0.0272216796875, + "ewc_loss_parallel": 2.7179718017578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.21690559387207, + "learning_rate": 1e-06, + "loss": 0.4081, + "mean_token_accuracy": 0.8704765439033508, + "num_tokens": 250781919.0, + "step": 6572 + }, + { + "epoch": 0.8361531611754229, + "ewc_loss": 0.027099609375, + "ewc_loss_parallel": 2.7060508728027344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.00634765625, + "learning_rate": 1e-06, + "loss": 0.4398, + "mean_token_accuracy": 0.858986496925354, + "num_tokens": 250817051.0, + "step": 6573 + }, + { + "epoch": 0.8362803714540135, + "ewc_loss": 0.02734375, + "ewc_loss_parallel": 2.7298927307128906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.303863525390625, + "learning_rate": 1e-06, + "loss": 0.4498, + "mean_token_accuracy": 0.8579047918319702, + "num_tokens": 250857241.0, + "step": 6574 + }, + { + "epoch": 0.836407581732604, + "ewc_loss": 0.0272216796875, + "ewc_loss_parallel": 2.7179718017578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.95201873779297, + "learning_rate": 1e-06, + "loss": 0.441, + "mean_token_accuracy": 0.8624488711357117, + "num_tokens": 250903591.0, + "step": 6575 + }, + { + "epoch": 0.8365347920111945, + "ewc_loss": 0.027099609375, + "ewc_loss_parallel": 2.7060508728027344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.112154006958008, + "learning_rate": 1e-06, + "loss": 0.4594, + "mean_token_accuracy": 0.8538220524787903, + "num_tokens": 250945788.0, + "step": 6576 + }, + { + "epoch": 0.836662002289785, + "ewc_loss": 0.0272216796875, + "ewc_loss_parallel": 2.7179718017578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.99091339111328, + "learning_rate": 1e-06, + "loss": 0.456, + "mean_token_accuracy": 0.8543369770050049, + "num_tokens": 250984884.0, + "step": 6577 + }, + { + "epoch": 0.8367892125683756, + "ewc_loss": 0.0272216796875, + "ewc_loss_parallel": 2.7179718017578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.983821868896484, + "learning_rate": 1e-06, + "loss": 0.3931, + "mean_token_accuracy": 0.8723076581954956, + "num_tokens": 251024347.0, + "step": 6578 + }, + { + "epoch": 0.836916422846966, + "ewc_loss": 0.027099609375, + "ewc_loss_parallel": 2.7060508728027344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.18116569519043, + "learning_rate": 1e-06, + "loss": 0.371, + "mean_token_accuracy": 0.8828431367874146, + "num_tokens": 251068527.0, + "step": 6579 + }, + { + "epoch": 0.8370436331255565, + "ewc_loss": 0.0274658203125, + "ewc_loss_parallel": 2.7418136596679688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.18620491027832, + "learning_rate": 1e-06, + "loss": 0.4228, + "mean_token_accuracy": 0.8658305406570435, + "num_tokens": 251109914.0, + "step": 6580 + }, + { + "epoch": 0.837170843404147, + "ewc_loss": 0.0274658203125, + "ewc_loss_parallel": 2.7418136596679688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.07627296447754, + "learning_rate": 1e-06, + "loss": 0.4568, + "mean_token_accuracy": 0.8525735139846802, + "num_tokens": 251144760.0, + "step": 6581 + }, + { + "epoch": 0.8372980536827376, + "ewc_loss": 0.0274658203125, + "ewc_loss_parallel": 2.7418136596679688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.24932861328125, + "learning_rate": 1e-06, + "loss": 0.4373, + "mean_token_accuracy": 0.8594323396682739, + "num_tokens": 251184456.0, + "step": 6582 + }, + { + "epoch": 0.8374252639613281, + "ewc_loss": 0.0272216796875, + "ewc_loss_parallel": 2.7179718017578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.15288543701172, + "learning_rate": 1e-06, + "loss": 0.4708, + "mean_token_accuracy": 0.8531160950660706, + "num_tokens": 251229493.0, + "step": 6583 + }, + { + "epoch": 0.8375524742399186, + "ewc_loss": 0.02734375, + "ewc_loss_parallel": 2.7298927307128906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.24681282043457, + "learning_rate": 1e-06, + "loss": 0.4177, + "mean_token_accuracy": 0.8654873371124268, + "num_tokens": 251270886.0, + "step": 6584 + }, + { + "epoch": 0.837679684518509, + "ewc_loss": 0.0272216796875, + "ewc_loss_parallel": 2.7179718017578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.01373291015625, + "learning_rate": 1e-06, + "loss": 0.4018, + "mean_token_accuracy": 0.8706735372543335, + "num_tokens": 251304833.0, + "step": 6585 + }, + { + "epoch": 0.8378068947970996, + "ewc_loss": 0.027099609375, + "ewc_loss_parallel": 2.7060508728027344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.11864471435547, + "learning_rate": 1e-06, + "loss": 0.4251, + "mean_token_accuracy": 0.8637816905975342, + "num_tokens": 251334196.0, + "step": 6586 + }, + { + "epoch": 0.8379341050756901, + "ewc_loss": 0.027099609375, + "ewc_loss_parallel": 2.7060508728027344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.10289192199707, + "learning_rate": 1e-06, + "loss": 0.4777, + "mean_token_accuracy": 0.8497272729873657, + "num_tokens": 251380278.0, + "step": 6587 + }, + { + "epoch": 0.8380613153542806, + "ewc_loss": 0.0272216796875, + "ewc_loss_parallel": 2.7179718017578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.130508422851562, + "learning_rate": 1e-06, + "loss": 0.4618, + "mean_token_accuracy": 0.8552491664886475, + "num_tokens": 251418432.0, + "step": 6588 + }, + { + "epoch": 0.8381885256328712, + "ewc_loss": 0.0272216796875, + "ewc_loss_parallel": 2.7179718017578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.16398811340332, + "learning_rate": 1e-06, + "loss": 0.4416, + "mean_token_accuracy": 0.858036458492279, + "num_tokens": 251453747.0, + "step": 6589 + }, + { + "epoch": 0.8383157359114617, + "ewc_loss": 0.0274658203125, + "ewc_loss_parallel": 2.7418136596679688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.225276947021484, + "learning_rate": 1e-06, + "loss": 0.4337, + "mean_token_accuracy": 0.8613079190254211, + "num_tokens": 251492439.0, + "step": 6590 + }, + { + "epoch": 0.8384429461900521, + "ewc_loss": 0.0274658203125, + "ewc_loss_parallel": 2.7418136596679688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.173622131347656, + "learning_rate": 1e-06, + "loss": 0.4257, + "mean_token_accuracy": 0.8620632290840149, + "num_tokens": 251527945.0, + "step": 6591 + }, + { + "epoch": 0.8385701564686426, + "ewc_loss": 0.02734375, + "ewc_loss_parallel": 2.7298927307128906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.975400924682617, + "learning_rate": 1e-06, + "loss": 0.4078, + "mean_token_accuracy": 0.8689456582069397, + "num_tokens": 251567161.0, + "step": 6592 + }, + { + "epoch": 0.8386973667472332, + "ewc_loss": 0.0274658203125, + "ewc_loss_parallel": 2.7418136596679688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.144739151000977, + "learning_rate": 1e-06, + "loss": 0.4373, + "mean_token_accuracy": 0.8607767224311829, + "num_tokens": 251601189.0, + "step": 6593 + }, + { + "epoch": 0.8388245770258237, + "ewc_loss": 0.02734375, + "ewc_loss_parallel": 2.7298927307128906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.023027420043945, + "learning_rate": 1e-06, + "loss": 0.4579, + "mean_token_accuracy": 0.8584781885147095, + "num_tokens": 251630846.0, + "step": 6594 + }, + { + "epoch": 0.8389517873044142, + "ewc_loss": 0.0274658203125, + "ewc_loss_parallel": 2.7418136596679688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.163143157958984, + "learning_rate": 1e-06, + "loss": 0.409, + "mean_token_accuracy": 0.868682861328125, + "num_tokens": 251665701.0, + "step": 6595 + }, + { + "epoch": 0.8390789975830047, + "ewc_loss": 0.02734375, + "ewc_loss_parallel": 2.7298927307128906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.149288177490234, + "learning_rate": 1e-06, + "loss": 0.4612, + "mean_token_accuracy": 0.853498637676239, + "num_tokens": 251699909.0, + "step": 6596 + }, + { + "epoch": 0.8392062078615952, + "ewc_loss": 0.02734375, + "ewc_loss_parallel": 2.7298927307128906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.98895835876465, + "learning_rate": 1e-06, + "loss": 0.4381, + "mean_token_accuracy": 0.8593043684959412, + "num_tokens": 251734052.0, + "step": 6597 + }, + { + "epoch": 0.8393334181401857, + "ewc_loss": 0.0277099609375, + "ewc_loss_parallel": 2.765655517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.072614669799805, + "learning_rate": 1e-06, + "loss": 0.4826, + "mean_token_accuracy": 0.8500300645828247, + "num_tokens": 251772518.0, + "step": 6598 + }, + { + "epoch": 0.8394606284187762, + "ewc_loss": 0.0272216796875, + "ewc_loss_parallel": 2.7179718017578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.137006759643555, + "learning_rate": 1e-06, + "loss": 0.4501, + "mean_token_accuracy": 0.8589082956314087, + "num_tokens": 251816378.0, + "step": 6599 + }, + { + "epoch": 0.8395878386973668, + "ewc_loss": 0.027587890625, + "ewc_loss_parallel": 2.753734588623047e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.155437469482422, + "learning_rate": 1e-06, + "loss": 0.4331, + "mean_token_accuracy": 0.8605740070343018, + "num_tokens": 251851264.0, + "step": 6600 + }, + { + "epoch": 0.8397150489759573, + "ewc_loss": 0.02783203125, + "ewc_loss_parallel": 2.777576446533203e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.27048683166504, + "learning_rate": 1e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.8633276224136353, + "num_tokens": 251888906.0, + "step": 6601 + }, + { + "epoch": 0.8398422592545478, + "ewc_loss": 0.0277099609375, + "ewc_loss_parallel": 2.765655517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.164897918701172, + "learning_rate": 1e-06, + "loss": 0.424, + "mean_token_accuracy": 0.8677687644958496, + "num_tokens": 251930641.0, + "step": 6602 + }, + { + "epoch": 0.8399694695331382, + "ewc_loss": 0.02734375, + "ewc_loss_parallel": 2.7298927307128906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.255550384521484, + "learning_rate": 1e-06, + "loss": 0.4949, + "mean_token_accuracy": 0.8423236608505249, + "num_tokens": 251968929.0, + "step": 6603 + }, + { + "epoch": 0.8400966798117288, + "ewc_loss": 0.0274658203125, + "ewc_loss_parallel": 2.7418136596679688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.050945281982422, + "learning_rate": 1e-06, + "loss": 0.4726, + "mean_token_accuracy": 0.8538479804992676, + "num_tokens": 252006751.0, + "step": 6604 + }, + { + "epoch": 0.8402238900903193, + "ewc_loss": 0.0274658203125, + "ewc_loss_parallel": 2.7418136596679688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.27676773071289, + "learning_rate": 1e-06, + "loss": 0.4438, + "mean_token_accuracy": 0.8574960827827454, + "num_tokens": 252045861.0, + "step": 6605 + }, + { + "epoch": 0.8403511003689098, + "ewc_loss": 0.0279541015625, + "ewc_loss_parallel": 2.7894973754882812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.300365447998047, + "learning_rate": 1e-06, + "loss": 0.4597, + "mean_token_accuracy": 0.8549399375915527, + "num_tokens": 252084412.0, + "step": 6606 + }, + { + "epoch": 0.8404783106475003, + "ewc_loss": 0.02734375, + "ewc_loss_parallel": 2.7298927307128906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.190509796142578, + "learning_rate": 1e-06, + "loss": 0.4286, + "mean_token_accuracy": 0.8655983209609985, + "num_tokens": 252120576.0, + "step": 6607 + }, + { + "epoch": 0.8406055209260909, + "ewc_loss": 0.0274658203125, + "ewc_loss_parallel": 2.7418136596679688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.37982749938965, + "learning_rate": 1e-06, + "loss": 0.4335, + "mean_token_accuracy": 0.8588106036186218, + "num_tokens": 252161636.0, + "step": 6608 + }, + { + "epoch": 0.8407327312046813, + "ewc_loss": 0.0272216796875, + "ewc_loss_parallel": 2.7179718017578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.2072811126709, + "learning_rate": 1e-06, + "loss": 0.432, + "mean_token_accuracy": 0.8616774082183838, + "num_tokens": 252199669.0, + "step": 6609 + }, + { + "epoch": 0.8408599414832718, + "ewc_loss": 0.0274658203125, + "ewc_loss_parallel": 2.7418136596679688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.453548431396484, + "learning_rate": 1e-06, + "loss": 0.4111, + "mean_token_accuracy": 0.8682857751846313, + "num_tokens": 252236832.0, + "step": 6610 + }, + { + "epoch": 0.8409871517618623, + "ewc_loss": 0.0272216796875, + "ewc_loss_parallel": 2.7179718017578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.106172561645508, + "learning_rate": 1e-06, + "loss": 0.4536, + "mean_token_accuracy": 0.8565716743469238, + "num_tokens": 252282518.0, + "step": 6611 + }, + { + "epoch": 0.8411143620404529, + "ewc_loss": 0.0274658203125, + "ewc_loss_parallel": 2.7418136596679688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.222862243652344, + "learning_rate": 1e-06, + "loss": 0.4565, + "mean_token_accuracy": 0.8547172546386719, + "num_tokens": 252316479.0, + "step": 6612 + }, + { + "epoch": 0.8412415723190434, + "ewc_loss": 0.0274658203125, + "ewc_loss_parallel": 2.7418136596679688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.31659507751465, + "learning_rate": 1e-06, + "loss": 0.454, + "mean_token_accuracy": 0.8548043370246887, + "num_tokens": 252353789.0, + "step": 6613 + }, + { + "epoch": 0.8413687825976339, + "ewc_loss": 0.0272216796875, + "ewc_loss_parallel": 2.7179718017578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.136669158935547, + "learning_rate": 1e-06, + "loss": 0.4354, + "mean_token_accuracy": 0.8638789057731628, + "num_tokens": 252391609.0, + "step": 6614 + }, + { + "epoch": 0.8414959928762245, + "ewc_loss": 0.0274658203125, + "ewc_loss_parallel": 2.7418136596679688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.157907485961914, + "learning_rate": 1e-06, + "loss": 0.3953, + "mean_token_accuracy": 0.873438835144043, + "num_tokens": 252433514.0, + "step": 6615 + }, + { + "epoch": 0.8416232031548149, + "ewc_loss": 0.0272216796875, + "ewc_loss_parallel": 2.7179718017578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.20441436767578, + "learning_rate": 1e-06, + "loss": 0.4228, + "mean_token_accuracy": 0.8652029037475586, + "num_tokens": 252469700.0, + "step": 6616 + }, + { + "epoch": 0.8417504134334054, + "ewc_loss": 0.02734375, + "ewc_loss_parallel": 2.7298927307128906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.143503189086914, + "learning_rate": 1e-06, + "loss": 0.4523, + "mean_token_accuracy": 0.8577706813812256, + "num_tokens": 252501658.0, + "step": 6617 + }, + { + "epoch": 0.8418776237119959, + "ewc_loss": 0.0274658203125, + "ewc_loss_parallel": 2.7418136596679688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.182701110839844, + "learning_rate": 1e-06, + "loss": 0.448, + "mean_token_accuracy": 0.8630199432373047, + "num_tokens": 252534546.0, + "step": 6618 + }, + { + "epoch": 0.8420048339905865, + "ewc_loss": 0.0274658203125, + "ewc_loss_parallel": 2.7418136596679688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.21438217163086, + "learning_rate": 1e-06, + "loss": 0.4089, + "mean_token_accuracy": 0.8683902621269226, + "num_tokens": 252573172.0, + "step": 6619 + }, + { + "epoch": 0.842132044269177, + "ewc_loss": 0.0274658203125, + "ewc_loss_parallel": 2.7418136596679688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.26333236694336, + "learning_rate": 1e-06, + "loss": 0.4061, + "mean_token_accuracy": 0.8696184158325195, + "num_tokens": 252607258.0, + "step": 6620 + }, + { + "epoch": 0.8422592545477675, + "ewc_loss": 0.0272216796875, + "ewc_loss_parallel": 2.7179718017578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.157054901123047, + "learning_rate": 1e-06, + "loss": 0.4075, + "mean_token_accuracy": 0.871481716632843, + "num_tokens": 252651451.0, + "step": 6621 + }, + { + "epoch": 0.8423864648263579, + "ewc_loss": 0.02734375, + "ewc_loss_parallel": 2.7298927307128906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.412403106689453, + "learning_rate": 1e-06, + "loss": 0.3906, + "mean_token_accuracy": 0.8726316690444946, + "num_tokens": 252686348.0, + "step": 6622 + }, + { + "epoch": 0.8425136751049485, + "ewc_loss": 0.0274658203125, + "ewc_loss_parallel": 2.7418136596679688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.177106857299805, + "learning_rate": 1e-06, + "loss": 0.4518, + "mean_token_accuracy": 0.8555996417999268, + "num_tokens": 252724316.0, + "step": 6623 + }, + { + "epoch": 0.842640885383539, + "ewc_loss": 0.0277099609375, + "ewc_loss_parallel": 2.765655517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.167245864868164, + "learning_rate": 1e-06, + "loss": 0.4768, + "mean_token_accuracy": 0.8504464030265808, + "num_tokens": 252764612.0, + "step": 6624 + }, + { + "epoch": 0.8427680956621295, + "ewc_loss": 0.0274658203125, + "ewc_loss_parallel": 2.7418136596679688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.25497055053711, + "learning_rate": 1e-06, + "loss": 0.5019, + "mean_token_accuracy": 0.8400406837463379, + "num_tokens": 252809510.0, + "step": 6625 + }, + { + "epoch": 0.84289530594072, + "ewc_loss": 0.0277099609375, + "ewc_loss_parallel": 2.765655517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.424182891845703, + "learning_rate": 1e-06, + "loss": 0.4554, + "mean_token_accuracy": 0.8587532043457031, + "num_tokens": 252847598.0, + "step": 6626 + }, + { + "epoch": 0.8430225162193106, + "ewc_loss": 0.0272216796875, + "ewc_loss_parallel": 2.7179718017578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.13512420654297, + "learning_rate": 1e-06, + "loss": 0.5017, + "mean_token_accuracy": 0.8464949131011963, + "num_tokens": 252884401.0, + "step": 6627 + }, + { + "epoch": 0.843149726497901, + "ewc_loss": 0.0272216796875, + "ewc_loss_parallel": 2.7179718017578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.155054092407227, + "learning_rate": 1e-06, + "loss": 0.4816, + "mean_token_accuracy": 0.8448581695556641, + "num_tokens": 252928348.0, + "step": 6628 + }, + { + "epoch": 0.8432769367764915, + "ewc_loss": 0.0274658203125, + "ewc_loss_parallel": 2.7418136596679688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.223751068115234, + "learning_rate": 1e-06, + "loss": 0.4407, + "mean_token_accuracy": 0.860739529132843, + "num_tokens": 252965084.0, + "step": 6629 + }, + { + "epoch": 0.843404147055082, + "ewc_loss": 0.0274658203125, + "ewc_loss_parallel": 2.7418136596679688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.180044174194336, + "learning_rate": 1e-06, + "loss": 0.4203, + "mean_token_accuracy": 0.8651472926139832, + "num_tokens": 253003697.0, + "step": 6630 + }, + { + "epoch": 0.8435313573336726, + "ewc_loss": 0.0269775390625, + "ewc_loss_parallel": 2.6941299438476562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.255064010620117, + "learning_rate": 1e-06, + "loss": 0.4795, + "mean_token_accuracy": 0.8474441766738892, + "num_tokens": 253037239.0, + "step": 6631 + }, + { + "epoch": 0.8436585676122631, + "ewc_loss": 0.0274658203125, + "ewc_loss_parallel": 2.7418136596679688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.186853408813477, + "learning_rate": 1e-06, + "loss": 0.4658, + "mean_token_accuracy": 0.8518013954162598, + "num_tokens": 253078163.0, + "step": 6632 + }, + { + "epoch": 0.8437857778908536, + "ewc_loss": 0.02734375, + "ewc_loss_parallel": 2.7298927307128906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.322717666625977, + "learning_rate": 1e-06, + "loss": 0.4459, + "mean_token_accuracy": 0.8592827320098877, + "num_tokens": 253118907.0, + "step": 6633 + }, + { + "epoch": 0.843912988169444, + "ewc_loss": 0.027587890625, + "ewc_loss_parallel": 2.753734588623047e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.14948844909668, + "learning_rate": 1e-06, + "loss": 0.3688, + "mean_token_accuracy": 0.8787376880645752, + "num_tokens": 253153132.0, + "step": 6634 + }, + { + "epoch": 0.8440401984480346, + "ewc_loss": 0.0277099609375, + "ewc_loss_parallel": 2.765655517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.44056510925293, + "learning_rate": 1e-06, + "loss": 0.4382, + "mean_token_accuracy": 0.8567929863929749, + "num_tokens": 253194446.0, + "step": 6635 + }, + { + "epoch": 0.8441674087266251, + "ewc_loss": 0.027099609375, + "ewc_loss_parallel": 2.7060508728027344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.184295654296875, + "learning_rate": 1e-06, + "loss": 0.4277, + "mean_token_accuracy": 0.863820493221283, + "num_tokens": 253230798.0, + "step": 6636 + }, + { + "epoch": 0.8442946190052156, + "ewc_loss": 0.0274658203125, + "ewc_loss_parallel": 2.7418136596679688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.207557678222656, + "learning_rate": 1e-06, + "loss": 0.4654, + "mean_token_accuracy": 0.8529295325279236, + "num_tokens": 253269029.0, + "step": 6637 + }, + { + "epoch": 0.8444218292838062, + "ewc_loss": 0.02734375, + "ewc_loss_parallel": 2.7298927307128906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.173450469970703, + "learning_rate": 1e-06, + "loss": 0.4585, + "mean_token_accuracy": 0.8565049767494202, + "num_tokens": 253310656.0, + "step": 6638 + }, + { + "epoch": 0.8445490395623967, + "ewc_loss": 0.0274658203125, + "ewc_loss_parallel": 2.7418136596679688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.206661224365234, + "learning_rate": 1e-06, + "loss": 0.3816, + "mean_token_accuracy": 0.8792409896850586, + "num_tokens": 253346882.0, + "step": 6639 + }, + { + "epoch": 0.8446762498409871, + "ewc_loss": 0.0274658203125, + "ewc_loss_parallel": 2.7418136596679688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.13544464111328, + "learning_rate": 1e-06, + "loss": 0.4688, + "mean_token_accuracy": 0.851008951663971, + "num_tokens": 253389601.0, + "step": 6640 + }, + { + "epoch": 0.8448034601195776, + "ewc_loss": 0.0274658203125, + "ewc_loss_parallel": 2.7418136596679688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.335147857666016, + "learning_rate": 1e-06, + "loss": 0.4356, + "mean_token_accuracy": 0.8587774038314819, + "num_tokens": 253428725.0, + "step": 6641 + }, + { + "epoch": 0.8449306703981682, + "ewc_loss": 0.0277099609375, + "ewc_loss_parallel": 2.765655517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.27315330505371, + "learning_rate": 1e-06, + "loss": 0.3953, + "mean_token_accuracy": 0.8748600482940674, + "num_tokens": 253461404.0, + "step": 6642 + }, + { + "epoch": 0.8450578806767587, + "ewc_loss": 0.0274658203125, + "ewc_loss_parallel": 2.7418136596679688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.455280303955078, + "learning_rate": 1e-06, + "loss": 0.4286, + "mean_token_accuracy": 0.8649513721466064, + "num_tokens": 253500324.0, + "step": 6643 + }, + { + "epoch": 0.8451850909553492, + "ewc_loss": 0.0274658203125, + "ewc_loss_parallel": 2.7418136596679688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.2123966217041, + "learning_rate": 1e-06, + "loss": 0.4759, + "mean_token_accuracy": 0.8478527665138245, + "num_tokens": 253542500.0, + "step": 6644 + }, + { + "epoch": 0.8453123012339397, + "ewc_loss": 0.027587890625, + "ewc_loss_parallel": 2.753734588623047e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.270069122314453, + "learning_rate": 1e-06, + "loss": 0.488, + "mean_token_accuracy": 0.8433372974395752, + "num_tokens": 253581887.0, + "step": 6645 + }, + { + "epoch": 0.8454395115125302, + "ewc_loss": 0.02734375, + "ewc_loss_parallel": 2.7298927307128906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.464920043945312, + "learning_rate": 1e-06, + "loss": 0.4097, + "mean_token_accuracy": 0.8688612580299377, + "num_tokens": 253616942.0, + "step": 6646 + }, + { + "epoch": 0.8455667217911207, + "ewc_loss": 0.0272216796875, + "ewc_loss_parallel": 2.7179718017578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.12668800354004, + "learning_rate": 1e-06, + "loss": 0.453, + "mean_token_accuracy": 0.8540488481521606, + "num_tokens": 253652610.0, + "step": 6647 + }, + { + "epoch": 0.8456939320697112, + "ewc_loss": 0.0274658203125, + "ewc_loss_parallel": 2.7418136596679688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.172313690185547, + "learning_rate": 1e-06, + "loss": 0.4389, + "mean_token_accuracy": 0.8575524091720581, + "num_tokens": 253686269.0, + "step": 6648 + }, + { + "epoch": 0.8458211423483017, + "ewc_loss": 0.02734375, + "ewc_loss_parallel": 2.7298927307128906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.193788528442383, + "learning_rate": 1e-06, + "loss": 0.4495, + "mean_token_accuracy": 0.8541473150253296, + "num_tokens": 253723632.0, + "step": 6649 + }, + { + "epoch": 0.8459483526268923, + "ewc_loss": 0.0272216796875, + "ewc_loss_parallel": 2.7179718017578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.22821617126465, + "learning_rate": 1e-06, + "loss": 0.453, + "mean_token_accuracy": 0.8534724712371826, + "num_tokens": 253755990.0, + "step": 6650 + }, + { + "epoch": 0.8460755629054828, + "ewc_loss": 0.0272216796875, + "ewc_loss_parallel": 2.7179718017578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 20.98998260498047, + "learning_rate": 1e-06, + "loss": 0.4632, + "mean_token_accuracy": 0.8556448221206665, + "num_tokens": 253794200.0, + "step": 6651 + }, + { + "epoch": 0.8462027731840732, + "ewc_loss": 0.0272216796875, + "ewc_loss_parallel": 2.7179718017578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.17091178894043, + "learning_rate": 1e-06, + "loss": 0.4055, + "mean_token_accuracy": 0.871669352054596, + "num_tokens": 253840554.0, + "step": 6652 + }, + { + "epoch": 0.8463299834626637, + "ewc_loss": 0.0274658203125, + "ewc_loss_parallel": 2.7418136596679688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.10148811340332, + "learning_rate": 1e-06, + "loss": 0.4123, + "mean_token_accuracy": 0.869608998298645, + "num_tokens": 253873642.0, + "step": 6653 + }, + { + "epoch": 0.8464571937412543, + "ewc_loss": 0.027587890625, + "ewc_loss_parallel": 2.753734588623047e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.211265563964844, + "learning_rate": 1e-06, + "loss": 0.4169, + "mean_token_accuracy": 0.8659750819206238, + "num_tokens": 253911839.0, + "step": 6654 + }, + { + "epoch": 0.8465844040198448, + "ewc_loss": 0.027587890625, + "ewc_loss_parallel": 2.753734588623047e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.077560424804688, + "learning_rate": 1e-06, + "loss": 0.4752, + "mean_token_accuracy": 0.8506823778152466, + "num_tokens": 253944649.0, + "step": 6655 + }, + { + "epoch": 0.8467116142984353, + "ewc_loss": 0.027587890625, + "ewc_loss_parallel": 2.753734588623047e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 34.043338775634766, + "learning_rate": 1e-06, + "loss": 0.455, + "mean_token_accuracy": 0.8593612313270569, + "num_tokens": 253977739.0, + "step": 6656 + }, + { + "epoch": 0.8468388245770259, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.393335342407227, + "learning_rate": 1e-06, + "loss": 0.4832, + "mean_token_accuracy": 0.8466705083847046, + "num_tokens": 254021216.0, + "step": 6657 + }, + { + "epoch": 0.8469660348556163, + "ewc_loss": 0.0260009765625, + "ewc_loss_parallel": 2.5987625122070312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 19.286672592163086, + "learning_rate": 1e-06, + "loss": 0.4475, + "mean_token_accuracy": 0.8581811189651489, + "num_tokens": 254063209.0, + "step": 6658 + }, + { + "epoch": 0.8470932451342068, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.187747955322266, + "learning_rate": 1e-06, + "loss": 0.4617, + "mean_token_accuracy": 0.8562476634979248, + "num_tokens": 254102325.0, + "step": 6659 + }, + { + "epoch": 0.8472204554127973, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.359678268432617, + "learning_rate": 1e-06, + "loss": 0.4288, + "mean_token_accuracy": 0.8656249642372131, + "num_tokens": 254136991.0, + "step": 6660 + }, + { + "epoch": 0.8473476656913879, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.39400863647461, + "learning_rate": 1e-06, + "loss": 0.4452, + "mean_token_accuracy": 0.8604275584220886, + "num_tokens": 254179520.0, + "step": 6661 + }, + { + "epoch": 0.8474748759699784, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.324724197387695, + "learning_rate": 1e-06, + "loss": 0.4386, + "mean_token_accuracy": 0.8604515194892883, + "num_tokens": 254220661.0, + "step": 6662 + }, + { + "epoch": 0.8476020862485689, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.40699005126953, + "learning_rate": 1e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.8679262399673462, + "num_tokens": 254261853.0, + "step": 6663 + }, + { + "epoch": 0.8477292965271594, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.508607864379883, + "learning_rate": 1e-06, + "loss": 0.439, + "mean_token_accuracy": 0.861259937286377, + "num_tokens": 254301577.0, + "step": 6664 + }, + { + "epoch": 0.8478565068057499, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.62639617919922, + "learning_rate": 1e-06, + "loss": 0.4023, + "mean_token_accuracy": 0.8734012842178345, + "num_tokens": 254341792.0, + "step": 6665 + }, + { + "epoch": 0.8479837170843404, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.405574798583984, + "learning_rate": 1e-06, + "loss": 0.4772, + "mean_token_accuracy": 0.8482403755187988, + "num_tokens": 254376755.0, + "step": 6666 + }, + { + "epoch": 0.8481109273629309, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.592252731323242, + "learning_rate": 1e-06, + "loss": 0.4129, + "mean_token_accuracy": 0.8715106248855591, + "num_tokens": 254413565.0, + "step": 6667 + }, + { + "epoch": 0.8482381376415215, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.48655891418457, + "learning_rate": 1e-06, + "loss": 0.4769, + "mean_token_accuracy": 0.8496801853179932, + "num_tokens": 254449889.0, + "step": 6668 + }, + { + "epoch": 0.848365347920112, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.64767074584961, + "learning_rate": 1e-06, + "loss": 0.4015, + "mean_token_accuracy": 0.8744497299194336, + "num_tokens": 254488129.0, + "step": 6669 + }, + { + "epoch": 0.8484925581987025, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.52273941040039, + "learning_rate": 1e-06, + "loss": 0.4117, + "mean_token_accuracy": 0.8716422915458679, + "num_tokens": 254522031.0, + "step": 6670 + }, + { + "epoch": 0.8486197684772929, + "ewc_loss": 0.030029296875, + "ewc_loss_parallel": 3.0040740966796875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.457185745239258, + "learning_rate": 1e-06, + "loss": 0.4016, + "mean_token_accuracy": 0.8721767067909241, + "num_tokens": 254559249.0, + "step": 6671 + }, + { + "epoch": 0.8487469787558835, + "ewc_loss": 0.0303955078125, + "ewc_loss_parallel": 3.039836883544922e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.411231994628906, + "learning_rate": 1e-06, + "loss": 0.4699, + "mean_token_accuracy": 0.853720486164093, + "num_tokens": 254603968.0, + "step": 6672 + }, + { + "epoch": 0.848874189034474, + "ewc_loss": 0.02978515625, + "ewc_loss_parallel": 2.9802322387695312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.600461959838867, + "learning_rate": 1e-06, + "loss": 0.4095, + "mean_token_accuracy": 0.8703519105911255, + "num_tokens": 254642178.0, + "step": 6673 + }, + { + "epoch": 0.8490013993130645, + "ewc_loss": 0.02978515625, + "ewc_loss_parallel": 2.9802322387695312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.40109634399414, + "learning_rate": 1e-06, + "loss": 0.4597, + "mean_token_accuracy": 0.853428840637207, + "num_tokens": 254679129.0, + "step": 6674 + }, + { + "epoch": 0.849128609591655, + "ewc_loss": 0.029296875, + "ewc_loss_parallel": 2.9325485229492188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.549034118652344, + "learning_rate": 1e-06, + "loss": 0.5446, + "mean_token_accuracy": 0.8324412703514099, + "num_tokens": 254714952.0, + "step": 6675 + }, + { + "epoch": 0.8492558198702456, + "ewc_loss": 0.029296875, + "ewc_loss_parallel": 2.9325485229492188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.395904541015625, + "learning_rate": 1e-06, + "loss": 0.4433, + "mean_token_accuracy": 0.8566884398460388, + "num_tokens": 254743323.0, + "step": 6676 + }, + { + "epoch": 0.849383030148836, + "ewc_loss": 0.029052734375, + "ewc_loss_parallel": 2.9087066650390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.55082130432129, + "learning_rate": 1e-06, + "loss": 0.4271, + "mean_token_accuracy": 0.8667253255844116, + "num_tokens": 254784749.0, + "step": 6677 + }, + { + "epoch": 0.8495102404274265, + "ewc_loss": 0.029052734375, + "ewc_loss_parallel": 2.9087066650390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.180747985839844, + "learning_rate": 1e-06, + "loss": 0.4365, + "mean_token_accuracy": 0.866844117641449, + "num_tokens": 254820102.0, + "step": 6678 + }, + { + "epoch": 0.849637450706017, + "ewc_loss": 0.0284423828125, + "ewc_loss_parallel": 2.849102020263672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.391834259033203, + "learning_rate": 1e-06, + "loss": 0.4491, + "mean_token_accuracy": 0.8564925789833069, + "num_tokens": 254859988.0, + "step": 6679 + }, + { + "epoch": 0.8497646609846076, + "ewc_loss": 0.0286865234375, + "ewc_loss_parallel": 2.872943878173828e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.39166259765625, + "learning_rate": 1e-06, + "loss": 0.4609, + "mean_token_accuracy": 0.8563040494918823, + "num_tokens": 254906792.0, + "step": 6680 + }, + { + "epoch": 0.8498918712631981, + "ewc_loss": 0.028076171875, + "ewc_loss_parallel": 2.8133392333984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.36852264404297, + "learning_rate": 1e-06, + "loss": 0.4316, + "mean_token_accuracy": 0.8622440695762634, + "num_tokens": 254941552.0, + "step": 6681 + }, + { + "epoch": 0.8500190815417886, + "ewc_loss": 0.0279541015625, + "ewc_loss_parallel": 2.8014183044433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.27672576904297, + "learning_rate": 1e-06, + "loss": 0.46, + "mean_token_accuracy": 0.853813648223877, + "num_tokens": 254974653.0, + "step": 6682 + }, + { + "epoch": 0.850146291820379, + "ewc_loss": 0.0279541015625, + "ewc_loss_parallel": 2.8014183044433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.233089447021484, + "learning_rate": 1e-06, + "loss": 0.4799, + "mean_token_accuracy": 0.8452574014663696, + "num_tokens": 255018418.0, + "step": 6683 + }, + { + "epoch": 0.8502735020989696, + "ewc_loss": 0.028076171875, + "ewc_loss_parallel": 2.8133392333984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.39580535888672, + "learning_rate": 1e-06, + "loss": 0.4253, + "mean_token_accuracy": 0.8663251996040344, + "num_tokens": 255049626.0, + "step": 6684 + }, + { + "epoch": 0.8504007123775601, + "ewc_loss": 0.0279541015625, + "ewc_loss_parallel": 2.7894973754882812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.56329345703125, + "learning_rate": 1e-06, + "loss": 0.4596, + "mean_token_accuracy": 0.8532336950302124, + "num_tokens": 255086323.0, + "step": 6685 + }, + { + "epoch": 0.8505279226561506, + "ewc_loss": 0.0283203125, + "ewc_loss_parallel": 2.8371810913085938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.342689514160156, + "learning_rate": 1e-06, + "loss": 0.4119, + "mean_token_accuracy": 0.8687761425971985, + "num_tokens": 255124102.0, + "step": 6686 + }, + { + "epoch": 0.8506551329347412, + "ewc_loss": 0.027587890625, + "ewc_loss_parallel": 2.753734588623047e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.283538818359375, + "learning_rate": 1e-06, + "loss": 0.4324, + "mean_token_accuracy": 0.8647048473358154, + "num_tokens": 255162176.0, + "step": 6687 + }, + { + "epoch": 0.8507823432133317, + "ewc_loss": 0.02783203125, + "ewc_loss_parallel": 2.777576446533203e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.428260803222656, + "learning_rate": 1e-06, + "loss": 0.4291, + "mean_token_accuracy": 0.8634141683578491, + "num_tokens": 255196256.0, + "step": 6688 + }, + { + "epoch": 0.8509095534919221, + "ewc_loss": 0.0277099609375, + "ewc_loss_parallel": 2.765655517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.277292251586914, + "learning_rate": 1e-06, + "loss": 0.4585, + "mean_token_accuracy": 0.8533483147621155, + "num_tokens": 255236752.0, + "step": 6689 + }, + { + "epoch": 0.8510367637705126, + "ewc_loss": 0.0277099609375, + "ewc_loss_parallel": 2.765655517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.479473114013672, + "learning_rate": 1e-06, + "loss": 0.4326, + "mean_token_accuracy": 0.8623427152633667, + "num_tokens": 255268694.0, + "step": 6690 + }, + { + "epoch": 0.8511639740491032, + "ewc_loss": 0.027587890625, + "ewc_loss_parallel": 2.753734588623047e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.430994033813477, + "learning_rate": 1e-06, + "loss": 0.4273, + "mean_token_accuracy": 0.8667982816696167, + "num_tokens": 255309445.0, + "step": 6691 + }, + { + "epoch": 0.8512911843276937, + "ewc_loss": 0.027587890625, + "ewc_loss_parallel": 2.753734588623047e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.150957107543945, + "learning_rate": 1e-06, + "loss": 0.4339, + "mean_token_accuracy": 0.863474428653717, + "num_tokens": 255341126.0, + "step": 6692 + }, + { + "epoch": 0.8514183946062842, + "ewc_loss": 0.02734375, + "ewc_loss_parallel": 2.7298927307128906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.606956481933594, + "learning_rate": 1e-06, + "loss": 0.4083, + "mean_token_accuracy": 0.8698602914810181, + "num_tokens": 255379553.0, + "step": 6693 + }, + { + "epoch": 0.8515456048848747, + "ewc_loss": 0.02734375, + "ewc_loss_parallel": 2.7298927307128906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.272563934326172, + "learning_rate": 1e-06, + "loss": 0.4378, + "mean_token_accuracy": 0.8605492115020752, + "num_tokens": 255416588.0, + "step": 6694 + }, + { + "epoch": 0.8516728151634652, + "ewc_loss": 0.0274658203125, + "ewc_loss_parallel": 2.7418136596679688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.557905197143555, + "learning_rate": 1e-06, + "loss": 0.4649, + "mean_token_accuracy": 0.8494951128959656, + "num_tokens": 255449574.0, + "step": 6695 + }, + { + "epoch": 0.8518000254420557, + "ewc_loss": 0.0274658203125, + "ewc_loss_parallel": 2.7418136596679688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.248952865600586, + "learning_rate": 1e-06, + "loss": 0.4406, + "mean_token_accuracy": 0.8584195375442505, + "num_tokens": 255494088.0, + "step": 6696 + }, + { + "epoch": 0.8519272357206462, + "ewc_loss": 0.02734375, + "ewc_loss_parallel": 2.7298927307128906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.572063446044922, + "learning_rate": 1e-06, + "loss": 0.4274, + "mean_token_accuracy": 0.8693634271621704, + "num_tokens": 255529785.0, + "step": 6697 + }, + { + "epoch": 0.8520544459992367, + "ewc_loss": 0.02734375, + "ewc_loss_parallel": 2.7298927307128906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.163061141967773, + "learning_rate": 1e-06, + "loss": 0.4685, + "mean_token_accuracy": 0.8473129272460938, + "num_tokens": 255566675.0, + "step": 6698 + }, + { + "epoch": 0.8521816562778273, + "ewc_loss": 0.027099609375, + "ewc_loss_parallel": 2.7060508728027344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.36979103088379, + "learning_rate": 1e-06, + "loss": 0.4255, + "mean_token_accuracy": 0.8654041886329651, + "num_tokens": 255606502.0, + "step": 6699 + }, + { + "epoch": 0.8523088665564178, + "ewc_loss": 0.027587890625, + "ewc_loss_parallel": 2.753734588623047e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.31292724609375, + "learning_rate": 1e-06, + "loss": 0.46, + "mean_token_accuracy": 0.8517569303512573, + "num_tokens": 255649179.0, + "step": 6700 + }, + { + "epoch": 0.8524360768350082, + "ewc_loss": 0.027587890625, + "ewc_loss_parallel": 2.753734588623047e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.453203201293945, + "learning_rate": 1e-06, + "loss": 0.4381, + "mean_token_accuracy": 0.8598119616508484, + "num_tokens": 255688190.0, + "step": 6701 + }, + { + "epoch": 0.8525632871135987, + "ewc_loss": 0.0274658203125, + "ewc_loss_parallel": 2.7418136596679688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.360952377319336, + "learning_rate": 1e-06, + "loss": 0.448, + "mean_token_accuracy": 0.8599942922592163, + "num_tokens": 255730401.0, + "step": 6702 + }, + { + "epoch": 0.8526904973921893, + "ewc_loss": 0.0272216796875, + "ewc_loss_parallel": 2.7179718017578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.58928871154785, + "learning_rate": 1e-06, + "loss": 0.4511, + "mean_token_accuracy": 0.8560690879821777, + "num_tokens": 255767881.0, + "step": 6703 + }, + { + "epoch": 0.8528177076707798, + "ewc_loss": 0.0272216796875, + "ewc_loss_parallel": 2.7179718017578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.298158645629883, + "learning_rate": 1e-06, + "loss": 0.4559, + "mean_token_accuracy": 0.8535836338996887, + "num_tokens": 255808347.0, + "step": 6704 + }, + { + "epoch": 0.8529449179493703, + "ewc_loss": 0.02734375, + "ewc_loss_parallel": 2.7298927307128906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.425508499145508, + "learning_rate": 1e-06, + "loss": 0.5019, + "mean_token_accuracy": 0.8417634963989258, + "num_tokens": 255849538.0, + "step": 6705 + }, + { + "epoch": 0.8530721282279609, + "ewc_loss": 0.0274658203125, + "ewc_loss_parallel": 2.7418136596679688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.46529769897461, + "learning_rate": 1e-06, + "loss": 0.4068, + "mean_token_accuracy": 0.8711705207824707, + "num_tokens": 255890228.0, + "step": 6706 + }, + { + "epoch": 0.8531993385065513, + "ewc_loss": 0.027099609375, + "ewc_loss_parallel": 2.7060508728027344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.23240089416504, + "learning_rate": 1e-06, + "loss": 0.4473, + "mean_token_accuracy": 0.8588731288909912, + "num_tokens": 255930189.0, + "step": 6707 + }, + { + "epoch": 0.8533265487851418, + "ewc_loss": 0.02734375, + "ewc_loss_parallel": 2.7298927307128906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.48398208618164, + "learning_rate": 1e-06, + "loss": 0.4124, + "mean_token_accuracy": 0.8676970601081848, + "num_tokens": 255968310.0, + "step": 6708 + }, + { + "epoch": 0.8534537590637323, + "ewc_loss": 0.02734375, + "ewc_loss_parallel": 2.7298927307128906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.30055046081543, + "learning_rate": 1e-06, + "loss": 0.4071, + "mean_token_accuracy": 0.8717607259750366, + "num_tokens": 256005872.0, + "step": 6709 + }, + { + "epoch": 0.8535809693423229, + "ewc_loss": 0.0272216796875, + "ewc_loss_parallel": 2.7179718017578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.626811981201172, + "learning_rate": 1e-06, + "loss": 0.4142, + "mean_token_accuracy": 0.8657753467559814, + "num_tokens": 256045374.0, + "step": 6710 + }, + { + "epoch": 0.8537081796209134, + "ewc_loss": 0.027587890625, + "ewc_loss_parallel": 2.753734588623047e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.519994735717773, + "learning_rate": 1e-06, + "loss": 0.4349, + "mean_token_accuracy": 0.8640738725662231, + "num_tokens": 256084874.0, + "step": 6711 + }, + { + "epoch": 0.8538353898995039, + "ewc_loss": 0.0272216796875, + "ewc_loss_parallel": 2.7179718017578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.46036148071289, + "learning_rate": 1e-06, + "loss": 0.4674, + "mean_token_accuracy": 0.8530293703079224, + "num_tokens": 256122251.0, + "step": 6712 + }, + { + "epoch": 0.8539626001780944, + "ewc_loss": 0.027099609375, + "ewc_loss_parallel": 2.7060508728027344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.44428062438965, + "learning_rate": 1e-06, + "loss": 0.4479, + "mean_token_accuracy": 0.8585311770439148, + "num_tokens": 256165025.0, + "step": 6713 + }, + { + "epoch": 0.8540898104566849, + "ewc_loss": 0.0274658203125, + "ewc_loss_parallel": 2.7418136596679688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.35807991027832, + "learning_rate": 1e-06, + "loss": 0.4231, + "mean_token_accuracy": 0.8677501678466797, + "num_tokens": 256202220.0, + "step": 6714 + }, + { + "epoch": 0.8542170207352754, + "ewc_loss": 0.0272216796875, + "ewc_loss_parallel": 2.7179718017578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.447662353515625, + "learning_rate": 1e-06, + "loss": 0.4312, + "mean_token_accuracy": 0.8629252910614014, + "num_tokens": 256234036.0, + "step": 6715 + }, + { + "epoch": 0.8543442310138659, + "ewc_loss": 0.0274658203125, + "ewc_loss_parallel": 2.7418136596679688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.345012664794922, + "learning_rate": 1e-06, + "loss": 0.4387, + "mean_token_accuracy": 0.8592256307601929, + "num_tokens": 256275253.0, + "step": 6716 + }, + { + "epoch": 0.8544714412924564, + "ewc_loss": 0.027587890625, + "ewc_loss_parallel": 2.753734588623047e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.534391403198242, + "learning_rate": 1e-06, + "loss": 0.4441, + "mean_token_accuracy": 0.858106255531311, + "num_tokens": 256311296.0, + "step": 6717 + }, + { + "epoch": 0.854598651571047, + "ewc_loss": 0.0272216796875, + "ewc_loss_parallel": 2.7179718017578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.404447555541992, + "learning_rate": 1e-06, + "loss": 0.4112, + "mean_token_accuracy": 0.8666598200798035, + "num_tokens": 256348149.0, + "step": 6718 + }, + { + "epoch": 0.8547258618496375, + "ewc_loss": 0.02734375, + "ewc_loss_parallel": 2.7298927307128906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.45283317565918, + "learning_rate": 1e-06, + "loss": 0.3939, + "mean_token_accuracy": 0.8743680715560913, + "num_tokens": 256387861.0, + "step": 6719 + }, + { + "epoch": 0.8548530721282279, + "ewc_loss": 0.02734375, + "ewc_loss_parallel": 2.7298927307128906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.264034271240234, + "learning_rate": 1e-06, + "loss": 0.4077, + "mean_token_accuracy": 0.8694756031036377, + "num_tokens": 256427131.0, + "step": 6720 + }, + { + "epoch": 0.8549802824068184, + "ewc_loss": 0.0274658203125, + "ewc_loss_parallel": 2.7418136596679688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.508636474609375, + "learning_rate": 1e-06, + "loss": 0.4114, + "mean_token_accuracy": 0.8658081293106079, + "num_tokens": 256465041.0, + "step": 6721 + }, + { + "epoch": 0.855107492685409, + "ewc_loss": 0.02734375, + "ewc_loss_parallel": 2.7298927307128906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.501312255859375, + "learning_rate": 1e-06, + "loss": 0.4556, + "mean_token_accuracy": 0.8522289991378784, + "num_tokens": 256502576.0, + "step": 6722 + }, + { + "epoch": 0.8552347029639995, + "ewc_loss": 0.0272216796875, + "ewc_loss_parallel": 2.7179718017578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.346538543701172, + "learning_rate": 1e-06, + "loss": 0.4217, + "mean_token_accuracy": 0.8627136945724487, + "num_tokens": 256540493.0, + "step": 6723 + }, + { + "epoch": 0.85536191324259, + "ewc_loss": 0.0274658203125, + "ewc_loss_parallel": 2.7418136596679688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.539154052734375, + "learning_rate": 1e-06, + "loss": 0.4889, + "mean_token_accuracy": 0.8445389866828918, + "num_tokens": 256578115.0, + "step": 6724 + }, + { + "epoch": 0.8554891235211806, + "ewc_loss": 0.0272216796875, + "ewc_loss_parallel": 2.7179718017578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.444232940673828, + "learning_rate": 1e-06, + "loss": 0.4176, + "mean_token_accuracy": 0.8685272932052612, + "num_tokens": 256615957.0, + "step": 6725 + }, + { + "epoch": 0.855616333799771, + "ewc_loss": 0.0272216796875, + "ewc_loss_parallel": 2.7179718017578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.3441219329834, + "learning_rate": 1e-06, + "loss": 0.4333, + "mean_token_accuracy": 0.8649852275848389, + "num_tokens": 256649972.0, + "step": 6726 + }, + { + "epoch": 0.8557435440783615, + "ewc_loss": 0.02734375, + "ewc_loss_parallel": 2.7298927307128906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.539907455444336, + "learning_rate": 1e-06, + "loss": 0.4148, + "mean_token_accuracy": 0.8683100342750549, + "num_tokens": 256686683.0, + "step": 6727 + }, + { + "epoch": 0.855870754356952, + "ewc_loss": 0.0272216796875, + "ewc_loss_parallel": 2.7179718017578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.20414161682129, + "learning_rate": 1e-06, + "loss": 0.4501, + "mean_token_accuracy": 0.8590438961982727, + "num_tokens": 256723587.0, + "step": 6728 + }, + { + "epoch": 0.8559979646355426, + "ewc_loss": 0.0274658203125, + "ewc_loss_parallel": 2.7418136596679688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.569875717163086, + "learning_rate": 1e-06, + "loss": 0.4371, + "mean_token_accuracy": 0.8618350028991699, + "num_tokens": 256765245.0, + "step": 6729 + }, + { + "epoch": 0.8561251749141331, + "ewc_loss": 0.02734375, + "ewc_loss_parallel": 2.7298927307128906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.385181427001953, + "learning_rate": 1e-06, + "loss": 0.3438, + "mean_token_accuracy": 0.88824063539505, + "num_tokens": 256806930.0, + "step": 6730 + }, + { + "epoch": 0.8562523851927236, + "ewc_loss": 0.0272216796875, + "ewc_loss_parallel": 2.7179718017578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.47162437438965, + "learning_rate": 1e-06, + "loss": 0.4558, + "mean_token_accuracy": 0.8591115474700928, + "num_tokens": 256844568.0, + "step": 6731 + }, + { + "epoch": 0.856379595471314, + "ewc_loss": 0.0272216796875, + "ewc_loss_parallel": 2.7179718017578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.35085678100586, + "learning_rate": 1e-06, + "loss": 0.4432, + "mean_token_accuracy": 0.8585513830184937, + "num_tokens": 256877483.0, + "step": 6732 + }, + { + "epoch": 0.8565068057499046, + "ewc_loss": 0.0272216796875, + "ewc_loss_parallel": 2.7179718017578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.16474151611328, + "learning_rate": 1e-06, + "loss": 0.4279, + "mean_token_accuracy": 0.8660176992416382, + "num_tokens": 256921139.0, + "step": 6733 + }, + { + "epoch": 0.8566340160284951, + "ewc_loss": 0.02734375, + "ewc_loss_parallel": 2.7298927307128906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.56773567199707, + "learning_rate": 1e-06, + "loss": 0.4438, + "mean_token_accuracy": 0.861579418182373, + "num_tokens": 256957861.0, + "step": 6734 + }, + { + "epoch": 0.8567612263070856, + "ewc_loss": 0.02783203125, + "ewc_loss_parallel": 2.777576446533203e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.378782272338867, + "learning_rate": 1e-06, + "loss": 0.4384, + "mean_token_accuracy": 0.8628947138786316, + "num_tokens": 256998582.0, + "step": 6735 + }, + { + "epoch": 0.8568884365856761, + "ewc_loss": 0.027587890625, + "ewc_loss_parallel": 2.753734588623047e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.448087692260742, + "learning_rate": 1e-06, + "loss": 0.4358, + "mean_token_accuracy": 0.860971212387085, + "num_tokens": 257037638.0, + "step": 6736 + }, + { + "epoch": 0.8570156468642667, + "ewc_loss": 0.0272216796875, + "ewc_loss_parallel": 2.7179718017578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.536951065063477, + "learning_rate": 1e-06, + "loss": 0.411, + "mean_token_accuracy": 0.86329185962677, + "num_tokens": 257074427.0, + "step": 6737 + }, + { + "epoch": 0.8571428571428571, + "ewc_loss": 0.0274658203125, + "ewc_loss_parallel": 2.7418136596679688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.1756591796875, + "learning_rate": 1e-06, + "loss": 0.4211, + "mean_token_accuracy": 0.8649240136146545, + "num_tokens": 257113826.0, + "step": 6738 + }, + { + "epoch": 0.8572700674214476, + "ewc_loss": 0.02734375, + "ewc_loss_parallel": 2.7298927307128906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.669694900512695, + "learning_rate": 1e-06, + "loss": 0.4757, + "mean_token_accuracy": 0.8477383852005005, + "num_tokens": 257152387.0, + "step": 6739 + }, + { + "epoch": 0.8573972777000382, + "ewc_loss": 0.0272216796875, + "ewc_loss_parallel": 2.7179718017578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.476511001586914, + "learning_rate": 1e-06, + "loss": 0.4345, + "mean_token_accuracy": 0.8617554903030396, + "num_tokens": 257192083.0, + "step": 6740 + }, + { + "epoch": 0.8575244879786287, + "ewc_loss": 0.0269775390625, + "ewc_loss_parallel": 2.6941299438476562e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.40436553955078, + "learning_rate": 1e-06, + "loss": 0.4606, + "mean_token_accuracy": 0.8538684844970703, + "num_tokens": 257235425.0, + "step": 6741 + }, + { + "epoch": 0.8576516982572192, + "ewc_loss": 0.02734375, + "ewc_loss_parallel": 2.7298927307128906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.376432418823242, + "learning_rate": 1e-06, + "loss": 0.4403, + "mean_token_accuracy": 0.8574745655059814, + "num_tokens": 257268621.0, + "step": 6742 + }, + { + "epoch": 0.8577789085358097, + "ewc_loss": 0.02734375, + "ewc_loss_parallel": 2.7298927307128906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.432369232177734, + "learning_rate": 1e-06, + "loss": 0.4167, + "mean_token_accuracy": 0.8660470843315125, + "num_tokens": 257306909.0, + "step": 6743 + }, + { + "epoch": 0.8579061188144002, + "ewc_loss": 0.0274658203125, + "ewc_loss_parallel": 2.7418136596679688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.300233840942383, + "learning_rate": 1e-06, + "loss": 0.392, + "mean_token_accuracy": 0.8749963045120239, + "num_tokens": 257343388.0, + "step": 6744 + }, + { + "epoch": 0.8580333290929907, + "ewc_loss": 0.0274658203125, + "ewc_loss_parallel": 2.7418136596679688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.4820499420166, + "learning_rate": 1e-06, + "loss": 0.4231, + "mean_token_accuracy": 0.8664692640304565, + "num_tokens": 257377172.0, + "step": 6745 + }, + { + "epoch": 0.8581605393715812, + "ewc_loss": 0.02734375, + "ewc_loss_parallel": 2.7298927307128906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.349044799804688, + "learning_rate": 1e-06, + "loss": 0.4549, + "mean_token_accuracy": 0.8540926575660706, + "num_tokens": 257410312.0, + "step": 6746 + }, + { + "epoch": 0.8582877496501717, + "ewc_loss": 0.02734375, + "ewc_loss_parallel": 2.7298927307128906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.417509078979492, + "learning_rate": 1e-06, + "loss": 0.4266, + "mean_token_accuracy": 0.8643949031829834, + "num_tokens": 257449919.0, + "step": 6747 + }, + { + "epoch": 0.8584149599287623, + "ewc_loss": 0.0274658203125, + "ewc_loss_parallel": 2.7418136596679688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.433181762695312, + "learning_rate": 1e-06, + "loss": 0.4022, + "mean_token_accuracy": 0.8723576664924622, + "num_tokens": 257488296.0, + "step": 6748 + }, + { + "epoch": 0.8585421702073528, + "ewc_loss": 0.0274658203125, + "ewc_loss_parallel": 2.7418136596679688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.228408813476562, + "learning_rate": 1e-06, + "loss": 0.4013, + "mean_token_accuracy": 0.8696804046630859, + "num_tokens": 257524678.0, + "step": 6749 + }, + { + "epoch": 0.8586693804859432, + "ewc_loss": 0.02783203125, + "ewc_loss_parallel": 2.777576446533203e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.29229164123535, + "learning_rate": 1e-06, + "loss": 0.4149, + "mean_token_accuracy": 0.8668822050094604, + "num_tokens": 257559060.0, + "step": 6750 + }, + { + "epoch": 0.8587965907645337, + "ewc_loss": 0.0274658203125, + "ewc_loss_parallel": 2.7418136596679688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.303998947143555, + "learning_rate": 1e-06, + "loss": 0.3974, + "mean_token_accuracy": 0.8703776597976685, + "num_tokens": 257594340.0, + "step": 6751 + }, + { + "epoch": 0.8589238010431243, + "ewc_loss": 0.0274658203125, + "ewc_loss_parallel": 2.7418136596679688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.261640548706055, + "learning_rate": 1e-06, + "loss": 0.4022, + "mean_token_accuracy": 0.8724775910377502, + "num_tokens": 257627936.0, + "step": 6752 + }, + { + "epoch": 0.8590510113217148, + "ewc_loss": 0.0274658203125, + "ewc_loss_parallel": 2.7418136596679688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.458641052246094, + "learning_rate": 1e-06, + "loss": 0.4392, + "mean_token_accuracy": 0.859033465385437, + "num_tokens": 257660311.0, + "step": 6753 + }, + { + "epoch": 0.8591782216003053, + "ewc_loss": 0.0272216796875, + "ewc_loss_parallel": 2.7179718017578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.281856536865234, + "learning_rate": 1e-06, + "loss": 0.4527, + "mean_token_accuracy": 0.8533086776733398, + "num_tokens": 257697496.0, + "step": 6754 + }, + { + "epoch": 0.8593054318788959, + "ewc_loss": 0.02783203125, + "ewc_loss_parallel": 2.777576446533203e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.331623077392578, + "learning_rate": 1e-06, + "loss": 0.4919, + "mean_token_accuracy": 0.8440510034561157, + "num_tokens": 257737242.0, + "step": 6755 + }, + { + "epoch": 0.8594326421574863, + "ewc_loss": 0.0279541015625, + "ewc_loss_parallel": 2.7894973754882812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.488412857055664, + "learning_rate": 1e-06, + "loss": 0.4241, + "mean_token_accuracy": 0.8626460433006287, + "num_tokens": 257772946.0, + "step": 6756 + }, + { + "epoch": 0.8595598524360768, + "ewc_loss": 0.0277099609375, + "ewc_loss_parallel": 2.765655517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.430288314819336, + "learning_rate": 1e-06, + "loss": 0.4607, + "mean_token_accuracy": 0.8563524484634399, + "num_tokens": 257807198.0, + "step": 6757 + }, + { + "epoch": 0.8596870627146673, + "ewc_loss": 0.027587890625, + "ewc_loss_parallel": 2.753734588623047e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.21143341064453, + "learning_rate": 1e-06, + "loss": 0.412, + "mean_token_accuracy": 0.8671878576278687, + "num_tokens": 257848618.0, + "step": 6758 + }, + { + "epoch": 0.8598142729932579, + "ewc_loss": 0.0272216796875, + "ewc_loss_parallel": 2.7179718017578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.186248779296875, + "learning_rate": 1e-06, + "loss": 0.3817, + "mean_token_accuracy": 0.8729618787765503, + "num_tokens": 257887805.0, + "step": 6759 + }, + { + "epoch": 0.8599414832718484, + "ewc_loss": 0.0277099609375, + "ewc_loss_parallel": 2.765655517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.44822120666504, + "learning_rate": 1e-06, + "loss": 0.4114, + "mean_token_accuracy": 0.8676400184631348, + "num_tokens": 257929276.0, + "step": 6760 + }, + { + "epoch": 0.8600686935504389, + "ewc_loss": 0.0277099609375, + "ewc_loss_parallel": 2.765655517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.459964752197266, + "learning_rate": 1e-06, + "loss": 0.4559, + "mean_token_accuracy": 0.8553577065467834, + "num_tokens": 257958629.0, + "step": 6761 + }, + { + "epoch": 0.8601959038290294, + "ewc_loss": 0.027587890625, + "ewc_loss_parallel": 2.753734588623047e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.442668914794922, + "learning_rate": 1e-06, + "loss": 0.4332, + "mean_token_accuracy": 0.8582999110221863, + "num_tokens": 257990303.0, + "step": 6762 + }, + { + "epoch": 0.8603231141076199, + "ewc_loss": 0.0277099609375, + "ewc_loss_parallel": 2.765655517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.333755493164062, + "learning_rate": 1e-06, + "loss": 0.4957, + "mean_token_accuracy": 0.8452247977256775, + "num_tokens": 258028704.0, + "step": 6763 + }, + { + "epoch": 0.8604503243862104, + "ewc_loss": 0.0274658203125, + "ewc_loss_parallel": 2.7418136596679688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.23504066467285, + "learning_rate": 1e-06, + "loss": 0.4019, + "mean_token_accuracy": 0.8701026439666748, + "num_tokens": 258062757.0, + "step": 6764 + }, + { + "epoch": 0.8605775346648009, + "ewc_loss": 0.0274658203125, + "ewc_loss_parallel": 2.7418136596679688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.503873825073242, + "learning_rate": 1e-06, + "loss": 0.4379, + "mean_token_accuracy": 0.8611295223236084, + "num_tokens": 258098574.0, + "step": 6765 + }, + { + "epoch": 0.8607047449433914, + "ewc_loss": 0.0274658203125, + "ewc_loss_parallel": 2.7418136596679688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.328916549682617, + "learning_rate": 1e-06, + "loss": 0.4017, + "mean_token_accuracy": 0.8707501888275146, + "num_tokens": 258130937.0, + "step": 6766 + }, + { + "epoch": 0.860831955221982, + "ewc_loss": 0.0277099609375, + "ewc_loss_parallel": 2.765655517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.411876678466797, + "learning_rate": 1e-06, + "loss": 0.4176, + "mean_token_accuracy": 0.8639670610427856, + "num_tokens": 258165142.0, + "step": 6767 + }, + { + "epoch": 0.8609591655005725, + "ewc_loss": 0.0274658203125, + "ewc_loss_parallel": 2.7418136596679688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.293413162231445, + "learning_rate": 1e-06, + "loss": 0.3787, + "mean_token_accuracy": 0.8784452676773071, + "num_tokens": 258203986.0, + "step": 6768 + }, + { + "epoch": 0.8610863757791629, + "ewc_loss": 0.0274658203125, + "ewc_loss_parallel": 2.7418136596679688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.575122833251953, + "learning_rate": 1e-06, + "loss": 0.4232, + "mean_token_accuracy": 0.8620907664299011, + "num_tokens": 258238692.0, + "step": 6769 + }, + { + "epoch": 0.8612135860577534, + "ewc_loss": 0.02783203125, + "ewc_loss_parallel": 2.777576446533203e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.323991775512695, + "learning_rate": 1e-06, + "loss": 0.5071, + "mean_token_accuracy": 0.8422431945800781, + "num_tokens": 258280942.0, + "step": 6770 + }, + { + "epoch": 0.861340796336344, + "ewc_loss": 0.027587890625, + "ewc_loss_parallel": 2.753734588623047e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.35068130493164, + "learning_rate": 1e-06, + "loss": 0.4148, + "mean_token_accuracy": 0.8662277460098267, + "num_tokens": 258318835.0, + "step": 6771 + }, + { + "epoch": 0.8614680066149345, + "ewc_loss": 0.0277099609375, + "ewc_loss_parallel": 2.765655517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.48916244506836, + "learning_rate": 1e-06, + "loss": 0.4675, + "mean_token_accuracy": 0.8531150221824646, + "num_tokens": 258359461.0, + "step": 6772 + }, + { + "epoch": 0.861595216893525, + "ewc_loss": 0.02734375, + "ewc_loss_parallel": 2.7298927307128906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.433603286743164, + "learning_rate": 1e-06, + "loss": 0.4414, + "mean_token_accuracy": 0.8577562570571899, + "num_tokens": 258394582.0, + "step": 6773 + }, + { + "epoch": 0.8617224271721156, + "ewc_loss": 0.0274658203125, + "ewc_loss_parallel": 2.7418136596679688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.340551376342773, + "learning_rate": 1e-06, + "loss": 0.4229, + "mean_token_accuracy": 0.8646236062049866, + "num_tokens": 258438526.0, + "step": 6774 + }, + { + "epoch": 0.861849637450706, + "ewc_loss": 0.0274658203125, + "ewc_loss_parallel": 2.7418136596679688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.30961036682129, + "learning_rate": 1e-06, + "loss": 0.4761, + "mean_token_accuracy": 0.8486824035644531, + "num_tokens": 258479332.0, + "step": 6775 + }, + { + "epoch": 0.8619768477292965, + "ewc_loss": 0.02734375, + "ewc_loss_parallel": 2.7298927307128906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.42884063720703, + "learning_rate": 1e-06, + "loss": 0.4646, + "mean_token_accuracy": 0.8528333902359009, + "num_tokens": 258513202.0, + "step": 6776 + }, + { + "epoch": 0.862104058007887, + "ewc_loss": 0.0274658203125, + "ewc_loss_parallel": 2.7418136596679688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.336132049560547, + "learning_rate": 1e-06, + "loss": 0.4175, + "mean_token_accuracy": 0.864021360874176, + "num_tokens": 258549145.0, + "step": 6777 + }, + { + "epoch": 0.8622312682864776, + "ewc_loss": 0.0272216796875, + "ewc_loss_parallel": 2.7179718017578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.396806716918945, + "learning_rate": 1e-06, + "loss": 0.4581, + "mean_token_accuracy": 0.8530012965202332, + "num_tokens": 258586421.0, + "step": 6778 + }, + { + "epoch": 0.8623584785650681, + "ewc_loss": 0.0274658203125, + "ewc_loss_parallel": 2.7418136596679688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.399215698242188, + "learning_rate": 1e-06, + "loss": 0.4384, + "mean_token_accuracy": 0.8618937134742737, + "num_tokens": 258620014.0, + "step": 6779 + }, + { + "epoch": 0.8624856888436586, + "ewc_loss": 0.0274658203125, + "ewc_loss_parallel": 2.7418136596679688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.463409423828125, + "learning_rate": 1e-06, + "loss": 0.4122, + "mean_token_accuracy": 0.8669924139976501, + "num_tokens": 258656124.0, + "step": 6780 + }, + { + "epoch": 0.862612899122249, + "ewc_loss": 0.0274658203125, + "ewc_loss_parallel": 2.7418136596679688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.499752044677734, + "learning_rate": 1e-06, + "loss": 0.4567, + "mean_token_accuracy": 0.850831151008606, + "num_tokens": 258692537.0, + "step": 6781 + }, + { + "epoch": 0.8627401094008396, + "ewc_loss": 0.027587890625, + "ewc_loss_parallel": 2.753734588623047e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.350900650024414, + "learning_rate": 1e-06, + "loss": 0.4183, + "mean_token_accuracy": 0.8650804162025452, + "num_tokens": 258732737.0, + "step": 6782 + }, + { + "epoch": 0.8628673196794301, + "ewc_loss": 0.0274658203125, + "ewc_loss_parallel": 2.7418136596679688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.26491928100586, + "learning_rate": 1e-06, + "loss": 0.4679, + "mean_token_accuracy": 0.8494541645050049, + "num_tokens": 258770634.0, + "step": 6783 + }, + { + "epoch": 0.8629945299580206, + "ewc_loss": 0.027587890625, + "ewc_loss_parallel": 2.753734588623047e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.498390197753906, + "learning_rate": 1e-06, + "loss": 0.4452, + "mean_token_accuracy": 0.8607139587402344, + "num_tokens": 258809643.0, + "step": 6784 + }, + { + "epoch": 0.8631217402366111, + "ewc_loss": 0.0277099609375, + "ewc_loss_parallel": 2.765655517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.28997039794922, + "learning_rate": 1e-06, + "loss": 0.4207, + "mean_token_accuracy": 0.8670432567596436, + "num_tokens": 258845336.0, + "step": 6785 + }, + { + "epoch": 0.8632489505152017, + "ewc_loss": 0.02734375, + "ewc_loss_parallel": 2.7298927307128906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.439756393432617, + "learning_rate": 1e-06, + "loss": 0.4274, + "mean_token_accuracy": 0.863563060760498, + "num_tokens": 258882206.0, + "step": 6786 + }, + { + "epoch": 0.8633761607937921, + "ewc_loss": 0.0272216796875, + "ewc_loss_parallel": 2.7179718017578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.377403259277344, + "learning_rate": 1e-06, + "loss": 0.4091, + "mean_token_accuracy": 0.8694182634353638, + "num_tokens": 258918661.0, + "step": 6787 + }, + { + "epoch": 0.8635033710723826, + "ewc_loss": 0.0274658203125, + "ewc_loss_parallel": 2.7418136596679688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.47229766845703, + "learning_rate": 1e-06, + "loss": 0.4469, + "mean_token_accuracy": 0.8567371368408203, + "num_tokens": 258957452.0, + "step": 6788 + }, + { + "epoch": 0.8636305813509731, + "ewc_loss": 0.0277099609375, + "ewc_loss_parallel": 2.765655517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.406288146972656, + "learning_rate": 1e-06, + "loss": 0.472, + "mean_token_accuracy": 0.8536238670349121, + "num_tokens": 259003507.0, + "step": 6789 + }, + { + "epoch": 0.8637577916295637, + "ewc_loss": 0.027099609375, + "ewc_loss_parallel": 2.7060508728027344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.235794067382812, + "learning_rate": 1e-06, + "loss": 0.4418, + "mean_token_accuracy": 0.8565293550491333, + "num_tokens": 259042284.0, + "step": 6790 + }, + { + "epoch": 0.8638850019081542, + "ewc_loss": 0.0277099609375, + "ewc_loss_parallel": 2.765655517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.37381935119629, + "learning_rate": 1e-06, + "loss": 0.3985, + "mean_token_accuracy": 0.8753653764724731, + "num_tokens": 259082929.0, + "step": 6791 + }, + { + "epoch": 0.8640122121867447, + "ewc_loss": 0.0274658203125, + "ewc_loss_parallel": 2.7418136596679688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.306486129760742, + "learning_rate": 1e-06, + "loss": 0.3897, + "mean_token_accuracy": 0.8764375448226929, + "num_tokens": 259125920.0, + "step": 6792 + }, + { + "epoch": 0.8641394224653351, + "ewc_loss": 0.027587890625, + "ewc_loss_parallel": 2.753734588623047e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.633514404296875, + "learning_rate": 1e-06, + "loss": 0.4216, + "mean_token_accuracy": 0.8626327514648438, + "num_tokens": 259163140.0, + "step": 6793 + }, + { + "epoch": 0.8642666327439257, + "ewc_loss": 0.027587890625, + "ewc_loss_parallel": 2.753734588623047e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.50484275817871, + "learning_rate": 1e-06, + "loss": 0.4367, + "mean_token_accuracy": 0.8601335883140564, + "num_tokens": 259199038.0, + "step": 6794 + }, + { + "epoch": 0.8643938430225162, + "ewc_loss": 0.0274658203125, + "ewc_loss_parallel": 2.7418136596679688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.44246482849121, + "learning_rate": 1e-06, + "loss": 0.4465, + "mean_token_accuracy": 0.8598204851150513, + "num_tokens": 259235713.0, + "step": 6795 + }, + { + "epoch": 0.8645210533011067, + "ewc_loss": 0.027587890625, + "ewc_loss_parallel": 2.753734588623047e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.400814056396484, + "learning_rate": 1e-06, + "loss": 0.4713, + "mean_token_accuracy": 0.850978434085846, + "num_tokens": 259278314.0, + "step": 6796 + }, + { + "epoch": 0.8646482635796973, + "ewc_loss": 0.027587890625, + "ewc_loss_parallel": 2.753734588623047e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.384899139404297, + "learning_rate": 1e-06, + "loss": 0.4094, + "mean_token_accuracy": 0.8705160021781921, + "num_tokens": 259313039.0, + "step": 6797 + }, + { + "epoch": 0.8647754738582878, + "ewc_loss": 0.0274658203125, + "ewc_loss_parallel": 2.7418136596679688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.329557418823242, + "learning_rate": 1e-06, + "loss": 0.4138, + "mean_token_accuracy": 0.8691109418869019, + "num_tokens": 259348845.0, + "step": 6798 + }, + { + "epoch": 0.8649026841368782, + "ewc_loss": 0.0277099609375, + "ewc_loss_parallel": 2.765655517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.475513458251953, + "learning_rate": 1e-06, + "loss": 0.445, + "mean_token_accuracy": 0.8554166555404663, + "num_tokens": 259387345.0, + "step": 6799 + }, + { + "epoch": 0.8650298944154687, + "ewc_loss": 0.0277099609375, + "ewc_loss_parallel": 2.765655517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.335025787353516, + "learning_rate": 1e-06, + "loss": 0.4413, + "mean_token_accuracy": 0.860634446144104, + "num_tokens": 259427835.0, + "step": 6800 + }, + { + "epoch": 0.8651571046940593, + "ewc_loss": 0.027587890625, + "ewc_loss_parallel": 2.753734588623047e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.277725219726562, + "learning_rate": 1e-06, + "loss": 0.4212, + "mean_token_accuracy": 0.8662242293357849, + "num_tokens": 259460453.0, + "step": 6801 + }, + { + "epoch": 0.8652843149726498, + "ewc_loss": 0.02734375, + "ewc_loss_parallel": 2.7298927307128906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.430814743041992, + "learning_rate": 1e-06, + "loss": 0.4157, + "mean_token_accuracy": 0.8680734634399414, + "num_tokens": 259499296.0, + "step": 6802 + }, + { + "epoch": 0.8654115252512403, + "ewc_loss": 0.027587890625, + "ewc_loss_parallel": 2.753734588623047e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.320268630981445, + "learning_rate": 1e-06, + "loss": 0.4383, + "mean_token_accuracy": 0.8592045307159424, + "num_tokens": 259537649.0, + "step": 6803 + }, + { + "epoch": 0.8655387355298308, + "ewc_loss": 0.0277099609375, + "ewc_loss_parallel": 2.765655517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.37889289855957, + "learning_rate": 1e-06, + "loss": 0.4658, + "mean_token_accuracy": 0.853434145450592, + "num_tokens": 259583136.0, + "step": 6804 + }, + { + "epoch": 0.8656659458084213, + "ewc_loss": 0.027587890625, + "ewc_loss_parallel": 2.753734588623047e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.301176071166992, + "learning_rate": 1e-06, + "loss": 0.4268, + "mean_token_accuracy": 0.8653408288955688, + "num_tokens": 259620422.0, + "step": 6805 + }, + { + "epoch": 0.8657931560870118, + "ewc_loss": 0.0277099609375, + "ewc_loss_parallel": 2.765655517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.372060775756836, + "learning_rate": 1e-06, + "loss": 0.4824, + "mean_token_accuracy": 0.8496783971786499, + "num_tokens": 259663948.0, + "step": 6806 + }, + { + "epoch": 0.8659203663656023, + "ewc_loss": 0.0277099609375, + "ewc_loss_parallel": 2.765655517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.432762145996094, + "learning_rate": 1e-06, + "loss": 0.4296, + "mean_token_accuracy": 0.863513708114624, + "num_tokens": 259701986.0, + "step": 6807 + }, + { + "epoch": 0.8660475766441929, + "ewc_loss": 0.0274658203125, + "ewc_loss_parallel": 2.7418136596679688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.24768829345703, + "learning_rate": 1e-06, + "loss": 0.3939, + "mean_token_accuracy": 0.8764359354972839, + "num_tokens": 259740684.0, + "step": 6808 + }, + { + "epoch": 0.8661747869227834, + "ewc_loss": 0.027587890625, + "ewc_loss_parallel": 2.753734588623047e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.480083465576172, + "learning_rate": 1e-06, + "loss": 0.4914, + "mean_token_accuracy": 0.8450815677642822, + "num_tokens": 259781659.0, + "step": 6809 + }, + { + "epoch": 0.8663019972013739, + "ewc_loss": 0.0279541015625, + "ewc_loss_parallel": 2.7894973754882812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.39158058166504, + "learning_rate": 1e-06, + "loss": 0.4598, + "mean_token_accuracy": 0.8566447496414185, + "num_tokens": 259818554.0, + "step": 6810 + }, + { + "epoch": 0.8664292074799644, + "ewc_loss": 0.0277099609375, + "ewc_loss_parallel": 2.765655517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.449037551879883, + "learning_rate": 1e-06, + "loss": 0.4227, + "mean_token_accuracy": 0.863735020160675, + "num_tokens": 259856652.0, + "step": 6811 + }, + { + "epoch": 0.8665564177585549, + "ewc_loss": 0.02783203125, + "ewc_loss_parallel": 2.777576446533203e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.52220344543457, + "learning_rate": 1e-06, + "loss": 0.3849, + "mean_token_accuracy": 0.8793399333953857, + "num_tokens": 259898255.0, + "step": 6812 + }, + { + "epoch": 0.8666836280371454, + "ewc_loss": 0.02734375, + "ewc_loss_parallel": 2.7298927307128906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.305988311767578, + "learning_rate": 1e-06, + "loss": 0.4155, + "mean_token_accuracy": 0.8660575151443481, + "num_tokens": 259935305.0, + "step": 6813 + }, + { + "epoch": 0.8668108383157359, + "ewc_loss": 0.027587890625, + "ewc_loss_parallel": 2.753734588623047e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.556896209716797, + "learning_rate": 1e-06, + "loss": 0.4215, + "mean_token_accuracy": 0.8659852743148804, + "num_tokens": 259975245.0, + "step": 6814 + }, + { + "epoch": 0.8669380485943264, + "ewc_loss": 0.0277099609375, + "ewc_loss_parallel": 2.765655517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.244632720947266, + "learning_rate": 1e-06, + "loss": 0.4169, + "mean_token_accuracy": 0.8667284250259399, + "num_tokens": 260014150.0, + "step": 6815 + }, + { + "epoch": 0.867065258872917, + "ewc_loss": 0.0272216796875, + "ewc_loss_parallel": 2.7179718017578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.591190338134766, + "learning_rate": 1e-06, + "loss": 0.4668, + "mean_token_accuracy": 0.8524065017700195, + "num_tokens": 260054655.0, + "step": 6816 + }, + { + "epoch": 0.8671924691515075, + "ewc_loss": 0.02783203125, + "ewc_loss_parallel": 2.777576446533203e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.35256004333496, + "learning_rate": 1e-06, + "loss": 0.4372, + "mean_token_accuracy": 0.8621019124984741, + "num_tokens": 260091528.0, + "step": 6817 + }, + { + "epoch": 0.8673196794300979, + "ewc_loss": 0.027587890625, + "ewc_loss_parallel": 2.753734588623047e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.42134666442871, + "learning_rate": 1e-06, + "loss": 0.383, + "mean_token_accuracy": 0.8758420944213867, + "num_tokens": 260123069.0, + "step": 6818 + }, + { + "epoch": 0.8674468897086884, + "ewc_loss": 0.027587890625, + "ewc_loss_parallel": 2.753734588623047e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.28199005126953, + "learning_rate": 1e-06, + "loss": 0.4813, + "mean_token_accuracy": 0.8462899923324585, + "num_tokens": 260154482.0, + "step": 6819 + }, + { + "epoch": 0.867574099987279, + "ewc_loss": 0.027587890625, + "ewc_loss_parallel": 2.753734588623047e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.512788772583008, + "learning_rate": 1e-06, + "loss": 0.4291, + "mean_token_accuracy": 0.863770604133606, + "num_tokens": 260193983.0, + "step": 6820 + }, + { + "epoch": 0.8677013102658695, + "ewc_loss": 0.0277099609375, + "ewc_loss_parallel": 2.765655517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.478418350219727, + "learning_rate": 1e-06, + "loss": 0.4271, + "mean_token_accuracy": 0.8646573424339294, + "num_tokens": 260228705.0, + "step": 6821 + }, + { + "epoch": 0.86782852054446, + "ewc_loss": 0.0277099609375, + "ewc_loss_parallel": 2.765655517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.474472045898438, + "learning_rate": 1e-06, + "loss": 0.4407, + "mean_token_accuracy": 0.8608949184417725, + "num_tokens": 260262821.0, + "step": 6822 + }, + { + "epoch": 0.8679557308230506, + "ewc_loss": 0.02783203125, + "ewc_loss_parallel": 2.777576446533203e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.47176170349121, + "learning_rate": 1e-06, + "loss": 0.433, + "mean_token_accuracy": 0.8601677417755127, + "num_tokens": 260298949.0, + "step": 6823 + }, + { + "epoch": 0.868082941101641, + "ewc_loss": 0.027587890625, + "ewc_loss_parallel": 2.753734588623047e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.19482421875, + "learning_rate": 1e-06, + "loss": 0.4103, + "mean_token_accuracy": 0.8700165748596191, + "num_tokens": 260340028.0, + "step": 6824 + }, + { + "epoch": 0.8682101513802315, + "ewc_loss": 0.0277099609375, + "ewc_loss_parallel": 2.765655517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.584421157836914, + "learning_rate": 1e-06, + "loss": 0.4341, + "mean_token_accuracy": 0.8609957695007324, + "num_tokens": 260381328.0, + "step": 6825 + }, + { + "epoch": 0.868337361658822, + "ewc_loss": 0.0277099609375, + "ewc_loss_parallel": 2.765655517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.341320037841797, + "learning_rate": 1e-06, + "loss": 0.4349, + "mean_token_accuracy": 0.8627787828445435, + "num_tokens": 260423421.0, + "step": 6826 + }, + { + "epoch": 0.8684645719374126, + "ewc_loss": 0.0277099609375, + "ewc_loss_parallel": 2.765655517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.529539108276367, + "learning_rate": 1e-06, + "loss": 0.4319, + "mean_token_accuracy": 0.8601149320602417, + "num_tokens": 260461985.0, + "step": 6827 + }, + { + "epoch": 0.8685917822160031, + "ewc_loss": 0.0274658203125, + "ewc_loss_parallel": 2.7418136596679688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.27097511291504, + "learning_rate": 1e-06, + "loss": 0.4954, + "mean_token_accuracy": 0.8460745811462402, + "num_tokens": 260503041.0, + "step": 6828 + }, + { + "epoch": 0.8687189924945936, + "ewc_loss": 0.027587890625, + "ewc_loss_parallel": 2.753734588623047e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.675569534301758, + "learning_rate": 1e-06, + "loss": 0.4062, + "mean_token_accuracy": 0.8696060180664062, + "num_tokens": 260545553.0, + "step": 6829 + }, + { + "epoch": 0.868846202773184, + "ewc_loss": 0.0279541015625, + "ewc_loss_parallel": 2.7894973754882812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.257549285888672, + "learning_rate": 1e-06, + "loss": 0.4192, + "mean_token_accuracy": 0.8686914443969727, + "num_tokens": 260577160.0, + "step": 6830 + }, + { + "epoch": 0.8689734130517746, + "ewc_loss": 0.02734375, + "ewc_loss_parallel": 2.7298927307128906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.326393127441406, + "learning_rate": 1e-06, + "loss": 0.392, + "mean_token_accuracy": 0.8734407424926758, + "num_tokens": 260614748.0, + "step": 6831 + }, + { + "epoch": 0.8691006233303651, + "ewc_loss": 0.0279541015625, + "ewc_loss_parallel": 2.8014183044433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.572954177856445, + "learning_rate": 1e-06, + "loss": 0.429, + "mean_token_accuracy": 0.862358808517456, + "num_tokens": 260651762.0, + "step": 6832 + }, + { + "epoch": 0.8692278336089556, + "ewc_loss": 0.0274658203125, + "ewc_loss_parallel": 2.7418136596679688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.3170223236084, + "learning_rate": 1e-06, + "loss": 0.4755, + "mean_token_accuracy": 0.850438117980957, + "num_tokens": 260694438.0, + "step": 6833 + }, + { + "epoch": 0.8693550438875461, + "ewc_loss": 0.0277099609375, + "ewc_loss_parallel": 2.765655517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.485939025878906, + "learning_rate": 1e-06, + "loss": 0.4162, + "mean_token_accuracy": 0.8705037832260132, + "num_tokens": 260730256.0, + "step": 6834 + }, + { + "epoch": 0.8694822541661367, + "ewc_loss": 0.0277099609375, + "ewc_loss_parallel": 2.765655517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.464130401611328, + "learning_rate": 1e-06, + "loss": 0.502, + "mean_token_accuracy": 0.8411372900009155, + "num_tokens": 260766190.0, + "step": 6835 + }, + { + "epoch": 0.8696094644447271, + "ewc_loss": 0.02783203125, + "ewc_loss_parallel": 2.777576446533203e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.608434677124023, + "learning_rate": 1e-06, + "loss": 0.3969, + "mean_token_accuracy": 0.8761614561080933, + "num_tokens": 260801733.0, + "step": 6836 + }, + { + "epoch": 0.8697366747233176, + "ewc_loss": 0.027587890625, + "ewc_loss_parallel": 2.753734588623047e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.389076232910156, + "learning_rate": 1e-06, + "loss": 0.4062, + "mean_token_accuracy": 0.87305748462677, + "num_tokens": 260843461.0, + "step": 6837 + }, + { + "epoch": 0.8698638850019081, + "ewc_loss": 0.0277099609375, + "ewc_loss_parallel": 2.765655517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.46077537536621, + "learning_rate": 1e-06, + "loss": 0.4517, + "mean_token_accuracy": 0.8600607514381409, + "num_tokens": 260875006.0, + "step": 6838 + }, + { + "epoch": 0.8699910952804987, + "ewc_loss": 0.0277099609375, + "ewc_loss_parallel": 2.765655517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.52039909362793, + "learning_rate": 1e-06, + "loss": 0.4609, + "mean_token_accuracy": 0.8561932444572449, + "num_tokens": 260917553.0, + "step": 6839 + }, + { + "epoch": 0.8701183055590892, + "ewc_loss": 0.0279541015625, + "ewc_loss_parallel": 2.7894973754882812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.355356216430664, + "learning_rate": 1e-06, + "loss": 0.5086, + "mean_token_accuracy": 0.8378482460975647, + "num_tokens": 260955293.0, + "step": 6840 + }, + { + "epoch": 0.8702455158376797, + "ewc_loss": 0.0274658203125, + "ewc_loss_parallel": 2.7418136596679688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.605037689208984, + "learning_rate": 1e-06, + "loss": 0.4405, + "mean_token_accuracy": 0.8618223667144775, + "num_tokens": 260983966.0, + "step": 6841 + }, + { + "epoch": 0.8703727261162701, + "ewc_loss": 0.027587890625, + "ewc_loss_parallel": 2.753734588623047e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.469690322875977, + "learning_rate": 1e-06, + "loss": 0.3892, + "mean_token_accuracy": 0.8745226860046387, + "num_tokens": 261019434.0, + "step": 6842 + }, + { + "epoch": 0.8704999363948607, + "ewc_loss": 0.02734375, + "ewc_loss_parallel": 2.7298927307128906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.352691650390625, + "learning_rate": 1e-06, + "loss": 0.4679, + "mean_token_accuracy": 0.8505212068557739, + "num_tokens": 261053761.0, + "step": 6843 + }, + { + "epoch": 0.8706271466734512, + "ewc_loss": 0.0277099609375, + "ewc_loss_parallel": 2.765655517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.315095901489258, + "learning_rate": 1e-06, + "loss": 0.4814, + "mean_token_accuracy": 0.8520412445068359, + "num_tokens": 261093490.0, + "step": 6844 + }, + { + "epoch": 0.8707543569520417, + "ewc_loss": 0.0277099609375, + "ewc_loss_parallel": 2.765655517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.58904457092285, + "learning_rate": 1e-06, + "loss": 0.4329, + "mean_token_accuracy": 0.8635941743850708, + "num_tokens": 261129518.0, + "step": 6845 + }, + { + "epoch": 0.8708815672306323, + "ewc_loss": 0.0277099609375, + "ewc_loss_parallel": 2.765655517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.380393981933594, + "learning_rate": 1e-06, + "loss": 0.4812, + "mean_token_accuracy": 0.8446287512779236, + "num_tokens": 261164282.0, + "step": 6846 + }, + { + "epoch": 0.8710087775092228, + "ewc_loss": 0.0277099609375, + "ewc_loss_parallel": 2.765655517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.48727798461914, + "learning_rate": 1e-06, + "loss": 0.5015, + "mean_token_accuracy": 0.8418506383895874, + "num_tokens": 261205055.0, + "step": 6847 + }, + { + "epoch": 0.8711359877878132, + "ewc_loss": 0.02783203125, + "ewc_loss_parallel": 2.777576446533203e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.319114685058594, + "learning_rate": 1e-06, + "loss": 0.451, + "mean_token_accuracy": 0.856467604637146, + "num_tokens": 261242329.0, + "step": 6848 + }, + { + "epoch": 0.8712631980664037, + "ewc_loss": 0.027587890625, + "ewc_loss_parallel": 2.753734588623047e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.43060302734375, + "learning_rate": 1e-06, + "loss": 0.5267, + "mean_token_accuracy": 0.8351172208786011, + "num_tokens": 261280330.0, + "step": 6849 + }, + { + "epoch": 0.8713904083449943, + "ewc_loss": 0.02783203125, + "ewc_loss_parallel": 2.777576446533203e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.39421844482422, + "learning_rate": 1e-06, + "loss": 0.3989, + "mean_token_accuracy": 0.8727295398712158, + "num_tokens": 261318232.0, + "step": 6850 + }, + { + "epoch": 0.8715176186235848, + "ewc_loss": 0.027587890625, + "ewc_loss_parallel": 2.753734588623047e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.255388259887695, + "learning_rate": 1e-06, + "loss": 0.3971, + "mean_token_accuracy": 0.8784573674201965, + "num_tokens": 261351483.0, + "step": 6851 + }, + { + "epoch": 0.8716448289021753, + "ewc_loss": 0.0279541015625, + "ewc_loss_parallel": 2.7894973754882812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.536893844604492, + "learning_rate": 1e-06, + "loss": 0.4263, + "mean_token_accuracy": 0.864762544631958, + "num_tokens": 261387058.0, + "step": 6852 + }, + { + "epoch": 0.8717720391807658, + "ewc_loss": 0.027587890625, + "ewc_loss_parallel": 2.753734588623047e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.352691650390625, + "learning_rate": 1e-06, + "loss": 0.3891, + "mean_token_accuracy": 0.8768746852874756, + "num_tokens": 261430110.0, + "step": 6853 + }, + { + "epoch": 0.8718992494593563, + "ewc_loss": 0.02783203125, + "ewc_loss_parallel": 2.777576446533203e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.62786102294922, + "learning_rate": 1e-06, + "loss": 0.4121, + "mean_token_accuracy": 0.8685227632522583, + "num_tokens": 261470707.0, + "step": 6854 + }, + { + "epoch": 0.8720264597379468, + "ewc_loss": 0.0274658203125, + "ewc_loss_parallel": 2.7418136596679688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.603364944458008, + "learning_rate": 1e-06, + "loss": 0.4135, + "mean_token_accuracy": 0.8690835237503052, + "num_tokens": 261507146.0, + "step": 6855 + }, + { + "epoch": 0.8721536700165373, + "ewc_loss": 0.0277099609375, + "ewc_loss_parallel": 2.765655517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.319040298461914, + "learning_rate": 1e-06, + "loss": 0.4742, + "mean_token_accuracy": 0.8472714424133301, + "num_tokens": 261549723.0, + "step": 6856 + }, + { + "epoch": 0.8722808802951278, + "ewc_loss": 0.0277099609375, + "ewc_loss_parallel": 2.765655517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.443801879882812, + "learning_rate": 1e-06, + "loss": 0.4097, + "mean_token_accuracy": 0.8675100207328796, + "num_tokens": 261591030.0, + "step": 6857 + }, + { + "epoch": 0.8724080905737184, + "ewc_loss": 0.027587890625, + "ewc_loss_parallel": 2.753734588623047e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.31869125366211, + "learning_rate": 1e-06, + "loss": 0.3987, + "mean_token_accuracy": 0.871340811252594, + "num_tokens": 261623919.0, + "step": 6858 + }, + { + "epoch": 0.8725353008523089, + "ewc_loss": 0.0277099609375, + "ewc_loss_parallel": 2.765655517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.3702392578125, + "learning_rate": 1e-06, + "loss": 0.3741, + "mean_token_accuracy": 0.8790323734283447, + "num_tokens": 261655277.0, + "step": 6859 + }, + { + "epoch": 0.8726625111308994, + "ewc_loss": 0.0277099609375, + "ewc_loss_parallel": 2.765655517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.490755081176758, + "learning_rate": 1e-06, + "loss": 0.4865, + "mean_token_accuracy": 0.8494342565536499, + "num_tokens": 261697462.0, + "step": 6860 + }, + { + "epoch": 0.8727897214094898, + "ewc_loss": 0.027587890625, + "ewc_loss_parallel": 2.753734588623047e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.294279098510742, + "learning_rate": 1e-06, + "loss": 0.4377, + "mean_token_accuracy": 0.8590496778488159, + "num_tokens": 261739134.0, + "step": 6861 + }, + { + "epoch": 0.8729169316880804, + "ewc_loss": 0.02783203125, + "ewc_loss_parallel": 2.777576446533203e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.48215675354004, + "learning_rate": 1e-06, + "loss": 0.429, + "mean_token_accuracy": 0.8648505210876465, + "num_tokens": 261776295.0, + "step": 6862 + }, + { + "epoch": 0.8730441419666709, + "ewc_loss": 0.0279541015625, + "ewc_loss_parallel": 2.7894973754882812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.42193031311035, + "learning_rate": 1e-06, + "loss": 0.4148, + "mean_token_accuracy": 0.8664230704307556, + "num_tokens": 261812388.0, + "step": 6863 + }, + { + "epoch": 0.8731713522452614, + "ewc_loss": 0.0274658203125, + "ewc_loss_parallel": 2.7418136596679688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.329927444458008, + "learning_rate": 1e-06, + "loss": 0.422, + "mean_token_accuracy": 0.8632504940032959, + "num_tokens": 261855537.0, + "step": 6864 + }, + { + "epoch": 0.873298562523852, + "ewc_loss": 0.027587890625, + "ewc_loss_parallel": 2.753734588623047e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.470867156982422, + "learning_rate": 1e-06, + "loss": 0.4162, + "mean_token_accuracy": 0.8702536821365356, + "num_tokens": 261899922.0, + "step": 6865 + }, + { + "epoch": 0.8734257728024425, + "ewc_loss": 0.0277099609375, + "ewc_loss_parallel": 2.765655517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.465097427368164, + "learning_rate": 1e-06, + "loss": 0.4036, + "mean_token_accuracy": 0.8738848567008972, + "num_tokens": 261934475.0, + "step": 6866 + }, + { + "epoch": 0.8735529830810329, + "ewc_loss": 0.0277099609375, + "ewc_loss_parallel": 2.765655517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.333162307739258, + "learning_rate": 1e-06, + "loss": 0.4262, + "mean_token_accuracy": 0.8625789880752563, + "num_tokens": 261973395.0, + "step": 6867 + }, + { + "epoch": 0.8736801933596234, + "ewc_loss": 0.0277099609375, + "ewc_loss_parallel": 2.765655517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.502460479736328, + "learning_rate": 1e-06, + "loss": 0.4883, + "mean_token_accuracy": 0.8454608917236328, + "num_tokens": 262011644.0, + "step": 6868 + }, + { + "epoch": 0.873807403638214, + "ewc_loss": 0.0277099609375, + "ewc_loss_parallel": 2.765655517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.549421310424805, + "learning_rate": 1e-06, + "loss": 0.4409, + "mean_token_accuracy": 0.8576265573501587, + "num_tokens": 262044038.0, + "step": 6869 + }, + { + "epoch": 0.8739346139168045, + "ewc_loss": 0.0277099609375, + "ewc_loss_parallel": 2.765655517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.29306411743164, + "learning_rate": 1e-06, + "loss": 0.4304, + "mean_token_accuracy": 0.8607540130615234, + "num_tokens": 262081023.0, + "step": 6870 + }, + { + "epoch": 0.874061824195395, + "ewc_loss": 0.027587890625, + "ewc_loss_parallel": 2.753734588623047e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.554996490478516, + "learning_rate": 1e-06, + "loss": 0.4473, + "mean_token_accuracy": 0.8553436994552612, + "num_tokens": 262123981.0, + "step": 6871 + }, + { + "epoch": 0.8741890344739855, + "ewc_loss": 0.0279541015625, + "ewc_loss_parallel": 2.7894973754882812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.380699157714844, + "learning_rate": 1e-06, + "loss": 0.4727, + "mean_token_accuracy": 0.8528953194618225, + "num_tokens": 262161542.0, + "step": 6872 + }, + { + "epoch": 0.874316244752576, + "ewc_loss": 0.0279541015625, + "ewc_loss_parallel": 2.7894973754882812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.410032272338867, + "learning_rate": 1e-06, + "loss": 0.4701, + "mean_token_accuracy": 0.8520047664642334, + "num_tokens": 262202482.0, + "step": 6873 + }, + { + "epoch": 0.8744434550311665, + "ewc_loss": 0.0277099609375, + "ewc_loss_parallel": 2.765655517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.4814510345459, + "learning_rate": 1e-06, + "loss": 0.4355, + "mean_token_accuracy": 0.8640422224998474, + "num_tokens": 262239499.0, + "step": 6874 + }, + { + "epoch": 0.874570665309757, + "ewc_loss": 0.02783203125, + "ewc_loss_parallel": 2.777576446533203e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.600786209106445, + "learning_rate": 1e-06, + "loss": 0.4331, + "mean_token_accuracy": 0.8631966710090637, + "num_tokens": 262281324.0, + "step": 6875 + }, + { + "epoch": 0.8746978755883476, + "ewc_loss": 0.027587890625, + "ewc_loss_parallel": 2.753734588623047e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.36195945739746, + "learning_rate": 1e-06, + "loss": 0.4565, + "mean_token_accuracy": 0.8533980250358582, + "num_tokens": 262318772.0, + "step": 6876 + }, + { + "epoch": 0.8748250858669381, + "ewc_loss": 0.02783203125, + "ewc_loss_parallel": 2.777576446533203e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.446735382080078, + "learning_rate": 1e-06, + "loss": 0.3985, + "mean_token_accuracy": 0.8751176595687866, + "num_tokens": 262353264.0, + "step": 6877 + }, + { + "epoch": 0.8749522961455286, + "ewc_loss": 0.027587890625, + "ewc_loss_parallel": 2.753734588623047e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.645790100097656, + "learning_rate": 1e-06, + "loss": 0.4448, + "mean_token_accuracy": 0.8595985770225525, + "num_tokens": 262393360.0, + "step": 6878 + }, + { + "epoch": 0.875079506424119, + "ewc_loss": 0.0274658203125, + "ewc_loss_parallel": 2.7418136596679688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.358749389648438, + "learning_rate": 1e-06, + "loss": 0.3942, + "mean_token_accuracy": 0.8725904226303101, + "num_tokens": 262432010.0, + "step": 6879 + }, + { + "epoch": 0.8752067167027096, + "ewc_loss": 0.027587890625, + "ewc_loss_parallel": 2.753734588623047e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.6075496673584, + "learning_rate": 1e-06, + "loss": 0.4289, + "mean_token_accuracy": 0.8611915707588196, + "num_tokens": 262468828.0, + "step": 6880 + }, + { + "epoch": 0.8753339269813001, + "ewc_loss": 0.02783203125, + "ewc_loss_parallel": 2.777576446533203e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.346467971801758, + "learning_rate": 1e-06, + "loss": 0.4069, + "mean_token_accuracy": 0.8691176176071167, + "num_tokens": 262515141.0, + "step": 6881 + }, + { + "epoch": 0.8754611372598906, + "ewc_loss": 0.0274658203125, + "ewc_loss_parallel": 2.7418136596679688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.527624130249023, + "learning_rate": 1e-06, + "loss": 0.4392, + "mean_token_accuracy": 0.8594381809234619, + "num_tokens": 262553137.0, + "step": 6882 + }, + { + "epoch": 0.8755883475384811, + "ewc_loss": 0.0277099609375, + "ewc_loss_parallel": 2.765655517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.45920753479004, + "learning_rate": 1e-06, + "loss": 0.3618, + "mean_token_accuracy": 0.8845165371894836, + "num_tokens": 262593847.0, + "step": 6883 + }, + { + "epoch": 0.8757155578170717, + "ewc_loss": 0.0279541015625, + "ewc_loss_parallel": 2.7894973754882812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.752193450927734, + "learning_rate": 1e-06, + "loss": 0.4657, + "mean_token_accuracy": 0.8516445755958557, + "num_tokens": 262635048.0, + "step": 6884 + }, + { + "epoch": 0.8758427680956621, + "ewc_loss": 0.0274658203125, + "ewc_loss_parallel": 2.7418136596679688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.346435546875, + "learning_rate": 1e-06, + "loss": 0.4616, + "mean_token_accuracy": 0.8539814949035645, + "num_tokens": 262671200.0, + "step": 6885 + }, + { + "epoch": 0.8759699783742526, + "ewc_loss": 0.027587890625, + "ewc_loss_parallel": 2.753734588623047e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.599130630493164, + "learning_rate": 1e-06, + "loss": 0.4202, + "mean_token_accuracy": 0.8639529943466187, + "num_tokens": 262708304.0, + "step": 6886 + }, + { + "epoch": 0.8760971886528431, + "ewc_loss": 0.02783203125, + "ewc_loss_parallel": 2.777576446533203e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.429534912109375, + "learning_rate": 1e-06, + "loss": 0.4459, + "mean_token_accuracy": 0.8587900400161743, + "num_tokens": 262750680.0, + "step": 6887 + }, + { + "epoch": 0.8762243989314337, + "ewc_loss": 0.0277099609375, + "ewc_loss_parallel": 2.765655517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.515810012817383, + "learning_rate": 1e-06, + "loss": 0.4518, + "mean_token_accuracy": 0.8568828105926514, + "num_tokens": 262784845.0, + "step": 6888 + }, + { + "epoch": 0.8763516092100242, + "ewc_loss": 0.0277099609375, + "ewc_loss_parallel": 2.765655517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.41573143005371, + "learning_rate": 1e-06, + "loss": 0.4362, + "mean_token_accuracy": 0.8615530729293823, + "num_tokens": 262822646.0, + "step": 6889 + }, + { + "epoch": 0.8764788194886147, + "ewc_loss": 0.027587890625, + "ewc_loss_parallel": 2.753734588623047e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.391040802001953, + "learning_rate": 1e-06, + "loss": 0.4436, + "mean_token_accuracy": 0.8584293127059937, + "num_tokens": 262863663.0, + "step": 6890 + }, + { + "epoch": 0.8766060297672051, + "ewc_loss": 0.02783203125, + "ewc_loss_parallel": 2.777576446533203e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.531414031982422, + "learning_rate": 1e-06, + "loss": 0.4748, + "mean_token_accuracy": 0.8496191501617432, + "num_tokens": 262903726.0, + "step": 6891 + }, + { + "epoch": 0.8767332400457957, + "ewc_loss": 0.0277099609375, + "ewc_loss_parallel": 2.765655517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.465869903564453, + "learning_rate": 1e-06, + "loss": 0.4705, + "mean_token_accuracy": 0.8507939577102661, + "num_tokens": 262942748.0, + "step": 6892 + }, + { + "epoch": 0.8768604503243862, + "ewc_loss": 0.02783203125, + "ewc_loss_parallel": 2.777576446533203e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.543058395385742, + "learning_rate": 1e-06, + "loss": 0.4997, + "mean_token_accuracy": 0.8413952589035034, + "num_tokens": 262977639.0, + "step": 6893 + }, + { + "epoch": 0.8769876606029767, + "ewc_loss": 0.0277099609375, + "ewc_loss_parallel": 2.765655517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.326128005981445, + "learning_rate": 1e-06, + "loss": 0.4276, + "mean_token_accuracy": 0.8654217720031738, + "num_tokens": 263015970.0, + "step": 6894 + }, + { + "epoch": 0.8771148708815673, + "ewc_loss": 0.0279541015625, + "ewc_loss_parallel": 2.7894973754882812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.359355926513672, + "learning_rate": 1e-06, + "loss": 0.4109, + "mean_token_accuracy": 0.8712878227233887, + "num_tokens": 263058618.0, + "step": 6895 + }, + { + "epoch": 0.8772420811601578, + "ewc_loss": 0.027587890625, + "ewc_loss_parallel": 2.753734588623047e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.587247848510742, + "learning_rate": 1e-06, + "loss": 0.4954, + "mean_token_accuracy": 0.8453781604766846, + "num_tokens": 263098628.0, + "step": 6896 + }, + { + "epoch": 0.8773692914387482, + "ewc_loss": 0.027587890625, + "ewc_loss_parallel": 2.753734588623047e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.28601837158203, + "learning_rate": 1e-06, + "loss": 0.4236, + "mean_token_accuracy": 0.8651061058044434, + "num_tokens": 263136793.0, + "step": 6897 + }, + { + "epoch": 0.8774965017173387, + "ewc_loss": 0.0277099609375, + "ewc_loss_parallel": 2.765655517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.50145149230957, + "learning_rate": 1e-06, + "loss": 0.4217, + "mean_token_accuracy": 0.8646073937416077, + "num_tokens": 263170996.0, + "step": 6898 + }, + { + "epoch": 0.8776237119959293, + "ewc_loss": 0.02783203125, + "ewc_loss_parallel": 2.777576446533203e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.322628021240234, + "learning_rate": 1e-06, + "loss": 0.4597, + "mean_token_accuracy": 0.8527700901031494, + "num_tokens": 263199961.0, + "step": 6899 + }, + { + "epoch": 0.8777509222745198, + "ewc_loss": 0.027587890625, + "ewc_loss_parallel": 2.753734588623047e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.304285049438477, + "learning_rate": 1e-06, + "loss": 0.4334, + "mean_token_accuracy": 0.8606189489364624, + "num_tokens": 263241541.0, + "step": 6900 + }, + { + "epoch": 0.8778781325531103, + "ewc_loss": 0.0274658203125, + "ewc_loss_parallel": 2.7418136596679688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.38288688659668, + "learning_rate": 1e-06, + "loss": 0.3833, + "mean_token_accuracy": 0.8745819330215454, + "num_tokens": 263278778.0, + "step": 6901 + }, + { + "epoch": 0.8780053428317008, + "ewc_loss": 0.02783203125, + "ewc_loss_parallel": 2.777576446533203e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.500274658203125, + "learning_rate": 1e-06, + "loss": 0.4283, + "mean_token_accuracy": 0.8643018007278442, + "num_tokens": 263310155.0, + "step": 6902 + }, + { + "epoch": 0.8781325531102913, + "ewc_loss": 0.0277099609375, + "ewc_loss_parallel": 2.765655517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.39101219177246, + "learning_rate": 1e-06, + "loss": 0.4544, + "mean_token_accuracy": 0.8584126830101013, + "num_tokens": 263355793.0, + "step": 6903 + }, + { + "epoch": 0.8782597633888818, + "ewc_loss": 0.02783203125, + "ewc_loss_parallel": 2.777576446533203e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.504709243774414, + "learning_rate": 1e-06, + "loss": 0.4025, + "mean_token_accuracy": 0.8719234466552734, + "num_tokens": 263394446.0, + "step": 6904 + }, + { + "epoch": 0.8783869736674723, + "ewc_loss": 0.0279541015625, + "ewc_loss_parallel": 2.8014183044433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.406044006347656, + "learning_rate": 1e-06, + "loss": 0.4303, + "mean_token_accuracy": 0.8662617802619934, + "num_tokens": 263438479.0, + "step": 6905 + }, + { + "epoch": 0.8785141839460628, + "ewc_loss": 0.0279541015625, + "ewc_loss_parallel": 2.7894973754882812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.41599464416504, + "learning_rate": 1e-06, + "loss": 0.4858, + "mean_token_accuracy": 0.8475450873374939, + "num_tokens": 263478514.0, + "step": 6906 + }, + { + "epoch": 0.8786413942246534, + "ewc_loss": 0.0277099609375, + "ewc_loss_parallel": 2.765655517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.43346405029297, + "learning_rate": 1e-06, + "loss": 0.5104, + "mean_token_accuracy": 0.8418943881988525, + "num_tokens": 263516734.0, + "step": 6907 + }, + { + "epoch": 0.8787686045032439, + "ewc_loss": 0.0277099609375, + "ewc_loss_parallel": 2.765655517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.50119972229004, + "learning_rate": 1e-06, + "loss": 0.4656, + "mean_token_accuracy": 0.8528430461883545, + "num_tokens": 263560105.0, + "step": 6908 + }, + { + "epoch": 0.8788958147818343, + "ewc_loss": 0.027587890625, + "ewc_loss_parallel": 2.753734588623047e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.388111114501953, + "learning_rate": 1e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.8638348579406738, + "num_tokens": 263597033.0, + "step": 6909 + }, + { + "epoch": 0.8790230250604248, + "ewc_loss": 0.027587890625, + "ewc_loss_parallel": 2.753734588623047e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.35089874267578, + "learning_rate": 1e-06, + "loss": 0.4203, + "mean_token_accuracy": 0.8688676357269287, + "num_tokens": 263631106.0, + "step": 6910 + }, + { + "epoch": 0.8791502353390154, + "ewc_loss": 0.0274658203125, + "ewc_loss_parallel": 2.7418136596679688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.335756301879883, + "learning_rate": 1e-06, + "loss": 0.4547, + "mean_token_accuracy": 0.8566540479660034, + "num_tokens": 263668227.0, + "step": 6911 + }, + { + "epoch": 0.8792774456176059, + "ewc_loss": 0.028076171875, + "ewc_loss_parallel": 2.8133392333984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.48381233215332, + "learning_rate": 1e-06, + "loss": 0.4269, + "mean_token_accuracy": 0.8633691668510437, + "num_tokens": 263704335.0, + "step": 6912 + }, + { + "epoch": 0.8794046558961964, + "ewc_loss": 0.027587890625, + "ewc_loss_parallel": 2.753734588623047e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.237985610961914, + "learning_rate": 1e-06, + "loss": 0.4063, + "mean_token_accuracy": 0.8684265613555908, + "num_tokens": 263742532.0, + "step": 6913 + }, + { + "epoch": 0.879531866174787, + "ewc_loss": 0.02783203125, + "ewc_loss_parallel": 2.777576446533203e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.411304473876953, + "learning_rate": 1e-06, + "loss": 0.426, + "mean_token_accuracy": 0.863233745098114, + "num_tokens": 263782321.0, + "step": 6914 + }, + { + "epoch": 0.8796590764533775, + "ewc_loss": 0.0277099609375, + "ewc_loss_parallel": 2.765655517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.37820053100586, + "learning_rate": 1e-06, + "loss": 0.4022, + "mean_token_accuracy": 0.8693331480026245, + "num_tokens": 263818037.0, + "step": 6915 + }, + { + "epoch": 0.8797862867319679, + "ewc_loss": 0.02783203125, + "ewc_loss_parallel": 2.777576446533203e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.28864288330078, + "learning_rate": 1e-06, + "loss": 0.4807, + "mean_token_accuracy": 0.8476296663284302, + "num_tokens": 263854901.0, + "step": 6916 + }, + { + "epoch": 0.8799134970105584, + "ewc_loss": 0.02783203125, + "ewc_loss_parallel": 2.777576446533203e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.332866668701172, + "learning_rate": 1e-06, + "loss": 0.4251, + "mean_token_accuracy": 0.866144061088562, + "num_tokens": 263896241.0, + "step": 6917 + }, + { + "epoch": 0.880040707289149, + "ewc_loss": 0.02783203125, + "ewc_loss_parallel": 2.777576446533203e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.512605667114258, + "learning_rate": 1e-06, + "loss": 0.426, + "mean_token_accuracy": 0.8636853098869324, + "num_tokens": 263934673.0, + "step": 6918 + }, + { + "epoch": 0.8801679175677395, + "ewc_loss": 0.02783203125, + "ewc_loss_parallel": 2.777576446533203e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.4608154296875, + "learning_rate": 1e-06, + "loss": 0.4654, + "mean_token_accuracy": 0.8533617854118347, + "num_tokens": 263973456.0, + "step": 6919 + }, + { + "epoch": 0.88029512784633, + "ewc_loss": 0.02783203125, + "ewc_loss_parallel": 2.777576446533203e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.45466423034668, + "learning_rate": 1e-06, + "loss": 0.4974, + "mean_token_accuracy": 0.8440702557563782, + "num_tokens": 264006050.0, + "step": 6920 + }, + { + "epoch": 0.8804223381249205, + "ewc_loss": 0.0277099609375, + "ewc_loss_parallel": 2.765655517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.337533950805664, + "learning_rate": 1e-06, + "loss": 0.4108, + "mean_token_accuracy": 0.8711544275283813, + "num_tokens": 264042736.0, + "step": 6921 + }, + { + "epoch": 0.880549548403511, + "ewc_loss": 0.0277099609375, + "ewc_loss_parallel": 2.765655517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.456344604492188, + "learning_rate": 1e-06, + "loss": 0.4556, + "mean_token_accuracy": 0.8524601459503174, + "num_tokens": 264084433.0, + "step": 6922 + }, + { + "epoch": 0.8806767586821015, + "ewc_loss": 0.02783203125, + "ewc_loss_parallel": 2.777576446533203e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.534549713134766, + "learning_rate": 1e-06, + "loss": 0.4045, + "mean_token_accuracy": 0.8699334263801575, + "num_tokens": 264118331.0, + "step": 6923 + }, + { + "epoch": 0.880803968960692, + "ewc_loss": 0.0279541015625, + "ewc_loss_parallel": 2.7894973754882812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.584304809570312, + "learning_rate": 1e-06, + "loss": 0.4268, + "mean_token_accuracy": 0.8656314015388489, + "num_tokens": 264152015.0, + "step": 6924 + }, + { + "epoch": 0.8809311792392825, + "ewc_loss": 0.0277099609375, + "ewc_loss_parallel": 2.765655517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.37038803100586, + "learning_rate": 1e-06, + "loss": 0.4151, + "mean_token_accuracy": 0.8680001497268677, + "num_tokens": 264187085.0, + "step": 6925 + }, + { + "epoch": 0.8810583895178731, + "ewc_loss": 0.02734375, + "ewc_loss_parallel": 2.7298927307128906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.349708557128906, + "learning_rate": 1e-06, + "loss": 0.4423, + "mean_token_accuracy": 0.8592281341552734, + "num_tokens": 264225558.0, + "step": 6926 + }, + { + "epoch": 0.8811855997964636, + "ewc_loss": 0.0279541015625, + "ewc_loss_parallel": 2.7894973754882812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.376550674438477, + "learning_rate": 1e-06, + "loss": 0.4602, + "mean_token_accuracy": 0.8525789976119995, + "num_tokens": 264264220.0, + "step": 6927 + }, + { + "epoch": 0.881312810075054, + "ewc_loss": 0.0277099609375, + "ewc_loss_parallel": 2.765655517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.407108306884766, + "learning_rate": 1e-06, + "loss": 0.4164, + "mean_token_accuracy": 0.8647518157958984, + "num_tokens": 264302097.0, + "step": 6928 + }, + { + "epoch": 0.8814400203536445, + "ewc_loss": 0.02783203125, + "ewc_loss_parallel": 2.777576446533203e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.313386917114258, + "learning_rate": 1e-06, + "loss": 0.4176, + "mean_token_accuracy": 0.8644760847091675, + "num_tokens": 264341966.0, + "step": 6929 + }, + { + "epoch": 0.8815672306322351, + "ewc_loss": 0.0279541015625, + "ewc_loss_parallel": 2.7894973754882812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.429914474487305, + "learning_rate": 1e-06, + "loss": 0.5203, + "mean_token_accuracy": 0.8391586542129517, + "num_tokens": 264384298.0, + "step": 6930 + }, + { + "epoch": 0.8816944409108256, + "ewc_loss": 0.0279541015625, + "ewc_loss_parallel": 2.7894973754882812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.42440414428711, + "learning_rate": 1e-06, + "loss": 0.4218, + "mean_token_accuracy": 0.8622397184371948, + "num_tokens": 264422513.0, + "step": 6931 + }, + { + "epoch": 0.8818216511894161, + "ewc_loss": 0.02783203125, + "ewc_loss_parallel": 2.777576446533203e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.39927864074707, + "learning_rate": 1e-06, + "loss": 0.3986, + "mean_token_accuracy": 0.875257134437561, + "num_tokens": 264461819.0, + "step": 6932 + }, + { + "epoch": 0.8819488614680067, + "ewc_loss": 0.02783203125, + "ewc_loss_parallel": 2.777576446533203e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.334436416625977, + "learning_rate": 1e-06, + "loss": 0.4696, + "mean_token_accuracy": 0.8509847521781921, + "num_tokens": 264502618.0, + "step": 6933 + }, + { + "epoch": 0.8820760717465971, + "ewc_loss": 0.0279541015625, + "ewc_loss_parallel": 2.8014183044433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.396526336669922, + "learning_rate": 1e-06, + "loss": 0.4548, + "mean_token_accuracy": 0.8547897934913635, + "num_tokens": 264534382.0, + "step": 6934 + }, + { + "epoch": 0.8822032820251876, + "ewc_loss": 0.0279541015625, + "ewc_loss_parallel": 2.7894973754882812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.646896362304688, + "learning_rate": 1e-06, + "loss": 0.4659, + "mean_token_accuracy": 0.8510750532150269, + "num_tokens": 264575297.0, + "step": 6935 + }, + { + "epoch": 0.8823304923037781, + "ewc_loss": 0.0281982421875, + "ewc_loss_parallel": 2.8252601623535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.43074607849121, + "learning_rate": 1e-06, + "loss": 0.4119, + "mean_token_accuracy": 0.867212176322937, + "num_tokens": 264611822.0, + "step": 6936 + }, + { + "epoch": 0.8824577025823687, + "ewc_loss": 0.0277099609375, + "ewc_loss_parallel": 2.765655517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.4052791595459, + "learning_rate": 1e-06, + "loss": 0.4473, + "mean_token_accuracy": 0.8573707938194275, + "num_tokens": 264657483.0, + "step": 6937 + }, + { + "epoch": 0.8825849128609592, + "ewc_loss": 0.0279541015625, + "ewc_loss_parallel": 2.8014183044433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.534305572509766, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.8673104643821716, + "num_tokens": 264690633.0, + "step": 6938 + }, + { + "epoch": 0.8827121231395497, + "ewc_loss": 0.0279541015625, + "ewc_loss_parallel": 2.7894973754882812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.347009658813477, + "learning_rate": 1e-06, + "loss": 0.3856, + "mean_token_accuracy": 0.876682698726654, + "num_tokens": 264733268.0, + "step": 6939 + }, + { + "epoch": 0.8828393334181401, + "ewc_loss": 0.0277099609375, + "ewc_loss_parallel": 2.765655517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.351634979248047, + "learning_rate": 1e-06, + "loss": 0.4642, + "mean_token_accuracy": 0.8553669452667236, + "num_tokens": 264774769.0, + "step": 6940 + }, + { + "epoch": 0.8829665436967307, + "ewc_loss": 0.0279541015625, + "ewc_loss_parallel": 2.8014183044433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.417436599731445, + "learning_rate": 1e-06, + "loss": 0.4487, + "mean_token_accuracy": 0.8579334020614624, + "num_tokens": 264806658.0, + "step": 6941 + }, + { + "epoch": 0.8830937539753212, + "ewc_loss": 0.0279541015625, + "ewc_loss_parallel": 2.8014183044433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.5289249420166, + "learning_rate": 1e-06, + "loss": 0.467, + "mean_token_accuracy": 0.8495169281959534, + "num_tokens": 264841054.0, + "step": 6942 + }, + { + "epoch": 0.8832209642539117, + "ewc_loss": 0.02783203125, + "ewc_loss_parallel": 2.777576446533203e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.304452896118164, + "learning_rate": 1e-06, + "loss": 0.4535, + "mean_token_accuracy": 0.8560487031936646, + "num_tokens": 264880241.0, + "step": 6943 + }, + { + "epoch": 0.8833481745325023, + "ewc_loss": 0.0279541015625, + "ewc_loss_parallel": 2.8014183044433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.34650993347168, + "learning_rate": 1e-06, + "loss": 0.428, + "mean_token_accuracy": 0.8626048564910889, + "num_tokens": 264922449.0, + "step": 6944 + }, + { + "epoch": 0.8834753848110928, + "ewc_loss": 0.0279541015625, + "ewc_loss_parallel": 2.8014183044433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.417890548706055, + "learning_rate": 1e-06, + "loss": 0.4412, + "mean_token_accuracy": 0.8616864681243896, + "num_tokens": 264961605.0, + "step": 6945 + }, + { + "epoch": 0.8836025950896832, + "ewc_loss": 0.0279541015625, + "ewc_loss_parallel": 2.8014183044433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.257810592651367, + "learning_rate": 1e-06, + "loss": 0.4001, + "mean_token_accuracy": 0.8698685169219971, + "num_tokens": 264992924.0, + "step": 6946 + }, + { + "epoch": 0.8837298053682737, + "ewc_loss": 0.028076171875, + "ewc_loss_parallel": 2.8133392333984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.516756057739258, + "learning_rate": 1e-06, + "loss": 0.4465, + "mean_token_accuracy": 0.8615311980247498, + "num_tokens": 265031849.0, + "step": 6947 + }, + { + "epoch": 0.8838570156468643, + "ewc_loss": 0.0281982421875, + "ewc_loss_parallel": 2.8252601623535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.391748428344727, + "learning_rate": 1e-06, + "loss": 0.4044, + "mean_token_accuracy": 0.873981773853302, + "num_tokens": 265069714.0, + "step": 6948 + }, + { + "epoch": 0.8839842259254548, + "ewc_loss": 0.0281982421875, + "ewc_loss_parallel": 2.8252601623535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.3658504486084, + "learning_rate": 1e-06, + "loss": 0.4442, + "mean_token_accuracy": 0.8573337197303772, + "num_tokens": 265103325.0, + "step": 6949 + }, + { + "epoch": 0.8841114362040453, + "ewc_loss": 0.0281982421875, + "ewc_loss_parallel": 2.8252601623535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.345129013061523, + "learning_rate": 1e-06, + "loss": 0.4972, + "mean_token_accuracy": 0.8398923873901367, + "num_tokens": 265139133.0, + "step": 6950 + }, + { + "epoch": 0.8842386464826358, + "ewc_loss": 0.0279541015625, + "ewc_loss_parallel": 2.8014183044433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.308095932006836, + "learning_rate": 1e-06, + "loss": 0.5142, + "mean_token_accuracy": 0.8371310234069824, + "num_tokens": 265183536.0, + "step": 6951 + }, + { + "epoch": 0.8843658567612263, + "ewc_loss": 0.0279541015625, + "ewc_loss_parallel": 2.7894973754882812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.486831665039062, + "learning_rate": 1e-06, + "loss": 0.4644, + "mean_token_accuracy": 0.8525029420852661, + "num_tokens": 265226288.0, + "step": 6952 + }, + { + "epoch": 0.8844930670398168, + "ewc_loss": 0.028076171875, + "ewc_loss_parallel": 2.8133392333984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.449941635131836, + "learning_rate": 1e-06, + "loss": 0.4344, + "mean_token_accuracy": 0.8613095879554749, + "num_tokens": 265265846.0, + "step": 6953 + }, + { + "epoch": 0.8846202773184073, + "ewc_loss": 0.028076171875, + "ewc_loss_parallel": 2.8133392333984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.46168327331543, + "learning_rate": 1e-06, + "loss": 0.3925, + "mean_token_accuracy": 0.8751468658447266, + "num_tokens": 265300612.0, + "step": 6954 + }, + { + "epoch": 0.8847474875969978, + "ewc_loss": 0.0279541015625, + "ewc_loss_parallel": 2.7894973754882812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.5687198638916, + "learning_rate": 1e-06, + "loss": 0.4079, + "mean_token_accuracy": 0.8701134920120239, + "num_tokens": 265341440.0, + "step": 6955 + }, + { + "epoch": 0.8848746978755884, + "ewc_loss": 0.0279541015625, + "ewc_loss_parallel": 2.8014183044433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.433761596679688, + "learning_rate": 1e-06, + "loss": 0.4607, + "mean_token_accuracy": 0.8517202138900757, + "num_tokens": 265384854.0, + "step": 6956 + }, + { + "epoch": 0.8850019081541789, + "ewc_loss": 0.028076171875, + "ewc_loss_parallel": 2.8133392333984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.706457138061523, + "learning_rate": 1e-06, + "loss": 0.4552, + "mean_token_accuracy": 0.854561448097229, + "num_tokens": 265420611.0, + "step": 6957 + }, + { + "epoch": 0.8851291184327693, + "ewc_loss": 0.02783203125, + "ewc_loss_parallel": 2.777576446533203e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.410078048706055, + "learning_rate": 1e-06, + "loss": 0.4444, + "mean_token_accuracy": 0.8597330451011658, + "num_tokens": 265457720.0, + "step": 6958 + }, + { + "epoch": 0.8852563287113598, + "ewc_loss": 0.0279541015625, + "ewc_loss_parallel": 2.8014183044433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.239599227905273, + "learning_rate": 1e-06, + "loss": 0.4116, + "mean_token_accuracy": 0.870111346244812, + "num_tokens": 265493434.0, + "step": 6959 + }, + { + "epoch": 0.8853835389899504, + "ewc_loss": 0.0279541015625, + "ewc_loss_parallel": 2.7894973754882812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.568655014038086, + "learning_rate": 1e-06, + "loss": 0.4258, + "mean_token_accuracy": 0.8633586764335632, + "num_tokens": 265529015.0, + "step": 6960 + }, + { + "epoch": 0.8855107492685409, + "ewc_loss": 0.028076171875, + "ewc_loss_parallel": 2.8133392333984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.41666603088379, + "learning_rate": 1e-06, + "loss": 0.4808, + "mean_token_accuracy": 0.845974326133728, + "num_tokens": 265568698.0, + "step": 6961 + }, + { + "epoch": 0.8856379595471314, + "ewc_loss": 0.02783203125, + "ewc_loss_parallel": 2.777576446533203e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.54172706604004, + "learning_rate": 1e-06, + "loss": 0.4418, + "mean_token_accuracy": 0.8622951507568359, + "num_tokens": 265605631.0, + "step": 6962 + }, + { + "epoch": 0.885765169825722, + "ewc_loss": 0.0279541015625, + "ewc_loss_parallel": 2.8014183044433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.465179443359375, + "learning_rate": 1e-06, + "loss": 0.4289, + "mean_token_accuracy": 0.8624430894851685, + "num_tokens": 265647071.0, + "step": 6963 + }, + { + "epoch": 0.8858923801043125, + "ewc_loss": 0.0277099609375, + "ewc_loss_parallel": 2.765655517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.554258346557617, + "learning_rate": 1e-06, + "loss": 0.4317, + "mean_token_accuracy": 0.8606939911842346, + "num_tokens": 265688680.0, + "step": 6964 + }, + { + "epoch": 0.8860195903829029, + "ewc_loss": 0.02783203125, + "ewc_loss_parallel": 2.777576446533203e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.35765266418457, + "learning_rate": 1e-06, + "loss": 0.4134, + "mean_token_accuracy": 0.8674535751342773, + "num_tokens": 265727707.0, + "step": 6965 + }, + { + "epoch": 0.8861468006614934, + "ewc_loss": 0.0279541015625, + "ewc_loss_parallel": 2.7894973754882812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.460676193237305, + "learning_rate": 1e-06, + "loss": 0.4426, + "mean_token_accuracy": 0.8589959144592285, + "num_tokens": 265765860.0, + "step": 6966 + }, + { + "epoch": 0.886274010940084, + "ewc_loss": 0.02783203125, + "ewc_loss_parallel": 2.777576446533203e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.508535385131836, + "learning_rate": 1e-06, + "loss": 0.4314, + "mean_token_accuracy": 0.8646737933158875, + "num_tokens": 265802988.0, + "step": 6967 + }, + { + "epoch": 0.8864012212186745, + "ewc_loss": 0.0279541015625, + "ewc_loss_parallel": 2.7894973754882812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.44701385498047, + "learning_rate": 1e-06, + "loss": 0.443, + "mean_token_accuracy": 0.8564057946205139, + "num_tokens": 265840614.0, + "step": 6968 + }, + { + "epoch": 0.886528431497265, + "ewc_loss": 0.0279541015625, + "ewc_loss_parallel": 2.7894973754882812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.412891387939453, + "learning_rate": 1e-06, + "loss": 0.5156, + "mean_token_accuracy": 0.8350212574005127, + "num_tokens": 265883032.0, + "step": 6969 + }, + { + "epoch": 0.8866556417758555, + "ewc_loss": 0.0277099609375, + "ewc_loss_parallel": 2.765655517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.654550552368164, + "learning_rate": 1e-06, + "loss": 0.4042, + "mean_token_accuracy": 0.8730165362358093, + "num_tokens": 265918309.0, + "step": 6970 + }, + { + "epoch": 0.886782852054446, + "ewc_loss": 0.02783203125, + "ewc_loss_parallel": 2.777576446533203e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.16874122619629, + "learning_rate": 1e-06, + "loss": 0.4312, + "mean_token_accuracy": 0.8638817071914673, + "num_tokens": 265956406.0, + "step": 6971 + }, + { + "epoch": 0.8869100623330365, + "ewc_loss": 0.02783203125, + "ewc_loss_parallel": 2.777576446533203e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.56100845336914, + "learning_rate": 1e-06, + "loss": 0.4053, + "mean_token_accuracy": 0.8702397346496582, + "num_tokens": 265988413.0, + "step": 6972 + }, + { + "epoch": 0.887037272611627, + "ewc_loss": 0.0281982421875, + "ewc_loss_parallel": 2.8252601623535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.50444984436035, + "learning_rate": 1e-06, + "loss": 0.4439, + "mean_token_accuracy": 0.860606849193573, + "num_tokens": 266022036.0, + "step": 6973 + }, + { + "epoch": 0.8871644828902175, + "ewc_loss": 0.0279541015625, + "ewc_loss_parallel": 2.7894973754882812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.46052360534668, + "learning_rate": 1e-06, + "loss": 0.4307, + "mean_token_accuracy": 0.8602436780929565, + "num_tokens": 266053735.0, + "step": 6974 + }, + { + "epoch": 0.8872916931688081, + "ewc_loss": 0.0279541015625, + "ewc_loss_parallel": 2.7894973754882812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.39423942565918, + "learning_rate": 1e-06, + "loss": 0.3904, + "mean_token_accuracy": 0.8758174180984497, + "num_tokens": 266094198.0, + "step": 6975 + }, + { + "epoch": 0.8874189034473986, + "ewc_loss": 0.028076171875, + "ewc_loss_parallel": 2.8133392333984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.594350814819336, + "learning_rate": 1e-06, + "loss": 0.4221, + "mean_token_accuracy": 0.8704115152359009, + "num_tokens": 266132213.0, + "step": 6976 + }, + { + "epoch": 0.887546113725989, + "ewc_loss": 0.0279541015625, + "ewc_loss_parallel": 2.8014183044433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.539003372192383, + "learning_rate": 1e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.861972451210022, + "num_tokens": 266168001.0, + "step": 6977 + }, + { + "epoch": 0.8876733240045795, + "ewc_loss": 0.0279541015625, + "ewc_loss_parallel": 2.7894973754882812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.504356384277344, + "learning_rate": 1e-06, + "loss": 0.5171, + "mean_token_accuracy": 0.8333889245986938, + "num_tokens": 266208771.0, + "step": 6978 + }, + { + "epoch": 0.8878005342831701, + "ewc_loss": 0.028076171875, + "ewc_loss_parallel": 2.8133392333984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.651350021362305, + "learning_rate": 1e-06, + "loss": 0.4147, + "mean_token_accuracy": 0.8670388460159302, + "num_tokens": 266248090.0, + "step": 6979 + }, + { + "epoch": 0.8879277445617606, + "ewc_loss": 0.0279541015625, + "ewc_loss_parallel": 2.7894973754882812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.573795318603516, + "learning_rate": 1e-06, + "loss": 0.4644, + "mean_token_accuracy": 0.8506100177764893, + "num_tokens": 266283748.0, + "step": 6980 + }, + { + "epoch": 0.8880549548403511, + "ewc_loss": 0.0279541015625, + "ewc_loss_parallel": 2.7894973754882812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.475500106811523, + "learning_rate": 1e-06, + "loss": 0.4107, + "mean_token_accuracy": 0.8685188293457031, + "num_tokens": 266316613.0, + "step": 6981 + }, + { + "epoch": 0.8881821651189417, + "ewc_loss": 0.0277099609375, + "ewc_loss_parallel": 2.765655517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.497116088867188, + "learning_rate": 1e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.8642383217811584, + "num_tokens": 266357515.0, + "step": 6982 + }, + { + "epoch": 0.8883093753975321, + "ewc_loss": 0.0279541015625, + "ewc_loss_parallel": 2.7894973754882812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.450761795043945, + "learning_rate": 1e-06, + "loss": 0.3948, + "mean_token_accuracy": 0.8734400868415833, + "num_tokens": 266393593.0, + "step": 6983 + }, + { + "epoch": 0.8884365856761226, + "ewc_loss": 0.0279541015625, + "ewc_loss_parallel": 2.7894973754882812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.454345703125, + "learning_rate": 1e-06, + "loss": 0.4036, + "mean_token_accuracy": 0.8730351328849792, + "num_tokens": 266436104.0, + "step": 6984 + }, + { + "epoch": 0.8885637959547131, + "ewc_loss": 0.02783203125, + "ewc_loss_parallel": 2.777576446533203e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.56570816040039, + "learning_rate": 1e-06, + "loss": 0.4014, + "mean_token_accuracy": 0.8739801645278931, + "num_tokens": 266472809.0, + "step": 6985 + }, + { + "epoch": 0.8886910062333037, + "ewc_loss": 0.0279541015625, + "ewc_loss_parallel": 2.7894973754882812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.407621383666992, + "learning_rate": 1e-06, + "loss": 0.4012, + "mean_token_accuracy": 0.8717992305755615, + "num_tokens": 266512726.0, + "step": 6986 + }, + { + "epoch": 0.8888182165118942, + "ewc_loss": 0.0279541015625, + "ewc_loss_parallel": 2.8014183044433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.555068969726562, + "learning_rate": 1e-06, + "loss": 0.466, + "mean_token_accuracy": 0.850085973739624, + "num_tokens": 266546098.0, + "step": 6987 + }, + { + "epoch": 0.8889454267904847, + "ewc_loss": 0.0279541015625, + "ewc_loss_parallel": 2.8014183044433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.58839225769043, + "learning_rate": 1e-06, + "loss": 0.4505, + "mean_token_accuracy": 0.8522045612335205, + "num_tokens": 266588478.0, + "step": 6988 + }, + { + "epoch": 0.8890726370690751, + "ewc_loss": 0.0277099609375, + "ewc_loss_parallel": 2.765655517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.431053161621094, + "learning_rate": 1e-06, + "loss": 0.4634, + "mean_token_accuracy": 0.8509645462036133, + "num_tokens": 266628975.0, + "step": 6989 + }, + { + "epoch": 0.8891998473476657, + "ewc_loss": 0.02783203125, + "ewc_loss_parallel": 2.777576446533203e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.446596145629883, + "learning_rate": 1e-06, + "loss": 0.3933, + "mean_token_accuracy": 0.8707339763641357, + "num_tokens": 266667533.0, + "step": 6990 + }, + { + "epoch": 0.8893270576262562, + "ewc_loss": 0.0277099609375, + "ewc_loss_parallel": 2.765655517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.31366729736328, + "learning_rate": 1e-06, + "loss": 0.4839, + "mean_token_accuracy": 0.8446074724197388, + "num_tokens": 266706990.0, + "step": 6991 + }, + { + "epoch": 0.8894542679048467, + "ewc_loss": 0.0279541015625, + "ewc_loss_parallel": 2.7894973754882812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.21102523803711, + "learning_rate": 1e-06, + "loss": 0.4669, + "mean_token_accuracy": 0.8496313095092773, + "num_tokens": 266747314.0, + "step": 6992 + }, + { + "epoch": 0.8895814781834372, + "ewc_loss": 0.0279541015625, + "ewc_loss_parallel": 2.7894973754882812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.390445709228516, + "learning_rate": 1e-06, + "loss": 0.4524, + "mean_token_accuracy": 0.8527944087982178, + "num_tokens": 266791031.0, + "step": 6993 + }, + { + "epoch": 0.8897086884620278, + "ewc_loss": 0.0279541015625, + "ewc_loss_parallel": 2.7894973754882812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.433788299560547, + "learning_rate": 1e-06, + "loss": 0.4468, + "mean_token_accuracy": 0.8608396053314209, + "num_tokens": 266823236.0, + "step": 6994 + }, + { + "epoch": 0.8898358987406182, + "ewc_loss": 0.0279541015625, + "ewc_loss_parallel": 2.8014183044433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.42772102355957, + "learning_rate": 1e-06, + "loss": 0.4409, + "mean_token_accuracy": 0.8595359325408936, + "num_tokens": 266867993.0, + "step": 6995 + }, + { + "epoch": 0.8899631090192087, + "ewc_loss": 0.0279541015625, + "ewc_loss_parallel": 2.8014183044433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.436111450195312, + "learning_rate": 1e-06, + "loss": 0.4916, + "mean_token_accuracy": 0.843684196472168, + "num_tokens": 266908573.0, + "step": 6996 + }, + { + "epoch": 0.8900903192977992, + "ewc_loss": 0.0279541015625, + "ewc_loss_parallel": 2.8014183044433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.40656089782715, + "learning_rate": 1e-06, + "loss": 0.4367, + "mean_token_accuracy": 0.8597662448883057, + "num_tokens": 266951167.0, + "step": 6997 + }, + { + "epoch": 0.8902175295763898, + "ewc_loss": 0.02783203125, + "ewc_loss_parallel": 2.777576446533203e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.24414825439453, + "learning_rate": 1e-06, + "loss": 0.4352, + "mean_token_accuracy": 0.8601089119911194, + "num_tokens": 266988976.0, + "step": 6998 + }, + { + "epoch": 0.8903447398549803, + "ewc_loss": 0.028076171875, + "ewc_loss_parallel": 2.8133392333984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.636449813842773, + "learning_rate": 1e-06, + "loss": 0.4395, + "mean_token_accuracy": 0.8600566983222961, + "num_tokens": 267022450.0, + "step": 6999 + }, + { + "epoch": 0.8904719501335708, + "ewc_loss": 0.028076171875, + "ewc_loss_parallel": 2.8133392333984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.49578094482422, + "learning_rate": 1e-06, + "loss": 0.4775, + "mean_token_accuracy": 0.8551549315452576, + "num_tokens": 267059933.0, + "step": 7000 + }, + { + "epoch": 0.8905991604121613, + "ewc_loss": 0.0279541015625, + "ewc_loss_parallel": 2.8014183044433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.504199981689453, + "learning_rate": 1e-06, + "loss": 0.4762, + "mean_token_accuracy": 0.8521133661270142, + "num_tokens": 267092518.0, + "step": 7001 + }, + { + "epoch": 0.8907263706907518, + "ewc_loss": 0.028076171875, + "ewc_loss_parallel": 2.8133392333984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.466936111450195, + "learning_rate": 1e-06, + "loss": 0.443, + "mean_token_accuracy": 0.8554508090019226, + "num_tokens": 267130008.0, + "step": 7002 + }, + { + "epoch": 0.8908535809693423, + "ewc_loss": 0.0279541015625, + "ewc_loss_parallel": 2.8014183044433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.442651748657227, + "learning_rate": 1e-06, + "loss": 0.4306, + "mean_token_accuracy": 0.8643274307250977, + "num_tokens": 267171039.0, + "step": 7003 + }, + { + "epoch": 0.8909807912479328, + "ewc_loss": 0.0279541015625, + "ewc_loss_parallel": 2.8014183044433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.517555236816406, + "learning_rate": 1e-06, + "loss": 0.4223, + "mean_token_accuracy": 0.8638907670974731, + "num_tokens": 267211197.0, + "step": 7004 + }, + { + "epoch": 0.8911080015265234, + "ewc_loss": 0.0279541015625, + "ewc_loss_parallel": 2.8014183044433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.252962112426758, + "learning_rate": 1e-06, + "loss": 0.4271, + "mean_token_accuracy": 0.8673259615898132, + "num_tokens": 267247471.0, + "step": 7005 + }, + { + "epoch": 0.8912352118051139, + "ewc_loss": 0.0281982421875, + "ewc_loss_parallel": 2.8252601623535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.41456413269043, + "learning_rate": 1e-06, + "loss": 0.531, + "mean_token_accuracy": 0.834388256072998, + "num_tokens": 267287276.0, + "step": 7006 + }, + { + "epoch": 0.8913624220837043, + "ewc_loss": 0.028076171875, + "ewc_loss_parallel": 2.8133392333984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.4760799407959, + "learning_rate": 1e-06, + "loss": 0.4458, + "mean_token_accuracy": 0.8581544160842896, + "num_tokens": 267323827.0, + "step": 7007 + }, + { + "epoch": 0.8914896323622948, + "ewc_loss": 0.028076171875, + "ewc_loss_parallel": 2.8133392333984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.264638900756836, + "learning_rate": 1e-06, + "loss": 0.4073, + "mean_token_accuracy": 0.8692948222160339, + "num_tokens": 267364053.0, + "step": 7008 + }, + { + "epoch": 0.8916168426408854, + "ewc_loss": 0.0279541015625, + "ewc_loss_parallel": 2.8014183044433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.42098045349121, + "learning_rate": 1e-06, + "loss": 0.4853, + "mean_token_accuracy": 0.8441176414489746, + "num_tokens": 267404393.0, + "step": 7009 + }, + { + "epoch": 0.8917440529194759, + "ewc_loss": 0.0283203125, + "ewc_loss_parallel": 2.8371810913085938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.516891479492188, + "learning_rate": 1e-06, + "loss": 0.4341, + "mean_token_accuracy": 0.861741304397583, + "num_tokens": 267444589.0, + "step": 7010 + }, + { + "epoch": 0.8918712631980664, + "ewc_loss": 0.0279541015625, + "ewc_loss_parallel": 2.7894973754882812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.53002166748047, + "learning_rate": 1e-06, + "loss": 0.4303, + "mean_token_accuracy": 0.8617995977401733, + "num_tokens": 267484460.0, + "step": 7011 + }, + { + "epoch": 0.891998473476657, + "ewc_loss": 0.0281982421875, + "ewc_loss_parallel": 2.8252601623535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.35468292236328, + "learning_rate": 1e-06, + "loss": 0.4623, + "mean_token_accuracy": 0.8543787002563477, + "num_tokens": 267517390.0, + "step": 7012 + }, + { + "epoch": 0.8921256837552475, + "ewc_loss": 0.028076171875, + "ewc_loss_parallel": 2.8133392333984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.460620880126953, + "learning_rate": 1e-06, + "loss": 0.4374, + "mean_token_accuracy": 0.8609774112701416, + "num_tokens": 267555813.0, + "step": 7013 + }, + { + "epoch": 0.8922528940338379, + "ewc_loss": 0.0279541015625, + "ewc_loss_parallel": 2.8014183044433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.384672164916992, + "learning_rate": 1e-06, + "loss": 0.4379, + "mean_token_accuracy": 0.8578743934631348, + "num_tokens": 267595818.0, + "step": 7014 + }, + { + "epoch": 0.8923801043124284, + "ewc_loss": 0.0279541015625, + "ewc_loss_parallel": 2.7894973754882812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.401634216308594, + "learning_rate": 1e-06, + "loss": 0.5198, + "mean_token_accuracy": 0.8348754644393921, + "num_tokens": 267633106.0, + "step": 7015 + }, + { + "epoch": 0.892507314591019, + "ewc_loss": 0.028076171875, + "ewc_loss_parallel": 2.8133392333984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.519412994384766, + "learning_rate": 1e-06, + "loss": 0.4275, + "mean_token_accuracy": 0.863710343837738, + "num_tokens": 267670166.0, + "step": 7016 + }, + { + "epoch": 0.8926345248696095, + "ewc_loss": 0.0283203125, + "ewc_loss_parallel": 2.8371810913085938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.34000587463379, + "learning_rate": 1e-06, + "loss": 0.4561, + "mean_token_accuracy": 0.8578537106513977, + "num_tokens": 267707284.0, + "step": 7017 + }, + { + "epoch": 0.8927617351482, + "ewc_loss": 0.0279541015625, + "ewc_loss_parallel": 2.8014183044433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.373300552368164, + "learning_rate": 1e-06, + "loss": 0.4405, + "mean_token_accuracy": 0.8624307513237, + "num_tokens": 267742349.0, + "step": 7018 + }, + { + "epoch": 0.8928889454267905, + "ewc_loss": 0.0281982421875, + "ewc_loss_parallel": 2.8252601623535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.4548397064209, + "learning_rate": 1e-06, + "loss": 0.4151, + "mean_token_accuracy": 0.8654581308364868, + "num_tokens": 267776003.0, + "step": 7019 + }, + { + "epoch": 0.893016155705381, + "ewc_loss": 0.028076171875, + "ewc_loss_parallel": 2.8133392333984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.454700469970703, + "learning_rate": 1e-06, + "loss": 0.4453, + "mean_token_accuracy": 0.8557574152946472, + "num_tokens": 267815887.0, + "step": 7020 + }, + { + "epoch": 0.8931433659839715, + "ewc_loss": 0.0281982421875, + "ewc_loss_parallel": 2.8252601623535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.343948364257812, + "learning_rate": 1e-06, + "loss": 0.4059, + "mean_token_accuracy": 0.8714227676391602, + "num_tokens": 267855874.0, + "step": 7021 + }, + { + "epoch": 0.893270576262562, + "ewc_loss": 0.028076171875, + "ewc_loss_parallel": 2.8133392333984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.408702850341797, + "learning_rate": 1e-06, + "loss": 0.4049, + "mean_token_accuracy": 0.870383083820343, + "num_tokens": 267897901.0, + "step": 7022 + }, + { + "epoch": 0.8933977865411525, + "ewc_loss": 0.0283203125, + "ewc_loss_parallel": 2.8371810913085938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.490201950073242, + "learning_rate": 1e-06, + "loss": 0.4011, + "mean_token_accuracy": 0.8718894124031067, + "num_tokens": 267938967.0, + "step": 7023 + }, + { + "epoch": 0.8935249968197431, + "ewc_loss": 0.0279541015625, + "ewc_loss_parallel": 2.7894973754882812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.450647354125977, + "learning_rate": 1e-06, + "loss": 0.464, + "mean_token_accuracy": 0.8573735356330872, + "num_tokens": 267975196.0, + "step": 7024 + }, + { + "epoch": 0.8936522070983336, + "ewc_loss": 0.0281982421875, + "ewc_loss_parallel": 2.8252601623535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.457725524902344, + "learning_rate": 1e-06, + "loss": 0.3883, + "mean_token_accuracy": 0.8745852112770081, + "num_tokens": 268014407.0, + "step": 7025 + }, + { + "epoch": 0.893779417376924, + "ewc_loss": 0.028076171875, + "ewc_loss_parallel": 2.8133392333984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.39097785949707, + "learning_rate": 1e-06, + "loss": 0.377, + "mean_token_accuracy": 0.8763486742973328, + "num_tokens": 268043892.0, + "step": 7026 + }, + { + "epoch": 0.8939066276555145, + "ewc_loss": 0.028076171875, + "ewc_loss_parallel": 2.8133392333984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.318086624145508, + "learning_rate": 1e-06, + "loss": 0.4449, + "mean_token_accuracy": 0.8610678911209106, + "num_tokens": 268079651.0, + "step": 7027 + }, + { + "epoch": 0.8940338379341051, + "ewc_loss": 0.028076171875, + "ewc_loss_parallel": 2.8133392333984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.358802795410156, + "learning_rate": 1e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.8645141124725342, + "num_tokens": 268114021.0, + "step": 7028 + }, + { + "epoch": 0.8941610482126956, + "ewc_loss": 0.0284423828125, + "ewc_loss_parallel": 2.849102020263672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.000484466552734, + "learning_rate": 1e-06, + "loss": 0.4429, + "mean_token_accuracy": 0.8597590923309326, + "num_tokens": 268155315.0, + "step": 7029 + }, + { + "epoch": 0.8942882584912861, + "ewc_loss": 0.0279541015625, + "ewc_loss_parallel": 2.8014183044433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.240196228027344, + "learning_rate": 1e-06, + "loss": 0.4502, + "mean_token_accuracy": 0.8583396673202515, + "num_tokens": 268199312.0, + "step": 7030 + }, + { + "epoch": 0.8944154687698767, + "ewc_loss": 0.0279541015625, + "ewc_loss_parallel": 2.8014183044433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.25267791748047, + "learning_rate": 1e-06, + "loss": 0.4141, + "mean_token_accuracy": 0.8680741786956787, + "num_tokens": 268236955.0, + "step": 7031 + }, + { + "epoch": 0.8945426790484671, + "ewc_loss": 0.0274658203125, + "ewc_loss_parallel": 2.7418136596679688e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.425771713256836, + "learning_rate": 1e-06, + "loss": 0.4349, + "mean_token_accuracy": 0.8588471412658691, + "num_tokens": 268272183.0, + "step": 7032 + }, + { + "epoch": 0.8946698893270576, + "ewc_loss": 0.0281982421875, + "ewc_loss_parallel": 2.8252601623535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.90842628479004, + "learning_rate": 1e-06, + "loss": 0.4739, + "mean_token_accuracy": 0.8513827919960022, + "num_tokens": 268305407.0, + "step": 7033 + }, + { + "epoch": 0.8947970996056481, + "ewc_loss": 0.0279541015625, + "ewc_loss_parallel": 2.8014183044433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.354890823364258, + "learning_rate": 1e-06, + "loss": 0.3972, + "mean_token_accuracy": 0.8719595670700073, + "num_tokens": 268340443.0, + "step": 7034 + }, + { + "epoch": 0.8949243098842387, + "ewc_loss": 0.02783203125, + "ewc_loss_parallel": 2.777576446533203e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.682703018188477, + "learning_rate": 1e-06, + "loss": 0.3907, + "mean_token_accuracy": 0.8795440793037415, + "num_tokens": 268375597.0, + "step": 7035 + }, + { + "epoch": 0.8950515201628292, + "ewc_loss": 0.028076171875, + "ewc_loss_parallel": 2.8133392333984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.624473571777344, + "learning_rate": 1e-06, + "loss": 0.464, + "mean_token_accuracy": 0.8541285991668701, + "num_tokens": 268412872.0, + "step": 7036 + }, + { + "epoch": 0.8951787304414197, + "ewc_loss": 0.0279541015625, + "ewc_loss_parallel": 2.7894973754882812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.308971405029297, + "learning_rate": 1e-06, + "loss": 0.4322, + "mean_token_accuracy": 0.862059473991394, + "num_tokens": 268452021.0, + "step": 7037 + }, + { + "epoch": 0.8953059407200101, + "ewc_loss": 0.0279541015625, + "ewc_loss_parallel": 2.7894973754882812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.686697006225586, + "learning_rate": 1e-06, + "loss": 0.4166, + "mean_token_accuracy": 0.8692628145217896, + "num_tokens": 268486354.0, + "step": 7038 + }, + { + "epoch": 0.8954331509986007, + "ewc_loss": 0.0279541015625, + "ewc_loss_parallel": 2.8014183044433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.52304458618164, + "learning_rate": 1e-06, + "loss": 0.4505, + "mean_token_accuracy": 0.854215145111084, + "num_tokens": 268519918.0, + "step": 7039 + }, + { + "epoch": 0.8955603612771912, + "ewc_loss": 0.0279541015625, + "ewc_loss_parallel": 2.7894973754882812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.397380828857422, + "learning_rate": 1e-06, + "loss": 0.4645, + "mean_token_accuracy": 0.8517783880233765, + "num_tokens": 268554259.0, + "step": 7040 + }, + { + "epoch": 0.8956875715557817, + "ewc_loss": 0.0279541015625, + "ewc_loss_parallel": 2.8014183044433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.363544464111328, + "learning_rate": 1e-06, + "loss": 0.4446, + "mean_token_accuracy": 0.8554309606552124, + "num_tokens": 268595785.0, + "step": 7041 + }, + { + "epoch": 0.8958147818343722, + "ewc_loss": 0.0279541015625, + "ewc_loss_parallel": 2.7894973754882812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.45364761352539, + "learning_rate": 1e-06, + "loss": 0.4925, + "mean_token_accuracy": 0.8464922904968262, + "num_tokens": 268640805.0, + "step": 7042 + }, + { + "epoch": 0.8959419921129628, + "ewc_loss": 0.028076171875, + "ewc_loss_parallel": 2.8133392333984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.425825119018555, + "learning_rate": 1e-06, + "loss": 0.4209, + "mean_token_accuracy": 0.8669439554214478, + "num_tokens": 268682446.0, + "step": 7043 + }, + { + "epoch": 0.8960692023915532, + "ewc_loss": 0.0283203125, + "ewc_loss_parallel": 2.8371810913085938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.539175033569336, + "learning_rate": 1e-06, + "loss": 0.4081, + "mean_token_accuracy": 0.8697373867034912, + "num_tokens": 268720262.0, + "step": 7044 + }, + { + "epoch": 0.8961964126701437, + "ewc_loss": 0.0279541015625, + "ewc_loss_parallel": 2.7894973754882812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.454288482666016, + "learning_rate": 1e-06, + "loss": 0.4669, + "mean_token_accuracy": 0.8483452796936035, + "num_tokens": 268760519.0, + "step": 7045 + }, + { + "epoch": 0.8963236229487342, + "ewc_loss": 0.02783203125, + "ewc_loss_parallel": 2.777576446533203e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.6270694732666, + "learning_rate": 1e-06, + "loss": 0.4652, + "mean_token_accuracy": 0.8530899286270142, + "num_tokens": 268796621.0, + "step": 7046 + }, + { + "epoch": 0.8964508332273248, + "ewc_loss": 0.0279541015625, + "ewc_loss_parallel": 2.8014183044433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.527122497558594, + "learning_rate": 1e-06, + "loss": 0.4083, + "mean_token_accuracy": 0.8691749572753906, + "num_tokens": 268830527.0, + "step": 7047 + }, + { + "epoch": 0.8965780435059153, + "ewc_loss": 0.0279541015625, + "ewc_loss_parallel": 2.7894973754882812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.48566246032715, + "learning_rate": 1e-06, + "loss": 0.4175, + "mean_token_accuracy": 0.8659325838088989, + "num_tokens": 268862112.0, + "step": 7048 + }, + { + "epoch": 0.8967052537845058, + "ewc_loss": 0.028076171875, + "ewc_loss_parallel": 2.8133392333984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.438230514526367, + "learning_rate": 1e-06, + "loss": 0.4387, + "mean_token_accuracy": 0.8596026301383972, + "num_tokens": 268901089.0, + "step": 7049 + }, + { + "epoch": 0.8968324640630962, + "ewc_loss": 0.0279541015625, + "ewc_loss_parallel": 2.7894973754882812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.538480758666992, + "learning_rate": 1e-06, + "loss": 0.4249, + "mean_token_accuracy": 0.8632969856262207, + "num_tokens": 268942766.0, + "step": 7050 + }, + { + "epoch": 0.8969596743416868, + "ewc_loss": 0.0279541015625, + "ewc_loss_parallel": 2.7894973754882812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.631746292114258, + "learning_rate": 1e-06, + "loss": 0.4118, + "mean_token_accuracy": 0.8674927353858948, + "num_tokens": 268978642.0, + "step": 7051 + }, + { + "epoch": 0.8970868846202773, + "ewc_loss": 0.0279541015625, + "ewc_loss_parallel": 2.7894973754882812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.43267059326172, + "learning_rate": 1e-06, + "loss": 0.4175, + "mean_token_accuracy": 0.8670458793640137, + "num_tokens": 269016048.0, + "step": 7052 + }, + { + "epoch": 0.8972140948988678, + "ewc_loss": 0.0277099609375, + "ewc_loss_parallel": 2.765655517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.54673194885254, + "learning_rate": 1e-06, + "loss": 0.4309, + "mean_token_accuracy": 0.862410306930542, + "num_tokens": 269054381.0, + "step": 7053 + }, + { + "epoch": 0.8973413051774584, + "ewc_loss": 0.028076171875, + "ewc_loss_parallel": 2.8133392333984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.505020141601562, + "learning_rate": 1e-06, + "loss": 0.4246, + "mean_token_accuracy": 0.8650294542312622, + "num_tokens": 269090672.0, + "step": 7054 + }, + { + "epoch": 0.8974685154560489, + "ewc_loss": 0.027587890625, + "ewc_loss_parallel": 2.753734588623047e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.53053092956543, + "learning_rate": 1e-06, + "loss": 0.4268, + "mean_token_accuracy": 0.8651577830314636, + "num_tokens": 269130231.0, + "step": 7055 + }, + { + "epoch": 0.8975957257346393, + "ewc_loss": 0.0279541015625, + "ewc_loss_parallel": 2.8014183044433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.443204879760742, + "learning_rate": 1e-06, + "loss": 0.4205, + "mean_token_accuracy": 0.8654744029045105, + "num_tokens": 269166282.0, + "step": 7056 + }, + { + "epoch": 0.8977229360132298, + "ewc_loss": 0.0279541015625, + "ewc_loss_parallel": 2.7894973754882812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.509742736816406, + "learning_rate": 1e-06, + "loss": 0.418, + "mean_token_accuracy": 0.8647880554199219, + "num_tokens": 269205326.0, + "step": 7057 + }, + { + "epoch": 0.8978501462918204, + "ewc_loss": 0.0279541015625, + "ewc_loss_parallel": 2.8014183044433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.54641342163086, + "learning_rate": 1e-06, + "loss": 0.4327, + "mean_token_accuracy": 0.8633214831352234, + "num_tokens": 269243903.0, + "step": 7058 + }, + { + "epoch": 0.8979773565704109, + "ewc_loss": 0.028076171875, + "ewc_loss_parallel": 2.8133392333984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.464275360107422, + "learning_rate": 1e-06, + "loss": 0.4599, + "mean_token_accuracy": 0.8555808663368225, + "num_tokens": 269288330.0, + "step": 7059 + }, + { + "epoch": 0.8981045668490014, + "ewc_loss": 0.0279541015625, + "ewc_loss_parallel": 2.7894973754882812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.543880462646484, + "learning_rate": 1e-06, + "loss": 0.4717, + "mean_token_accuracy": 0.8514756560325623, + "num_tokens": 269328731.0, + "step": 7060 + }, + { + "epoch": 0.898231777127592, + "ewc_loss": 0.0279541015625, + "ewc_loss_parallel": 2.7894973754882812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.341014862060547, + "learning_rate": 1e-06, + "loss": 0.4273, + "mean_token_accuracy": 0.8621249198913574, + "num_tokens": 269369576.0, + "step": 7061 + }, + { + "epoch": 0.8983589874061825, + "ewc_loss": 0.028076171875, + "ewc_loss_parallel": 2.8133392333984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.452754974365234, + "learning_rate": 1e-06, + "loss": 0.4528, + "mean_token_accuracy": 0.8607838153839111, + "num_tokens": 269407734.0, + "step": 7062 + }, + { + "epoch": 0.8984861976847729, + "ewc_loss": 0.028076171875, + "ewc_loss_parallel": 2.8133392333984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.4482421875, + "learning_rate": 1e-06, + "loss": 0.502, + "mean_token_accuracy": 0.8407440185546875, + "num_tokens": 269444042.0, + "step": 7063 + }, + { + "epoch": 0.8986134079633634, + "ewc_loss": 0.0281982421875, + "ewc_loss_parallel": 2.8252601623535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.442401885986328, + "learning_rate": 1e-06, + "loss": 0.3657, + "mean_token_accuracy": 0.8843556642532349, + "num_tokens": 269480979.0, + "step": 7064 + }, + { + "epoch": 0.898740618241954, + "ewc_loss": 0.0281982421875, + "ewc_loss_parallel": 2.8252601623535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.50050926208496, + "learning_rate": 1e-06, + "loss": 0.439, + "mean_token_accuracy": 0.8624622225761414, + "num_tokens": 269515594.0, + "step": 7065 + }, + { + "epoch": 0.8988678285205445, + "ewc_loss": 0.0279541015625, + "ewc_loss_parallel": 2.8014183044433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.480262756347656, + "learning_rate": 1e-06, + "loss": 0.4231, + "mean_token_accuracy": 0.864226222038269, + "num_tokens": 269556974.0, + "step": 7066 + }, + { + "epoch": 0.898995038799135, + "ewc_loss": 0.0283203125, + "ewc_loss_parallel": 2.8371810913085938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.54173469543457, + "learning_rate": 1e-06, + "loss": 0.458, + "mean_token_accuracy": 0.8540840744972229, + "num_tokens": 269591088.0, + "step": 7067 + }, + { + "epoch": 0.8991222490777255, + "ewc_loss": 0.0281982421875, + "ewc_loss_parallel": 2.8252601623535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.381872177124023, + "learning_rate": 1e-06, + "loss": 0.4122, + "mean_token_accuracy": 0.8710892796516418, + "num_tokens": 269627921.0, + "step": 7068 + }, + { + "epoch": 0.899249459356316, + "ewc_loss": 0.0279541015625, + "ewc_loss_parallel": 2.8014183044433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.516374588012695, + "learning_rate": 1e-06, + "loss": 0.4849, + "mean_token_accuracy": 0.8460401296615601, + "num_tokens": 269666586.0, + "step": 7069 + }, + { + "epoch": 0.8993766696349065, + "ewc_loss": 0.028076171875, + "ewc_loss_parallel": 2.8133392333984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.560482025146484, + "learning_rate": 1e-06, + "loss": 0.4205, + "mean_token_accuracy": 0.864157497882843, + "num_tokens": 269701205.0, + "step": 7070 + }, + { + "epoch": 0.899503879913497, + "ewc_loss": 0.028076171875, + "ewc_loss_parallel": 2.8133392333984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.4294376373291, + "learning_rate": 1e-06, + "loss": 0.3944, + "mean_token_accuracy": 0.8741482496261597, + "num_tokens": 269739982.0, + "step": 7071 + }, + { + "epoch": 0.8996310901920875, + "ewc_loss": 0.0281982421875, + "ewc_loss_parallel": 2.8252601623535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.435304641723633, + "learning_rate": 1e-06, + "loss": 0.4321, + "mean_token_accuracy": 0.8640907406806946, + "num_tokens": 269773048.0, + "step": 7072 + }, + { + "epoch": 0.8997583004706781, + "ewc_loss": 0.028076171875, + "ewc_loss_parallel": 2.8133392333984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.461275100708008, + "learning_rate": 1e-06, + "loss": 0.4389, + "mean_token_accuracy": 0.8610790967941284, + "num_tokens": 269811343.0, + "step": 7073 + }, + { + "epoch": 0.8998855107492686, + "ewc_loss": 0.0281982421875, + "ewc_loss_parallel": 2.8252601623535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.45577049255371, + "learning_rate": 1e-06, + "loss": 0.42, + "mean_token_accuracy": 0.8672329187393188, + "num_tokens": 269851601.0, + "step": 7074 + }, + { + "epoch": 0.900012721027859, + "ewc_loss": 0.028076171875, + "ewc_loss_parallel": 2.8133392333984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.3797664642334, + "learning_rate": 1e-06, + "loss": 0.4055, + "mean_token_accuracy": 0.870360255241394, + "num_tokens": 269891396.0, + "step": 7075 + }, + { + "epoch": 0.9001399313064495, + "ewc_loss": 0.028076171875, + "ewc_loss_parallel": 2.8133392333984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.570758819580078, + "learning_rate": 1e-06, + "loss": 0.4323, + "mean_token_accuracy": 0.8618998527526855, + "num_tokens": 269930292.0, + "step": 7076 + }, + { + "epoch": 0.9002671415850401, + "ewc_loss": 0.0283203125, + "ewc_loss_parallel": 2.8371810913085938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.425472259521484, + "learning_rate": 1e-06, + "loss": 0.5066, + "mean_token_accuracy": 0.8351353406906128, + "num_tokens": 269968464.0, + "step": 7077 + }, + { + "epoch": 0.9003943518636306, + "ewc_loss": 0.028076171875, + "ewc_loss_parallel": 2.8133392333984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.54368782043457, + "learning_rate": 1e-06, + "loss": 0.4728, + "mean_token_accuracy": 0.8491742610931396, + "num_tokens": 270011569.0, + "step": 7078 + }, + { + "epoch": 0.9005215621422211, + "ewc_loss": 0.0279541015625, + "ewc_loss_parallel": 2.8014183044433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.559463500976562, + "learning_rate": 1e-06, + "loss": 0.4355, + "mean_token_accuracy": 0.8570517301559448, + "num_tokens": 270050608.0, + "step": 7079 + }, + { + "epoch": 0.9006487724208116, + "ewc_loss": 0.0281982421875, + "ewc_loss_parallel": 2.8252601623535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.449491500854492, + "learning_rate": 1e-06, + "loss": 0.4762, + "mean_token_accuracy": 0.8477849960327148, + "num_tokens": 270096100.0, + "step": 7080 + }, + { + "epoch": 0.9007759826994021, + "ewc_loss": 0.0279541015625, + "ewc_loss_parallel": 2.8014183044433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.49582862854004, + "learning_rate": 1e-06, + "loss": 0.4021, + "mean_token_accuracy": 0.8722907304763794, + "num_tokens": 270140915.0, + "step": 7081 + }, + { + "epoch": 0.9009031929779926, + "ewc_loss": 0.0284423828125, + "ewc_loss_parallel": 2.849102020263672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.579877853393555, + "learning_rate": 1e-06, + "loss": 0.4244, + "mean_token_accuracy": 0.8623194694519043, + "num_tokens": 270184233.0, + "step": 7082 + }, + { + "epoch": 0.9010304032565831, + "ewc_loss": 0.0279541015625, + "ewc_loss_parallel": 2.8014183044433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.499507904052734, + "learning_rate": 1e-06, + "loss": 0.439, + "mean_token_accuracy": 0.8600155115127563, + "num_tokens": 270223889.0, + "step": 7083 + }, + { + "epoch": 0.9011576135351737, + "ewc_loss": 0.028076171875, + "ewc_loss_parallel": 2.8133392333984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.335006713867188, + "learning_rate": 1e-06, + "loss": 0.415, + "mean_token_accuracy": 0.8662994503974915, + "num_tokens": 270266357.0, + "step": 7084 + }, + { + "epoch": 0.9012848238137642, + "ewc_loss": 0.0283203125, + "ewc_loss_parallel": 2.8371810913085938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.63567543029785, + "learning_rate": 1e-06, + "loss": 0.4115, + "mean_token_accuracy": 0.869681179523468, + "num_tokens": 270298252.0, + "step": 7085 + }, + { + "epoch": 0.9014120340923547, + "ewc_loss": 0.0279541015625, + "ewc_loss_parallel": 2.8014183044433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.486492156982422, + "learning_rate": 1e-06, + "loss": 0.4548, + "mean_token_accuracy": 0.8544390201568604, + "num_tokens": 270343187.0, + "step": 7086 + }, + { + "epoch": 0.9015392443709451, + "ewc_loss": 0.028076171875, + "ewc_loss_parallel": 2.8133392333984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.360700607299805, + "learning_rate": 1e-06, + "loss": 0.4675, + "mean_token_accuracy": 0.8508032560348511, + "num_tokens": 270385094.0, + "step": 7087 + }, + { + "epoch": 0.9016664546495357, + "ewc_loss": 0.028076171875, + "ewc_loss_parallel": 2.8133392333984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.545759201049805, + "learning_rate": 1e-06, + "loss": 0.4766, + "mean_token_accuracy": 0.848689615726471, + "num_tokens": 270421772.0, + "step": 7088 + }, + { + "epoch": 0.9017936649281262, + "ewc_loss": 0.028076171875, + "ewc_loss_parallel": 2.8133392333984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.7089900970459, + "learning_rate": 1e-06, + "loss": 0.5, + "mean_token_accuracy": 0.8437044620513916, + "num_tokens": 270465033.0, + "step": 7089 + }, + { + "epoch": 0.9019208752067167, + "ewc_loss": 0.0279541015625, + "ewc_loss_parallel": 2.8014183044433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.331287384033203, + "learning_rate": 1e-06, + "loss": 0.3621, + "mean_token_accuracy": 0.884334921836853, + "num_tokens": 270501370.0, + "step": 7090 + }, + { + "epoch": 0.9020480854853072, + "ewc_loss": 0.0279541015625, + "ewc_loss_parallel": 2.7894973754882812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.531251907348633, + "learning_rate": 1e-06, + "loss": 0.4284, + "mean_token_accuracy": 0.86586594581604, + "num_tokens": 270543571.0, + "step": 7091 + }, + { + "epoch": 0.9021752957638978, + "ewc_loss": 0.0279541015625, + "ewc_loss_parallel": 2.7894973754882812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.555864334106445, + "learning_rate": 1e-06, + "loss": 0.4021, + "mean_token_accuracy": 0.872254490852356, + "num_tokens": 270580635.0, + "step": 7092 + }, + { + "epoch": 0.9023025060424882, + "ewc_loss": 0.0279541015625, + "ewc_loss_parallel": 2.8014183044433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.61604118347168, + "learning_rate": 1e-06, + "loss": 0.3956, + "mean_token_accuracy": 0.8740600943565369, + "num_tokens": 270616534.0, + "step": 7093 + }, + { + "epoch": 0.9024297163210787, + "ewc_loss": 0.0279541015625, + "ewc_loss_parallel": 2.7894973754882812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.550621032714844, + "learning_rate": 1e-06, + "loss": 0.5046, + "mean_token_accuracy": 0.8424028158187866, + "num_tokens": 270653998.0, + "step": 7094 + }, + { + "epoch": 0.9025569265996692, + "ewc_loss": 0.02783203125, + "ewc_loss_parallel": 2.777576446533203e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.4728946685791, + "learning_rate": 1e-06, + "loss": 0.4374, + "mean_token_accuracy": 0.8609519004821777, + "num_tokens": 270688023.0, + "step": 7095 + }, + { + "epoch": 0.9026841368782598, + "ewc_loss": 0.028076171875, + "ewc_loss_parallel": 2.8133392333984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.637775421142578, + "learning_rate": 1e-06, + "loss": 0.4986, + "mean_token_accuracy": 0.8439286351203918, + "num_tokens": 270721298.0, + "step": 7096 + }, + { + "epoch": 0.9028113471568503, + "ewc_loss": 0.028076171875, + "ewc_loss_parallel": 2.8133392333984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.261659622192383, + "learning_rate": 1e-06, + "loss": 0.4148, + "mean_token_accuracy": 0.8670423030853271, + "num_tokens": 270762182.0, + "step": 7097 + }, + { + "epoch": 0.9029385574354408, + "ewc_loss": 0.0279541015625, + "ewc_loss_parallel": 2.7894973754882812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.69942283630371, + "learning_rate": 1e-06, + "loss": 0.3868, + "mean_token_accuracy": 0.8766933083534241, + "num_tokens": 270799577.0, + "step": 7098 + }, + { + "epoch": 0.9030657677140312, + "ewc_loss": 0.028076171875, + "ewc_loss_parallel": 2.8133392333984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.419557571411133, + "learning_rate": 1e-06, + "loss": 0.4286, + "mean_token_accuracy": 0.863320529460907, + "num_tokens": 270840218.0, + "step": 7099 + }, + { + "epoch": 0.9031929779926218, + "ewc_loss": 0.02783203125, + "ewc_loss_parallel": 2.777576446533203e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.282865524291992, + "learning_rate": 1e-06, + "loss": 0.3682, + "mean_token_accuracy": 0.8829761147499084, + "num_tokens": 270872982.0, + "step": 7100 + }, + { + "epoch": 0.9033201882712123, + "ewc_loss": 0.0279541015625, + "ewc_loss_parallel": 2.8014183044433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.573989868164062, + "learning_rate": 1e-06, + "loss": 0.4217, + "mean_token_accuracy": 0.8666821718215942, + "num_tokens": 270908381.0, + "step": 7101 + }, + { + "epoch": 0.9034473985498028, + "ewc_loss": 0.028076171875, + "ewc_loss_parallel": 2.8133392333984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.324731826782227, + "learning_rate": 1e-06, + "loss": 0.4391, + "mean_token_accuracy": 0.860276997089386, + "num_tokens": 270949217.0, + "step": 7102 + }, + { + "epoch": 0.9035746088283934, + "ewc_loss": 0.0279541015625, + "ewc_loss_parallel": 2.8014183044433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.45117950439453, + "learning_rate": 1e-06, + "loss": 0.4891, + "mean_token_accuracy": 0.8449012041091919, + "num_tokens": 270989667.0, + "step": 7103 + }, + { + "epoch": 0.9037018191069839, + "ewc_loss": 0.0281982421875, + "ewc_loss_parallel": 2.8252601623535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.47770118713379, + "learning_rate": 1e-06, + "loss": 0.4688, + "mean_token_accuracy": 0.8516868352890015, + "num_tokens": 271030395.0, + "step": 7104 + }, + { + "epoch": 0.9038290293855743, + "ewc_loss": 0.028564453125, + "ewc_loss_parallel": 2.86102294921875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.48948097229004, + "learning_rate": 1e-06, + "loss": 0.5028, + "mean_token_accuracy": 0.8434199690818787, + "num_tokens": 271068239.0, + "step": 7105 + }, + { + "epoch": 0.9039562396641648, + "ewc_loss": 0.0283203125, + "ewc_loss_parallel": 2.8371810913085938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.42643928527832, + "learning_rate": 1e-06, + "loss": 0.4571, + "mean_token_accuracy": 0.855379581451416, + "num_tokens": 271110294.0, + "step": 7106 + }, + { + "epoch": 0.9040834499427554, + "ewc_loss": 0.0281982421875, + "ewc_loss_parallel": 2.8252601623535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.417821884155273, + "learning_rate": 1e-06, + "loss": 0.3791, + "mean_token_accuracy": 0.8793503046035767, + "num_tokens": 271143288.0, + "step": 7107 + }, + { + "epoch": 0.9042106602213459, + "ewc_loss": 0.0283203125, + "ewc_loss_parallel": 2.8371810913085938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.655038833618164, + "learning_rate": 1e-06, + "loss": 0.4016, + "mean_token_accuracy": 0.8688420057296753, + "num_tokens": 271181183.0, + "step": 7108 + }, + { + "epoch": 0.9043378704999364, + "ewc_loss": 0.028076171875, + "ewc_loss_parallel": 2.8133392333984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.38375473022461, + "learning_rate": 1e-06, + "loss": 0.4317, + "mean_token_accuracy": 0.8606256246566772, + "num_tokens": 271216201.0, + "step": 7109 + }, + { + "epoch": 0.9044650807785269, + "ewc_loss": 0.0281982421875, + "ewc_loss_parallel": 2.8252601623535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.53086280822754, + "learning_rate": 1e-06, + "loss": 0.4017, + "mean_token_accuracy": 0.8686609268188477, + "num_tokens": 271250158.0, + "step": 7110 + }, + { + "epoch": 0.9045922910571175, + "ewc_loss": 0.0283203125, + "ewc_loss_parallel": 2.8371810913085938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.383953094482422, + "learning_rate": 1e-06, + "loss": 0.3981, + "mean_token_accuracy": 0.8744620084762573, + "num_tokens": 271289994.0, + "step": 7111 + }, + { + "epoch": 0.9047195013357079, + "ewc_loss": 0.0281982421875, + "ewc_loss_parallel": 2.8252601623535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.590110778808594, + "learning_rate": 1e-06, + "loss": 0.4071, + "mean_token_accuracy": 0.867559552192688, + "num_tokens": 271325782.0, + "step": 7112 + }, + { + "epoch": 0.9048467116142984, + "ewc_loss": 0.0281982421875, + "ewc_loss_parallel": 2.8252601623535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.553279876708984, + "learning_rate": 1e-06, + "loss": 0.4889, + "mean_token_accuracy": 0.8494985699653625, + "num_tokens": 271361604.0, + "step": 7113 + }, + { + "epoch": 0.9049739218928889, + "ewc_loss": 0.0279541015625, + "ewc_loss_parallel": 2.8014183044433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.619245529174805, + "learning_rate": 1e-06, + "loss": 0.4024, + "mean_token_accuracy": 0.8711844682693481, + "num_tokens": 271398520.0, + "step": 7114 + }, + { + "epoch": 0.9051011321714795, + "ewc_loss": 0.0284423828125, + "ewc_loss_parallel": 2.849102020263672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.53081703186035, + "learning_rate": 1e-06, + "loss": 0.4554, + "mean_token_accuracy": 0.8564999103546143, + "num_tokens": 271445138.0, + "step": 7115 + }, + { + "epoch": 0.90522834245007, + "ewc_loss": 0.0281982421875, + "ewc_loss_parallel": 2.8252601623535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.552268981933594, + "learning_rate": 1e-06, + "loss": 0.4285, + "mean_token_accuracy": 0.8672154545783997, + "num_tokens": 271479147.0, + "step": 7116 + }, + { + "epoch": 0.9053555527286605, + "ewc_loss": 0.0283203125, + "ewc_loss_parallel": 2.8371810913085938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.65899085998535, + "learning_rate": 1e-06, + "loss": 0.4219, + "mean_token_accuracy": 0.8636811375617981, + "num_tokens": 271512477.0, + "step": 7117 + }, + { + "epoch": 0.905482763007251, + "ewc_loss": 0.0283203125, + "ewc_loss_parallel": 2.8371810913085938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.53362464904785, + "learning_rate": 1e-06, + "loss": 0.4805, + "mean_token_accuracy": 0.8493310809135437, + "num_tokens": 271552355.0, + "step": 7118 + }, + { + "epoch": 0.9056099732858415, + "ewc_loss": 0.0283203125, + "ewc_loss_parallel": 2.8371810913085938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.651458740234375, + "learning_rate": 1e-06, + "loss": 0.446, + "mean_token_accuracy": 0.8577088713645935, + "num_tokens": 271595825.0, + "step": 7119 + }, + { + "epoch": 0.905737183564432, + "ewc_loss": 0.0284423828125, + "ewc_loss_parallel": 2.849102020263672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.69753646850586, + "learning_rate": 1e-06, + "loss": 0.4695, + "mean_token_accuracy": 0.8504149913787842, + "num_tokens": 271627384.0, + "step": 7120 + }, + { + "epoch": 0.9058643938430225, + "ewc_loss": 0.028076171875, + "ewc_loss_parallel": 2.8133392333984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.59442710876465, + "learning_rate": 1e-06, + "loss": 0.4024, + "mean_token_accuracy": 0.8715051412582397, + "num_tokens": 271666711.0, + "step": 7121 + }, + { + "epoch": 0.9059916041216131, + "ewc_loss": 0.0281982421875, + "ewc_loss_parallel": 2.8252601623535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.542381286621094, + "learning_rate": 1e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.8683496117591858, + "num_tokens": 271703145.0, + "step": 7122 + }, + { + "epoch": 0.9061188144002036, + "ewc_loss": 0.028076171875, + "ewc_loss_parallel": 2.8133392333984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.505062103271484, + "learning_rate": 1e-06, + "loss": 0.4719, + "mean_token_accuracy": 0.8508076667785645, + "num_tokens": 271743131.0, + "step": 7123 + }, + { + "epoch": 0.906246024678794, + "ewc_loss": 0.0279541015625, + "ewc_loss_parallel": 2.8014183044433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.720726013183594, + "learning_rate": 1e-06, + "loss": 0.418, + "mean_token_accuracy": 0.866407036781311, + "num_tokens": 271778245.0, + "step": 7124 + }, + { + "epoch": 0.9063732349573845, + "ewc_loss": 0.0281982421875, + "ewc_loss_parallel": 2.8252601623535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.40842056274414, + "learning_rate": 1e-06, + "loss": 0.4684, + "mean_token_accuracy": 0.8529326319694519, + "num_tokens": 271817279.0, + "step": 7125 + }, + { + "epoch": 0.9065004452359751, + "ewc_loss": 0.028076171875, + "ewc_loss_parallel": 2.8133392333984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.547420501708984, + "learning_rate": 1e-06, + "loss": 0.429, + "mean_token_accuracy": 0.8688188195228577, + "num_tokens": 271857136.0, + "step": 7126 + }, + { + "epoch": 0.9066276555145656, + "ewc_loss": 0.0284423828125, + "ewc_loss_parallel": 2.849102020263672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.534732818603516, + "learning_rate": 1e-06, + "loss": 0.413, + "mean_token_accuracy": 0.8710382580757141, + "num_tokens": 271891651.0, + "step": 7127 + }, + { + "epoch": 0.9067548657931561, + "ewc_loss": 0.028076171875, + "ewc_loss_parallel": 2.8133392333984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.40671157836914, + "learning_rate": 1e-06, + "loss": 0.4131, + "mean_token_accuracy": 0.8669912815093994, + "num_tokens": 271925805.0, + "step": 7128 + }, + { + "epoch": 0.9068820760717466, + "ewc_loss": 0.028076171875, + "ewc_loss_parallel": 2.8133392333984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.45223617553711, + "learning_rate": 1e-06, + "loss": 0.4207, + "mean_token_accuracy": 0.8663897514343262, + "num_tokens": 271964954.0, + "step": 7129 + }, + { + "epoch": 0.9070092863503371, + "ewc_loss": 0.0283203125, + "ewc_loss_parallel": 2.8371810913085938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.38667869567871, + "learning_rate": 1e-06, + "loss": 0.4355, + "mean_token_accuracy": 0.8613222241401672, + "num_tokens": 272000331.0, + "step": 7130 + }, + { + "epoch": 0.9071364966289276, + "ewc_loss": 0.028076171875, + "ewc_loss_parallel": 2.8133392333984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.491058349609375, + "learning_rate": 1e-06, + "loss": 0.4116, + "mean_token_accuracy": 0.8697932958602905, + "num_tokens": 272037853.0, + "step": 7131 + }, + { + "epoch": 0.9072637069075181, + "ewc_loss": 0.028076171875, + "ewc_loss_parallel": 2.8133392333984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.39323616027832, + "learning_rate": 1e-06, + "loss": 0.399, + "mean_token_accuracy": 0.8722577691078186, + "num_tokens": 272077663.0, + "step": 7132 + }, + { + "epoch": 0.9073909171861086, + "ewc_loss": 0.0283203125, + "ewc_loss_parallel": 2.8371810913085938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.431087493896484, + "learning_rate": 1e-06, + "loss": 0.4459, + "mean_token_accuracy": 0.8569283485412598, + "num_tokens": 272116942.0, + "step": 7133 + }, + { + "epoch": 0.9075181274646992, + "ewc_loss": 0.0283203125, + "ewc_loss_parallel": 2.8371810913085938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.657642364501953, + "learning_rate": 1e-06, + "loss": 0.4436, + "mean_token_accuracy": 0.8554642200469971, + "num_tokens": 272151319.0, + "step": 7134 + }, + { + "epoch": 0.9076453377432897, + "ewc_loss": 0.0279541015625, + "ewc_loss_parallel": 2.8014183044433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.507259368896484, + "learning_rate": 1e-06, + "loss": 0.4812, + "mean_token_accuracy": 0.8486239910125732, + "num_tokens": 272190817.0, + "step": 7135 + }, + { + "epoch": 0.9077725480218801, + "ewc_loss": 0.0283203125, + "ewc_loss_parallel": 2.8371810913085938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.558473587036133, + "learning_rate": 1e-06, + "loss": 0.4561, + "mean_token_accuracy": 0.8555067777633667, + "num_tokens": 272230461.0, + "step": 7136 + }, + { + "epoch": 0.9078997583004706, + "ewc_loss": 0.0283203125, + "ewc_loss_parallel": 2.8371810913085938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.748655319213867, + "learning_rate": 1e-06, + "loss": 0.458, + "mean_token_accuracy": 0.8565924167633057, + "num_tokens": 272268510.0, + "step": 7137 + }, + { + "epoch": 0.9080269685790612, + "ewc_loss": 0.0281982421875, + "ewc_loss_parallel": 2.8252601623535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.465951919555664, + "learning_rate": 1e-06, + "loss": 0.4313, + "mean_token_accuracy": 0.8621319532394409, + "num_tokens": 272312430.0, + "step": 7138 + }, + { + "epoch": 0.9081541788576517, + "ewc_loss": 0.0281982421875, + "ewc_loss_parallel": 2.8252601623535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.58723258972168, + "learning_rate": 1e-06, + "loss": 0.4319, + "mean_token_accuracy": 0.860216498374939, + "num_tokens": 272348437.0, + "step": 7139 + }, + { + "epoch": 0.9082813891362422, + "ewc_loss": 0.0284423828125, + "ewc_loss_parallel": 2.849102020263672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.74973487854004, + "learning_rate": 1e-06, + "loss": 0.4506, + "mean_token_accuracy": 0.855146586894989, + "num_tokens": 272378902.0, + "step": 7140 + }, + { + "epoch": 0.9084085994148328, + "ewc_loss": 0.0281982421875, + "ewc_loss_parallel": 2.8252601623535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.55267333984375, + "learning_rate": 1e-06, + "loss": 0.3984, + "mean_token_accuracy": 0.8757395148277283, + "num_tokens": 272421257.0, + "step": 7141 + }, + { + "epoch": 0.9085358096934232, + "ewc_loss": 0.028076171875, + "ewc_loss_parallel": 2.8133392333984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.568918228149414, + "learning_rate": 1e-06, + "loss": 0.4383, + "mean_token_accuracy": 0.8611553311347961, + "num_tokens": 272459800.0, + "step": 7142 + }, + { + "epoch": 0.9086630199720137, + "ewc_loss": 0.0281982421875, + "ewc_loss_parallel": 2.8252601623535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.470731735229492, + "learning_rate": 1e-06, + "loss": 0.3741, + "mean_token_accuracy": 0.8812623620033264, + "num_tokens": 272501245.0, + "step": 7143 + }, + { + "epoch": 0.9087902302506042, + "ewc_loss": 0.028076171875, + "ewc_loss_parallel": 2.8133392333984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.53783416748047, + "learning_rate": 1e-06, + "loss": 0.4262, + "mean_token_accuracy": 0.8630651235580444, + "num_tokens": 272535873.0, + "step": 7144 + }, + { + "epoch": 0.9089174405291948, + "ewc_loss": 0.0279541015625, + "ewc_loss_parallel": 2.8014183044433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.482606887817383, + "learning_rate": 1e-06, + "loss": 0.4399, + "mean_token_accuracy": 0.8602949976921082, + "num_tokens": 272575213.0, + "step": 7145 + }, + { + "epoch": 0.9090446508077853, + "ewc_loss": 0.0281982421875, + "ewc_loss_parallel": 2.8252601623535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.359638214111328, + "learning_rate": 1e-06, + "loss": 0.4254, + "mean_token_accuracy": 0.8642342686653137, + "num_tokens": 272611960.0, + "step": 7146 + }, + { + "epoch": 0.9091718610863758, + "ewc_loss": 0.0283203125, + "ewc_loss_parallel": 2.8371810913085938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.535234451293945, + "learning_rate": 1e-06, + "loss": 0.4387, + "mean_token_accuracy": 0.8610320687294006, + "num_tokens": 272650895.0, + "step": 7147 + }, + { + "epoch": 0.9092990713649662, + "ewc_loss": 0.028076171875, + "ewc_loss_parallel": 2.8133392333984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.482248306274414, + "learning_rate": 1e-06, + "loss": 0.435, + "mean_token_accuracy": 0.8623594045639038, + "num_tokens": 272690617.0, + "step": 7148 + }, + { + "epoch": 0.9094262816435568, + "ewc_loss": 0.0283203125, + "ewc_loss_parallel": 2.8371810913085938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.63856315612793, + "learning_rate": 1e-06, + "loss": 0.4689, + "mean_token_accuracy": 0.8533538579940796, + "num_tokens": 272732526.0, + "step": 7149 + }, + { + "epoch": 0.9095534919221473, + "ewc_loss": 0.0283203125, + "ewc_loss_parallel": 2.8371810913085938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.453922271728516, + "learning_rate": 1e-06, + "loss": 0.4384, + "mean_token_accuracy": 0.8600168228149414, + "num_tokens": 272769882.0, + "step": 7150 + }, + { + "epoch": 0.9096807022007378, + "ewc_loss": 0.0281982421875, + "ewc_loss_parallel": 2.8252601623535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.522045135498047, + "learning_rate": 1e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.8650563955307007, + "num_tokens": 272809614.0, + "step": 7151 + }, + { + "epoch": 0.9098079124793284, + "ewc_loss": 0.0283203125, + "ewc_loss_parallel": 2.8371810913085938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.64861297607422, + "learning_rate": 1e-06, + "loss": 0.4411, + "mean_token_accuracy": 0.8549153208732605, + "num_tokens": 272844819.0, + "step": 7152 + }, + { + "epoch": 0.9099351227579189, + "ewc_loss": 0.028076171875, + "ewc_loss_parallel": 2.8133392333984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.52631187438965, + "learning_rate": 1e-06, + "loss": 0.4186, + "mean_token_accuracy": 0.8676077723503113, + "num_tokens": 272882798.0, + "step": 7153 + }, + { + "epoch": 0.9100623330365093, + "ewc_loss": 0.0283203125, + "ewc_loss_parallel": 2.8371810913085938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.56513023376465, + "learning_rate": 1e-06, + "loss": 0.4298, + "mean_token_accuracy": 0.8648163676261902, + "num_tokens": 272917472.0, + "step": 7154 + }, + { + "epoch": 0.9101895433150998, + "ewc_loss": 0.0281982421875, + "ewc_loss_parallel": 2.8252601623535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.491899490356445, + "learning_rate": 1e-06, + "loss": 0.4782, + "mean_token_accuracy": 0.851296603679657, + "num_tokens": 272959947.0, + "step": 7155 + }, + { + "epoch": 0.9103167535936904, + "ewc_loss": 0.0283203125, + "ewc_loss_parallel": 2.8371810913085938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.554744720458984, + "learning_rate": 1e-06, + "loss": 0.4819, + "mean_token_accuracy": 0.8448171615600586, + "num_tokens": 272998194.0, + "step": 7156 + }, + { + "epoch": 0.9104439638722809, + "ewc_loss": 0.0279541015625, + "ewc_loss_parallel": 2.8014183044433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.545988082885742, + "learning_rate": 1e-06, + "loss": 0.3799, + "mean_token_accuracy": 0.8791464567184448, + "num_tokens": 273034688.0, + "step": 7157 + }, + { + "epoch": 0.9105711741508714, + "ewc_loss": 0.0281982421875, + "ewc_loss_parallel": 2.8252601623535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.66429901123047, + "learning_rate": 1e-06, + "loss": 0.4043, + "mean_token_accuracy": 0.8722720146179199, + "num_tokens": 273071575.0, + "step": 7158 + }, + { + "epoch": 0.9106983844294619, + "ewc_loss": 0.0279541015625, + "ewc_loss_parallel": 2.8014183044433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.430736541748047, + "learning_rate": 1e-06, + "loss": 0.5181, + "mean_token_accuracy": 0.8363581299781799, + "num_tokens": 273106106.0, + "step": 7159 + }, + { + "epoch": 0.9108255947080524, + "ewc_loss": 0.0281982421875, + "ewc_loss_parallel": 2.8252601623535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.53896713256836, + "learning_rate": 1e-06, + "loss": 0.4514, + "mean_token_accuracy": 0.8571287393569946, + "num_tokens": 273142959.0, + "step": 7160 + }, + { + "epoch": 0.9109528049866429, + "ewc_loss": 0.028076171875, + "ewc_loss_parallel": 2.8133392333984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.434825897216797, + "learning_rate": 1e-06, + "loss": 0.4743, + "mean_token_accuracy": 0.847832202911377, + "num_tokens": 273182781.0, + "step": 7161 + }, + { + "epoch": 0.9110800152652334, + "ewc_loss": 0.028076171875, + "ewc_loss_parallel": 2.8133392333984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.55364418029785, + "learning_rate": 1e-06, + "loss": 0.4182, + "mean_token_accuracy": 0.8680486679077148, + "num_tokens": 273225738.0, + "step": 7162 + }, + { + "epoch": 0.9112072255438239, + "ewc_loss": 0.0283203125, + "ewc_loss_parallel": 2.8371810913085938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.5157413482666, + "learning_rate": 1e-06, + "loss": 0.4199, + "mean_token_accuracy": 0.8690291047096252, + "num_tokens": 273269166.0, + "step": 7163 + }, + { + "epoch": 0.9113344358224145, + "ewc_loss": 0.028076171875, + "ewc_loss_parallel": 2.8133392333984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.646820068359375, + "learning_rate": 1e-06, + "loss": 0.4457, + "mean_token_accuracy": 0.8567452430725098, + "num_tokens": 273309093.0, + "step": 7164 + }, + { + "epoch": 0.911461646101005, + "ewc_loss": 0.0284423828125, + "ewc_loss_parallel": 2.849102020263672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.527544021606445, + "learning_rate": 1e-06, + "loss": 0.4152, + "mean_token_accuracy": 0.8672333359718323, + "num_tokens": 273346857.0, + "step": 7165 + }, + { + "epoch": 0.9115888563795955, + "ewc_loss": 0.0284423828125, + "ewc_loss_parallel": 2.849102020263672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.438005447387695, + "learning_rate": 1e-06, + "loss": 0.4499, + "mean_token_accuracy": 0.8576959371566772, + "num_tokens": 273385720.0, + "step": 7166 + }, + { + "epoch": 0.9117160666581859, + "ewc_loss": 0.0281982421875, + "ewc_loss_parallel": 2.8252601623535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.59223747253418, + "learning_rate": 1e-06, + "loss": 0.4369, + "mean_token_accuracy": 0.8626999855041504, + "num_tokens": 273431079.0, + "step": 7167 + }, + { + "epoch": 0.9118432769367765, + "ewc_loss": 0.0281982421875, + "ewc_loss_parallel": 2.8252601623535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.614885330200195, + "learning_rate": 1e-06, + "loss": 0.4385, + "mean_token_accuracy": 0.8632726073265076, + "num_tokens": 273477316.0, + "step": 7168 + }, + { + "epoch": 0.911970487215367, + "ewc_loss": 0.0279541015625, + "ewc_loss_parallel": 2.8014183044433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.764461517333984, + "learning_rate": 1e-06, + "loss": 0.4269, + "mean_token_accuracy": 0.8652822971343994, + "num_tokens": 273512831.0, + "step": 7169 + }, + { + "epoch": 0.9120976974939575, + "ewc_loss": 0.0284423828125, + "ewc_loss_parallel": 2.849102020263672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.636613845825195, + "learning_rate": 1e-06, + "loss": 0.4602, + "mean_token_accuracy": 0.8558749556541443, + "num_tokens": 273551927.0, + "step": 7170 + }, + { + "epoch": 0.912224907772548, + "ewc_loss": 0.02783203125, + "ewc_loss_parallel": 2.777576446533203e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.572444915771484, + "learning_rate": 1e-06, + "loss": 0.418, + "mean_token_accuracy": 0.8655468225479126, + "num_tokens": 273583053.0, + "step": 7171 + }, + { + "epoch": 0.9123521180511386, + "ewc_loss": 0.0283203125, + "ewc_loss_parallel": 2.8371810913085938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.915307998657227, + "learning_rate": 1e-06, + "loss": 0.4718, + "mean_token_accuracy": 0.8515465259552002, + "num_tokens": 273617953.0, + "step": 7172 + }, + { + "epoch": 0.912479328329729, + "ewc_loss": 0.028076171875, + "ewc_loss_parallel": 2.8133392333984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.723310470581055, + "learning_rate": 1e-06, + "loss": 0.4876, + "mean_token_accuracy": 0.8481664657592773, + "num_tokens": 273649959.0, + "step": 7173 + }, + { + "epoch": 0.9126065386083195, + "ewc_loss": 0.0279541015625, + "ewc_loss_parallel": 2.8014183044433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.810821533203125, + "learning_rate": 1e-06, + "loss": 0.4704, + "mean_token_accuracy": 0.8519554138183594, + "num_tokens": 273682666.0, + "step": 7174 + }, + { + "epoch": 0.9127337488869101, + "ewc_loss": 0.028076171875, + "ewc_loss_parallel": 2.8133392333984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.72211456298828, + "learning_rate": 1e-06, + "loss": 0.3734, + "mean_token_accuracy": 0.8801910281181335, + "num_tokens": 273718641.0, + "step": 7175 + }, + { + "epoch": 0.9128609591655006, + "ewc_loss": 0.028076171875, + "ewc_loss_parallel": 2.8133392333984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.591232299804688, + "learning_rate": 1e-06, + "loss": 0.4131, + "mean_token_accuracy": 0.8715595602989197, + "num_tokens": 273760502.0, + "step": 7176 + }, + { + "epoch": 0.9129881694440911, + "ewc_loss": 0.0279541015625, + "ewc_loss_parallel": 2.7894973754882812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.756399154663086, + "learning_rate": 1e-06, + "loss": 0.4532, + "mean_token_accuracy": 0.8560680747032166, + "num_tokens": 273795068.0, + "step": 7177 + }, + { + "epoch": 0.9131153797226816, + "ewc_loss": 0.028076171875, + "ewc_loss_parallel": 2.8133392333984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.66197395324707, + "learning_rate": 1e-06, + "loss": 0.4867, + "mean_token_accuracy": 0.8435752987861633, + "num_tokens": 273825099.0, + "step": 7178 + }, + { + "epoch": 0.9132425900012721, + "ewc_loss": 0.0279541015625, + "ewc_loss_parallel": 2.7894973754882812e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.556753158569336, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.8672223687171936, + "num_tokens": 273862566.0, + "step": 7179 + }, + { + "epoch": 0.9133698002798626, + "ewc_loss": 0.0279541015625, + "ewc_loss_parallel": 2.8014183044433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.651243209838867, + "learning_rate": 1e-06, + "loss": 0.43, + "mean_token_accuracy": 0.8631477355957031, + "num_tokens": 273896107.0, + "step": 7180 + }, + { + "epoch": 0.9134970105584531, + "ewc_loss": 0.0279541015625, + "ewc_loss_parallel": 2.8014183044433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.729145050048828, + "learning_rate": 1e-06, + "loss": 0.415, + "mean_token_accuracy": 0.8685697317123413, + "num_tokens": 273934066.0, + "step": 7181 + }, + { + "epoch": 0.9136242208370436, + "ewc_loss": 0.0281982421875, + "ewc_loss_parallel": 2.8252601623535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.4507999420166, + "learning_rate": 1e-06, + "loss": 0.4394, + "mean_token_accuracy": 0.8610345125198364, + "num_tokens": 273971843.0, + "step": 7182 + }, + { + "epoch": 0.9137514311156342, + "ewc_loss": 0.0279541015625, + "ewc_loss_parallel": 2.8014183044433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.805213928222656, + "learning_rate": 1e-06, + "loss": 0.4545, + "mean_token_accuracy": 0.8541991710662842, + "num_tokens": 274001646.0, + "step": 7183 + }, + { + "epoch": 0.9138786413942247, + "ewc_loss": 0.0284423828125, + "ewc_loss_parallel": 2.849102020263672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.477996826171875, + "learning_rate": 1e-06, + "loss": 0.4844, + "mean_token_accuracy": 0.8468709588050842, + "num_tokens": 274042656.0, + "step": 7184 + }, + { + "epoch": 0.9140058516728151, + "ewc_loss": 0.028076171875, + "ewc_loss_parallel": 2.8133392333984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.623355865478516, + "learning_rate": 1e-06, + "loss": 0.4648, + "mean_token_accuracy": 0.8477745056152344, + "num_tokens": 274084609.0, + "step": 7185 + }, + { + "epoch": 0.9141330619514056, + "ewc_loss": 0.0281982421875, + "ewc_loss_parallel": 2.8252601623535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.761756896972656, + "learning_rate": 1e-06, + "loss": 0.428, + "mean_token_accuracy": 0.8651195168495178, + "num_tokens": 274127503.0, + "step": 7186 + }, + { + "epoch": 0.9142602722299962, + "ewc_loss": 0.028076171875, + "ewc_loss_parallel": 2.8133392333984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.631940841674805, + "learning_rate": 1e-06, + "loss": 0.4145, + "mean_token_accuracy": 0.8668453097343445, + "num_tokens": 274164126.0, + "step": 7187 + }, + { + "epoch": 0.9143874825085867, + "ewc_loss": 0.0279541015625, + "ewc_loss_parallel": 2.8014183044433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.528148651123047, + "learning_rate": 1e-06, + "loss": 0.4391, + "mean_token_accuracy": 0.8610929250717163, + "num_tokens": 274198104.0, + "step": 7188 + }, + { + "epoch": 0.9145146927871772, + "ewc_loss": 0.0281982421875, + "ewc_loss_parallel": 2.8252601623535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.54144859313965, + "learning_rate": 1e-06, + "loss": 0.4719, + "mean_token_accuracy": 0.8511662483215332, + "num_tokens": 274233487.0, + "step": 7189 + }, + { + "epoch": 0.9146419030657678, + "ewc_loss": 0.0281982421875, + "ewc_loss_parallel": 2.8252601623535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.645145416259766, + "learning_rate": 1e-06, + "loss": 0.4204, + "mean_token_accuracy": 0.8662278652191162, + "num_tokens": 274267317.0, + "step": 7190 + }, + { + "epoch": 0.9147691133443582, + "ewc_loss": 0.0281982421875, + "ewc_loss_parallel": 2.8252601623535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.59588623046875, + "learning_rate": 1e-06, + "loss": 0.4581, + "mean_token_accuracy": 0.8548696637153625, + "num_tokens": 274304169.0, + "step": 7191 + }, + { + "epoch": 0.9148963236229487, + "ewc_loss": 0.0283203125, + "ewc_loss_parallel": 2.8371810913085938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.54766082763672, + "learning_rate": 1e-06, + "loss": 0.4322, + "mean_token_accuracy": 0.8631722927093506, + "num_tokens": 274343265.0, + "step": 7192 + }, + { + "epoch": 0.9150235339015392, + "ewc_loss": 0.0279541015625, + "ewc_loss_parallel": 2.8014183044433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.6259708404541, + "learning_rate": 1e-06, + "loss": 0.4832, + "mean_token_accuracy": 0.846315860748291, + "num_tokens": 274380756.0, + "step": 7193 + }, + { + "epoch": 0.9151507441801298, + "ewc_loss": 0.0283203125, + "ewc_loss_parallel": 2.8371810913085938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.592496871948242, + "learning_rate": 1e-06, + "loss": 0.4032, + "mean_token_accuracy": 0.8717037439346313, + "num_tokens": 274422035.0, + "step": 7194 + }, + { + "epoch": 0.9152779544587203, + "ewc_loss": 0.028076171875, + "ewc_loss_parallel": 2.8133392333984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.583951950073242, + "learning_rate": 1e-06, + "loss": 0.4593, + "mean_token_accuracy": 0.8515146970748901, + "num_tokens": 274460875.0, + "step": 7195 + }, + { + "epoch": 0.9154051647373108, + "ewc_loss": 0.0281982421875, + "ewc_loss_parallel": 2.8252601623535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.635019302368164, + "learning_rate": 1e-06, + "loss": 0.4247, + "mean_token_accuracy": 0.8646413087844849, + "num_tokens": 274495807.0, + "step": 7196 + }, + { + "epoch": 0.9155323750159012, + "ewc_loss": 0.028076171875, + "ewc_loss_parallel": 2.8133392333984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.471567153930664, + "learning_rate": 1e-06, + "loss": 0.3965, + "mean_token_accuracy": 0.8687463402748108, + "num_tokens": 274533098.0, + "step": 7197 + }, + { + "epoch": 0.9156595852944918, + "ewc_loss": 0.0283203125, + "ewc_loss_parallel": 2.8371810913085938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.666767120361328, + "learning_rate": 1e-06, + "loss": 0.4425, + "mean_token_accuracy": 0.8579252362251282, + "num_tokens": 274565282.0, + "step": 7198 + }, + { + "epoch": 0.9157867955730823, + "ewc_loss": 0.0283203125, + "ewc_loss_parallel": 2.8371810913085938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.63459014892578, + "learning_rate": 1e-06, + "loss": 0.4209, + "mean_token_accuracy": 0.8651746511459351, + "num_tokens": 274605401.0, + "step": 7199 + }, + { + "epoch": 0.9159140058516728, + "ewc_loss": 0.0281982421875, + "ewc_loss_parallel": 2.8252601623535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.322044372558594, + "learning_rate": 1e-06, + "loss": 0.4602, + "mean_token_accuracy": 0.8558549880981445, + "num_tokens": 274645018.0, + "step": 7200 + }, + { + "epoch": 0.9160412161302633, + "ewc_loss": 0.0284423828125, + "ewc_loss_parallel": 2.849102020263672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.673120498657227, + "learning_rate": 1e-06, + "loss": 0.4232, + "mean_token_accuracy": 0.8704845309257507, + "num_tokens": 274681541.0, + "step": 7201 + }, + { + "epoch": 0.9161684264088539, + "ewc_loss": 0.0281982421875, + "ewc_loss_parallel": 2.8252601623535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.495159149169922, + "learning_rate": 1e-06, + "loss": 0.4287, + "mean_token_accuracy": 0.8653080463409424, + "num_tokens": 274716883.0, + "step": 7202 + }, + { + "epoch": 0.9162956366874443, + "ewc_loss": 0.0283203125, + "ewc_loss_parallel": 2.8371810913085938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.543338775634766, + "learning_rate": 1e-06, + "loss": 0.4388, + "mean_token_accuracy": 0.8558363914489746, + "num_tokens": 274753038.0, + "step": 7203 + }, + { + "epoch": 0.9164228469660348, + "ewc_loss": 0.028564453125, + "ewc_loss_parallel": 2.86102294921875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.74581527709961, + "learning_rate": 1e-06, + "loss": 0.4065, + "mean_token_accuracy": 0.8674564361572266, + "num_tokens": 274790760.0, + "step": 7204 + }, + { + "epoch": 0.9165500572446253, + "ewc_loss": 0.0281982421875, + "ewc_loss_parallel": 2.8252601623535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.419898986816406, + "learning_rate": 1e-06, + "loss": 0.4596, + "mean_token_accuracy": 0.8549656867980957, + "num_tokens": 274830872.0, + "step": 7205 + }, + { + "epoch": 0.9166772675232159, + "ewc_loss": 0.0281982421875, + "ewc_loss_parallel": 2.8252601623535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.67685317993164, + "learning_rate": 1e-06, + "loss": 0.4778, + "mean_token_accuracy": 0.8488956093788147, + "num_tokens": 274869832.0, + "step": 7206 + }, + { + "epoch": 0.9168044778018064, + "ewc_loss": 0.0284423828125, + "ewc_loss_parallel": 2.849102020263672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.471933364868164, + "learning_rate": 1e-06, + "loss": 0.4573, + "mean_token_accuracy": 0.8542752265930176, + "num_tokens": 274903853.0, + "step": 7207 + }, + { + "epoch": 0.9169316880803969, + "ewc_loss": 0.0281982421875, + "ewc_loss_parallel": 2.8252601623535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.754030227661133, + "learning_rate": 1e-06, + "loss": 0.443, + "mean_token_accuracy": 0.8610334396362305, + "num_tokens": 274942911.0, + "step": 7208 + }, + { + "epoch": 0.9170588983589874, + "ewc_loss": 0.0284423828125, + "ewc_loss_parallel": 2.849102020263672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.57481575012207, + "learning_rate": 1e-06, + "loss": 0.4199, + "mean_token_accuracy": 0.8687434792518616, + "num_tokens": 274986427.0, + "step": 7209 + }, + { + "epoch": 0.9171861086375779, + "ewc_loss": 0.0283203125, + "ewc_loss_parallel": 2.8371810913085938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.663429260253906, + "learning_rate": 1e-06, + "loss": 0.4218, + "mean_token_accuracy": 0.8642994165420532, + "num_tokens": 275021557.0, + "step": 7210 + }, + { + "epoch": 0.9173133189161684, + "ewc_loss": 0.0283203125, + "ewc_loss_parallel": 2.8371810913085938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.585098266601562, + "learning_rate": 1e-06, + "loss": 0.4725, + "mean_token_accuracy": 0.8502364158630371, + "num_tokens": 275061308.0, + "step": 7211 + }, + { + "epoch": 0.9174405291947589, + "ewc_loss": 0.0284423828125, + "ewc_loss_parallel": 2.849102020263672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.524728775024414, + "learning_rate": 1e-06, + "loss": 0.4574, + "mean_token_accuracy": 0.8550043106079102, + "num_tokens": 275101497.0, + "step": 7212 + }, + { + "epoch": 0.9175677394733495, + "ewc_loss": 0.0283203125, + "ewc_loss_parallel": 2.8371810913085938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.796310424804688, + "learning_rate": 1e-06, + "loss": 0.4731, + "mean_token_accuracy": 0.8490302562713623, + "num_tokens": 275138317.0, + "step": 7213 + }, + { + "epoch": 0.91769494975194, + "ewc_loss": 0.028564453125, + "ewc_loss_parallel": 2.86102294921875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.3768253326416, + "learning_rate": 1e-06, + "loss": 0.4381, + "mean_token_accuracy": 0.8607390522956848, + "num_tokens": 275180119.0, + "step": 7214 + }, + { + "epoch": 0.9178221600305305, + "ewc_loss": 0.0281982421875, + "ewc_loss_parallel": 2.8252601623535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.76234245300293, + "learning_rate": 1e-06, + "loss": 0.4439, + "mean_token_accuracy": 0.8593744039535522, + "num_tokens": 275212083.0, + "step": 7215 + }, + { + "epoch": 0.9179493703091209, + "ewc_loss": 0.0283203125, + "ewc_loss_parallel": 2.8371810913085938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.59202003479004, + "learning_rate": 1e-06, + "loss": 0.4658, + "mean_token_accuracy": 0.8519777059555054, + "num_tokens": 275247872.0, + "step": 7216 + }, + { + "epoch": 0.9180765805877115, + "ewc_loss": 0.028076171875, + "ewc_loss_parallel": 2.8133392333984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.59535789489746, + "learning_rate": 1e-06, + "loss": 0.4112, + "mean_token_accuracy": 0.8660900592803955, + "num_tokens": 275287022.0, + "step": 7217 + }, + { + "epoch": 0.918203790866302, + "ewc_loss": 0.028564453125, + "ewc_loss_parallel": 2.86102294921875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.689128875732422, + "learning_rate": 1e-06, + "loss": 0.4272, + "mean_token_accuracy": 0.863840639591217, + "num_tokens": 275322259.0, + "step": 7218 + }, + { + "epoch": 0.9183310011448925, + "ewc_loss": 0.0283203125, + "ewc_loss_parallel": 2.8371810913085938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.608253479003906, + "learning_rate": 1e-06, + "loss": 0.3888, + "mean_token_accuracy": 0.8759221434593201, + "num_tokens": 275357410.0, + "step": 7219 + }, + { + "epoch": 0.918458211423483, + "ewc_loss": 0.0283203125, + "ewc_loss_parallel": 2.8371810913085938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.59539222717285, + "learning_rate": 1e-06, + "loss": 0.3971, + "mean_token_accuracy": 0.8727028369903564, + "num_tokens": 275388720.0, + "step": 7220 + }, + { + "epoch": 0.9185854217020736, + "ewc_loss": 0.0284423828125, + "ewc_loss_parallel": 2.849102020263672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.589000701904297, + "learning_rate": 1e-06, + "loss": 0.4261, + "mean_token_accuracy": 0.8676189184188843, + "num_tokens": 275427699.0, + "step": 7221 + }, + { + "epoch": 0.918712631980664, + "ewc_loss": 0.0283203125, + "ewc_loss_parallel": 2.8371810913085938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.65305519104004, + "learning_rate": 1e-06, + "loss": 0.4275, + "mean_token_accuracy": 0.8639886379241943, + "num_tokens": 275472220.0, + "step": 7222 + }, + { + "epoch": 0.9188398422592545, + "ewc_loss": 0.028564453125, + "ewc_loss_parallel": 2.86102294921875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.68756103515625, + "learning_rate": 1e-06, + "loss": 0.4704, + "mean_token_accuracy": 0.8488928079605103, + "num_tokens": 275506693.0, + "step": 7223 + }, + { + "epoch": 0.918967052537845, + "ewc_loss": 0.028076171875, + "ewc_loss_parallel": 2.8133392333984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.5459041595459, + "learning_rate": 1e-06, + "loss": 0.4333, + "mean_token_accuracy": 0.8649578094482422, + "num_tokens": 275547295.0, + "step": 7224 + }, + { + "epoch": 0.9190942628164356, + "ewc_loss": 0.0284423828125, + "ewc_loss_parallel": 2.849102020263672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.561887741088867, + "learning_rate": 1e-06, + "loss": 0.4576, + "mean_token_accuracy": 0.857448399066925, + "num_tokens": 275591552.0, + "step": 7225 + }, + { + "epoch": 0.9192214730950261, + "ewc_loss": 0.0283203125, + "ewc_loss_parallel": 2.8371810913085938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.728357315063477, + "learning_rate": 1e-06, + "loss": 0.5185, + "mean_token_accuracy": 0.835472583770752, + "num_tokens": 275632197.0, + "step": 7226 + }, + { + "epoch": 0.9193486833736166, + "ewc_loss": 0.0284423828125, + "ewc_loss_parallel": 2.849102020263672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.643817901611328, + "learning_rate": 1e-06, + "loss": 0.4496, + "mean_token_accuracy": 0.8552322387695312, + "num_tokens": 275670680.0, + "step": 7227 + }, + { + "epoch": 0.919475893652207, + "ewc_loss": 0.0279541015625, + "ewc_loss_parallel": 2.8014183044433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.592546463012695, + "learning_rate": 1e-06, + "loss": 0.4529, + "mean_token_accuracy": 0.8596386909484863, + "num_tokens": 275709315.0, + "step": 7228 + }, + { + "epoch": 0.9196031039307976, + "ewc_loss": 0.028076171875, + "ewc_loss_parallel": 2.8133392333984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.43486785888672, + "learning_rate": 1e-06, + "loss": 0.4363, + "mean_token_accuracy": 0.8607059717178345, + "num_tokens": 275740405.0, + "step": 7229 + }, + { + "epoch": 0.9197303142093881, + "ewc_loss": 0.0279541015625, + "ewc_loss_parallel": 2.8014183044433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.7674503326416, + "learning_rate": 1e-06, + "loss": 0.3801, + "mean_token_accuracy": 0.8780655264854431, + "num_tokens": 275780915.0, + "step": 7230 + }, + { + "epoch": 0.9198575244879786, + "ewc_loss": 0.0283203125, + "ewc_loss_parallel": 2.8371810913085938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.657865524291992, + "learning_rate": 1e-06, + "loss": 0.4542, + "mean_token_accuracy": 0.8553347587585449, + "num_tokens": 275817832.0, + "step": 7231 + }, + { + "epoch": 0.9199847347665692, + "ewc_loss": 0.0283203125, + "ewc_loss_parallel": 2.8371810913085938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.700223922729492, + "learning_rate": 1e-06, + "loss": 0.4203, + "mean_token_accuracy": 0.8650392293930054, + "num_tokens": 275857621.0, + "step": 7232 + }, + { + "epoch": 0.9201119450451597, + "ewc_loss": 0.028076171875, + "ewc_loss_parallel": 2.8133392333984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.742084503173828, + "learning_rate": 1e-06, + "loss": 0.4558, + "mean_token_accuracy": 0.857481062412262, + "num_tokens": 275898917.0, + "step": 7233 + }, + { + "epoch": 0.9202391553237501, + "ewc_loss": 0.0281982421875, + "ewc_loss_parallel": 2.8252601623535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.704259872436523, + "learning_rate": 1e-06, + "loss": 0.4334, + "mean_token_accuracy": 0.8653048276901245, + "num_tokens": 275930151.0, + "step": 7234 + }, + { + "epoch": 0.9203663656023406, + "ewc_loss": 0.028076171875, + "ewc_loss_parallel": 2.8133392333984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.480039596557617, + "learning_rate": 1e-06, + "loss": 0.4222, + "mean_token_accuracy": 0.8689506649971008, + "num_tokens": 275966901.0, + "step": 7235 + }, + { + "epoch": 0.9204935758809312, + "ewc_loss": 0.028076171875, + "ewc_loss_parallel": 2.8133392333984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.76384925842285, + "learning_rate": 1e-06, + "loss": 0.3606, + "mean_token_accuracy": 0.8833397626876831, + "num_tokens": 276002845.0, + "step": 7236 + }, + { + "epoch": 0.9206207861595217, + "ewc_loss": 0.028076171875, + "ewc_loss_parallel": 2.8133392333984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.51837921142578, + "learning_rate": 1e-06, + "loss": 0.4301, + "mean_token_accuracy": 0.859703779220581, + "num_tokens": 276038476.0, + "step": 7237 + }, + { + "epoch": 0.9207479964381122, + "ewc_loss": 0.0283203125, + "ewc_loss_parallel": 2.8371810913085938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.566614151000977, + "learning_rate": 1e-06, + "loss": 0.4544, + "mean_token_accuracy": 0.8576998114585876, + "num_tokens": 276077867.0, + "step": 7238 + }, + { + "epoch": 0.9208752067167028, + "ewc_loss": 0.0283203125, + "ewc_loss_parallel": 2.8371810913085938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.717918395996094, + "learning_rate": 1e-06, + "loss": 0.4686, + "mean_token_accuracy": 0.8504796028137207, + "num_tokens": 276123289.0, + "step": 7239 + }, + { + "epoch": 0.9210024169952932, + "ewc_loss": 0.0283203125, + "ewc_loss_parallel": 2.8371810913085938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.431381225585938, + "learning_rate": 1e-06, + "loss": 0.4733, + "mean_token_accuracy": 0.8488391637802124, + "num_tokens": 276157357.0, + "step": 7240 + }, + { + "epoch": 0.9211296272738837, + "ewc_loss": 0.0283203125, + "ewc_loss_parallel": 2.8371810913085938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.48914909362793, + "learning_rate": 1e-06, + "loss": 0.3638, + "mean_token_accuracy": 0.8821628093719482, + "num_tokens": 276195548.0, + "step": 7241 + }, + { + "epoch": 0.9212568375524742, + "ewc_loss": 0.0284423828125, + "ewc_loss_parallel": 2.849102020263672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.533611297607422, + "learning_rate": 1e-06, + "loss": 0.4382, + "mean_token_accuracy": 0.8632739782333374, + "num_tokens": 276233533.0, + "step": 7242 + }, + { + "epoch": 0.9213840478310648, + "ewc_loss": 0.0283203125, + "ewc_loss_parallel": 2.8371810913085938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.649951934814453, + "learning_rate": 1e-06, + "loss": 0.4235, + "mean_token_accuracy": 0.8652837872505188, + "num_tokens": 276274310.0, + "step": 7243 + }, + { + "epoch": 0.9215112581096553, + "ewc_loss": 0.0283203125, + "ewc_loss_parallel": 2.8371810913085938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.508806228637695, + "learning_rate": 1e-06, + "loss": 0.4268, + "mean_token_accuracy": 0.8587398529052734, + "num_tokens": 276310272.0, + "step": 7244 + }, + { + "epoch": 0.9216384683882458, + "ewc_loss": 0.028076171875, + "ewc_loss_parallel": 2.8133392333984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.3508243560791, + "learning_rate": 1e-06, + "loss": 0.4436, + "mean_token_accuracy": 0.8554284572601318, + "num_tokens": 276350334.0, + "step": 7245 + }, + { + "epoch": 0.9217656786668362, + "ewc_loss": 0.0284423828125, + "ewc_loss_parallel": 2.849102020263672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.424938201904297, + "learning_rate": 1e-06, + "loss": 0.451, + "mean_token_accuracy": 0.8559719324111938, + "num_tokens": 276393913.0, + "step": 7246 + }, + { + "epoch": 0.9218928889454268, + "ewc_loss": 0.028564453125, + "ewc_loss_parallel": 2.86102294921875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.599411010742188, + "learning_rate": 1e-06, + "loss": 0.44, + "mean_token_accuracy": 0.857313334941864, + "num_tokens": 276430368.0, + "step": 7247 + }, + { + "epoch": 0.9220200992240173, + "ewc_loss": 0.0284423828125, + "ewc_loss_parallel": 2.849102020263672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.53558349609375, + "learning_rate": 1e-06, + "loss": 0.4536, + "mean_token_accuracy": 0.8549966812133789, + "num_tokens": 276470930.0, + "step": 7248 + }, + { + "epoch": 0.9221473095026078, + "ewc_loss": 0.0283203125, + "ewc_loss_parallel": 2.8371810913085938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.619306564331055, + "learning_rate": 1e-06, + "loss": 0.4297, + "mean_token_accuracy": 0.8658920526504517, + "num_tokens": 276509359.0, + "step": 7249 + }, + { + "epoch": 0.9222745197811983, + "ewc_loss": 0.0283203125, + "ewc_loss_parallel": 2.8371810913085938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.476877212524414, + "learning_rate": 1e-06, + "loss": 0.4628, + "mean_token_accuracy": 0.8532053232192993, + "num_tokens": 276546980.0, + "step": 7250 + }, + { + "epoch": 0.9224017300597889, + "ewc_loss": 0.0284423828125, + "ewc_loss_parallel": 2.849102020263672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.474441528320312, + "learning_rate": 1e-06, + "loss": 0.4876, + "mean_token_accuracy": 0.8481169939041138, + "num_tokens": 276586212.0, + "step": 7251 + }, + { + "epoch": 0.9225289403383793, + "ewc_loss": 0.0281982421875, + "ewc_loss_parallel": 2.8252601623535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.4892635345459, + "learning_rate": 1e-06, + "loss": 0.4277, + "mean_token_accuracy": 0.8625041246414185, + "num_tokens": 276623946.0, + "step": 7252 + }, + { + "epoch": 0.9226561506169698, + "ewc_loss": 0.0283203125, + "ewc_loss_parallel": 2.8371810913085938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.503210067749023, + "learning_rate": 1e-06, + "loss": 0.4684, + "mean_token_accuracy": 0.855554461479187, + "num_tokens": 276667567.0, + "step": 7253 + }, + { + "epoch": 0.9227833608955603, + "ewc_loss": 0.0283203125, + "ewc_loss_parallel": 2.8371810913085938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.61962127685547, + "learning_rate": 1e-06, + "loss": 0.4615, + "mean_token_accuracy": 0.854629397392273, + "num_tokens": 276708479.0, + "step": 7254 + }, + { + "epoch": 0.9229105711741509, + "ewc_loss": 0.0281982421875, + "ewc_loss_parallel": 2.8252601623535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.375106811523438, + "learning_rate": 1e-06, + "loss": 0.4501, + "mean_token_accuracy": 0.8553109765052795, + "num_tokens": 276744627.0, + "step": 7255 + }, + { + "epoch": 0.9230377814527414, + "ewc_loss": 0.0283203125, + "ewc_loss_parallel": 2.8371810913085938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.666847229003906, + "learning_rate": 1e-06, + "loss": 0.4366, + "mean_token_accuracy": 0.8614196181297302, + "num_tokens": 276782830.0, + "step": 7256 + }, + { + "epoch": 0.9231649917313319, + "ewc_loss": 0.0283203125, + "ewc_loss_parallel": 2.8371810913085938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.47834014892578, + "learning_rate": 1e-06, + "loss": 0.4433, + "mean_token_accuracy": 0.8590211868286133, + "num_tokens": 276825745.0, + "step": 7257 + }, + { + "epoch": 0.9232922020099223, + "ewc_loss": 0.0281982421875, + "ewc_loss_parallel": 2.8252601623535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.75047492980957, + "learning_rate": 1e-06, + "loss": 0.4262, + "mean_token_accuracy": 0.8661059141159058, + "num_tokens": 276866743.0, + "step": 7258 + }, + { + "epoch": 0.9234194122885129, + "ewc_loss": 0.0283203125, + "ewc_loss_parallel": 2.8371810913085938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.5607967376709, + "learning_rate": 1e-06, + "loss": 0.4497, + "mean_token_accuracy": 0.8567975759506226, + "num_tokens": 276903884.0, + "step": 7259 + }, + { + "epoch": 0.9235466225671034, + "ewc_loss": 0.0281982421875, + "ewc_loss_parallel": 2.8252601623535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.630088806152344, + "learning_rate": 1e-06, + "loss": 0.4223, + "mean_token_accuracy": 0.8667174577713013, + "num_tokens": 276939142.0, + "step": 7260 + }, + { + "epoch": 0.9236738328456939, + "ewc_loss": 0.0284423828125, + "ewc_loss_parallel": 2.849102020263672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.746551513671875, + "learning_rate": 1e-06, + "loss": 0.4328, + "mean_token_accuracy": 0.8609525561332703, + "num_tokens": 276979041.0, + "step": 7261 + }, + { + "epoch": 0.9238010431242845, + "ewc_loss": 0.0281982421875, + "ewc_loss_parallel": 2.8252601623535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.558307647705078, + "learning_rate": 1e-06, + "loss": 0.3938, + "mean_token_accuracy": 0.8718760013580322, + "num_tokens": 277015590.0, + "step": 7262 + }, + { + "epoch": 0.923928253402875, + "ewc_loss": 0.0283203125, + "ewc_loss_parallel": 2.8371810913085938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.63956642150879, + "learning_rate": 1e-06, + "loss": 0.4562, + "mean_token_accuracy": 0.8540173172950745, + "num_tokens": 277055745.0, + "step": 7263 + }, + { + "epoch": 0.9240554636814655, + "ewc_loss": 0.028564453125, + "ewc_loss_parallel": 2.86102294921875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.555255889892578, + "learning_rate": 1e-06, + "loss": 0.4458, + "mean_token_accuracy": 0.8566697835922241, + "num_tokens": 277098856.0, + "step": 7264 + }, + { + "epoch": 0.9241826739600559, + "ewc_loss": 0.0283203125, + "ewc_loss_parallel": 2.8371810913085938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.58833885192871, + "learning_rate": 1e-06, + "loss": 0.4047, + "mean_token_accuracy": 0.8717484474182129, + "num_tokens": 277139165.0, + "step": 7265 + }, + { + "epoch": 0.9243098842386465, + "ewc_loss": 0.0283203125, + "ewc_loss_parallel": 2.8371810913085938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.655359268188477, + "learning_rate": 1e-06, + "loss": 0.4304, + "mean_token_accuracy": 0.864500880241394, + "num_tokens": 277171879.0, + "step": 7266 + }, + { + "epoch": 0.924437094517237, + "ewc_loss": 0.0283203125, + "ewc_loss_parallel": 2.8371810913085938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.713058471679688, + "learning_rate": 1e-06, + "loss": 0.4378, + "mean_token_accuracy": 0.8619997501373291, + "num_tokens": 277211719.0, + "step": 7267 + }, + { + "epoch": 0.9245643047958275, + "ewc_loss": 0.0283203125, + "ewc_loss_parallel": 2.8371810913085938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.725061416625977, + "learning_rate": 1e-06, + "loss": 0.4525, + "mean_token_accuracy": 0.8524882793426514, + "num_tokens": 277252675.0, + "step": 7268 + }, + { + "epoch": 0.924691515074418, + "ewc_loss": 0.0283203125, + "ewc_loss_parallel": 2.8371810913085938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.561904907226562, + "learning_rate": 1e-06, + "loss": 0.4653, + "mean_token_accuracy": 0.8493936061859131, + "num_tokens": 277287529.0, + "step": 7269 + }, + { + "epoch": 0.9248187253530086, + "ewc_loss": 0.0283203125, + "ewc_loss_parallel": 2.8371810913085938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.735492706298828, + "learning_rate": 1e-06, + "loss": 0.4324, + "mean_token_accuracy": 0.8602499961853027, + "num_tokens": 277328392.0, + "step": 7270 + }, + { + "epoch": 0.924945935631599, + "ewc_loss": 0.0283203125, + "ewc_loss_parallel": 2.8371810913085938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.59259796142578, + "learning_rate": 1e-06, + "loss": 0.4096, + "mean_token_accuracy": 0.8707096576690674, + "num_tokens": 277368412.0, + "step": 7271 + }, + { + "epoch": 0.9250731459101895, + "ewc_loss": 0.0281982421875, + "ewc_loss_parallel": 2.8252601623535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.471759796142578, + "learning_rate": 1e-06, + "loss": 0.4138, + "mean_token_accuracy": 0.8661645650863647, + "num_tokens": 277404373.0, + "step": 7272 + }, + { + "epoch": 0.92520035618878, + "ewc_loss": 0.0283203125, + "ewc_loss_parallel": 2.8371810913085938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.93605613708496, + "learning_rate": 1e-06, + "loss": 0.447, + "mean_token_accuracy": 0.8582867383956909, + "num_tokens": 277444988.0, + "step": 7273 + }, + { + "epoch": 0.9253275664673706, + "ewc_loss": 0.028076171875, + "ewc_loss_parallel": 2.8133392333984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.521881103515625, + "learning_rate": 1e-06, + "loss": 0.4244, + "mean_token_accuracy": 0.8701138496398926, + "num_tokens": 277487501.0, + "step": 7274 + }, + { + "epoch": 0.9254547767459611, + "ewc_loss": 0.0283203125, + "ewc_loss_parallel": 2.8371810913085938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.629255294799805, + "learning_rate": 1e-06, + "loss": 0.437, + "mean_token_accuracy": 0.857742428779602, + "num_tokens": 277529056.0, + "step": 7275 + }, + { + "epoch": 0.9255819870245516, + "ewc_loss": 0.0284423828125, + "ewc_loss_parallel": 2.849102020263672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.624561309814453, + "learning_rate": 1e-06, + "loss": 0.4032, + "mean_token_accuracy": 0.8730836510658264, + "num_tokens": 277563022.0, + "step": 7276 + }, + { + "epoch": 0.925709197303142, + "ewc_loss": 0.0284423828125, + "ewc_loss_parallel": 2.849102020263672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.45073699951172, + "learning_rate": 1e-06, + "loss": 0.4131, + "mean_token_accuracy": 0.8684489727020264, + "num_tokens": 277603879.0, + "step": 7277 + }, + { + "epoch": 0.9258364075817326, + "ewc_loss": 0.0283203125, + "ewc_loss_parallel": 2.8371810913085938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.600135803222656, + "learning_rate": 1e-06, + "loss": 0.4461, + "mean_token_accuracy": 0.8572890758514404, + "num_tokens": 277643711.0, + "step": 7278 + }, + { + "epoch": 0.9259636178603231, + "ewc_loss": 0.0283203125, + "ewc_loss_parallel": 2.8371810913085938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.790058135986328, + "learning_rate": 1e-06, + "loss": 0.4462, + "mean_token_accuracy": 0.8605062365531921, + "num_tokens": 277680667.0, + "step": 7279 + }, + { + "epoch": 0.9260908281389136, + "ewc_loss": 0.0283203125, + "ewc_loss_parallel": 2.8371810913085938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.6943359375, + "learning_rate": 1e-06, + "loss": 0.4458, + "mean_token_accuracy": 0.8586171865463257, + "num_tokens": 277715548.0, + "step": 7280 + }, + { + "epoch": 0.9262180384175042, + "ewc_loss": 0.0283203125, + "ewc_loss_parallel": 2.8371810913085938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.631019592285156, + "learning_rate": 1e-06, + "loss": 0.4355, + "mean_token_accuracy": 0.8601905703544617, + "num_tokens": 277745287.0, + "step": 7281 + }, + { + "epoch": 0.9263452486960947, + "ewc_loss": 0.028564453125, + "ewc_loss_parallel": 2.86102294921875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.735336303710938, + "learning_rate": 1e-06, + "loss": 0.4572, + "mean_token_accuracy": 0.8538711071014404, + "num_tokens": 277781484.0, + "step": 7282 + }, + { + "epoch": 0.9264724589746851, + "ewc_loss": 0.0283203125, + "ewc_loss_parallel": 2.8371810913085938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.52317237854004, + "learning_rate": 1e-06, + "loss": 0.4199, + "mean_token_accuracy": 0.8635326027870178, + "num_tokens": 277818429.0, + "step": 7283 + }, + { + "epoch": 0.9265996692532756, + "ewc_loss": 0.0283203125, + "ewc_loss_parallel": 2.8371810913085938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.874088287353516, + "learning_rate": 1e-06, + "loss": 0.4129, + "mean_token_accuracy": 0.8680347204208374, + "num_tokens": 277853077.0, + "step": 7284 + }, + { + "epoch": 0.9267268795318662, + "ewc_loss": 0.028564453125, + "ewc_loss_parallel": 2.86102294921875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.40150260925293, + "learning_rate": 1e-06, + "loss": 0.3899, + "mean_token_accuracy": 0.8780440092086792, + "num_tokens": 277893997.0, + "step": 7285 + }, + { + "epoch": 0.9268540898104567, + "ewc_loss": 0.028076171875, + "ewc_loss_parallel": 2.8133392333984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.646669387817383, + "learning_rate": 1e-06, + "loss": 0.4697, + "mean_token_accuracy": 0.8500756025314331, + "num_tokens": 277934680.0, + "step": 7286 + }, + { + "epoch": 0.9269813000890472, + "ewc_loss": 0.0284423828125, + "ewc_loss_parallel": 2.849102020263672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.3983211517334, + "learning_rate": 1e-06, + "loss": 0.4611, + "mean_token_accuracy": 0.8532525300979614, + "num_tokens": 277980370.0, + "step": 7287 + }, + { + "epoch": 0.9271085103676378, + "ewc_loss": 0.0281982421875, + "ewc_loss_parallel": 2.8252601623535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.578487396240234, + "learning_rate": 1e-06, + "loss": 0.3905, + "mean_token_accuracy": 0.8733277320861816, + "num_tokens": 278017257.0, + "step": 7288 + }, + { + "epoch": 0.9272357206462282, + "ewc_loss": 0.0284423828125, + "ewc_loss_parallel": 2.849102020263672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.52015495300293, + "learning_rate": 1e-06, + "loss": 0.4387, + "mean_token_accuracy": 0.8609083890914917, + "num_tokens": 278054835.0, + "step": 7289 + }, + { + "epoch": 0.9273629309248187, + "ewc_loss": 0.0286865234375, + "ewc_loss_parallel": 2.872943878173828e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.69842529296875, + "learning_rate": 1e-06, + "loss": 0.4386, + "mean_token_accuracy": 0.8587319850921631, + "num_tokens": 278089510.0, + "step": 7290 + }, + { + "epoch": 0.9274901412034092, + "ewc_loss": 0.0284423828125, + "ewc_loss_parallel": 2.849102020263672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.622106552124023, + "learning_rate": 1e-06, + "loss": 0.4234, + "mean_token_accuracy": 0.8626441359519958, + "num_tokens": 278126179.0, + "step": 7291 + }, + { + "epoch": 0.9276173514819998, + "ewc_loss": 0.0281982421875, + "ewc_loss_parallel": 2.8252601623535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.537273406982422, + "learning_rate": 1e-06, + "loss": 0.5111, + "mean_token_accuracy": 0.8391255140304565, + "num_tokens": 278170837.0, + "step": 7292 + }, + { + "epoch": 0.9277445617605903, + "ewc_loss": 0.0286865234375, + "ewc_loss_parallel": 2.872943878173828e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.652244567871094, + "learning_rate": 1e-06, + "loss": 0.4289, + "mean_token_accuracy": 0.866775393486023, + "num_tokens": 278207410.0, + "step": 7293 + }, + { + "epoch": 0.9278717720391808, + "ewc_loss": 0.0283203125, + "ewc_loss_parallel": 2.8371810913085938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.6538028717041, + "learning_rate": 1e-06, + "loss": 0.3739, + "mean_token_accuracy": 0.8805181980133057, + "num_tokens": 278240495.0, + "step": 7294 + }, + { + "epoch": 0.9279989823177712, + "ewc_loss": 0.0283203125, + "ewc_loss_parallel": 2.8371810913085938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.689189910888672, + "learning_rate": 1e-06, + "loss": 0.4599, + "mean_token_accuracy": 0.852819561958313, + "num_tokens": 278278449.0, + "step": 7295 + }, + { + "epoch": 0.9281261925963618, + "ewc_loss": 0.0283203125, + "ewc_loss_parallel": 2.8371810913085938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.470542907714844, + "learning_rate": 1e-06, + "loss": 0.412, + "mean_token_accuracy": 0.8713758587837219, + "num_tokens": 278321434.0, + "step": 7296 + }, + { + "epoch": 0.9282534028749523, + "ewc_loss": 0.0283203125, + "ewc_loss_parallel": 2.8371810913085938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.699655532836914, + "learning_rate": 1e-06, + "loss": 0.4731, + "mean_token_accuracy": 0.8564109802246094, + "num_tokens": 278363730.0, + "step": 7297 + }, + { + "epoch": 0.9283806131535428, + "ewc_loss": 0.0281982421875, + "ewc_loss_parallel": 2.8252601623535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.599641799926758, + "learning_rate": 1e-06, + "loss": 0.4188, + "mean_token_accuracy": 0.8623690605163574, + "num_tokens": 278395493.0, + "step": 7298 + }, + { + "epoch": 0.9285078234321333, + "ewc_loss": 0.0283203125, + "ewc_loss_parallel": 2.8371810913085938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.67681121826172, + "learning_rate": 1e-06, + "loss": 0.428, + "mean_token_accuracy": 0.8633147478103638, + "num_tokens": 278433024.0, + "step": 7299 + }, + { + "epoch": 0.9286350337107239, + "ewc_loss": 0.0283203125, + "ewc_loss_parallel": 2.8371810913085938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.585384368896484, + "learning_rate": 1e-06, + "loss": 0.4319, + "mean_token_accuracy": 0.8618999719619751, + "num_tokens": 278471311.0, + "step": 7300 + }, + { + "epoch": 0.9287622439893143, + "ewc_loss": 0.0283203125, + "ewc_loss_parallel": 2.8371810913085938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.678447723388672, + "learning_rate": 1e-06, + "loss": 0.3782, + "mean_token_accuracy": 0.8812793493270874, + "num_tokens": 278509151.0, + "step": 7301 + }, + { + "epoch": 0.9288894542679048, + "ewc_loss": 0.0283203125, + "ewc_loss_parallel": 2.8371810913085938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.638242721557617, + "learning_rate": 1e-06, + "loss": 0.4196, + "mean_token_accuracy": 0.8683751225471497, + "num_tokens": 278548743.0, + "step": 7302 + }, + { + "epoch": 0.9290166645464953, + "ewc_loss": 0.028564453125, + "ewc_loss_parallel": 2.86102294921875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.9776611328125, + "learning_rate": 1e-06, + "loss": 0.3962, + "mean_token_accuracy": 0.8728047609329224, + "num_tokens": 278590722.0, + "step": 7303 + }, + { + "epoch": 0.9291438748250859, + "ewc_loss": 0.0284423828125, + "ewc_loss_parallel": 2.849102020263672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.70250129699707, + "learning_rate": 1e-06, + "loss": 0.483, + "mean_token_accuracy": 0.8484803438186646, + "num_tokens": 278628150.0, + "step": 7304 + }, + { + "epoch": 0.9292710851036764, + "ewc_loss": 0.028076171875, + "ewc_loss_parallel": 2.8133392333984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.639245986938477, + "learning_rate": 1e-06, + "loss": 0.4827, + "mean_token_accuracy": 0.8450506329536438, + "num_tokens": 278672543.0, + "step": 7305 + }, + { + "epoch": 0.9293982953822669, + "ewc_loss": 0.0283203125, + "ewc_loss_parallel": 2.8371810913085938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.777009963989258, + "learning_rate": 1e-06, + "loss": 0.4075, + "mean_token_accuracy": 0.870130717754364, + "num_tokens": 278707234.0, + "step": 7306 + }, + { + "epoch": 0.9295255056608573, + "ewc_loss": 0.028076171875, + "ewc_loss_parallel": 2.8133392333984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.562543869018555, + "learning_rate": 1e-06, + "loss": 0.4198, + "mean_token_accuracy": 0.8641226291656494, + "num_tokens": 278734678.0, + "step": 7307 + }, + { + "epoch": 0.9296527159394479, + "ewc_loss": 0.0281982421875, + "ewc_loss_parallel": 2.8252601623535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.68292808532715, + "learning_rate": 1e-06, + "loss": 0.4058, + "mean_token_accuracy": 0.8708575367927551, + "num_tokens": 278778284.0, + "step": 7308 + }, + { + "epoch": 0.9297799262180384, + "ewc_loss": 0.0281982421875, + "ewc_loss_parallel": 2.8252601623535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.870861053466797, + "learning_rate": 1e-06, + "loss": 0.4481, + "mean_token_accuracy": 0.8615775108337402, + "num_tokens": 278815363.0, + "step": 7309 + }, + { + "epoch": 0.9299071364966289, + "ewc_loss": 0.0279541015625, + "ewc_loss_parallel": 2.8014183044433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.608097076416016, + "learning_rate": 1e-06, + "loss": 0.4659, + "mean_token_accuracy": 0.8495664596557617, + "num_tokens": 278860777.0, + "step": 7310 + }, + { + "epoch": 0.9300343467752195, + "ewc_loss": 0.028076171875, + "ewc_loss_parallel": 2.8133392333984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.5283145904541, + "learning_rate": 1e-06, + "loss": 0.3881, + "mean_token_accuracy": 0.8760350942611694, + "num_tokens": 278890552.0, + "step": 7311 + }, + { + "epoch": 0.93016155705381, + "ewc_loss": 0.028076171875, + "ewc_loss_parallel": 2.8133392333984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.72244644165039, + "learning_rate": 1e-06, + "loss": 0.481, + "mean_token_accuracy": 0.8506077527999878, + "num_tokens": 278924741.0, + "step": 7312 + }, + { + "epoch": 0.9302887673324005, + "ewc_loss": 0.0283203125, + "ewc_loss_parallel": 2.8371810913085938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.632030487060547, + "learning_rate": 1e-06, + "loss": 0.4045, + "mean_token_accuracy": 0.8701930046081543, + "num_tokens": 278965185.0, + "step": 7313 + }, + { + "epoch": 0.9304159776109909, + "ewc_loss": 0.0281982421875, + "ewc_loss_parallel": 2.8252601623535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.52617073059082, + "learning_rate": 1e-06, + "loss": 0.4705, + "mean_token_accuracy": 0.8501324653625488, + "num_tokens": 278998961.0, + "step": 7314 + }, + { + "epoch": 0.9305431878895815, + "ewc_loss": 0.0281982421875, + "ewc_loss_parallel": 2.8252601623535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.583744049072266, + "learning_rate": 1e-06, + "loss": 0.4744, + "mean_token_accuracy": 0.8492165803909302, + "num_tokens": 279032689.0, + "step": 7315 + }, + { + "epoch": 0.930670398168172, + "ewc_loss": 0.0283203125, + "ewc_loss_parallel": 2.8371810913085938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.550846099853516, + "learning_rate": 1e-06, + "loss": 0.4812, + "mean_token_accuracy": 0.8487757444381714, + "num_tokens": 279080305.0, + "step": 7316 + }, + { + "epoch": 0.9307976084467625, + "ewc_loss": 0.0283203125, + "ewc_loss_parallel": 2.8371810913085938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.527286529541016, + "learning_rate": 1e-06, + "loss": 0.4354, + "mean_token_accuracy": 0.8619788885116577, + "num_tokens": 279120581.0, + "step": 7317 + }, + { + "epoch": 0.930924818725353, + "ewc_loss": 0.0284423828125, + "ewc_loss_parallel": 2.849102020263672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.661283493041992, + "learning_rate": 1e-06, + "loss": 0.3956, + "mean_token_accuracy": 0.873518705368042, + "num_tokens": 279165050.0, + "step": 7318 + }, + { + "epoch": 0.9310520290039436, + "ewc_loss": 0.0286865234375, + "ewc_loss_parallel": 2.872943878173828e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.68583869934082, + "learning_rate": 1e-06, + "loss": 0.4283, + "mean_token_accuracy": 0.8660505414009094, + "num_tokens": 279209393.0, + "step": 7319 + }, + { + "epoch": 0.931179239282534, + "ewc_loss": 0.0284423828125, + "ewc_loss_parallel": 2.849102020263672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.469036102294922, + "learning_rate": 1e-06, + "loss": 0.4777, + "mean_token_accuracy": 0.8489254117012024, + "num_tokens": 279250569.0, + "step": 7320 + }, + { + "epoch": 0.9313064495611245, + "ewc_loss": 0.0281982421875, + "ewc_loss_parallel": 2.8252601623535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.583982467651367, + "learning_rate": 1e-06, + "loss": 0.4162, + "mean_token_accuracy": 0.8670627474784851, + "num_tokens": 279291154.0, + "step": 7321 + }, + { + "epoch": 0.931433659839715, + "ewc_loss": 0.028564453125, + "ewc_loss_parallel": 2.86102294921875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.489534378051758, + "learning_rate": 1e-06, + "loss": 0.4385, + "mean_token_accuracy": 0.8606649041175842, + "num_tokens": 279326967.0, + "step": 7322 + }, + { + "epoch": 0.9315608701183056, + "ewc_loss": 0.028564453125, + "ewc_loss_parallel": 2.86102294921875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.637187957763672, + "learning_rate": 1e-06, + "loss": 0.3961, + "mean_token_accuracy": 0.8738610744476318, + "num_tokens": 279367129.0, + "step": 7323 + }, + { + "epoch": 0.9316880803968961, + "ewc_loss": 0.0283203125, + "ewc_loss_parallel": 2.8371810913085938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.750642776489258, + "learning_rate": 1e-06, + "loss": 0.4117, + "mean_token_accuracy": 0.8677759766578674, + "num_tokens": 279403651.0, + "step": 7324 + }, + { + "epoch": 0.9318152906754866, + "ewc_loss": 0.0281982421875, + "ewc_loss_parallel": 2.8252601623535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.444875717163086, + "learning_rate": 1e-06, + "loss": 0.4562, + "mean_token_accuracy": 0.853428840637207, + "num_tokens": 279439166.0, + "step": 7325 + }, + { + "epoch": 0.931942500954077, + "ewc_loss": 0.0283203125, + "ewc_loss_parallel": 2.8371810913085938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.640235900878906, + "learning_rate": 1e-06, + "loss": 0.4418, + "mean_token_accuracy": 0.8601454496383667, + "num_tokens": 279473489.0, + "step": 7326 + }, + { + "epoch": 0.9320697112326676, + "ewc_loss": 0.0283203125, + "ewc_loss_parallel": 2.8371810913085938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.66189193725586, + "learning_rate": 1e-06, + "loss": 0.4345, + "mean_token_accuracy": 0.864647388458252, + "num_tokens": 279511709.0, + "step": 7327 + }, + { + "epoch": 0.9321969215112581, + "ewc_loss": 0.0283203125, + "ewc_loss_parallel": 2.8371810913085938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.67845344543457, + "learning_rate": 1e-06, + "loss": 0.4609, + "mean_token_accuracy": 0.8496880531311035, + "num_tokens": 279551638.0, + "step": 7328 + }, + { + "epoch": 0.9323241317898486, + "ewc_loss": 0.0284423828125, + "ewc_loss_parallel": 2.849102020263672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.58897590637207, + "learning_rate": 1e-06, + "loss": 0.3951, + "mean_token_accuracy": 0.8728082180023193, + "num_tokens": 279587304.0, + "step": 7329 + }, + { + "epoch": 0.9324513420684392, + "ewc_loss": 0.0284423828125, + "ewc_loss_parallel": 2.849102020263672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.592151641845703, + "learning_rate": 1e-06, + "loss": 0.4583, + "mean_token_accuracy": 0.852784276008606, + "num_tokens": 279628138.0, + "step": 7330 + }, + { + "epoch": 0.9325785523470297, + "ewc_loss": 0.028564453125, + "ewc_loss_parallel": 2.86102294921875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.555055618286133, + "learning_rate": 1e-06, + "loss": 0.4161, + "mean_token_accuracy": 0.8664435148239136, + "num_tokens": 279663064.0, + "step": 7331 + }, + { + "epoch": 0.9327057626256201, + "ewc_loss": 0.0284423828125, + "ewc_loss_parallel": 2.849102020263672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.717628479003906, + "learning_rate": 1e-06, + "loss": 0.4574, + "mean_token_accuracy": 0.8532266616821289, + "num_tokens": 279704937.0, + "step": 7332 + }, + { + "epoch": 0.9328329729042106, + "ewc_loss": 0.028564453125, + "ewc_loss_parallel": 2.86102294921875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.6765079498291, + "learning_rate": 1e-06, + "loss": 0.4259, + "mean_token_accuracy": 0.864702045917511, + "num_tokens": 279738215.0, + "step": 7333 + }, + { + "epoch": 0.9329601831828012, + "ewc_loss": 0.0281982421875, + "ewc_loss_parallel": 2.8252601623535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.58565330505371, + "learning_rate": 1e-06, + "loss": 0.4863, + "mean_token_accuracy": 0.8461949825286865, + "num_tokens": 279771996.0, + "step": 7334 + }, + { + "epoch": 0.9330873934613917, + "ewc_loss": 0.0283203125, + "ewc_loss_parallel": 2.8371810913085938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.49891471862793, + "learning_rate": 1e-06, + "loss": 0.3938, + "mean_token_accuracy": 0.8756489753723145, + "num_tokens": 279810037.0, + "step": 7335 + }, + { + "epoch": 0.9332146037399822, + "ewc_loss": 0.0283203125, + "ewc_loss_parallel": 2.8371810913085938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.500011444091797, + "learning_rate": 1e-06, + "loss": 0.4621, + "mean_token_accuracy": 0.8545422554016113, + "num_tokens": 279847394.0, + "step": 7336 + }, + { + "epoch": 0.9333418140185727, + "ewc_loss": 0.0284423828125, + "ewc_loss_parallel": 2.849102020263672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.679426193237305, + "learning_rate": 1e-06, + "loss": 0.4692, + "mean_token_accuracy": 0.847903847694397, + "num_tokens": 279879057.0, + "step": 7337 + }, + { + "epoch": 0.9334690242971632, + "ewc_loss": 0.0284423828125, + "ewc_loss_parallel": 2.849102020263672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.482336044311523, + "learning_rate": 1e-06, + "loss": 0.4536, + "mean_token_accuracy": 0.8553574085235596, + "num_tokens": 279911775.0, + "step": 7338 + }, + { + "epoch": 0.9335962345757537, + "ewc_loss": 0.0284423828125, + "ewc_loss_parallel": 2.849102020263672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.721851348876953, + "learning_rate": 1e-06, + "loss": 0.4337, + "mean_token_accuracy": 0.864493727684021, + "num_tokens": 279947661.0, + "step": 7339 + }, + { + "epoch": 0.9337234448543442, + "ewc_loss": 0.0284423828125, + "ewc_loss_parallel": 2.849102020263672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.68269157409668, + "learning_rate": 1e-06, + "loss": 0.4191, + "mean_token_accuracy": 0.862450897693634, + "num_tokens": 279977507.0, + "step": 7340 + }, + { + "epoch": 0.9338506551329347, + "ewc_loss": 0.0283203125, + "ewc_loss_parallel": 2.8371810913085938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.688261032104492, + "learning_rate": 1e-06, + "loss": 0.4527, + "mean_token_accuracy": 0.8585069179534912, + "num_tokens": 280016749.0, + "step": 7341 + }, + { + "epoch": 0.9339778654115253, + "ewc_loss": 0.0284423828125, + "ewc_loss_parallel": 2.849102020263672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.59761619567871, + "learning_rate": 1e-06, + "loss": 0.4184, + "mean_token_accuracy": 0.8637049198150635, + "num_tokens": 280056092.0, + "step": 7342 + }, + { + "epoch": 0.9341050756901158, + "ewc_loss": 0.0283203125, + "ewc_loss_parallel": 2.8371810913085938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.736919403076172, + "learning_rate": 1e-06, + "loss": 0.3964, + "mean_token_accuracy": 0.8734570741653442, + "num_tokens": 280089803.0, + "step": 7343 + }, + { + "epoch": 0.9342322859687062, + "ewc_loss": 0.0284423828125, + "ewc_loss_parallel": 2.849102020263672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.539386749267578, + "learning_rate": 1e-06, + "loss": 0.4695, + "mean_token_accuracy": 0.8495361804962158, + "num_tokens": 280126190.0, + "step": 7344 + }, + { + "epoch": 0.9343594962472968, + "ewc_loss": 0.028564453125, + "ewc_loss_parallel": 2.86102294921875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.670068740844727, + "learning_rate": 1e-06, + "loss": 0.4036, + "mean_token_accuracy": 0.8732186555862427, + "num_tokens": 280162743.0, + "step": 7345 + }, + { + "epoch": 0.9344867065258873, + "ewc_loss": 0.0283203125, + "ewc_loss_parallel": 2.8371810913085938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.679790496826172, + "learning_rate": 1e-06, + "loss": 0.4397, + "mean_token_accuracy": 0.8616751432418823, + "num_tokens": 280203676.0, + "step": 7346 + }, + { + "epoch": 0.9346139168044778, + "ewc_loss": 0.0283203125, + "ewc_loss_parallel": 2.8371810913085938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.48971939086914, + "learning_rate": 1e-06, + "loss": 0.3846, + "mean_token_accuracy": 0.8784674406051636, + "num_tokens": 280241765.0, + "step": 7347 + }, + { + "epoch": 0.9347411270830683, + "ewc_loss": 0.0284423828125, + "ewc_loss_parallel": 2.849102020263672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.810075759887695, + "learning_rate": 1e-06, + "loss": 0.5346, + "mean_token_accuracy": 0.8403235077857971, + "num_tokens": 280280103.0, + "step": 7348 + }, + { + "epoch": 0.9348683373616589, + "ewc_loss": 0.028564453125, + "ewc_loss_parallel": 2.86102294921875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.668075561523438, + "learning_rate": 1e-06, + "loss": 0.4789, + "mean_token_accuracy": 0.8444264531135559, + "num_tokens": 280314674.0, + "step": 7349 + }, + { + "epoch": 0.9349955476402493, + "ewc_loss": 0.0281982421875, + "ewc_loss_parallel": 2.8252601623535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.519302368164062, + "learning_rate": 1e-06, + "loss": 0.4279, + "mean_token_accuracy": 0.8629436492919922, + "num_tokens": 280353890.0, + "step": 7350 + }, + { + "epoch": 0.9351227579188398, + "ewc_loss": 0.028564453125, + "ewc_loss_parallel": 2.86102294921875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.649723052978516, + "learning_rate": 1e-06, + "loss": 0.4601, + "mean_token_accuracy": 0.8539185523986816, + "num_tokens": 280396403.0, + "step": 7351 + }, + { + "epoch": 0.9352499681974303, + "ewc_loss": 0.0284423828125, + "ewc_loss_parallel": 2.849102020263672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.647994995117188, + "learning_rate": 1e-06, + "loss": 0.4475, + "mean_token_accuracy": 0.8621619343757629, + "num_tokens": 280436154.0, + "step": 7352 + }, + { + "epoch": 0.9353771784760209, + "ewc_loss": 0.0284423828125, + "ewc_loss_parallel": 2.849102020263672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.765260696411133, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.8692992925643921, + "num_tokens": 280476480.0, + "step": 7353 + }, + { + "epoch": 0.9355043887546114, + "ewc_loss": 0.0284423828125, + "ewc_loss_parallel": 2.849102020263672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.678089141845703, + "learning_rate": 1e-06, + "loss": 0.4201, + "mean_token_accuracy": 0.8652772903442383, + "num_tokens": 280505976.0, + "step": 7354 + }, + { + "epoch": 0.9356315990332019, + "ewc_loss": 0.0283203125, + "ewc_loss_parallel": 2.8371810913085938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.437362670898438, + "learning_rate": 1e-06, + "loss": 0.4536, + "mean_token_accuracy": 0.8579037189483643, + "num_tokens": 280548365.0, + "step": 7355 + }, + { + "epoch": 0.9357588093117923, + "ewc_loss": 0.0284423828125, + "ewc_loss_parallel": 2.849102020263672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.712318420410156, + "learning_rate": 1e-06, + "loss": 0.3897, + "mean_token_accuracy": 0.8792913556098938, + "num_tokens": 280588074.0, + "step": 7356 + }, + { + "epoch": 0.9358860195903829, + "ewc_loss": 0.0281982421875, + "ewc_loss_parallel": 2.8252601623535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.6798095703125, + "learning_rate": 1e-06, + "loss": 0.408, + "mean_token_accuracy": 0.8710991740226746, + "num_tokens": 280626915.0, + "step": 7357 + }, + { + "epoch": 0.9360132298689734, + "ewc_loss": 0.0283203125, + "ewc_loss_parallel": 2.8371810913085938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.64006996154785, + "learning_rate": 1e-06, + "loss": 0.4513, + "mean_token_accuracy": 0.8550066947937012, + "num_tokens": 280660088.0, + "step": 7358 + }, + { + "epoch": 0.9361404401475639, + "ewc_loss": 0.0283203125, + "ewc_loss_parallel": 2.8371810913085938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.70395851135254, + "learning_rate": 1e-06, + "loss": 0.4334, + "mean_token_accuracy": 0.8604047298431396, + "num_tokens": 280701326.0, + "step": 7359 + }, + { + "epoch": 0.9362676504261545, + "ewc_loss": 0.0283203125, + "ewc_loss_parallel": 2.8371810913085938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.681814193725586, + "learning_rate": 1e-06, + "loss": 0.3985, + "mean_token_accuracy": 0.871399998664856, + "num_tokens": 280740256.0, + "step": 7360 + }, + { + "epoch": 0.936394860704745, + "ewc_loss": 0.0283203125, + "ewc_loss_parallel": 2.8371810913085938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.61102294921875, + "learning_rate": 1e-06, + "loss": 0.3937, + "mean_token_accuracy": 0.8756735324859619, + "num_tokens": 280773515.0, + "step": 7361 + }, + { + "epoch": 0.9365220709833355, + "ewc_loss": 0.0286865234375, + "ewc_loss_parallel": 2.872943878173828e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.729808807373047, + "learning_rate": 1e-06, + "loss": 0.511, + "mean_token_accuracy": 0.8362460136413574, + "num_tokens": 280817207.0, + "step": 7362 + }, + { + "epoch": 0.9366492812619259, + "ewc_loss": 0.0283203125, + "ewc_loss_parallel": 2.8371810913085938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.742446899414062, + "learning_rate": 1e-06, + "loss": 0.4751, + "mean_token_accuracy": 0.8493586778640747, + "num_tokens": 280860253.0, + "step": 7363 + }, + { + "epoch": 0.9367764915405165, + "ewc_loss": 0.0283203125, + "ewc_loss_parallel": 2.8371810913085938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.65064811706543, + "learning_rate": 1e-06, + "loss": 0.4425, + "mean_token_accuracy": 0.8584907054901123, + "num_tokens": 280892634.0, + "step": 7364 + }, + { + "epoch": 0.936903701819107, + "ewc_loss": 0.0283203125, + "ewc_loss_parallel": 2.8371810913085938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.52124786376953, + "learning_rate": 1e-06, + "loss": 0.4034, + "mean_token_accuracy": 0.8698257207870483, + "num_tokens": 280930912.0, + "step": 7365 + }, + { + "epoch": 0.9370309120976975, + "ewc_loss": 0.0283203125, + "ewc_loss_parallel": 2.8371810913085938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.901540756225586, + "learning_rate": 1e-06, + "loss": 0.4218, + "mean_token_accuracy": 0.865003764629364, + "num_tokens": 280971588.0, + "step": 7366 + }, + { + "epoch": 0.937158122376288, + "ewc_loss": 0.0284423828125, + "ewc_loss_parallel": 2.849102020263672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.669116973876953, + "learning_rate": 1e-06, + "loss": 0.4135, + "mean_token_accuracy": 0.8701088428497314, + "num_tokens": 281009898.0, + "step": 7367 + }, + { + "epoch": 0.9372853326548786, + "ewc_loss": 0.0283203125, + "ewc_loss_parallel": 2.8371810913085938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.60831642150879, + "learning_rate": 1e-06, + "loss": 0.4589, + "mean_token_accuracy": 0.854931116104126, + "num_tokens": 281046726.0, + "step": 7368 + }, + { + "epoch": 0.937412542933469, + "ewc_loss": 0.0283203125, + "ewc_loss_parallel": 2.8371810913085938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.912181854248047, + "learning_rate": 1e-06, + "loss": 0.3921, + "mean_token_accuracy": 0.8758636116981506, + "num_tokens": 281085163.0, + "step": 7369 + }, + { + "epoch": 0.9375397532120595, + "ewc_loss": 0.0281982421875, + "ewc_loss_parallel": 2.8252601623535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.42056655883789, + "learning_rate": 1e-06, + "loss": 0.4222, + "mean_token_accuracy": 0.8680565357208252, + "num_tokens": 281128660.0, + "step": 7370 + }, + { + "epoch": 0.93766696349065, + "ewc_loss": 0.0283203125, + "ewc_loss_parallel": 2.8371810913085938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.783910751342773, + "learning_rate": 1e-06, + "loss": 0.5064, + "mean_token_accuracy": 0.8442833423614502, + "num_tokens": 281163780.0, + "step": 7371 + }, + { + "epoch": 0.9377941737692406, + "ewc_loss": 0.028076171875, + "ewc_loss_parallel": 2.8133392333984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.672653198242188, + "learning_rate": 1e-06, + "loss": 0.4093, + "mean_token_accuracy": 0.8694730997085571, + "num_tokens": 281204247.0, + "step": 7372 + }, + { + "epoch": 0.9379213840478311, + "ewc_loss": 0.028076171875, + "ewc_loss_parallel": 2.8133392333984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.44316291809082, + "learning_rate": 1e-06, + "loss": 0.3616, + "mean_token_accuracy": 0.8866013288497925, + "num_tokens": 281239518.0, + "step": 7373 + }, + { + "epoch": 0.9380485943264216, + "ewc_loss": 0.0284423828125, + "ewc_loss_parallel": 2.849102020263672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.681577682495117, + "learning_rate": 1e-06, + "loss": 0.4142, + "mean_token_accuracy": 0.8703041076660156, + "num_tokens": 281275916.0, + "step": 7374 + }, + { + "epoch": 0.938175804605012, + "ewc_loss": 0.0284423828125, + "ewc_loss_parallel": 2.849102020263672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.689285278320312, + "learning_rate": 1e-06, + "loss": 0.3901, + "mean_token_accuracy": 0.8723551630973816, + "num_tokens": 281307615.0, + "step": 7375 + }, + { + "epoch": 0.9383030148836026, + "ewc_loss": 0.0284423828125, + "ewc_loss_parallel": 2.849102020263672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.751367568969727, + "learning_rate": 1e-06, + "loss": 0.4153, + "mean_token_accuracy": 0.8663046360015869, + "num_tokens": 281342677.0, + "step": 7376 + }, + { + "epoch": 0.9384302251621931, + "ewc_loss": 0.0283203125, + "ewc_loss_parallel": 2.8371810913085938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.619970321655273, + "learning_rate": 1e-06, + "loss": 0.4522, + "mean_token_accuracy": 0.8574059009552002, + "num_tokens": 281379545.0, + "step": 7377 + }, + { + "epoch": 0.9385574354407836, + "ewc_loss": 0.0284423828125, + "ewc_loss_parallel": 2.849102020263672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.758392333984375, + "learning_rate": 1e-06, + "loss": 0.4964, + "mean_token_accuracy": 0.843612790107727, + "num_tokens": 281424169.0, + "step": 7378 + }, + { + "epoch": 0.9386846457193742, + "ewc_loss": 0.0283203125, + "ewc_loss_parallel": 2.8371810913085938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.642900466918945, + "learning_rate": 1e-06, + "loss": 0.3754, + "mean_token_accuracy": 0.881813645362854, + "num_tokens": 281461306.0, + "step": 7379 + }, + { + "epoch": 0.9388118559979647, + "ewc_loss": 0.028564453125, + "ewc_loss_parallel": 2.86102294921875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.834280014038086, + "learning_rate": 1e-06, + "loss": 0.385, + "mean_token_accuracy": 0.874600887298584, + "num_tokens": 281498753.0, + "step": 7380 + }, + { + "epoch": 0.9389390662765551, + "ewc_loss": 0.0283203125, + "ewc_loss_parallel": 2.8371810913085938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.604473114013672, + "learning_rate": 1e-06, + "loss": 0.3991, + "mean_token_accuracy": 0.8714563250541687, + "num_tokens": 281537163.0, + "step": 7381 + }, + { + "epoch": 0.9390662765551456, + "ewc_loss": 0.0284423828125, + "ewc_loss_parallel": 2.849102020263672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.677854537963867, + "learning_rate": 1e-06, + "loss": 0.4623, + "mean_token_accuracy": 0.8531945943832397, + "num_tokens": 281577926.0, + "step": 7382 + }, + { + "epoch": 0.9391934868337362, + "ewc_loss": 0.0283203125, + "ewc_loss_parallel": 2.8371810913085938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.59956169128418, + "learning_rate": 1e-06, + "loss": 0.3747, + "mean_token_accuracy": 0.8810995221138, + "num_tokens": 281621283.0, + "step": 7383 + }, + { + "epoch": 0.9393206971123267, + "ewc_loss": 0.0283203125, + "ewc_loss_parallel": 2.8371810913085938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.637962341308594, + "learning_rate": 1e-06, + "loss": 0.4312, + "mean_token_accuracy": 0.8659261465072632, + "num_tokens": 281658336.0, + "step": 7384 + }, + { + "epoch": 0.9394479073909172, + "ewc_loss": 0.028564453125, + "ewc_loss_parallel": 2.86102294921875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.746749877929688, + "learning_rate": 1e-06, + "loss": 0.4205, + "mean_token_accuracy": 0.8652067184448242, + "num_tokens": 281695137.0, + "step": 7385 + }, + { + "epoch": 0.9395751176695077, + "ewc_loss": 0.0281982421875, + "ewc_loss_parallel": 2.8252601623535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.514305114746094, + "learning_rate": 1e-06, + "loss": 0.3843, + "mean_token_accuracy": 0.8762456178665161, + "num_tokens": 281733570.0, + "step": 7386 + }, + { + "epoch": 0.9397023279480982, + "ewc_loss": 0.028564453125, + "ewc_loss_parallel": 2.86102294921875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.826257705688477, + "learning_rate": 1e-06, + "loss": 0.4314, + "mean_token_accuracy": 0.8637287616729736, + "num_tokens": 281771163.0, + "step": 7387 + }, + { + "epoch": 0.9398295382266887, + "ewc_loss": 0.0284423828125, + "ewc_loss_parallel": 2.849102020263672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.533029556274414, + "learning_rate": 1e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.8647583723068237, + "num_tokens": 281813715.0, + "step": 7388 + }, + { + "epoch": 0.9399567485052792, + "ewc_loss": 0.0283203125, + "ewc_loss_parallel": 2.8371810913085938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.617719650268555, + "learning_rate": 1e-06, + "loss": 0.4351, + "mean_token_accuracy": 0.860346794128418, + "num_tokens": 281856921.0, + "step": 7389 + }, + { + "epoch": 0.9400839587838697, + "ewc_loss": 0.028564453125, + "ewc_loss_parallel": 2.86102294921875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.62790870666504, + "learning_rate": 1e-06, + "loss": 0.4561, + "mean_token_accuracy": 0.853882372379303, + "num_tokens": 281893273.0, + "step": 7390 + }, + { + "epoch": 0.9402111690624603, + "ewc_loss": 0.028564453125, + "ewc_loss_parallel": 2.86102294921875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.784717559814453, + "learning_rate": 1e-06, + "loss": 0.4056, + "mean_token_accuracy": 0.866705060005188, + "num_tokens": 281927452.0, + "step": 7391 + }, + { + "epoch": 0.9403383793410508, + "ewc_loss": 0.028564453125, + "ewc_loss_parallel": 2.86102294921875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.542001724243164, + "learning_rate": 1e-06, + "loss": 0.3485, + "mean_token_accuracy": 0.886892557144165, + "num_tokens": 281965359.0, + "step": 7392 + }, + { + "epoch": 0.9404655896196412, + "ewc_loss": 0.028564453125, + "ewc_loss_parallel": 2.86102294921875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.775270462036133, + "learning_rate": 1e-06, + "loss": 0.4304, + "mean_token_accuracy": 0.8602457046508789, + "num_tokens": 281999209.0, + "step": 7393 + }, + { + "epoch": 0.9405927998982317, + "ewc_loss": 0.028564453125, + "ewc_loss_parallel": 2.86102294921875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.769216537475586, + "learning_rate": 1e-06, + "loss": 0.4524, + "mean_token_accuracy": 0.851861298084259, + "num_tokens": 282042129.0, + "step": 7394 + }, + { + "epoch": 0.9407200101768223, + "ewc_loss": 0.0284423828125, + "ewc_loss_parallel": 2.849102020263672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.59219741821289, + "learning_rate": 1e-06, + "loss": 0.461, + "mean_token_accuracy": 0.8537916541099548, + "num_tokens": 282087941.0, + "step": 7395 + }, + { + "epoch": 0.9408472204554128, + "ewc_loss": 0.0283203125, + "ewc_loss_parallel": 2.8371810913085938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.808286666870117, + "learning_rate": 1e-06, + "loss": 0.4257, + "mean_token_accuracy": 0.8680739998817444, + "num_tokens": 282123130.0, + "step": 7396 + }, + { + "epoch": 0.9409744307340033, + "ewc_loss": 0.0283203125, + "ewc_loss_parallel": 2.8371810913085938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.745121002197266, + "learning_rate": 1e-06, + "loss": 0.4453, + "mean_token_accuracy": 0.858667254447937, + "num_tokens": 282161142.0, + "step": 7397 + }, + { + "epoch": 0.9411016410125939, + "ewc_loss": 0.0284423828125, + "ewc_loss_parallel": 2.849102020263672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.07466697692871, + "learning_rate": 1e-06, + "loss": 0.3514, + "mean_token_accuracy": 0.8865025639533997, + "num_tokens": 282196363.0, + "step": 7398 + }, + { + "epoch": 0.9412288512911843, + "ewc_loss": 0.0281982421875, + "ewc_loss_parallel": 2.8252601623535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.53083610534668, + "learning_rate": 1e-06, + "loss": 0.4968, + "mean_token_accuracy": 0.8426679968833923, + "num_tokens": 282233202.0, + "step": 7399 + }, + { + "epoch": 0.9413560615697748, + "ewc_loss": 0.028076171875, + "ewc_loss_parallel": 2.8133392333984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.585979461669922, + "learning_rate": 1e-06, + "loss": 0.4275, + "mean_token_accuracy": 0.8636817336082458, + "num_tokens": 282270976.0, + "step": 7400 + }, + { + "epoch": 0.9414832718483653, + "ewc_loss": 0.0283203125, + "ewc_loss_parallel": 2.8371810913085938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.792158126831055, + "learning_rate": 1e-06, + "loss": 0.4034, + "mean_token_accuracy": 0.8726403713226318, + "num_tokens": 282312223.0, + "step": 7401 + }, + { + "epoch": 0.9416104821269559, + "ewc_loss": 0.0284423828125, + "ewc_loss_parallel": 2.849102020263672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.62670135498047, + "learning_rate": 1e-06, + "loss": 0.4794, + "mean_token_accuracy": 0.8467175960540771, + "num_tokens": 282350418.0, + "step": 7402 + }, + { + "epoch": 0.9417376924055464, + "ewc_loss": 0.0281982421875, + "ewc_loss_parallel": 2.8252601623535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.662200927734375, + "learning_rate": 1e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.8632174730300903, + "num_tokens": 282380751.0, + "step": 7403 + }, + { + "epoch": 0.9418649026841369, + "ewc_loss": 0.0283203125, + "ewc_loss_parallel": 2.8371810913085938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.470781326293945, + "learning_rate": 1e-06, + "loss": 0.4025, + "mean_token_accuracy": 0.874504804611206, + "num_tokens": 282420248.0, + "step": 7404 + }, + { + "epoch": 0.9419921129627273, + "ewc_loss": 0.0284423828125, + "ewc_loss_parallel": 2.849102020263672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.702512741088867, + "learning_rate": 1e-06, + "loss": 0.4291, + "mean_token_accuracy": 0.8675971031188965, + "num_tokens": 282457951.0, + "step": 7405 + }, + { + "epoch": 0.9421193232413179, + "ewc_loss": 0.0283203125, + "ewc_loss_parallel": 2.8371810913085938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.478395462036133, + "learning_rate": 1e-06, + "loss": 0.4378, + "mean_token_accuracy": 0.8619024753570557, + "num_tokens": 282507379.0, + "step": 7406 + }, + { + "epoch": 0.9422465335199084, + "ewc_loss": 0.0283203125, + "ewc_loss_parallel": 2.8371810913085938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.684810638427734, + "learning_rate": 1e-06, + "loss": 0.4706, + "mean_token_accuracy": 0.8512293100357056, + "num_tokens": 282546905.0, + "step": 7407 + }, + { + "epoch": 0.9423737437984989, + "ewc_loss": 0.0284423828125, + "ewc_loss_parallel": 2.849102020263672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.44013786315918, + "learning_rate": 1e-06, + "loss": 0.5291, + "mean_token_accuracy": 0.8347448706626892, + "num_tokens": 282591368.0, + "step": 7408 + }, + { + "epoch": 0.9425009540770894, + "ewc_loss": 0.028564453125, + "ewc_loss_parallel": 2.86102294921875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.574432373046875, + "learning_rate": 1e-06, + "loss": 0.435, + "mean_token_accuracy": 0.8601336479187012, + "num_tokens": 282617331.0, + "step": 7409 + }, + { + "epoch": 0.94262816435568, + "ewc_loss": 0.028564453125, + "ewc_loss_parallel": 2.86102294921875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.669483184814453, + "learning_rate": 1e-06, + "loss": 0.3893, + "mean_token_accuracy": 0.876107931137085, + "num_tokens": 282652346.0, + "step": 7410 + }, + { + "epoch": 0.9427553746342705, + "ewc_loss": 0.028564453125, + "ewc_loss_parallel": 2.86102294921875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.516225814819336, + "learning_rate": 1e-06, + "loss": 0.43, + "mean_token_accuracy": 0.8667865991592407, + "num_tokens": 282691564.0, + "step": 7411 + }, + { + "epoch": 0.9428825849128609, + "ewc_loss": 0.028564453125, + "ewc_loss_parallel": 2.86102294921875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.657085418701172, + "learning_rate": 1e-06, + "loss": 0.4092, + "mean_token_accuracy": 0.8727160096168518, + "num_tokens": 282728186.0, + "step": 7412 + }, + { + "epoch": 0.9430097951914514, + "ewc_loss": 0.028564453125, + "ewc_loss_parallel": 2.86102294921875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.600265502929688, + "learning_rate": 1e-06, + "loss": 0.4543, + "mean_token_accuracy": 0.8554741740226746, + "num_tokens": 282770491.0, + "step": 7413 + }, + { + "epoch": 0.943137005470042, + "ewc_loss": 0.0284423828125, + "ewc_loss_parallel": 2.849102020263672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.7958927154541, + "learning_rate": 1e-06, + "loss": 0.4512, + "mean_token_accuracy": 0.855956494808197, + "num_tokens": 282809824.0, + "step": 7414 + }, + { + "epoch": 0.9432642157486325, + "ewc_loss": 0.028564453125, + "ewc_loss_parallel": 2.86102294921875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.82613182067871, + "learning_rate": 1e-06, + "loss": 0.4138, + "mean_token_accuracy": 0.8672381639480591, + "num_tokens": 282846735.0, + "step": 7415 + }, + { + "epoch": 0.943391426027223, + "ewc_loss": 0.0284423828125, + "ewc_loss_parallel": 2.849102020263672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.728147506713867, + "learning_rate": 1e-06, + "loss": 0.3755, + "mean_token_accuracy": 0.8819674849510193, + "num_tokens": 282882242.0, + "step": 7416 + }, + { + "epoch": 0.9435186363058136, + "ewc_loss": 0.028564453125, + "ewc_loss_parallel": 2.86102294921875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.687049865722656, + "learning_rate": 1e-06, + "loss": 0.4717, + "mean_token_accuracy": 0.8498966693878174, + "num_tokens": 282918774.0, + "step": 7417 + }, + { + "epoch": 0.943645846584404, + "ewc_loss": 0.0284423828125, + "ewc_loss_parallel": 2.849102020263672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.639850616455078, + "learning_rate": 1e-06, + "loss": 0.4548, + "mean_token_accuracy": 0.8548188209533691, + "num_tokens": 282957766.0, + "step": 7418 + }, + { + "epoch": 0.9437730568629945, + "ewc_loss": 0.0284423828125, + "ewc_loss_parallel": 2.849102020263672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.81785774230957, + "learning_rate": 1e-06, + "loss": 0.4673, + "mean_token_accuracy": 0.8515379428863525, + "num_tokens": 282995980.0, + "step": 7419 + }, + { + "epoch": 0.943900267141585, + "ewc_loss": 0.0284423828125, + "ewc_loss_parallel": 2.849102020263672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.597511291503906, + "learning_rate": 1e-06, + "loss": 0.4345, + "mean_token_accuracy": 0.8639249801635742, + "num_tokens": 283033148.0, + "step": 7420 + }, + { + "epoch": 0.9440274774201756, + "ewc_loss": 0.0284423828125, + "ewc_loss_parallel": 2.849102020263672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.683687210083008, + "learning_rate": 1e-06, + "loss": 0.4196, + "mean_token_accuracy": 0.8672837615013123, + "num_tokens": 283069781.0, + "step": 7421 + }, + { + "epoch": 0.9441546876987661, + "ewc_loss": 0.0286865234375, + "ewc_loss_parallel": 2.872943878173828e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.59408187866211, + "learning_rate": 1e-06, + "loss": 0.4564, + "mean_token_accuracy": 0.855901837348938, + "num_tokens": 283107270.0, + "step": 7422 + }, + { + "epoch": 0.9442818979773566, + "ewc_loss": 0.0284423828125, + "ewc_loss_parallel": 2.849102020263672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.688814163208008, + "learning_rate": 1e-06, + "loss": 0.3807, + "mean_token_accuracy": 0.8743943572044373, + "num_tokens": 283143571.0, + "step": 7423 + }, + { + "epoch": 0.944409108255947, + "ewc_loss": 0.0286865234375, + "ewc_loss_parallel": 2.872943878173828e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.636642456054688, + "learning_rate": 1e-06, + "loss": 0.4235, + "mean_token_accuracy": 0.8676928281784058, + "num_tokens": 283179995.0, + "step": 7424 + }, + { + "epoch": 0.9445363185345376, + "ewc_loss": 0.0283203125, + "ewc_loss_parallel": 2.8371810913085938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.49759292602539, + "learning_rate": 1e-06, + "loss": 0.4269, + "mean_token_accuracy": 0.8622055053710938, + "num_tokens": 283218370.0, + "step": 7425 + }, + { + "epoch": 0.9446635288131281, + "ewc_loss": 0.0284423828125, + "ewc_loss_parallel": 2.849102020263672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.66249656677246, + "learning_rate": 1e-06, + "loss": 0.4508, + "mean_token_accuracy": 0.8567614555358887, + "num_tokens": 283247879.0, + "step": 7426 + }, + { + "epoch": 0.9447907390917186, + "ewc_loss": 0.028564453125, + "ewc_loss_parallel": 2.86102294921875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.609573364257812, + "learning_rate": 1e-06, + "loss": 0.3954, + "mean_token_accuracy": 0.8777276873588562, + "num_tokens": 283285027.0, + "step": 7427 + }, + { + "epoch": 0.9449179493703092, + "ewc_loss": 0.0283203125, + "ewc_loss_parallel": 2.8371810913085938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.66437530517578, + "learning_rate": 1e-06, + "loss": 0.4272, + "mean_token_accuracy": 0.865884006023407, + "num_tokens": 283324934.0, + "step": 7428 + }, + { + "epoch": 0.9450451596488997, + "ewc_loss": 0.0284423828125, + "ewc_loss_parallel": 2.849102020263672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.737638473510742, + "learning_rate": 1e-06, + "loss": 0.4129, + "mean_token_accuracy": 0.8714708089828491, + "num_tokens": 283357540.0, + "step": 7429 + }, + { + "epoch": 0.9451723699274901, + "ewc_loss": 0.0286865234375, + "ewc_loss_parallel": 2.872943878173828e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.498615264892578, + "learning_rate": 1e-06, + "loss": 0.4334, + "mean_token_accuracy": 0.8602472543716431, + "num_tokens": 283393470.0, + "step": 7430 + }, + { + "epoch": 0.9452995802060806, + "ewc_loss": 0.0281982421875, + "ewc_loss_parallel": 2.8252601623535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.75838279724121, + "learning_rate": 1e-06, + "loss": 0.4297, + "mean_token_accuracy": 0.8627191185951233, + "num_tokens": 283429916.0, + "step": 7431 + }, + { + "epoch": 0.9454267904846712, + "ewc_loss": 0.0283203125, + "ewc_loss_parallel": 2.8371810913085938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.65827178955078, + "learning_rate": 1e-06, + "loss": 0.4579, + "mean_token_accuracy": 0.8515823483467102, + "num_tokens": 283467997.0, + "step": 7432 + }, + { + "epoch": 0.9455540007632617, + "ewc_loss": 0.0284423828125, + "ewc_loss_parallel": 2.849102020263672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.59071922302246, + "learning_rate": 1e-06, + "loss": 0.5347, + "mean_token_accuracy": 0.8287087678909302, + "num_tokens": 283504741.0, + "step": 7433 + }, + { + "epoch": 0.9456812110418522, + "ewc_loss": 0.0286865234375, + "ewc_loss_parallel": 2.872943878173828e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.74373435974121, + "learning_rate": 1e-06, + "loss": 0.4043, + "mean_token_accuracy": 0.8721510767936707, + "num_tokens": 283534535.0, + "step": 7434 + }, + { + "epoch": 0.9458084213204427, + "ewc_loss": 0.0283203125, + "ewc_loss_parallel": 2.8371810913085938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.52764129638672, + "learning_rate": 1e-06, + "loss": 0.3976, + "mean_token_accuracy": 0.8744560480117798, + "num_tokens": 283571600.0, + "step": 7435 + }, + { + "epoch": 0.9459356315990332, + "ewc_loss": 0.028564453125, + "ewc_loss_parallel": 2.86102294921875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.661334991455078, + "learning_rate": 1e-06, + "loss": 0.4779, + "mean_token_accuracy": 0.8486572504043579, + "num_tokens": 283605662.0, + "step": 7436 + }, + { + "epoch": 0.9460628418776237, + "ewc_loss": 0.028564453125, + "ewc_loss_parallel": 2.86102294921875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.722522735595703, + "learning_rate": 1e-06, + "loss": 0.436, + "mean_token_accuracy": 0.8626278638839722, + "num_tokens": 283642868.0, + "step": 7437 + }, + { + "epoch": 0.9461900521562142, + "ewc_loss": 0.0284423828125, + "ewc_loss_parallel": 2.849102020263672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.60869026184082, + "learning_rate": 1e-06, + "loss": 0.4669, + "mean_token_accuracy": 0.8536561727523804, + "num_tokens": 283675604.0, + "step": 7438 + }, + { + "epoch": 0.9463172624348047, + "ewc_loss": 0.0284423828125, + "ewc_loss_parallel": 2.849102020263672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.565135955810547, + "learning_rate": 1e-06, + "loss": 0.3678, + "mean_token_accuracy": 0.8837435841560364, + "num_tokens": 283709459.0, + "step": 7439 + }, + { + "epoch": 0.9464444727133953, + "ewc_loss": 0.0286865234375, + "ewc_loss_parallel": 2.872943878173828e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.859424591064453, + "learning_rate": 1e-06, + "loss": 0.4231, + "mean_token_accuracy": 0.8619276285171509, + "num_tokens": 283748020.0, + "step": 7440 + }, + { + "epoch": 0.9465716829919858, + "ewc_loss": 0.0284423828125, + "ewc_loss_parallel": 2.849102020263672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.5841121673584, + "learning_rate": 1e-06, + "loss": 0.4268, + "mean_token_accuracy": 0.8575048446655273, + "num_tokens": 283778514.0, + "step": 7441 + }, + { + "epoch": 0.9466988932705762, + "ewc_loss": 0.0284423828125, + "ewc_loss_parallel": 2.849102020263672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.830753326416016, + "learning_rate": 1e-06, + "loss": 0.4349, + "mean_token_accuracy": 0.8666754364967346, + "num_tokens": 283819764.0, + "step": 7442 + }, + { + "epoch": 0.9468261035491667, + "ewc_loss": 0.02880859375, + "ewc_loss_parallel": 2.8848648071289062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.548831939697266, + "learning_rate": 1e-06, + "loss": 0.4714, + "mean_token_accuracy": 0.8478716015815735, + "num_tokens": 283853897.0, + "step": 7443 + }, + { + "epoch": 0.9469533138277573, + "ewc_loss": 0.0284423828125, + "ewc_loss_parallel": 2.849102020263672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.726665496826172, + "learning_rate": 1e-06, + "loss": 0.4116, + "mean_token_accuracy": 0.8761693239212036, + "num_tokens": 283891826.0, + "step": 7444 + }, + { + "epoch": 0.9470805241063478, + "ewc_loss": 0.0283203125, + "ewc_loss_parallel": 2.8371810913085938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.75596046447754, + "learning_rate": 1e-06, + "loss": 0.4213, + "mean_token_accuracy": 0.8619460463523865, + "num_tokens": 283931690.0, + "step": 7445 + }, + { + "epoch": 0.9472077343849383, + "ewc_loss": 0.0284423828125, + "ewc_loss_parallel": 2.849102020263672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.485410690307617, + "learning_rate": 1e-06, + "loss": 0.4314, + "mean_token_accuracy": 0.8629584312438965, + "num_tokens": 283971107.0, + "step": 7446 + }, + { + "epoch": 0.9473349446635289, + "ewc_loss": 0.0286865234375, + "ewc_loss_parallel": 2.872943878173828e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.65520668029785, + "learning_rate": 1e-06, + "loss": 0.4005, + "mean_token_accuracy": 0.8700101375579834, + "num_tokens": 284005286.0, + "step": 7447 + }, + { + "epoch": 0.9474621549421193, + "ewc_loss": 0.0286865234375, + "ewc_loss_parallel": 2.872943878173828e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.661861419677734, + "learning_rate": 1e-06, + "loss": 0.3778, + "mean_token_accuracy": 0.8776330351829529, + "num_tokens": 284044089.0, + "step": 7448 + }, + { + "epoch": 0.9475893652207098, + "ewc_loss": 0.028564453125, + "ewc_loss_parallel": 2.86102294921875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.608036041259766, + "learning_rate": 1e-06, + "loss": 0.4679, + "mean_token_accuracy": 0.8579598665237427, + "num_tokens": 284081172.0, + "step": 7449 + }, + { + "epoch": 0.9477165754993003, + "ewc_loss": 0.028564453125, + "ewc_loss_parallel": 2.86102294921875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.668424606323242, + "learning_rate": 1e-06, + "loss": 0.4113, + "mean_token_accuracy": 0.8707036375999451, + "num_tokens": 284113362.0, + "step": 7450 + }, + { + "epoch": 0.9478437857778909, + "ewc_loss": 0.02880859375, + "ewc_loss_parallel": 2.8848648071289062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.710573196411133, + "learning_rate": 1e-06, + "loss": 0.4047, + "mean_token_accuracy": 0.8724715709686279, + "num_tokens": 284155272.0, + "step": 7451 + }, + { + "epoch": 0.9479709960564814, + "ewc_loss": 0.0286865234375, + "ewc_loss_parallel": 2.872943878173828e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.748218536376953, + "learning_rate": 1e-06, + "loss": 0.4038, + "mean_token_accuracy": 0.8712770938873291, + "num_tokens": 284188218.0, + "step": 7452 + }, + { + "epoch": 0.9480982063350719, + "ewc_loss": 0.028564453125, + "ewc_loss_parallel": 2.86102294921875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.67108726501465, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.8661366701126099, + "num_tokens": 284228838.0, + "step": 7453 + }, + { + "epoch": 0.9482254166136623, + "ewc_loss": 0.028564453125, + "ewc_loss_parallel": 2.86102294921875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.534175872802734, + "learning_rate": 1e-06, + "loss": 0.4421, + "mean_token_accuracy": 0.8612327575683594, + "num_tokens": 284265570.0, + "step": 7454 + }, + { + "epoch": 0.9483526268922529, + "ewc_loss": 0.0284423828125, + "ewc_loss_parallel": 2.849102020263672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.498579025268555, + "learning_rate": 1e-06, + "loss": 0.4613, + "mean_token_accuracy": 0.8538063764572144, + "num_tokens": 284307062.0, + "step": 7455 + }, + { + "epoch": 0.9484798371708434, + "ewc_loss": 0.0286865234375, + "ewc_loss_parallel": 2.872943878173828e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.808198928833008, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.860732913017273, + "num_tokens": 284340488.0, + "step": 7456 + }, + { + "epoch": 0.9486070474494339, + "ewc_loss": 0.02880859375, + "ewc_loss_parallel": 2.8848648071289062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.683406829833984, + "learning_rate": 1e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.8613585233688354, + "num_tokens": 284382399.0, + "step": 7457 + }, + { + "epoch": 0.9487342577280244, + "ewc_loss": 0.0283203125, + "ewc_loss_parallel": 2.8371810913085938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.576589584350586, + "learning_rate": 1e-06, + "loss": 0.461, + "mean_token_accuracy": 0.8502640724182129, + "num_tokens": 284416391.0, + "step": 7458 + }, + { + "epoch": 0.948861468006615, + "ewc_loss": 0.0286865234375, + "ewc_loss_parallel": 2.872943878173828e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.57870101928711, + "learning_rate": 1e-06, + "loss": 0.4331, + "mean_token_accuracy": 0.8636729121208191, + "num_tokens": 284457049.0, + "step": 7459 + }, + { + "epoch": 0.9489886782852054, + "ewc_loss": 0.0286865234375, + "ewc_loss_parallel": 2.872943878173828e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.57784080505371, + "learning_rate": 1e-06, + "loss": 0.4375, + "mean_token_accuracy": 0.8612246513366699, + "num_tokens": 284494426.0, + "step": 7460 + }, + { + "epoch": 0.9491158885637959, + "ewc_loss": 0.0286865234375, + "ewc_loss_parallel": 2.872943878173828e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.661270141601562, + "learning_rate": 1e-06, + "loss": 0.3736, + "mean_token_accuracy": 0.8807832598686218, + "num_tokens": 284533349.0, + "step": 7461 + }, + { + "epoch": 0.9492430988423864, + "ewc_loss": 0.0286865234375, + "ewc_loss_parallel": 2.872943878173828e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.744840621948242, + "learning_rate": 1e-06, + "loss": 0.3988, + "mean_token_accuracy": 0.8694397211074829, + "num_tokens": 284567777.0, + "step": 7462 + }, + { + "epoch": 0.949370309120977, + "ewc_loss": 0.028564453125, + "ewc_loss_parallel": 2.86102294921875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.63921546936035, + "learning_rate": 1e-06, + "loss": 0.4531, + "mean_token_accuracy": 0.8566002249717712, + "num_tokens": 284609120.0, + "step": 7463 + }, + { + "epoch": 0.9494975193995675, + "ewc_loss": 0.0286865234375, + "ewc_loss_parallel": 2.872943878173828e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.751781463623047, + "learning_rate": 1e-06, + "loss": 0.4595, + "mean_token_accuracy": 0.8559228181838989, + "num_tokens": 284649221.0, + "step": 7464 + }, + { + "epoch": 0.949624729678158, + "ewc_loss": 0.0284423828125, + "ewc_loss_parallel": 2.849102020263672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.58026695251465, + "learning_rate": 1e-06, + "loss": 0.4158, + "mean_token_accuracy": 0.8676170706748962, + "num_tokens": 284682595.0, + "step": 7465 + }, + { + "epoch": 0.9497519399567486, + "ewc_loss": 0.028564453125, + "ewc_loss_parallel": 2.86102294921875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.783588409423828, + "learning_rate": 1e-06, + "loss": 0.4448, + "mean_token_accuracy": 0.8590412735939026, + "num_tokens": 284722903.0, + "step": 7466 + }, + { + "epoch": 0.949879150235339, + "ewc_loss": 0.0284423828125, + "ewc_loss_parallel": 2.849102020263672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.631244659423828, + "learning_rate": 1e-06, + "loss": 0.4205, + "mean_token_accuracy": 0.8684619665145874, + "num_tokens": 284759166.0, + "step": 7467 + }, + { + "epoch": 0.9500063605139295, + "ewc_loss": 0.0284423828125, + "ewc_loss_parallel": 2.849102020263672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.757997512817383, + "learning_rate": 1e-06, + "loss": 0.4433, + "mean_token_accuracy": 0.8587186336517334, + "num_tokens": 284803192.0, + "step": 7468 + }, + { + "epoch": 0.95013357079252, + "ewc_loss": 0.028564453125, + "ewc_loss_parallel": 2.86102294921875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.651338577270508, + "learning_rate": 1e-06, + "loss": 0.4323, + "mean_token_accuracy": 0.8648477792739868, + "num_tokens": 284837088.0, + "step": 7469 + }, + { + "epoch": 0.9502607810711106, + "ewc_loss": 0.028564453125, + "ewc_loss_parallel": 2.86102294921875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.60093116760254, + "learning_rate": 1e-06, + "loss": 0.5062, + "mean_token_accuracy": 0.8439215421676636, + "num_tokens": 284879641.0, + "step": 7470 + }, + { + "epoch": 0.9503879913497011, + "ewc_loss": 0.0289306640625, + "ewc_loss_parallel": 2.8967857360839844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.727550506591797, + "learning_rate": 1e-06, + "loss": 0.4685, + "mean_token_accuracy": 0.8509664535522461, + "num_tokens": 284917519.0, + "step": 7471 + }, + { + "epoch": 0.9505152016282916, + "ewc_loss": 0.028564453125, + "ewc_loss_parallel": 2.86102294921875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.699865341186523, + "learning_rate": 1e-06, + "loss": 0.4679, + "mean_token_accuracy": 0.853859543800354, + "num_tokens": 284950422.0, + "step": 7472 + }, + { + "epoch": 0.950642411906882, + "ewc_loss": 0.0286865234375, + "ewc_loss_parallel": 2.872943878173828e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.567113876342773, + "learning_rate": 1e-06, + "loss": 0.4062, + "mean_token_accuracy": 0.8694397211074829, + "num_tokens": 284986392.0, + "step": 7473 + }, + { + "epoch": 0.9507696221854726, + "ewc_loss": 0.028564453125, + "ewc_loss_parallel": 2.86102294921875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.831680297851562, + "learning_rate": 1e-06, + "loss": 0.4303, + "mean_token_accuracy": 0.8645433187484741, + "num_tokens": 285027684.0, + "step": 7474 + }, + { + "epoch": 0.9508968324640631, + "ewc_loss": 0.02880859375, + "ewc_loss_parallel": 2.8848648071289062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.82890510559082, + "learning_rate": 1e-06, + "loss": 0.449, + "mean_token_accuracy": 0.8539153933525085, + "num_tokens": 285059551.0, + "step": 7475 + }, + { + "epoch": 0.9510240427426536, + "ewc_loss": 0.028564453125, + "ewc_loss_parallel": 2.86102294921875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.630918502807617, + "learning_rate": 1e-06, + "loss": 0.4246, + "mean_token_accuracy": 0.863635778427124, + "num_tokens": 285100197.0, + "step": 7476 + }, + { + "epoch": 0.9511512530212441, + "ewc_loss": 0.0286865234375, + "ewc_loss_parallel": 2.872943878173828e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.842845916748047, + "learning_rate": 1e-06, + "loss": 0.4285, + "mean_token_accuracy": 0.8630814552307129, + "num_tokens": 285130115.0, + "step": 7477 + }, + { + "epoch": 0.9512784632998347, + "ewc_loss": 0.028564453125, + "ewc_loss_parallel": 2.86102294921875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.891071319580078, + "learning_rate": 1e-06, + "loss": 0.4307, + "mean_token_accuracy": 0.8640104532241821, + "num_tokens": 285169069.0, + "step": 7478 + }, + { + "epoch": 0.9514056735784251, + "ewc_loss": 0.0286865234375, + "ewc_loss_parallel": 2.872943878173828e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.880353927612305, + "learning_rate": 1e-06, + "loss": 0.5117, + "mean_token_accuracy": 0.8426691293716431, + "num_tokens": 285209971.0, + "step": 7479 + }, + { + "epoch": 0.9515328838570156, + "ewc_loss": 0.0286865234375, + "ewc_loss_parallel": 2.872943878173828e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.689815521240234, + "learning_rate": 1e-06, + "loss": 0.4535, + "mean_token_accuracy": 0.8574599623680115, + "num_tokens": 285250024.0, + "step": 7480 + }, + { + "epoch": 0.9516600941356061, + "ewc_loss": 0.0284423828125, + "ewc_loss_parallel": 2.849102020263672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.65878677368164, + "learning_rate": 1e-06, + "loss": 0.4426, + "mean_token_accuracy": 0.8587327003479004, + "num_tokens": 285284900.0, + "step": 7481 + }, + { + "epoch": 0.9517873044141967, + "ewc_loss": 0.02880859375, + "ewc_loss_parallel": 2.8848648071289062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.83394432067871, + "learning_rate": 1e-06, + "loss": 0.4128, + "mean_token_accuracy": 0.8672522306442261, + "num_tokens": 285325657.0, + "step": 7482 + }, + { + "epoch": 0.9519145146927872, + "ewc_loss": 0.028564453125, + "ewc_loss_parallel": 2.86102294921875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.62660789489746, + "learning_rate": 1e-06, + "loss": 0.4415, + "mean_token_accuracy": 0.8629166483879089, + "num_tokens": 285356362.0, + "step": 7483 + }, + { + "epoch": 0.9520417249713777, + "ewc_loss": 0.0283203125, + "ewc_loss_parallel": 2.8371810913085938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.70919418334961, + "learning_rate": 1e-06, + "loss": 0.4276, + "mean_token_accuracy": 0.8628791570663452, + "num_tokens": 285394321.0, + "step": 7484 + }, + { + "epoch": 0.9521689352499682, + "ewc_loss": 0.0284423828125, + "ewc_loss_parallel": 2.849102020263672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.782527923583984, + "learning_rate": 1e-06, + "loss": 0.3783, + "mean_token_accuracy": 0.8823216557502747, + "num_tokens": 285435260.0, + "step": 7485 + }, + { + "epoch": 0.9522961455285587, + "ewc_loss": 0.0284423828125, + "ewc_loss_parallel": 2.849102020263672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.590862274169922, + "learning_rate": 1e-06, + "loss": 0.4097, + "mean_token_accuracy": 0.8707475662231445, + "num_tokens": 285479449.0, + "step": 7486 + }, + { + "epoch": 0.9524233558071492, + "ewc_loss": 0.028564453125, + "ewc_loss_parallel": 2.86102294921875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.809768676757812, + "learning_rate": 1e-06, + "loss": 0.5086, + "mean_token_accuracy": 0.8429819345474243, + "num_tokens": 285520492.0, + "step": 7487 + }, + { + "epoch": 0.9525505660857397, + "ewc_loss": 0.0284423828125, + "ewc_loss_parallel": 2.849102020263672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.87860679626465, + "learning_rate": 1e-06, + "loss": 0.4448, + "mean_token_accuracy": 0.8588681817054749, + "num_tokens": 285558410.0, + "step": 7488 + }, + { + "epoch": 0.9526777763643303, + "ewc_loss": 0.028564453125, + "ewc_loss_parallel": 2.86102294921875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.62830352783203, + "learning_rate": 1e-06, + "loss": 0.4915, + "mean_token_accuracy": 0.8432104587554932, + "num_tokens": 285598871.0, + "step": 7489 + }, + { + "epoch": 0.9528049866429208, + "ewc_loss": 0.0283203125, + "ewc_loss_parallel": 2.8371810913085938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.906394958496094, + "learning_rate": 1e-06, + "loss": 0.4508, + "mean_token_accuracy": 0.8550798892974854, + "num_tokens": 285641374.0, + "step": 7490 + }, + { + "epoch": 0.9529321969215112, + "ewc_loss": 0.028564453125, + "ewc_loss_parallel": 2.86102294921875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.59811019897461, + "learning_rate": 1e-06, + "loss": 0.3964, + "mean_token_accuracy": 0.8732908964157104, + "num_tokens": 285675416.0, + "step": 7491 + }, + { + "epoch": 0.9530594072001017, + "ewc_loss": 0.0284423828125, + "ewc_loss_parallel": 2.849102020263672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.87847137451172, + "learning_rate": 1e-06, + "loss": 0.4003, + "mean_token_accuracy": 0.8682094812393188, + "num_tokens": 285707491.0, + "step": 7492 + }, + { + "epoch": 0.9531866174786923, + "ewc_loss": 0.0284423828125, + "ewc_loss_parallel": 2.849102020263672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.780418395996094, + "learning_rate": 1e-06, + "loss": 0.4581, + "mean_token_accuracy": 0.8511853814125061, + "num_tokens": 285747571.0, + "step": 7493 + }, + { + "epoch": 0.9533138277572828, + "ewc_loss": 0.0283203125, + "ewc_loss_parallel": 2.8371810913085938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.78502655029297, + "learning_rate": 1e-06, + "loss": 0.3611, + "mean_token_accuracy": 0.8849014043807983, + "num_tokens": 285786362.0, + "step": 7494 + }, + { + "epoch": 0.9534410380358733, + "ewc_loss": 0.0284423828125, + "ewc_loss_parallel": 2.849102020263672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.83198356628418, + "learning_rate": 1e-06, + "loss": 0.3989, + "mean_token_accuracy": 0.8697496652603149, + "num_tokens": 285829401.0, + "step": 7495 + }, + { + "epoch": 0.9535682483144639, + "ewc_loss": 0.028564453125, + "ewc_loss_parallel": 2.86102294921875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.60201072692871, + "learning_rate": 1e-06, + "loss": 0.4486, + "mean_token_accuracy": 0.8608211874961853, + "num_tokens": 285867853.0, + "step": 7496 + }, + { + "epoch": 0.9536954585930543, + "ewc_loss": 0.0286865234375, + "ewc_loss_parallel": 2.872943878173828e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.810062408447266, + "learning_rate": 1e-06, + "loss": 0.4021, + "mean_token_accuracy": 0.873001217842102, + "num_tokens": 285908497.0, + "step": 7497 + }, + { + "epoch": 0.9538226688716448, + "ewc_loss": 0.0284423828125, + "ewc_loss_parallel": 2.849102020263672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.839357376098633, + "learning_rate": 1e-06, + "loss": 0.4716, + "mean_token_accuracy": 0.849705159664154, + "num_tokens": 285940528.0, + "step": 7498 + }, + { + "epoch": 0.9539498791502353, + "ewc_loss": 0.0286865234375, + "ewc_loss_parallel": 2.872943878173828e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.651004791259766, + "learning_rate": 1e-06, + "loss": 0.433, + "mean_token_accuracy": 0.8641479015350342, + "num_tokens": 285980206.0, + "step": 7499 + }, + { + "epoch": 0.9540770894288259, + "ewc_loss": 0.0284423828125, + "ewc_loss_parallel": 2.849102020263672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.673439025878906, + "learning_rate": 1e-06, + "loss": 0.4415, + "mean_token_accuracy": 0.8572471141815186, + "num_tokens": 286019409.0, + "step": 7500 + }, + { + "epoch": 0.9542042997074164, + "ewc_loss": 0.0286865234375, + "ewc_loss_parallel": 2.872943878173828e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.730308532714844, + "learning_rate": 1e-06, + "loss": 0.4093, + "mean_token_accuracy": 0.8713763952255249, + "num_tokens": 286050955.0, + "step": 7501 + }, + { + "epoch": 0.9543315099860069, + "ewc_loss": 0.0286865234375, + "ewc_loss_parallel": 2.872943878173828e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.78942108154297, + "learning_rate": 1e-06, + "loss": 0.423, + "mean_token_accuracy": 0.8678217530250549, + "num_tokens": 286087567.0, + "step": 7502 + }, + { + "epoch": 0.9544587202645973, + "ewc_loss": 0.0283203125, + "ewc_loss_parallel": 2.8371810913085938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.759647369384766, + "learning_rate": 1e-06, + "loss": 0.4999, + "mean_token_accuracy": 0.8420143127441406, + "num_tokens": 286126893.0, + "step": 7503 + }, + { + "epoch": 0.9545859305431879, + "ewc_loss": 0.028564453125, + "ewc_loss_parallel": 2.86102294921875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.706266403198242, + "learning_rate": 1e-06, + "loss": 0.4475, + "mean_token_accuracy": 0.8572105169296265, + "num_tokens": 286165818.0, + "step": 7504 + }, + { + "epoch": 0.9547131408217784, + "ewc_loss": 0.028564453125, + "ewc_loss_parallel": 2.86102294921875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.900739669799805, + "learning_rate": 1e-06, + "loss": 0.4237, + "mean_token_accuracy": 0.8642463684082031, + "num_tokens": 286208026.0, + "step": 7505 + }, + { + "epoch": 0.9548403511003689, + "ewc_loss": 0.028564453125, + "ewc_loss_parallel": 2.86102294921875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.761577606201172, + "learning_rate": 1e-06, + "loss": 0.4266, + "mean_token_accuracy": 0.8629119396209717, + "num_tokens": 286245468.0, + "step": 7506 + }, + { + "epoch": 0.9549675613789594, + "ewc_loss": 0.028564453125, + "ewc_loss_parallel": 2.86102294921875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.301103591918945, + "learning_rate": 1e-06, + "loss": 0.4109, + "mean_token_accuracy": 0.866572380065918, + "num_tokens": 286283955.0, + "step": 7507 + }, + { + "epoch": 0.95509477165755, + "ewc_loss": 0.0286865234375, + "ewc_loss_parallel": 2.872943878173828e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.54608917236328, + "learning_rate": 1e-06, + "loss": 0.4212, + "mean_token_accuracy": 0.8687807321548462, + "num_tokens": 286326560.0, + "step": 7508 + }, + { + "epoch": 0.9552219819361404, + "ewc_loss": 0.0283203125, + "ewc_loss_parallel": 2.8371810913085938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.861186981201172, + "learning_rate": 1e-06, + "loss": 0.4313, + "mean_token_accuracy": 0.8624742031097412, + "num_tokens": 286365791.0, + "step": 7509 + }, + { + "epoch": 0.9553491922147309, + "ewc_loss": 0.028564453125, + "ewc_loss_parallel": 2.86102294921875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.93367576599121, + "learning_rate": 1e-06, + "loss": 0.3997, + "mean_token_accuracy": 0.8743248581886292, + "num_tokens": 286404132.0, + "step": 7510 + }, + { + "epoch": 0.9554764024933214, + "ewc_loss": 0.0283203125, + "ewc_loss_parallel": 2.8371810913085938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.737396240234375, + "learning_rate": 1e-06, + "loss": 0.4582, + "mean_token_accuracy": 0.8525298237800598, + "num_tokens": 286444828.0, + "step": 7511 + }, + { + "epoch": 0.955603612771912, + "ewc_loss": 0.0283203125, + "ewc_loss_parallel": 2.8371810913085938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.954275131225586, + "learning_rate": 1e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.8681025505065918, + "num_tokens": 286481725.0, + "step": 7512 + }, + { + "epoch": 0.9557308230505025, + "ewc_loss": 0.028564453125, + "ewc_loss_parallel": 2.86102294921875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.868518829345703, + "learning_rate": 1e-06, + "loss": 0.4311, + "mean_token_accuracy": 0.8640982508659363, + "num_tokens": 286520354.0, + "step": 7513 + }, + { + "epoch": 0.955858033329093, + "ewc_loss": 0.0283203125, + "ewc_loss_parallel": 2.8371810913085938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.65431785583496, + "learning_rate": 1e-06, + "loss": 0.4414, + "mean_token_accuracy": 0.8603932857513428, + "num_tokens": 286564314.0, + "step": 7514 + }, + { + "epoch": 0.9559852436076836, + "ewc_loss": 0.0284423828125, + "ewc_loss_parallel": 2.849102020263672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.954296112060547, + "learning_rate": 1e-06, + "loss": 0.4275, + "mean_token_accuracy": 0.8618859052658081, + "num_tokens": 286606955.0, + "step": 7515 + }, + { + "epoch": 0.956112453886274, + "ewc_loss": 0.0283203125, + "ewc_loss_parallel": 2.8371810913085938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.78502082824707, + "learning_rate": 1e-06, + "loss": 0.4096, + "mean_token_accuracy": 0.86786949634552, + "num_tokens": 286640091.0, + "step": 7516 + }, + { + "epoch": 0.9562396641648645, + "ewc_loss": 0.0283203125, + "ewc_loss_parallel": 2.8371810913085938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.735780715942383, + "learning_rate": 1e-06, + "loss": 0.434, + "mean_token_accuracy": 0.8620139360427856, + "num_tokens": 286679090.0, + "step": 7517 + }, + { + "epoch": 0.956366874443455, + "ewc_loss": 0.0283203125, + "ewc_loss_parallel": 2.8371810913085938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.90509796142578, + "learning_rate": 1e-06, + "loss": 0.4256, + "mean_token_accuracy": 0.8669650554656982, + "num_tokens": 286720260.0, + "step": 7518 + }, + { + "epoch": 0.9564940847220456, + "ewc_loss": 0.0283203125, + "ewc_loss_parallel": 2.8371810913085938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.698707580566406, + "learning_rate": 1e-06, + "loss": 0.41, + "mean_token_accuracy": 0.8690052032470703, + "num_tokens": 286763992.0, + "step": 7519 + }, + { + "epoch": 0.9566212950006361, + "ewc_loss": 0.028564453125, + "ewc_loss_parallel": 2.86102294921875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.842466354370117, + "learning_rate": 1e-06, + "loss": 0.4844, + "mean_token_accuracy": 0.8431454300880432, + "num_tokens": 286807068.0, + "step": 7520 + }, + { + "epoch": 0.9567485052792266, + "ewc_loss": 0.0286865234375, + "ewc_loss_parallel": 2.872943878173828e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.746572494506836, + "learning_rate": 1e-06, + "loss": 0.4137, + "mean_token_accuracy": 0.8695222735404968, + "num_tokens": 286853020.0, + "step": 7521 + }, + { + "epoch": 0.956875715557817, + "ewc_loss": 0.028564453125, + "ewc_loss_parallel": 2.86102294921875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.686859130859375, + "learning_rate": 1e-06, + "loss": 0.4558, + "mean_token_accuracy": 0.8521795272827148, + "num_tokens": 286896098.0, + "step": 7522 + }, + { + "epoch": 0.9570029258364076, + "ewc_loss": 0.02880859375, + "ewc_loss_parallel": 2.8848648071289062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.887876510620117, + "learning_rate": 1e-06, + "loss": 0.4801, + "mean_token_accuracy": 0.8472192287445068, + "num_tokens": 286938873.0, + "step": 7523 + }, + { + "epoch": 0.9571301361149981, + "ewc_loss": 0.028564453125, + "ewc_loss_parallel": 2.86102294921875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.76913833618164, + "learning_rate": 1e-06, + "loss": 0.4197, + "mean_token_accuracy": 0.8666415214538574, + "num_tokens": 286972941.0, + "step": 7524 + }, + { + "epoch": 0.9572573463935886, + "ewc_loss": 0.0284423828125, + "ewc_loss_parallel": 2.849102020263672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.823911666870117, + "learning_rate": 1e-06, + "loss": 0.4655, + "mean_token_accuracy": 0.8525461554527283, + "num_tokens": 287010818.0, + "step": 7525 + }, + { + "epoch": 0.9573845566721791, + "ewc_loss": 0.0283203125, + "ewc_loss_parallel": 2.8371810913085938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.6696834564209, + "learning_rate": 1e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.867402195930481, + "num_tokens": 287053841.0, + "step": 7526 + }, + { + "epoch": 0.9575117669507697, + "ewc_loss": 0.0283203125, + "ewc_loss_parallel": 2.8371810913085938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.8324031829834, + "learning_rate": 1e-06, + "loss": 0.414, + "mean_token_accuracy": 0.8688573837280273, + "num_tokens": 287090900.0, + "step": 7527 + }, + { + "epoch": 0.9576389772293601, + "ewc_loss": 0.0283203125, + "ewc_loss_parallel": 2.8371810913085938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.892879486083984, + "learning_rate": 1e-06, + "loss": 0.4524, + "mean_token_accuracy": 0.8613791465759277, + "num_tokens": 287131203.0, + "step": 7528 + }, + { + "epoch": 0.9577661875079506, + "ewc_loss": 0.0283203125, + "ewc_loss_parallel": 2.8371810913085938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.7458553314209, + "learning_rate": 1e-06, + "loss": 0.3997, + "mean_token_accuracy": 0.873380184173584, + "num_tokens": 287165949.0, + "step": 7529 + }, + { + "epoch": 0.9578933977865411, + "ewc_loss": 0.0283203125, + "ewc_loss_parallel": 2.8371810913085938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.821428298950195, + "learning_rate": 1e-06, + "loss": 0.4363, + "mean_token_accuracy": 0.8628565073013306, + "num_tokens": 287203376.0, + "step": 7530 + }, + { + "epoch": 0.9580206080651317, + "ewc_loss": 0.028564453125, + "ewc_loss_parallel": 2.86102294921875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.80704689025879, + "learning_rate": 1e-06, + "loss": 0.4583, + "mean_token_accuracy": 0.8536904454231262, + "num_tokens": 287240391.0, + "step": 7531 + }, + { + "epoch": 0.9581478183437222, + "ewc_loss": 0.0281982421875, + "ewc_loss_parallel": 2.8252601623535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.624258041381836, + "learning_rate": 1e-06, + "loss": 0.4483, + "mean_token_accuracy": 0.8602232933044434, + "num_tokens": 287278485.0, + "step": 7532 + }, + { + "epoch": 0.9582750286223127, + "ewc_loss": 0.0284423828125, + "ewc_loss_parallel": 2.849102020263672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.909103393554688, + "learning_rate": 1e-06, + "loss": 0.4542, + "mean_token_accuracy": 0.8577026128768921, + "num_tokens": 287313626.0, + "step": 7533 + }, + { + "epoch": 0.9584022389009031, + "ewc_loss": 0.0283203125, + "ewc_loss_parallel": 2.8371810913085938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.77089500427246, + "learning_rate": 1e-06, + "loss": 0.3749, + "mean_token_accuracy": 0.8783907890319824, + "num_tokens": 287344532.0, + "step": 7534 + }, + { + "epoch": 0.9585294491794937, + "ewc_loss": 0.028564453125, + "ewc_loss_parallel": 2.86102294921875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.808971405029297, + "learning_rate": 1e-06, + "loss": 0.4843, + "mean_token_accuracy": 0.8443593382835388, + "num_tokens": 287383069.0, + "step": 7535 + }, + { + "epoch": 0.9586566594580842, + "ewc_loss": 0.028564453125, + "ewc_loss_parallel": 2.86102294921875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.817716598510742, + "learning_rate": 1e-06, + "loss": 0.4311, + "mean_token_accuracy": 0.8628314137458801, + "num_tokens": 287418554.0, + "step": 7536 + }, + { + "epoch": 0.9587838697366747, + "ewc_loss": 0.028564453125, + "ewc_loss_parallel": 2.86102294921875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.514760971069336, + "learning_rate": 1e-06, + "loss": 0.4333, + "mean_token_accuracy": 0.8612579107284546, + "num_tokens": 287452449.0, + "step": 7537 + }, + { + "epoch": 0.9589110800152653, + "ewc_loss": 0.0284423828125, + "ewc_loss_parallel": 2.849102020263672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.77596664428711, + "learning_rate": 1e-06, + "loss": 0.4778, + "mean_token_accuracy": 0.8483194708824158, + "num_tokens": 287492247.0, + "step": 7538 + }, + { + "epoch": 0.9590382902938558, + "ewc_loss": 0.02880859375, + "ewc_loss_parallel": 2.8848648071289062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.732036590576172, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.8693726062774658, + "num_tokens": 287528651.0, + "step": 7539 + }, + { + "epoch": 0.9591655005724462, + "ewc_loss": 0.028564453125, + "ewc_loss_parallel": 2.86102294921875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.68440055847168, + "learning_rate": 1e-06, + "loss": 0.4726, + "mean_token_accuracy": 0.8527306318283081, + "num_tokens": 287565837.0, + "step": 7540 + }, + { + "epoch": 0.9592927108510367, + "ewc_loss": 0.0283203125, + "ewc_loss_parallel": 2.8371810913085938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.676767349243164, + "learning_rate": 1e-06, + "loss": 0.4504, + "mean_token_accuracy": 0.8611375093460083, + "num_tokens": 287599469.0, + "step": 7541 + }, + { + "epoch": 0.9594199211296273, + "ewc_loss": 0.0284423828125, + "ewc_loss_parallel": 2.849102020263672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.770349502563477, + "learning_rate": 1e-06, + "loss": 0.4335, + "mean_token_accuracy": 0.8604822158813477, + "num_tokens": 287637104.0, + "step": 7542 + }, + { + "epoch": 0.9595471314082178, + "ewc_loss": 0.0284423828125, + "ewc_loss_parallel": 2.849102020263672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.787506103515625, + "learning_rate": 1e-06, + "loss": 0.4247, + "mean_token_accuracy": 0.8635352849960327, + "num_tokens": 287679134.0, + "step": 7543 + }, + { + "epoch": 0.9596743416868083, + "ewc_loss": 0.028564453125, + "ewc_loss_parallel": 2.86102294921875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.73596954345703, + "learning_rate": 1e-06, + "loss": 0.411, + "mean_token_accuracy": 0.8696601986885071, + "num_tokens": 287720340.0, + "step": 7544 + }, + { + "epoch": 0.9598015519653988, + "ewc_loss": 0.028564453125, + "ewc_loss_parallel": 2.86102294921875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.779212951660156, + "learning_rate": 1e-06, + "loss": 0.4088, + "mean_token_accuracy": 0.8674288988113403, + "num_tokens": 287760931.0, + "step": 7545 + }, + { + "epoch": 0.9599287622439893, + "ewc_loss": 0.0284423828125, + "ewc_loss_parallel": 2.849102020263672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.85360336303711, + "learning_rate": 1e-06, + "loss": 0.4872, + "mean_token_accuracy": 0.8491878509521484, + "num_tokens": 287801723.0, + "step": 7546 + }, + { + "epoch": 0.9600559725225798, + "ewc_loss": 0.0284423828125, + "ewc_loss_parallel": 2.849102020263672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.942869186401367, + "learning_rate": 1e-06, + "loss": 0.4366, + "mean_token_accuracy": 0.8605067729949951, + "num_tokens": 287837010.0, + "step": 7547 + }, + { + "epoch": 0.9601831828011703, + "ewc_loss": 0.0284423828125, + "ewc_loss_parallel": 2.849102020263672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.569887161254883, + "learning_rate": 1e-06, + "loss": 0.4248, + "mean_token_accuracy": 0.8644952774047852, + "num_tokens": 287874381.0, + "step": 7548 + }, + { + "epoch": 0.9603103930797608, + "ewc_loss": 0.028564453125, + "ewc_loss_parallel": 2.86102294921875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.65039825439453, + "learning_rate": 1e-06, + "loss": 0.4584, + "mean_token_accuracy": 0.855029821395874, + "num_tokens": 287916673.0, + "step": 7549 + }, + { + "epoch": 0.9604376033583514, + "ewc_loss": 0.028564453125, + "ewc_loss_parallel": 2.86102294921875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.721792221069336, + "learning_rate": 1e-06, + "loss": 0.4505, + "mean_token_accuracy": 0.8585612773895264, + "num_tokens": 287956764.0, + "step": 7550 + }, + { + "epoch": 0.9605648136369419, + "ewc_loss": 0.028564453125, + "ewc_loss_parallel": 2.86102294921875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.736589431762695, + "learning_rate": 1e-06, + "loss": 0.4058, + "mean_token_accuracy": 0.8709506988525391, + "num_tokens": 287996373.0, + "step": 7551 + }, + { + "epoch": 0.9606920239155323, + "ewc_loss": 0.0286865234375, + "ewc_loss_parallel": 2.872943878173828e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.775920867919922, + "learning_rate": 1e-06, + "loss": 0.4589, + "mean_token_accuracy": 0.8573856949806213, + "num_tokens": 288033671.0, + "step": 7552 + }, + { + "epoch": 0.9608192341941229, + "ewc_loss": 0.028564453125, + "ewc_loss_parallel": 2.86102294921875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.62638282775879, + "learning_rate": 1e-06, + "loss": 0.4031, + "mean_token_accuracy": 0.8710137605667114, + "num_tokens": 288068254.0, + "step": 7553 + }, + { + "epoch": 0.9609464444727134, + "ewc_loss": 0.028564453125, + "ewc_loss_parallel": 2.86102294921875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.802980422973633, + "learning_rate": 1e-06, + "loss": 0.469, + "mean_token_accuracy": 0.8482396602630615, + "num_tokens": 288102833.0, + "step": 7554 + }, + { + "epoch": 0.9610736547513039, + "ewc_loss": 0.0286865234375, + "ewc_loss_parallel": 2.872943878173828e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.6718807220459, + "learning_rate": 1e-06, + "loss": 0.3923, + "mean_token_accuracy": 0.8750139474868774, + "num_tokens": 288135927.0, + "step": 7555 + }, + { + "epoch": 0.9612008650298944, + "ewc_loss": 0.02880859375, + "ewc_loss_parallel": 2.8848648071289062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.722898483276367, + "learning_rate": 1e-06, + "loss": 0.3685, + "mean_token_accuracy": 0.8823609948158264, + "num_tokens": 288173842.0, + "step": 7556 + }, + { + "epoch": 0.961328075308485, + "ewc_loss": 0.0286865234375, + "ewc_loss_parallel": 2.872943878173828e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.83517074584961, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.8681355714797974, + "num_tokens": 288215007.0, + "step": 7557 + }, + { + "epoch": 0.9614552855870754, + "ewc_loss": 0.028564453125, + "ewc_loss_parallel": 2.86102294921875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.530744552612305, + "learning_rate": 1e-06, + "loss": 0.4813, + "mean_token_accuracy": 0.8523333072662354, + "num_tokens": 288251598.0, + "step": 7558 + }, + { + "epoch": 0.9615824958656659, + "ewc_loss": 0.028564453125, + "ewc_loss_parallel": 2.86102294921875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.715717315673828, + "learning_rate": 1e-06, + "loss": 0.4306, + "mean_token_accuracy": 0.8643344640731812, + "num_tokens": 288292811.0, + "step": 7559 + }, + { + "epoch": 0.9617097061442564, + "ewc_loss": 0.0286865234375, + "ewc_loss_parallel": 2.872943878173828e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.545190811157227, + "learning_rate": 1e-06, + "loss": 0.4163, + "mean_token_accuracy": 0.8678198456764221, + "num_tokens": 288337790.0, + "step": 7560 + }, + { + "epoch": 0.961836916422847, + "ewc_loss": 0.0286865234375, + "ewc_loss_parallel": 2.872943878173828e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.7579345703125, + "learning_rate": 1e-06, + "loss": 0.4262, + "mean_token_accuracy": 0.8600817918777466, + "num_tokens": 288373347.0, + "step": 7561 + }, + { + "epoch": 0.9619641267014375, + "ewc_loss": 0.02880859375, + "ewc_loss_parallel": 2.8848648071289062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.77130889892578, + "learning_rate": 1e-06, + "loss": 0.3663, + "mean_token_accuracy": 0.8833231925964355, + "num_tokens": 288405650.0, + "step": 7562 + }, + { + "epoch": 0.962091336980028, + "ewc_loss": 0.02880859375, + "ewc_loss_parallel": 2.8848648071289062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.590665817260742, + "learning_rate": 1e-06, + "loss": 0.4095, + "mean_token_accuracy": 0.8682900071144104, + "num_tokens": 288443849.0, + "step": 7563 + }, + { + "epoch": 0.9622185472586186, + "ewc_loss": 0.0284423828125, + "ewc_loss_parallel": 2.849102020263672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.720218658447266, + "learning_rate": 1e-06, + "loss": 0.4992, + "mean_token_accuracy": 0.8415536880493164, + "num_tokens": 288487557.0, + "step": 7564 + }, + { + "epoch": 0.962345757537209, + "ewc_loss": 0.028564453125, + "ewc_loss_parallel": 2.86102294921875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.762760162353516, + "learning_rate": 1e-06, + "loss": 0.4556, + "mean_token_accuracy": 0.8528276681900024, + "num_tokens": 288529932.0, + "step": 7565 + }, + { + "epoch": 0.9624729678157995, + "ewc_loss": 0.0286865234375, + "ewc_loss_parallel": 2.872943878173828e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.712284088134766, + "learning_rate": 1e-06, + "loss": 0.447, + "mean_token_accuracy": 0.858182966709137, + "num_tokens": 288568276.0, + "step": 7566 + }, + { + "epoch": 0.96260017809439, + "ewc_loss": 0.028564453125, + "ewc_loss_parallel": 2.86102294921875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.901588439941406, + "learning_rate": 1e-06, + "loss": 0.4104, + "mean_token_accuracy": 0.8651925325393677, + "num_tokens": 288602387.0, + "step": 7567 + }, + { + "epoch": 0.9627273883729806, + "ewc_loss": 0.0284423828125, + "ewc_loss_parallel": 2.849102020263672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.704700469970703, + "learning_rate": 1e-06, + "loss": 0.4116, + "mean_token_accuracy": 0.8706073760986328, + "num_tokens": 288638157.0, + "step": 7568 + }, + { + "epoch": 0.9628545986515711, + "ewc_loss": 0.0283203125, + "ewc_loss_parallel": 2.8371810913085938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.759878158569336, + "learning_rate": 1e-06, + "loss": 0.4495, + "mean_token_accuracy": 0.8601769208908081, + "num_tokens": 288675054.0, + "step": 7569 + }, + { + "epoch": 0.9629818089301616, + "ewc_loss": 0.0286865234375, + "ewc_loss_parallel": 2.872943878173828e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.662551879882812, + "learning_rate": 1e-06, + "loss": 0.423, + "mean_token_accuracy": 0.8683893084526062, + "num_tokens": 288714222.0, + "step": 7570 + }, + { + "epoch": 0.963109019208752, + "ewc_loss": 0.028564453125, + "ewc_loss_parallel": 2.86102294921875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.829694747924805, + "learning_rate": 1e-06, + "loss": 0.4528, + "mean_token_accuracy": 0.8528969287872314, + "num_tokens": 288752772.0, + "step": 7571 + }, + { + "epoch": 0.9632362294873426, + "ewc_loss": 0.028564453125, + "ewc_loss_parallel": 2.86102294921875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.753612518310547, + "learning_rate": 1e-06, + "loss": 0.4779, + "mean_token_accuracy": 0.8465554118156433, + "num_tokens": 288792067.0, + "step": 7572 + }, + { + "epoch": 0.9633634397659331, + "ewc_loss": 0.028564453125, + "ewc_loss_parallel": 2.86102294921875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.73224449157715, + "learning_rate": 1e-06, + "loss": 0.3889, + "mean_token_accuracy": 0.8744205832481384, + "num_tokens": 288826172.0, + "step": 7573 + }, + { + "epoch": 0.9634906500445236, + "ewc_loss": 0.0286865234375, + "ewc_loss_parallel": 2.872943878173828e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.680397033691406, + "learning_rate": 1e-06, + "loss": 0.3954, + "mean_token_accuracy": 0.8736107349395752, + "num_tokens": 288863774.0, + "step": 7574 + }, + { + "epoch": 0.9636178603231141, + "ewc_loss": 0.028564453125, + "ewc_loss_parallel": 2.86102294921875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.65699577331543, + "learning_rate": 1e-06, + "loss": 0.4553, + "mean_token_accuracy": 0.8565588593482971, + "num_tokens": 288901892.0, + "step": 7575 + }, + { + "epoch": 0.9637450706017047, + "ewc_loss": 0.0286865234375, + "ewc_loss_parallel": 2.872943878173828e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.836000442504883, + "learning_rate": 1e-06, + "loss": 0.4639, + "mean_token_accuracy": 0.8523407578468323, + "num_tokens": 288941470.0, + "step": 7576 + }, + { + "epoch": 0.9638722808802951, + "ewc_loss": 0.0284423828125, + "ewc_loss_parallel": 2.849102020263672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.654491424560547, + "learning_rate": 1e-06, + "loss": 0.4705, + "mean_token_accuracy": 0.8486899137496948, + "num_tokens": 288988616.0, + "step": 7577 + }, + { + "epoch": 0.9639994911588856, + "ewc_loss": 0.0286865234375, + "ewc_loss_parallel": 2.872943878173828e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.778844833374023, + "learning_rate": 1e-06, + "loss": 0.377, + "mean_token_accuracy": 0.8796536922454834, + "num_tokens": 289024589.0, + "step": 7578 + }, + { + "epoch": 0.9641267014374761, + "ewc_loss": 0.0286865234375, + "ewc_loss_parallel": 2.872943878173828e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.711963653564453, + "learning_rate": 1e-06, + "loss": 0.4269, + "mean_token_accuracy": 0.8653014302253723, + "num_tokens": 289064334.0, + "step": 7579 + }, + { + "epoch": 0.9642539117160667, + "ewc_loss": 0.0286865234375, + "ewc_loss_parallel": 2.872943878173828e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.757471084594727, + "learning_rate": 1e-06, + "loss": 0.4653, + "mean_token_accuracy": 0.8517589569091797, + "num_tokens": 289099385.0, + "step": 7580 + }, + { + "epoch": 0.9643811219946572, + "ewc_loss": 0.0286865234375, + "ewc_loss_parallel": 2.872943878173828e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.81184196472168, + "learning_rate": 1e-06, + "loss": 0.4208, + "mean_token_accuracy": 0.8658041954040527, + "num_tokens": 289139143.0, + "step": 7581 + }, + { + "epoch": 0.9645083322732477, + "ewc_loss": 0.0286865234375, + "ewc_loss_parallel": 2.872943878173828e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.81467628479004, + "learning_rate": 1e-06, + "loss": 0.4574, + "mean_token_accuracy": 0.8578394651412964, + "num_tokens": 289179303.0, + "step": 7582 + }, + { + "epoch": 0.9646355425518381, + "ewc_loss": 0.0286865234375, + "ewc_loss_parallel": 2.872943878173828e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.65758514404297, + "learning_rate": 1e-06, + "loss": 0.4618, + "mean_token_accuracy": 0.8546730279922485, + "num_tokens": 289214218.0, + "step": 7583 + }, + { + "epoch": 0.9647627528304287, + "ewc_loss": 0.0284423828125, + "ewc_loss_parallel": 2.849102020263672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.741554260253906, + "learning_rate": 1e-06, + "loss": 0.4391, + "mean_token_accuracy": 0.8587157726287842, + "num_tokens": 289240991.0, + "step": 7584 + }, + { + "epoch": 0.9648899631090192, + "ewc_loss": 0.02880859375, + "ewc_loss_parallel": 2.8848648071289062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.7418270111084, + "learning_rate": 1e-06, + "loss": 0.4586, + "mean_token_accuracy": 0.8565551042556763, + "num_tokens": 289273374.0, + "step": 7585 + }, + { + "epoch": 0.9650171733876097, + "ewc_loss": 0.0284423828125, + "ewc_loss_parallel": 2.849102020263672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.8065242767334, + "learning_rate": 1e-06, + "loss": 0.4798, + "mean_token_accuracy": 0.8489845991134644, + "num_tokens": 289314904.0, + "step": 7586 + }, + { + "epoch": 0.9651443836662003, + "ewc_loss": 0.028564453125, + "ewc_loss_parallel": 2.86102294921875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.64872169494629, + "learning_rate": 1e-06, + "loss": 0.4601, + "mean_token_accuracy": 0.8555830717086792, + "num_tokens": 289358110.0, + "step": 7587 + }, + { + "epoch": 0.9652715939447908, + "ewc_loss": 0.0286865234375, + "ewc_loss_parallel": 2.872943878173828e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.738325119018555, + "learning_rate": 1e-06, + "loss": 0.4554, + "mean_token_accuracy": 0.8528144359588623, + "num_tokens": 289391581.0, + "step": 7588 + }, + { + "epoch": 0.9653988042233812, + "ewc_loss": 0.0289306640625, + "ewc_loss_parallel": 2.8967857360839844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.616336822509766, + "learning_rate": 1e-06, + "loss": 0.5102, + "mean_token_accuracy": 0.836311936378479, + "num_tokens": 289432280.0, + "step": 7589 + }, + { + "epoch": 0.9655260145019717, + "ewc_loss": 0.028564453125, + "ewc_loss_parallel": 2.86102294921875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.647838592529297, + "learning_rate": 1e-06, + "loss": 0.4902, + "mean_token_accuracy": 0.8467313647270203, + "num_tokens": 289473222.0, + "step": 7590 + }, + { + "epoch": 0.9656532247805623, + "ewc_loss": 0.02880859375, + "ewc_loss_parallel": 2.8848648071289062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.856740951538086, + "learning_rate": 1e-06, + "loss": 0.4506, + "mean_token_accuracy": 0.8587239384651184, + "num_tokens": 289512898.0, + "step": 7591 + }, + { + "epoch": 0.9657804350591528, + "ewc_loss": 0.0289306640625, + "ewc_loss_parallel": 2.8967857360839844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.63688850402832, + "learning_rate": 1e-06, + "loss": 0.423, + "mean_token_accuracy": 0.8674684762954712, + "num_tokens": 289554394.0, + "step": 7592 + }, + { + "epoch": 0.9659076453377433, + "ewc_loss": 0.02880859375, + "ewc_loss_parallel": 2.8848648071289062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.6024227142334, + "learning_rate": 1e-06, + "loss": 0.4467, + "mean_token_accuracy": 0.8592192530632019, + "num_tokens": 289594871.0, + "step": 7593 + }, + { + "epoch": 0.9660348556163338, + "ewc_loss": 0.02880859375, + "ewc_loss_parallel": 2.8848648071289062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.721393585205078, + "learning_rate": 1e-06, + "loss": 0.4604, + "mean_token_accuracy": 0.8505429029464722, + "num_tokens": 289629165.0, + "step": 7594 + }, + { + "epoch": 0.9661620658949243, + "ewc_loss": 0.029052734375, + "ewc_loss_parallel": 2.9087066650390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.723127365112305, + "learning_rate": 1e-06, + "loss": 0.436, + "mean_token_accuracy": 0.8611533641815186, + "num_tokens": 289670131.0, + "step": 7595 + }, + { + "epoch": 0.9662892761735148, + "ewc_loss": 0.028564453125, + "ewc_loss_parallel": 2.86102294921875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.584129333496094, + "learning_rate": 1e-06, + "loss": 0.4191, + "mean_token_accuracy": 0.864645779132843, + "num_tokens": 289705238.0, + "step": 7596 + }, + { + "epoch": 0.9664164864521053, + "ewc_loss": 0.0289306640625, + "ewc_loss_parallel": 2.8967857360839844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.692676544189453, + "learning_rate": 1e-06, + "loss": 0.4311, + "mean_token_accuracy": 0.8650499582290649, + "num_tokens": 289740146.0, + "step": 7597 + }, + { + "epoch": 0.9665436967306958, + "ewc_loss": 0.0286865234375, + "ewc_loss_parallel": 2.872943878173828e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.610794067382812, + "learning_rate": 1e-06, + "loss": 0.4283, + "mean_token_accuracy": 0.865949273109436, + "num_tokens": 289781145.0, + "step": 7598 + }, + { + "epoch": 0.9666709070092864, + "ewc_loss": 0.0289306640625, + "ewc_loss_parallel": 2.8967857360839844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.66379165649414, + "learning_rate": 1e-06, + "loss": 0.4899, + "mean_token_accuracy": 0.8451576232910156, + "num_tokens": 289816347.0, + "step": 7599 + }, + { + "epoch": 0.9667981172878769, + "ewc_loss": 0.02880859375, + "ewc_loss_parallel": 2.8848648071289062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.823213577270508, + "learning_rate": 1e-06, + "loss": 0.4971, + "mean_token_accuracy": 0.8474299311637878, + "num_tokens": 289852974.0, + "step": 7600 + }, + { + "epoch": 0.9669253275664673, + "ewc_loss": 0.029052734375, + "ewc_loss_parallel": 2.9087066650390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.70242691040039, + "learning_rate": 1e-06, + "loss": 0.4631, + "mean_token_accuracy": 0.8502150774002075, + "num_tokens": 289894553.0, + "step": 7601 + }, + { + "epoch": 0.9670525378450578, + "ewc_loss": 0.02880859375, + "ewc_loss_parallel": 2.8848648071289062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.726058959960938, + "learning_rate": 1e-06, + "loss": 0.4996, + "mean_token_accuracy": 0.8440154790878296, + "num_tokens": 289930641.0, + "step": 7602 + }, + { + "epoch": 0.9671797481236484, + "ewc_loss": 0.0289306640625, + "ewc_loss_parallel": 2.8967857360839844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.835119247436523, + "learning_rate": 1e-06, + "loss": 0.4473, + "mean_token_accuracy": 0.8559973835945129, + "num_tokens": 289972690.0, + "step": 7603 + }, + { + "epoch": 0.9673069584022389, + "ewc_loss": 0.0289306640625, + "ewc_loss_parallel": 2.8967857360839844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.727399826049805, + "learning_rate": 1e-06, + "loss": 0.4545, + "mean_token_accuracy": 0.855597734451294, + "num_tokens": 290006518.0, + "step": 7604 + }, + { + "epoch": 0.9674341686808294, + "ewc_loss": 0.0286865234375, + "ewc_loss_parallel": 2.872943878173828e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.73162078857422, + "learning_rate": 1e-06, + "loss": 0.4888, + "mean_token_accuracy": 0.8481601476669312, + "num_tokens": 290045206.0, + "step": 7605 + }, + { + "epoch": 0.96756137895942, + "ewc_loss": 0.0289306640625, + "ewc_loss_parallel": 2.8967857360839844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.557998657226562, + "learning_rate": 1e-06, + "loss": 0.4252, + "mean_token_accuracy": 0.8668528199195862, + "num_tokens": 290086976.0, + "step": 7606 + }, + { + "epoch": 0.9676885892380104, + "ewc_loss": 0.0289306640625, + "ewc_loss_parallel": 2.8967857360839844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.750856399536133, + "learning_rate": 1e-06, + "loss": 0.3913, + "mean_token_accuracy": 0.8745863437652588, + "num_tokens": 290126894.0, + "step": 7607 + }, + { + "epoch": 0.9678157995166009, + "ewc_loss": 0.0289306640625, + "ewc_loss_parallel": 2.8967857360839844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.779813766479492, + "learning_rate": 1e-06, + "loss": 0.3896, + "mean_token_accuracy": 0.8752278089523315, + "num_tokens": 290161684.0, + "step": 7608 + }, + { + "epoch": 0.9679430097951914, + "ewc_loss": 0.02880859375, + "ewc_loss_parallel": 2.8848648071289062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.73622703552246, + "learning_rate": 1e-06, + "loss": 0.4288, + "mean_token_accuracy": 0.8636627197265625, + "num_tokens": 290203407.0, + "step": 7609 + }, + { + "epoch": 0.968070220073782, + "ewc_loss": 0.02880859375, + "ewc_loss_parallel": 2.8848648071289062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.876832962036133, + "learning_rate": 1e-06, + "loss": 0.4111, + "mean_token_accuracy": 0.8684746623039246, + "num_tokens": 290238645.0, + "step": 7610 + }, + { + "epoch": 0.9681974303523725, + "ewc_loss": 0.0289306640625, + "ewc_loss_parallel": 2.8967857360839844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.715639114379883, + "learning_rate": 1e-06, + "loss": 0.4285, + "mean_token_accuracy": 0.8612006306648254, + "num_tokens": 290280279.0, + "step": 7611 + }, + { + "epoch": 0.968324640630963, + "ewc_loss": 0.029052734375, + "ewc_loss_parallel": 2.9087066650390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.833330154418945, + "learning_rate": 1e-06, + "loss": 0.4455, + "mean_token_accuracy": 0.8602815270423889, + "num_tokens": 290317739.0, + "step": 7612 + }, + { + "epoch": 0.9684518509095535, + "ewc_loss": 0.0289306640625, + "ewc_loss_parallel": 2.8967857360839844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.794679641723633, + "learning_rate": 1e-06, + "loss": 0.4472, + "mean_token_accuracy": 0.8594747185707092, + "num_tokens": 290360384.0, + "step": 7613 + }, + { + "epoch": 0.968579061188144, + "ewc_loss": 0.02880859375, + "ewc_loss_parallel": 2.8848648071289062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.611703872680664, + "learning_rate": 1e-06, + "loss": 0.414, + "mean_token_accuracy": 0.8683784604072571, + "num_tokens": 290398706.0, + "step": 7614 + }, + { + "epoch": 0.9687062714667345, + "ewc_loss": 0.02880859375, + "ewc_loss_parallel": 2.8848648071289062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.87520980834961, + "learning_rate": 1e-06, + "loss": 0.3936, + "mean_token_accuracy": 0.8754262328147888, + "num_tokens": 290439524.0, + "step": 7615 + }, + { + "epoch": 0.968833481745325, + "ewc_loss": 0.029052734375, + "ewc_loss_parallel": 2.9087066650390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.666906356811523, + "learning_rate": 1e-06, + "loss": 0.4457, + "mean_token_accuracy": 0.8562842607498169, + "num_tokens": 290472125.0, + "step": 7616 + }, + { + "epoch": 0.9689606920239155, + "ewc_loss": 0.02880859375, + "ewc_loss_parallel": 2.8848648071289062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.739458084106445, + "learning_rate": 1e-06, + "loss": 0.495, + "mean_token_accuracy": 0.8461549282073975, + "num_tokens": 290514348.0, + "step": 7617 + }, + { + "epoch": 0.9690879023025061, + "ewc_loss": 0.0291748046875, + "ewc_loss_parallel": 2.9206275939941406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.860822677612305, + "learning_rate": 1e-06, + "loss": 0.453, + "mean_token_accuracy": 0.8560066223144531, + "num_tokens": 290551699.0, + "step": 7618 + }, + { + "epoch": 0.9692151125810966, + "ewc_loss": 0.028564453125, + "ewc_loss_parallel": 2.86102294921875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.70948600769043, + "learning_rate": 1e-06, + "loss": 0.4242, + "mean_token_accuracy": 0.8637911081314087, + "num_tokens": 290588644.0, + "step": 7619 + }, + { + "epoch": 0.969342322859687, + "ewc_loss": 0.0289306640625, + "ewc_loss_parallel": 2.8967857360839844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.774368286132812, + "learning_rate": 1e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.8636184334754944, + "num_tokens": 290629546.0, + "step": 7620 + }, + { + "epoch": 0.9694695331382776, + "ewc_loss": 0.02880859375, + "ewc_loss_parallel": 2.8848648071289062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.685462951660156, + "learning_rate": 1e-06, + "loss": 0.4078, + "mean_token_accuracy": 0.8700059652328491, + "num_tokens": 290665527.0, + "step": 7621 + }, + { + "epoch": 0.9695967434168681, + "ewc_loss": 0.02880859375, + "ewc_loss_parallel": 2.8848648071289062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.60361671447754, + "learning_rate": 1e-06, + "loss": 0.4513, + "mean_token_accuracy": 0.8533014059066772, + "num_tokens": 290706667.0, + "step": 7622 + }, + { + "epoch": 0.9697239536954586, + "ewc_loss": 0.028564453125, + "ewc_loss_parallel": 2.86102294921875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.801097869873047, + "learning_rate": 1e-06, + "loss": 0.4633, + "mean_token_accuracy": 0.856590986251831, + "num_tokens": 290745698.0, + "step": 7623 + }, + { + "epoch": 0.9698511639740491, + "ewc_loss": 0.0286865234375, + "ewc_loss_parallel": 2.872943878173828e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.674720764160156, + "learning_rate": 1e-06, + "loss": 0.4163, + "mean_token_accuracy": 0.8672388195991516, + "num_tokens": 290777983.0, + "step": 7624 + }, + { + "epoch": 0.9699783742526397, + "ewc_loss": 0.02880859375, + "ewc_loss_parallel": 2.8848648071289062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.943147659301758, + "learning_rate": 1e-06, + "loss": 0.4597, + "mean_token_accuracy": 0.8566889762878418, + "num_tokens": 290814184.0, + "step": 7625 + }, + { + "epoch": 0.9701055845312301, + "ewc_loss": 0.0289306640625, + "ewc_loss_parallel": 2.8967857360839844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.558582305908203, + "learning_rate": 1e-06, + "loss": 0.4511, + "mean_token_accuracy": 0.8531467914581299, + "num_tokens": 290854054.0, + "step": 7626 + }, + { + "epoch": 0.9702327948098206, + "ewc_loss": 0.02880859375, + "ewc_loss_parallel": 2.8848648071289062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.872711181640625, + "learning_rate": 1e-06, + "loss": 0.4367, + "mean_token_accuracy": 0.8612244725227356, + "num_tokens": 290891325.0, + "step": 7627 + }, + { + "epoch": 0.9703600050884111, + "ewc_loss": 0.0291748046875, + "ewc_loss_parallel": 2.9206275939941406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.623201370239258, + "learning_rate": 1e-06, + "loss": 0.4467, + "mean_token_accuracy": 0.8600074052810669, + "num_tokens": 290930101.0, + "step": 7628 + }, + { + "epoch": 0.9704872153670017, + "ewc_loss": 0.02880859375, + "ewc_loss_parallel": 2.8848648071289062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.685483932495117, + "learning_rate": 1e-06, + "loss": 0.4228, + "mean_token_accuracy": 0.8627687096595764, + "num_tokens": 290964691.0, + "step": 7629 + }, + { + "epoch": 0.9706144256455922, + "ewc_loss": 0.029052734375, + "ewc_loss_parallel": 2.9087066650390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.62837028503418, + "learning_rate": 1e-06, + "loss": 0.4519, + "mean_token_accuracy": 0.8586686849594116, + "num_tokens": 291008140.0, + "step": 7630 + }, + { + "epoch": 0.9707416359241827, + "ewc_loss": 0.02880859375, + "ewc_loss_parallel": 2.8848648071289062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.68202781677246, + "learning_rate": 1e-06, + "loss": 0.4439, + "mean_token_accuracy": 0.8558667302131653, + "num_tokens": 291041484.0, + "step": 7631 + }, + { + "epoch": 0.9708688462027731, + "ewc_loss": 0.02880859375, + "ewc_loss_parallel": 2.8848648071289062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.58028221130371, + "learning_rate": 1e-06, + "loss": 0.5014, + "mean_token_accuracy": 0.8435306549072266, + "num_tokens": 291082337.0, + "step": 7632 + }, + { + "epoch": 0.9709960564813637, + "ewc_loss": 0.02880859375, + "ewc_loss_parallel": 2.8848648071289062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.67123794555664, + "learning_rate": 1e-06, + "loss": 0.4395, + "mean_token_accuracy": 0.8615246415138245, + "num_tokens": 291115921.0, + "step": 7633 + }, + { + "epoch": 0.9711232667599542, + "ewc_loss": 0.0289306640625, + "ewc_loss_parallel": 2.8967857360839844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.778345108032227, + "learning_rate": 1e-06, + "loss": 0.4115, + "mean_token_accuracy": 0.8695055246353149, + "num_tokens": 291154367.0, + "step": 7634 + }, + { + "epoch": 0.9712504770385447, + "ewc_loss": 0.0289306640625, + "ewc_loss_parallel": 2.8967857360839844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.886587142944336, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8618948459625244, + "num_tokens": 291198919.0, + "step": 7635 + }, + { + "epoch": 0.9713776873171353, + "ewc_loss": 0.02880859375, + "ewc_loss_parallel": 2.8848648071289062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.723182678222656, + "learning_rate": 1e-06, + "loss": 0.4715, + "mean_token_accuracy": 0.8497377634048462, + "num_tokens": 291237062.0, + "step": 7636 + }, + { + "epoch": 0.9715048975957258, + "ewc_loss": 0.02880859375, + "ewc_loss_parallel": 2.8848648071289062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.658180236816406, + "learning_rate": 1e-06, + "loss": 0.3978, + "mean_token_accuracy": 0.8730721473693848, + "num_tokens": 291275676.0, + "step": 7637 + }, + { + "epoch": 0.9716321078743162, + "ewc_loss": 0.02880859375, + "ewc_loss_parallel": 2.8848648071289062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.689416885375977, + "learning_rate": 1e-06, + "loss": 0.4663, + "mean_token_accuracy": 0.8543413281440735, + "num_tokens": 291316785.0, + "step": 7638 + }, + { + "epoch": 0.9717593181529067, + "ewc_loss": 0.0289306640625, + "ewc_loss_parallel": 2.8967857360839844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.767114639282227, + "learning_rate": 1e-06, + "loss": 0.4431, + "mean_token_accuracy": 0.8591991662979126, + "num_tokens": 291360448.0, + "step": 7639 + }, + { + "epoch": 0.9718865284314973, + "ewc_loss": 0.028564453125, + "ewc_loss_parallel": 2.86102294921875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.705978393554688, + "learning_rate": 1e-06, + "loss": 0.4475, + "mean_token_accuracy": 0.8584303855895996, + "num_tokens": 291401650.0, + "step": 7640 + }, + { + "epoch": 0.9720137387100878, + "ewc_loss": 0.02880859375, + "ewc_loss_parallel": 2.8848648071289062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.64544677734375, + "learning_rate": 1e-06, + "loss": 0.4197, + "mean_token_accuracy": 0.8626375198364258, + "num_tokens": 291439736.0, + "step": 7641 + }, + { + "epoch": 0.9721409489886783, + "ewc_loss": 0.02880859375, + "ewc_loss_parallel": 2.8848648071289062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.88807487487793, + "learning_rate": 1e-06, + "loss": 0.4089, + "mean_token_accuracy": 0.8667329549789429, + "num_tokens": 291479077.0, + "step": 7642 + }, + { + "epoch": 0.9722681592672688, + "ewc_loss": 0.02880859375, + "ewc_loss_parallel": 2.8848648071289062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.65583038330078, + "learning_rate": 1e-06, + "loss": 0.3993, + "mean_token_accuracy": 0.8716017007827759, + "num_tokens": 291512470.0, + "step": 7643 + }, + { + "epoch": 0.9723953695458593, + "ewc_loss": 0.028564453125, + "ewc_loss_parallel": 2.86102294921875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.986738204956055, + "learning_rate": 1e-06, + "loss": 0.3989, + "mean_token_accuracy": 0.8728759288787842, + "num_tokens": 291552797.0, + "step": 7644 + }, + { + "epoch": 0.9725225798244498, + "ewc_loss": 0.0289306640625, + "ewc_loss_parallel": 2.8967857360839844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.674619674682617, + "learning_rate": 1e-06, + "loss": 0.4169, + "mean_token_accuracy": 0.8661706447601318, + "num_tokens": 291589885.0, + "step": 7645 + }, + { + "epoch": 0.9726497901030403, + "ewc_loss": 0.0283203125, + "ewc_loss_parallel": 2.8371810913085938e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.683286666870117, + "learning_rate": 1e-06, + "loss": 0.4301, + "mean_token_accuracy": 0.8610158562660217, + "num_tokens": 291633005.0, + "step": 7646 + }, + { + "epoch": 0.9727770003816308, + "ewc_loss": 0.0286865234375, + "ewc_loss_parallel": 2.872943878173828e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.742036819458008, + "learning_rate": 1e-06, + "loss": 0.4611, + "mean_token_accuracy": 0.858758270740509, + "num_tokens": 291666083.0, + "step": 7647 + }, + { + "epoch": 0.9729042106602214, + "ewc_loss": 0.02880859375, + "ewc_loss_parallel": 2.8848648071289062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.693592071533203, + "learning_rate": 1e-06, + "loss": 0.4119, + "mean_token_accuracy": 0.8684956431388855, + "num_tokens": 291706421.0, + "step": 7648 + }, + { + "epoch": 0.9730314209388119, + "ewc_loss": 0.0289306640625, + "ewc_loss_parallel": 2.8967857360839844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.937456130981445, + "learning_rate": 1e-06, + "loss": 0.4201, + "mean_token_accuracy": 0.8668934106826782, + "num_tokens": 291744641.0, + "step": 7649 + }, + { + "epoch": 0.9731586312174023, + "ewc_loss": 0.0289306640625, + "ewc_loss_parallel": 2.8967857360839844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.896865844726562, + "learning_rate": 1e-06, + "loss": 0.3538, + "mean_token_accuracy": 0.8873707056045532, + "num_tokens": 291780586.0, + "step": 7650 + }, + { + "epoch": 0.9732858414959928, + "ewc_loss": 0.028564453125, + "ewc_loss_parallel": 2.86102294921875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.67828369140625, + "learning_rate": 1e-06, + "loss": 0.4595, + "mean_token_accuracy": 0.8573447465896606, + "num_tokens": 291820482.0, + "step": 7651 + }, + { + "epoch": 0.9734130517745834, + "ewc_loss": 0.02880859375, + "ewc_loss_parallel": 2.8848648071289062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.915855407714844, + "learning_rate": 1e-06, + "loss": 0.415, + "mean_token_accuracy": 0.870307207107544, + "num_tokens": 291858568.0, + "step": 7652 + }, + { + "epoch": 0.9735402620531739, + "ewc_loss": 0.0289306640625, + "ewc_loss_parallel": 2.8967857360839844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.75934410095215, + "learning_rate": 1e-06, + "loss": 0.4533, + "mean_token_accuracy": 0.8540117740631104, + "num_tokens": 291889383.0, + "step": 7653 + }, + { + "epoch": 0.9736674723317644, + "ewc_loss": 0.02880859375, + "ewc_loss_parallel": 2.8848648071289062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.82208824157715, + "learning_rate": 1e-06, + "loss": 0.4009, + "mean_token_accuracy": 0.8713833689689636, + "num_tokens": 291927541.0, + "step": 7654 + }, + { + "epoch": 0.973794682610355, + "ewc_loss": 0.0289306640625, + "ewc_loss_parallel": 2.8967857360839844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.862964630126953, + "learning_rate": 1e-06, + "loss": 0.4775, + "mean_token_accuracy": 0.8461987376213074, + "num_tokens": 291969074.0, + "step": 7655 + }, + { + "epoch": 0.9739218928889454, + "ewc_loss": 0.0286865234375, + "ewc_loss_parallel": 2.872943878173828e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.77451515197754, + "learning_rate": 1e-06, + "loss": 0.425, + "mean_token_accuracy": 0.8604096174240112, + "num_tokens": 292008311.0, + "step": 7656 + }, + { + "epoch": 0.9740491031675359, + "ewc_loss": 0.02880859375, + "ewc_loss_parallel": 2.8848648071289062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.883359909057617, + "learning_rate": 1e-06, + "loss": 0.4566, + "mean_token_accuracy": 0.8533986210823059, + "num_tokens": 292051810.0, + "step": 7657 + }, + { + "epoch": 0.9741763134461264, + "ewc_loss": 0.02880859375, + "ewc_loss_parallel": 2.8848648071289062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.953718185424805, + "learning_rate": 1e-06, + "loss": 0.4124, + "mean_token_accuracy": 0.8681010007858276, + "num_tokens": 292090411.0, + "step": 7658 + }, + { + "epoch": 0.974303523724717, + "ewc_loss": 0.0286865234375, + "ewc_loss_parallel": 2.872943878173828e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.700971603393555, + "learning_rate": 1e-06, + "loss": 0.459, + "mean_token_accuracy": 0.8539389371871948, + "num_tokens": 292124411.0, + "step": 7659 + }, + { + "epoch": 0.9744307340033075, + "ewc_loss": 0.0286865234375, + "ewc_loss_parallel": 2.872943878173828e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.0308780670166, + "learning_rate": 1e-06, + "loss": 0.4326, + "mean_token_accuracy": 0.8612766861915588, + "num_tokens": 292166428.0, + "step": 7660 + }, + { + "epoch": 0.974557944281898, + "ewc_loss": 0.02880859375, + "ewc_loss_parallel": 2.8848648071289062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.83597755432129, + "learning_rate": 1e-06, + "loss": 0.428, + "mean_token_accuracy": 0.8636983633041382, + "num_tokens": 292202687.0, + "step": 7661 + }, + { + "epoch": 0.9746851545604885, + "ewc_loss": 0.0289306640625, + "ewc_loss_parallel": 2.8967857360839844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.77419090270996, + "learning_rate": 1e-06, + "loss": 0.4089, + "mean_token_accuracy": 0.870637059211731, + "num_tokens": 292246364.0, + "step": 7662 + }, + { + "epoch": 0.974812364839079, + "ewc_loss": 0.02880859375, + "ewc_loss_parallel": 2.8848648071289062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.714540481567383, + "learning_rate": 1e-06, + "loss": 0.4349, + "mean_token_accuracy": 0.8624696731567383, + "num_tokens": 292285789.0, + "step": 7663 + }, + { + "epoch": 0.9749395751176695, + "ewc_loss": 0.0286865234375, + "ewc_loss_parallel": 2.872943878173828e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.769412994384766, + "learning_rate": 1e-06, + "loss": 0.461, + "mean_token_accuracy": 0.8564736843109131, + "num_tokens": 292320755.0, + "step": 7664 + }, + { + "epoch": 0.97506678539626, + "ewc_loss": 0.0289306640625, + "ewc_loss_parallel": 2.8967857360839844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.813953399658203, + "learning_rate": 1e-06, + "loss": 0.4042, + "mean_token_accuracy": 0.8714130520820618, + "num_tokens": 292362818.0, + "step": 7665 + }, + { + "epoch": 0.9751939956748505, + "ewc_loss": 0.028564453125, + "ewc_loss_parallel": 2.86102294921875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.936235427856445, + "learning_rate": 1e-06, + "loss": 0.4586, + "mean_token_accuracy": 0.8541501760482788, + "num_tokens": 292404016.0, + "step": 7666 + }, + { + "epoch": 0.9753212059534411, + "ewc_loss": 0.02880859375, + "ewc_loss_parallel": 2.8848648071289062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.830581665039062, + "learning_rate": 1e-06, + "loss": 0.469, + "mean_token_accuracy": 0.8478195071220398, + "num_tokens": 292442117.0, + "step": 7667 + }, + { + "epoch": 0.9754484162320316, + "ewc_loss": 0.0286865234375, + "ewc_loss_parallel": 2.872943878173828e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.803369522094727, + "learning_rate": 1e-06, + "loss": 0.3998, + "mean_token_accuracy": 0.8696685433387756, + "num_tokens": 292472523.0, + "step": 7668 + }, + { + "epoch": 0.975575626510622, + "ewc_loss": 0.0289306640625, + "ewc_loss_parallel": 2.8967857360839844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.861814498901367, + "learning_rate": 1e-06, + "loss": 0.4252, + "mean_token_accuracy": 0.8663859963417053, + "num_tokens": 292508571.0, + "step": 7669 + }, + { + "epoch": 0.9757028367892125, + "ewc_loss": 0.0286865234375, + "ewc_loss_parallel": 2.872943878173828e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.819652557373047, + "learning_rate": 1e-06, + "loss": 0.4447, + "mean_token_accuracy": 0.8585321307182312, + "num_tokens": 292545358.0, + "step": 7670 + }, + { + "epoch": 0.9758300470678031, + "ewc_loss": 0.0289306640625, + "ewc_loss_parallel": 2.8967857360839844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.9637508392334, + "learning_rate": 1e-06, + "loss": 0.4819, + "mean_token_accuracy": 0.8472446203231812, + "num_tokens": 292583962.0, + "step": 7671 + }, + { + "epoch": 0.9759572573463936, + "ewc_loss": 0.028564453125, + "ewc_loss_parallel": 2.86102294921875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.733036041259766, + "learning_rate": 1e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.861420750617981, + "num_tokens": 292616186.0, + "step": 7672 + }, + { + "epoch": 0.9760844676249841, + "ewc_loss": 0.02880859375, + "ewc_loss_parallel": 2.8848648071289062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.89564323425293, + "learning_rate": 1e-06, + "loss": 0.4503, + "mean_token_accuracy": 0.8576983213424683, + "num_tokens": 292657378.0, + "step": 7673 + }, + { + "epoch": 0.9762116779035747, + "ewc_loss": 0.02880859375, + "ewc_loss_parallel": 2.8848648071289062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.863618850708008, + "learning_rate": 1e-06, + "loss": 0.3919, + "mean_token_accuracy": 0.8758401870727539, + "num_tokens": 292692041.0, + "step": 7674 + }, + { + "epoch": 0.9763388881821651, + "ewc_loss": 0.0286865234375, + "ewc_loss_parallel": 2.872943878173828e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.856761932373047, + "learning_rate": 1e-06, + "loss": 0.4276, + "mean_token_accuracy": 0.8593837022781372, + "num_tokens": 292725062.0, + "step": 7675 + }, + { + "epoch": 0.9764660984607556, + "ewc_loss": 0.02880859375, + "ewc_loss_parallel": 2.8848648071289062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.652009963989258, + "learning_rate": 1e-06, + "loss": 0.4065, + "mean_token_accuracy": 0.8692578077316284, + "num_tokens": 292766522.0, + "step": 7676 + }, + { + "epoch": 0.9765933087393461, + "ewc_loss": 0.0286865234375, + "ewc_loss_parallel": 2.872943878173828e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.98255729675293, + "learning_rate": 1e-06, + "loss": 0.4343, + "mean_token_accuracy": 0.8612792491912842, + "num_tokens": 292811371.0, + "step": 7677 + }, + { + "epoch": 0.9767205190179367, + "ewc_loss": 0.02880859375, + "ewc_loss_parallel": 2.8848648071289062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.753725051879883, + "learning_rate": 1e-06, + "loss": 0.4324, + "mean_token_accuracy": 0.861810564994812, + "num_tokens": 292850237.0, + "step": 7678 + }, + { + "epoch": 0.9768477292965272, + "ewc_loss": 0.02880859375, + "ewc_loss_parallel": 2.8848648071289062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.871768951416016, + "learning_rate": 1e-06, + "loss": 0.4219, + "mean_token_accuracy": 0.8653873801231384, + "num_tokens": 292887291.0, + "step": 7679 + }, + { + "epoch": 0.9769749395751177, + "ewc_loss": 0.0286865234375, + "ewc_loss_parallel": 2.872943878173828e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.86178207397461, + "learning_rate": 1e-06, + "loss": 0.3972, + "mean_token_accuracy": 0.8748726844787598, + "num_tokens": 292928396.0, + "step": 7680 + }, + { + "epoch": 0.9771021498537081, + "ewc_loss": 0.02880859375, + "ewc_loss_parallel": 2.8848648071289062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.70088768005371, + "learning_rate": 1e-06, + "loss": 0.4127, + "mean_token_accuracy": 0.8651818633079529, + "num_tokens": 292962748.0, + "step": 7681 + }, + { + "epoch": 0.9772293601322987, + "ewc_loss": 0.02880859375, + "ewc_loss_parallel": 2.8848648071289062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.831907272338867, + "learning_rate": 1e-06, + "loss": 0.4084, + "mean_token_accuracy": 0.8733029365539551, + "num_tokens": 293002330.0, + "step": 7682 + }, + { + "epoch": 0.9773565704108892, + "ewc_loss": 0.02880859375, + "ewc_loss_parallel": 2.8848648071289062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.75091552734375, + "learning_rate": 1e-06, + "loss": 0.4453, + "mean_token_accuracy": 0.8618316650390625, + "num_tokens": 293038288.0, + "step": 7683 + }, + { + "epoch": 0.9774837806894797, + "ewc_loss": 0.02880859375, + "ewc_loss_parallel": 2.8848648071289062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.75751304626465, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.8642660975456238, + "num_tokens": 293078121.0, + "step": 7684 + }, + { + "epoch": 0.9776109909680702, + "ewc_loss": 0.0289306640625, + "ewc_loss_parallel": 2.8967857360839844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.626201629638672, + "learning_rate": 1e-06, + "loss": 0.4428, + "mean_token_accuracy": 0.8587256073951721, + "num_tokens": 293116846.0, + "step": 7685 + }, + { + "epoch": 0.9777382012466608, + "ewc_loss": 0.0289306640625, + "ewc_loss_parallel": 2.8967857360839844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.812702178955078, + "learning_rate": 1e-06, + "loss": 0.4499, + "mean_token_accuracy": 0.8580453395843506, + "num_tokens": 293162237.0, + "step": 7686 + }, + { + "epoch": 0.9778654115252512, + "ewc_loss": 0.0289306640625, + "ewc_loss_parallel": 2.8967857360839844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.6522216796875, + "learning_rate": 1e-06, + "loss": 0.4516, + "mean_token_accuracy": 0.8589063882827759, + "num_tokens": 293196631.0, + "step": 7687 + }, + { + "epoch": 0.9779926218038417, + "ewc_loss": 0.02880859375, + "ewc_loss_parallel": 2.8848648071289062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.84913444519043, + "learning_rate": 1e-06, + "loss": 0.4044, + "mean_token_accuracy": 0.8722082376480103, + "num_tokens": 293238883.0, + "step": 7688 + }, + { + "epoch": 0.9781198320824323, + "ewc_loss": 0.0286865234375, + "ewc_loss_parallel": 2.872943878173828e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.589515686035156, + "learning_rate": 1e-06, + "loss": 0.4608, + "mean_token_accuracy": 0.851835310459137, + "num_tokens": 293270802.0, + "step": 7689 + }, + { + "epoch": 0.9782470423610228, + "ewc_loss": 0.029052734375, + "ewc_loss_parallel": 2.9087066650390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.714975357055664, + "learning_rate": 1e-06, + "loss": 0.5043, + "mean_token_accuracy": 0.8485549092292786, + "num_tokens": 293314910.0, + "step": 7690 + }, + { + "epoch": 0.9783742526396133, + "ewc_loss": 0.02880859375, + "ewc_loss_parallel": 2.8848648071289062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.653146743774414, + "learning_rate": 1e-06, + "loss": 0.3875, + "mean_token_accuracy": 0.8749436140060425, + "num_tokens": 293353083.0, + "step": 7691 + }, + { + "epoch": 0.9785014629182038, + "ewc_loss": 0.0289306640625, + "ewc_loss_parallel": 2.8967857360839844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.659629821777344, + "learning_rate": 1e-06, + "loss": 0.4253, + "mean_token_accuracy": 0.8643589615821838, + "num_tokens": 293389473.0, + "step": 7692 + }, + { + "epoch": 0.9786286731967943, + "ewc_loss": 0.0289306640625, + "ewc_loss_parallel": 2.8967857360839844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.73461151123047, + "learning_rate": 1e-06, + "loss": 0.3902, + "mean_token_accuracy": 0.8756843209266663, + "num_tokens": 293426893.0, + "step": 7693 + }, + { + "epoch": 0.9787558834753848, + "ewc_loss": 0.0289306640625, + "ewc_loss_parallel": 2.8967857360839844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.975147247314453, + "learning_rate": 1e-06, + "loss": 0.4255, + "mean_token_accuracy": 0.8672133684158325, + "num_tokens": 293468116.0, + "step": 7694 + }, + { + "epoch": 0.9788830937539753, + "ewc_loss": 0.029052734375, + "ewc_loss_parallel": 2.9087066650390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.689037322998047, + "learning_rate": 1e-06, + "loss": 0.4215, + "mean_token_accuracy": 0.8646739721298218, + "num_tokens": 293503050.0, + "step": 7695 + }, + { + "epoch": 0.9790103040325658, + "ewc_loss": 0.0286865234375, + "ewc_loss_parallel": 2.872943878173828e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.775222778320312, + "learning_rate": 1e-06, + "loss": 0.4609, + "mean_token_accuracy": 0.8520026206970215, + "num_tokens": 293541673.0, + "step": 7696 + }, + { + "epoch": 0.9791375143111564, + "ewc_loss": 0.0289306640625, + "ewc_loss_parallel": 2.8967857360839844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.83893394470215, + "learning_rate": 1e-06, + "loss": 0.4356, + "mean_token_accuracy": 0.8591455817222595, + "num_tokens": 293578472.0, + "step": 7697 + }, + { + "epoch": 0.9792647245897469, + "ewc_loss": 0.0289306640625, + "ewc_loss_parallel": 2.8967857360839844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.841503143310547, + "learning_rate": 1e-06, + "loss": 0.42, + "mean_token_accuracy": 0.8640116453170776, + "num_tokens": 293611204.0, + "step": 7698 + }, + { + "epoch": 0.9793919348683373, + "ewc_loss": 0.0289306640625, + "ewc_loss_parallel": 2.8967857360839844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.843955993652344, + "learning_rate": 1e-06, + "loss": 0.463, + "mean_token_accuracy": 0.8523762226104736, + "num_tokens": 293651352.0, + "step": 7699 + }, + { + "epoch": 0.9795191451469278, + "ewc_loss": 0.0289306640625, + "ewc_loss_parallel": 2.8967857360839844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.865352630615234, + "learning_rate": 1e-06, + "loss": 0.3903, + "mean_token_accuracy": 0.8733790516853333, + "num_tokens": 293687396.0, + "step": 7700 + }, + { + "epoch": 0.9796463554255184, + "ewc_loss": 0.0289306640625, + "ewc_loss_parallel": 2.8967857360839844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.748994827270508, + "learning_rate": 1e-06, + "loss": 0.4514, + "mean_token_accuracy": 0.8579542636871338, + "num_tokens": 293725083.0, + "step": 7701 + }, + { + "epoch": 0.9797735657041089, + "ewc_loss": 0.02880859375, + "ewc_loss_parallel": 2.8848648071289062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.905200958251953, + "learning_rate": 1e-06, + "loss": 0.441, + "mean_token_accuracy": 0.8631137609481812, + "num_tokens": 293766136.0, + "step": 7702 + }, + { + "epoch": 0.9799007759826994, + "ewc_loss": 0.029052734375, + "ewc_loss_parallel": 2.9087066650390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.98571014404297, + "learning_rate": 1e-06, + "loss": 0.4911, + "mean_token_accuracy": 0.8459270000457764, + "num_tokens": 293805810.0, + "step": 7703 + }, + { + "epoch": 0.98002798626129, + "ewc_loss": 0.028564453125, + "ewc_loss_parallel": 2.86102294921875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.69748878479004, + "learning_rate": 1e-06, + "loss": 0.4644, + "mean_token_accuracy": 0.8537428379058838, + "num_tokens": 293839041.0, + "step": 7704 + }, + { + "epoch": 0.9801551965398804, + "ewc_loss": 0.02880859375, + "ewc_loss_parallel": 2.8848648071289062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.90064811706543, + "learning_rate": 1e-06, + "loss": 0.3859, + "mean_token_accuracy": 0.876980185508728, + "num_tokens": 293876541.0, + "step": 7705 + }, + { + "epoch": 0.9802824068184709, + "ewc_loss": 0.02880859375, + "ewc_loss_parallel": 2.8848648071289062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.737199783325195, + "learning_rate": 1e-06, + "loss": 0.3868, + "mean_token_accuracy": 0.8761177062988281, + "num_tokens": 293918141.0, + "step": 7706 + }, + { + "epoch": 0.9804096170970614, + "ewc_loss": 0.0286865234375, + "ewc_loss_parallel": 2.872943878173828e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.767175674438477, + "learning_rate": 1e-06, + "loss": 0.4647, + "mean_token_accuracy": 0.8513197302818298, + "num_tokens": 293958078.0, + "step": 7707 + }, + { + "epoch": 0.980536827375652, + "ewc_loss": 0.0286865234375, + "ewc_loss_parallel": 2.872943878173828e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.72515106201172, + "learning_rate": 1e-06, + "loss": 0.4021, + "mean_token_accuracy": 0.8721779584884644, + "num_tokens": 293990347.0, + "step": 7708 + }, + { + "epoch": 0.9806640376542425, + "ewc_loss": 0.0289306640625, + "ewc_loss_parallel": 2.8967857360839844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.027864456176758, + "learning_rate": 1e-06, + "loss": 0.4142, + "mean_token_accuracy": 0.8679075241088867, + "num_tokens": 294032136.0, + "step": 7709 + }, + { + "epoch": 0.980791247932833, + "ewc_loss": 0.029052734375, + "ewc_loss_parallel": 2.9087066650390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.869897842407227, + "learning_rate": 1e-06, + "loss": 0.4696, + "mean_token_accuracy": 0.8508390784263611, + "num_tokens": 294065136.0, + "step": 7710 + }, + { + "epoch": 0.9809184582114235, + "ewc_loss": 0.02880859375, + "ewc_loss_parallel": 2.8848648071289062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.59042739868164, + "learning_rate": 1e-06, + "loss": 0.3908, + "mean_token_accuracy": 0.8762919902801514, + "num_tokens": 294104539.0, + "step": 7711 + }, + { + "epoch": 0.981045668490014, + "ewc_loss": 0.02880859375, + "ewc_loss_parallel": 2.8848648071289062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.728670120239258, + "learning_rate": 1e-06, + "loss": 0.435, + "mean_token_accuracy": 0.8596322536468506, + "num_tokens": 294148962.0, + "step": 7712 + }, + { + "epoch": 0.9811728787686045, + "ewc_loss": 0.0289306640625, + "ewc_loss_parallel": 2.8967857360839844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.81414794921875, + "learning_rate": 1e-06, + "loss": 0.413, + "mean_token_accuracy": 0.8656803369522095, + "num_tokens": 294181866.0, + "step": 7713 + }, + { + "epoch": 0.981300089047195, + "ewc_loss": 0.0289306640625, + "ewc_loss_parallel": 2.8967857360839844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.78509521484375, + "learning_rate": 1e-06, + "loss": 0.4047, + "mean_token_accuracy": 0.8710338473320007, + "num_tokens": 294223558.0, + "step": 7714 + }, + { + "epoch": 0.9814272993257855, + "ewc_loss": 0.0289306640625, + "ewc_loss_parallel": 2.8967857360839844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.75055694580078, + "learning_rate": 1e-06, + "loss": 0.4215, + "mean_token_accuracy": 0.8669407367706299, + "num_tokens": 294260487.0, + "step": 7715 + }, + { + "epoch": 0.9815545096043761, + "ewc_loss": 0.029052734375, + "ewc_loss_parallel": 2.9087066650390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.721691131591797, + "learning_rate": 1e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.8633776307106018, + "num_tokens": 294302642.0, + "step": 7716 + }, + { + "epoch": 0.9816817198829666, + "ewc_loss": 0.029052734375, + "ewc_loss_parallel": 2.9087066650390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.571224212646484, + "learning_rate": 1e-06, + "loss": 0.4591, + "mean_token_accuracy": 0.8548527956008911, + "num_tokens": 294338376.0, + "step": 7717 + }, + { + "epoch": 0.981808930161557, + "ewc_loss": 0.029052734375, + "ewc_loss_parallel": 2.9087066650390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.006927490234375, + "learning_rate": 1e-06, + "loss": 0.454, + "mean_token_accuracy": 0.8574374318122864, + "num_tokens": 294372089.0, + "step": 7718 + }, + { + "epoch": 0.9819361404401475, + "ewc_loss": 0.029052734375, + "ewc_loss_parallel": 2.9087066650390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.55718231201172, + "learning_rate": 1e-06, + "loss": 0.4173, + "mean_token_accuracy": 0.8664379119873047, + "num_tokens": 294403801.0, + "step": 7719 + }, + { + "epoch": 0.9820633507187381, + "ewc_loss": 0.029052734375, + "ewc_loss_parallel": 2.9087066650390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.831951141357422, + "learning_rate": 1e-06, + "loss": 0.4272, + "mean_token_accuracy": 0.8651888370513916, + "num_tokens": 294440301.0, + "step": 7720 + }, + { + "epoch": 0.9821905609973286, + "ewc_loss": 0.029052734375, + "ewc_loss_parallel": 2.9087066650390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.77149200439453, + "learning_rate": 1e-06, + "loss": 0.4339, + "mean_token_accuracy": 0.8625088930130005, + "num_tokens": 294472967.0, + "step": 7721 + }, + { + "epoch": 0.9823177712759191, + "ewc_loss": 0.02880859375, + "ewc_loss_parallel": 2.8848648071289062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.877002716064453, + "learning_rate": 1e-06, + "loss": 0.4689, + "mean_token_accuracy": 0.8533438444137573, + "num_tokens": 294518248.0, + "step": 7722 + }, + { + "epoch": 0.9824449815545097, + "ewc_loss": 0.029052734375, + "ewc_loss_parallel": 2.9087066650390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.62677574157715, + "learning_rate": 1e-06, + "loss": 0.4323, + "mean_token_accuracy": 0.8602732419967651, + "num_tokens": 294553899.0, + "step": 7723 + }, + { + "epoch": 0.9825721918331001, + "ewc_loss": 0.0289306640625, + "ewc_loss_parallel": 2.8967857360839844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.850683212280273, + "learning_rate": 1e-06, + "loss": 0.4927, + "mean_token_accuracy": 0.840449333190918, + "num_tokens": 294591382.0, + "step": 7724 + }, + { + "epoch": 0.9826994021116906, + "ewc_loss": 0.0291748046875, + "ewc_loss_parallel": 2.9206275939941406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.763174057006836, + "learning_rate": 1e-06, + "loss": 0.4327, + "mean_token_accuracy": 0.8631584048271179, + "num_tokens": 294629606.0, + "step": 7725 + }, + { + "epoch": 0.9828266123902811, + "ewc_loss": 0.0289306640625, + "ewc_loss_parallel": 2.8967857360839844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.604305267333984, + "learning_rate": 1e-06, + "loss": 0.4304, + "mean_token_accuracy": 0.8648132085800171, + "num_tokens": 294669417.0, + "step": 7726 + }, + { + "epoch": 0.9829538226688717, + "ewc_loss": 0.029052734375, + "ewc_loss_parallel": 2.9087066650390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.91555404663086, + "learning_rate": 1e-06, + "loss": 0.4012, + "mean_token_accuracy": 0.871683657169342, + "num_tokens": 294709876.0, + "step": 7727 + }, + { + "epoch": 0.9830810329474622, + "ewc_loss": 0.0291748046875, + "ewc_loss_parallel": 2.9206275939941406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.800334930419922, + "learning_rate": 1e-06, + "loss": 0.4739, + "mean_token_accuracy": 0.8528203964233398, + "num_tokens": 294750565.0, + "step": 7728 + }, + { + "epoch": 0.9832082432260527, + "ewc_loss": 0.0289306640625, + "ewc_loss_parallel": 2.8967857360839844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.75298309326172, + "learning_rate": 1e-06, + "loss": 0.4407, + "mean_token_accuracy": 0.8630317449569702, + "num_tokens": 294790586.0, + "step": 7729 + }, + { + "epoch": 0.9833354535046431, + "ewc_loss": 0.0289306640625, + "ewc_loss_parallel": 2.8967857360839844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.818693161010742, + "learning_rate": 1e-06, + "loss": 0.4327, + "mean_token_accuracy": 0.8602645397186279, + "num_tokens": 294830176.0, + "step": 7730 + }, + { + "epoch": 0.9834626637832337, + "ewc_loss": 0.0289306640625, + "ewc_loss_parallel": 2.8967857360839844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.699804306030273, + "learning_rate": 1e-06, + "loss": 0.4775, + "mean_token_accuracy": 0.8497883081436157, + "num_tokens": 294874199.0, + "step": 7731 + }, + { + "epoch": 0.9835898740618242, + "ewc_loss": 0.0291748046875, + "ewc_loss_parallel": 2.9206275939941406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.758419036865234, + "learning_rate": 1e-06, + "loss": 0.4115, + "mean_token_accuracy": 0.8692338466644287, + "num_tokens": 294914219.0, + "step": 7732 + }, + { + "epoch": 0.9837170843404147, + "ewc_loss": 0.0289306640625, + "ewc_loss_parallel": 2.8967857360839844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.694955825805664, + "learning_rate": 1e-06, + "loss": 0.4987, + "mean_token_accuracy": 0.8396489024162292, + "num_tokens": 294950726.0, + "step": 7733 + }, + { + "epoch": 0.9838442946190052, + "ewc_loss": 0.029052734375, + "ewc_loss_parallel": 2.9087066650390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.79119300842285, + "learning_rate": 1e-06, + "loss": 0.4385, + "mean_token_accuracy": 0.861802339553833, + "num_tokens": 294993815.0, + "step": 7734 + }, + { + "epoch": 0.9839715048975958, + "ewc_loss": 0.029052734375, + "ewc_loss_parallel": 2.9087066650390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.826204299926758, + "learning_rate": 1e-06, + "loss": 0.4132, + "mean_token_accuracy": 0.8698851466178894, + "num_tokens": 295029275.0, + "step": 7735 + }, + { + "epoch": 0.9840987151761862, + "ewc_loss": 0.0289306640625, + "ewc_loss_parallel": 2.8967857360839844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.710128784179688, + "learning_rate": 1e-06, + "loss": 0.4252, + "mean_token_accuracy": 0.8619304895401001, + "num_tokens": 295066166.0, + "step": 7736 + }, + { + "epoch": 0.9842259254547767, + "ewc_loss": 0.0289306640625, + "ewc_loss_parallel": 2.8967857360839844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.786827087402344, + "learning_rate": 1e-06, + "loss": 0.5142, + "mean_token_accuracy": 0.8401854038238525, + "num_tokens": 295100436.0, + "step": 7737 + }, + { + "epoch": 0.9843531357333672, + "ewc_loss": 0.0289306640625, + "ewc_loss_parallel": 2.8967857360839844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.773574829101562, + "learning_rate": 1e-06, + "loss": 0.4194, + "mean_token_accuracy": 0.8639593720436096, + "num_tokens": 295141756.0, + "step": 7738 + }, + { + "epoch": 0.9844803460119578, + "ewc_loss": 0.0291748046875, + "ewc_loss_parallel": 2.9206275939941406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.893169403076172, + "learning_rate": 1e-06, + "loss": 0.38, + "mean_token_accuracy": 0.8800410032272339, + "num_tokens": 295178894.0, + "step": 7739 + }, + { + "epoch": 0.9846075562905483, + "ewc_loss": 0.029052734375, + "ewc_loss_parallel": 2.9087066650390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.841121673583984, + "learning_rate": 1e-06, + "loss": 0.4715, + "mean_token_accuracy": 0.8520811796188354, + "num_tokens": 295219133.0, + "step": 7740 + }, + { + "epoch": 0.9847347665691388, + "ewc_loss": 0.0291748046875, + "ewc_loss_parallel": 2.9206275939941406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.790447235107422, + "learning_rate": 1e-06, + "loss": 0.4328, + "mean_token_accuracy": 0.8636445999145508, + "num_tokens": 295260322.0, + "step": 7741 + }, + { + "epoch": 0.9848619768477292, + "ewc_loss": 0.02880859375, + "ewc_loss_parallel": 2.8848648071289062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.86247444152832, + "learning_rate": 1e-06, + "loss": 0.4462, + "mean_token_accuracy": 0.8567601442337036, + "num_tokens": 295299675.0, + "step": 7742 + }, + { + "epoch": 0.9849891871263198, + "ewc_loss": 0.029052734375, + "ewc_loss_parallel": 2.9087066650390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.734739303588867, + "learning_rate": 1e-06, + "loss": 0.4189, + "mean_token_accuracy": 0.8678675889968872, + "num_tokens": 295336281.0, + "step": 7743 + }, + { + "epoch": 0.9851163974049103, + "ewc_loss": 0.0289306640625, + "ewc_loss_parallel": 2.8967857360839844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.804853439331055, + "learning_rate": 1e-06, + "loss": 0.4395, + "mean_token_accuracy": 0.86045241355896, + "num_tokens": 295376932.0, + "step": 7744 + }, + { + "epoch": 0.9852436076835008, + "ewc_loss": 0.02880859375, + "ewc_loss_parallel": 2.8848648071289062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.800880432128906, + "learning_rate": 1e-06, + "loss": 0.4429, + "mean_token_accuracy": 0.8592058420181274, + "num_tokens": 295421904.0, + "step": 7745 + }, + { + "epoch": 0.9853708179620914, + "ewc_loss": 0.029052734375, + "ewc_loss_parallel": 2.9087066650390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.690868377685547, + "learning_rate": 1e-06, + "loss": 0.4279, + "mean_token_accuracy": 0.8650776147842407, + "num_tokens": 295456688.0, + "step": 7746 + }, + { + "epoch": 0.9854980282406819, + "ewc_loss": 0.0289306640625, + "ewc_loss_parallel": 2.8967857360839844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.8297061920166, + "learning_rate": 1e-06, + "loss": 0.3813, + "mean_token_accuracy": 0.8795908689498901, + "num_tokens": 295500022.0, + "step": 7747 + }, + { + "epoch": 0.9856252385192723, + "ewc_loss": 0.02880859375, + "ewc_loss_parallel": 2.8848648071289062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.70439338684082, + "learning_rate": 1e-06, + "loss": 0.3685, + "mean_token_accuracy": 0.8820368647575378, + "num_tokens": 295538106.0, + "step": 7748 + }, + { + "epoch": 0.9857524487978628, + "ewc_loss": 0.0289306640625, + "ewc_loss_parallel": 2.8967857360839844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.99543571472168, + "learning_rate": 1e-06, + "loss": 0.4456, + "mean_token_accuracy": 0.8573750257492065, + "num_tokens": 295572465.0, + "step": 7749 + }, + { + "epoch": 0.9858796590764534, + "ewc_loss": 0.0289306640625, + "ewc_loss_parallel": 2.8967857360839844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.643810272216797, + "learning_rate": 1e-06, + "loss": 0.4158, + "mean_token_accuracy": 0.8678245544433594, + "num_tokens": 295614577.0, + "step": 7750 + }, + { + "epoch": 0.9860068693550439, + "ewc_loss": 0.02880859375, + "ewc_loss_parallel": 2.8848648071289062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.823781967163086, + "learning_rate": 1e-06, + "loss": 0.4198, + "mean_token_accuracy": 0.8662846088409424, + "num_tokens": 295654780.0, + "step": 7751 + }, + { + "epoch": 0.9861340796336344, + "ewc_loss": 0.02880859375, + "ewc_loss_parallel": 2.8848648071289062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.718175888061523, + "learning_rate": 1e-06, + "loss": 0.4458, + "mean_token_accuracy": 0.8608673810958862, + "num_tokens": 295695880.0, + "step": 7752 + }, + { + "epoch": 0.986261289912225, + "ewc_loss": 0.0289306640625, + "ewc_loss_parallel": 2.8967857360839844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.762765884399414, + "learning_rate": 1e-06, + "loss": 0.4406, + "mean_token_accuracy": 0.8624507188796997, + "num_tokens": 295738176.0, + "step": 7753 + }, + { + "epoch": 0.9863885001908154, + "ewc_loss": 0.0291748046875, + "ewc_loss_parallel": 2.9206275939941406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.857666015625, + "learning_rate": 1e-06, + "loss": 0.4621, + "mean_token_accuracy": 0.8524801135063171, + "num_tokens": 295777562.0, + "step": 7754 + }, + { + "epoch": 0.9865157104694059, + "ewc_loss": 0.0289306640625, + "ewc_loss_parallel": 2.8967857360839844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.69010353088379, + "learning_rate": 1e-06, + "loss": 0.3775, + "mean_token_accuracy": 0.8788295984268188, + "num_tokens": 295815013.0, + "step": 7755 + }, + { + "epoch": 0.9866429207479964, + "ewc_loss": 0.0289306640625, + "ewc_loss_parallel": 2.8967857360839844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.773902893066406, + "learning_rate": 1e-06, + "loss": 0.4639, + "mean_token_accuracy": 0.8512558937072754, + "num_tokens": 295857751.0, + "step": 7756 + }, + { + "epoch": 0.986770131026587, + "ewc_loss": 0.028564453125, + "ewc_loss_parallel": 2.86102294921875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.982982635498047, + "learning_rate": 1e-06, + "loss": 0.4158, + "mean_token_accuracy": 0.8675814270973206, + "num_tokens": 295896033.0, + "step": 7757 + }, + { + "epoch": 0.9868973413051775, + "ewc_loss": 0.02880859375, + "ewc_loss_parallel": 2.8848648071289062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.714981079101562, + "learning_rate": 1e-06, + "loss": 0.4263, + "mean_token_accuracy": 0.8642081618309021, + "num_tokens": 295936838.0, + "step": 7758 + }, + { + "epoch": 0.987024551583768, + "ewc_loss": 0.028564453125, + "ewc_loss_parallel": 2.86102294921875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.862489700317383, + "learning_rate": 1e-06, + "loss": 0.4153, + "mean_token_accuracy": 0.8680347204208374, + "num_tokens": 295972307.0, + "step": 7759 + }, + { + "epoch": 0.9871517618623584, + "ewc_loss": 0.0289306640625, + "ewc_loss_parallel": 2.8967857360839844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.84501838684082, + "learning_rate": 1e-06, + "loss": 0.4518, + "mean_token_accuracy": 0.8607226014137268, + "num_tokens": 296009966.0, + "step": 7760 + }, + { + "epoch": 0.987278972140949, + "ewc_loss": 0.0286865234375, + "ewc_loss_parallel": 2.872943878173828e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.84748649597168, + "learning_rate": 1e-06, + "loss": 0.4369, + "mean_token_accuracy": 0.8590649366378784, + "num_tokens": 296047874.0, + "step": 7761 + }, + { + "epoch": 0.9874061824195395, + "ewc_loss": 0.0286865234375, + "ewc_loss_parallel": 2.872943878173828e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.928794860839844, + "learning_rate": 1e-06, + "loss": 0.4087, + "mean_token_accuracy": 0.8701016902923584, + "num_tokens": 296090591.0, + "step": 7762 + }, + { + "epoch": 0.98753339269813, + "ewc_loss": 0.02880859375, + "ewc_loss_parallel": 2.8848648071289062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.624996185302734, + "learning_rate": 1e-06, + "loss": 0.4464, + "mean_token_accuracy": 0.8564437031745911, + "num_tokens": 296133024.0, + "step": 7763 + }, + { + "epoch": 0.9876606029767205, + "ewc_loss": 0.0286865234375, + "ewc_loss_parallel": 2.872943878173828e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.871784210205078, + "learning_rate": 1e-06, + "loss": 0.4656, + "mean_token_accuracy": 0.8514772653579712, + "num_tokens": 296177347.0, + "step": 7764 + }, + { + "epoch": 0.9877878132553111, + "ewc_loss": 0.0286865234375, + "ewc_loss_parallel": 2.872943878173828e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.677227020263672, + "learning_rate": 1e-06, + "loss": 0.4444, + "mean_token_accuracy": 0.853277325630188, + "num_tokens": 296215532.0, + "step": 7765 + }, + { + "epoch": 0.9879150235339016, + "ewc_loss": 0.0286865234375, + "ewc_loss_parallel": 2.872943878173828e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.848125457763672, + "learning_rate": 1e-06, + "loss": 0.4642, + "mean_token_accuracy": 0.8511375784873962, + "num_tokens": 296249494.0, + "step": 7766 + }, + { + "epoch": 0.988042233812492, + "ewc_loss": 0.0289306640625, + "ewc_loss_parallel": 2.8967857360839844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.668230056762695, + "learning_rate": 1e-06, + "loss": 0.4672, + "mean_token_accuracy": 0.8538371920585632, + "num_tokens": 296289370.0, + "step": 7767 + }, + { + "epoch": 0.9881694440910825, + "ewc_loss": 0.02880859375, + "ewc_loss_parallel": 2.8848648071289062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.865808486938477, + "learning_rate": 1e-06, + "loss": 0.4226, + "mean_token_accuracy": 0.863906741142273, + "num_tokens": 296329376.0, + "step": 7768 + }, + { + "epoch": 0.9882966543696731, + "ewc_loss": 0.02880859375, + "ewc_loss_parallel": 2.8848648071289062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.83984375, + "learning_rate": 1e-06, + "loss": 0.4271, + "mean_token_accuracy": 0.8688225746154785, + "num_tokens": 296368875.0, + "step": 7769 + }, + { + "epoch": 0.9884238646482636, + "ewc_loss": 0.0289306640625, + "ewc_loss_parallel": 2.8967857360839844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.795665740966797, + "learning_rate": 1e-06, + "loss": 0.4434, + "mean_token_accuracy": 0.8619248867034912, + "num_tokens": 296404766.0, + "step": 7770 + }, + { + "epoch": 0.9885510749268541, + "ewc_loss": 0.029052734375, + "ewc_loss_parallel": 2.9087066650390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.66946792602539, + "learning_rate": 1e-06, + "loss": 0.4503, + "mean_token_accuracy": 0.8617690801620483, + "num_tokens": 296441705.0, + "step": 7771 + }, + { + "epoch": 0.9886782852054447, + "ewc_loss": 0.0289306640625, + "ewc_loss_parallel": 2.8967857360839844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.952478408813477, + "learning_rate": 1e-06, + "loss": 0.4115, + "mean_token_accuracy": 0.8696231842041016, + "num_tokens": 296481075.0, + "step": 7772 + }, + { + "epoch": 0.9888054954840351, + "ewc_loss": 0.0291748046875, + "ewc_loss_parallel": 2.9206275939941406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.756765365600586, + "learning_rate": 1e-06, + "loss": 0.4443, + "mean_token_accuracy": 0.8586368560791016, + "num_tokens": 296511404.0, + "step": 7773 + }, + { + "epoch": 0.9889327057626256, + "ewc_loss": 0.0289306640625, + "ewc_loss_parallel": 2.8967857360839844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.90913963317871, + "learning_rate": 1e-06, + "loss": 0.4574, + "mean_token_accuracy": 0.8536744117736816, + "num_tokens": 296546080.0, + "step": 7774 + }, + { + "epoch": 0.9890599160412161, + "ewc_loss": 0.029052734375, + "ewc_loss_parallel": 2.9087066650390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.650917053222656, + "learning_rate": 1e-06, + "loss": 0.4252, + "mean_token_accuracy": 0.865013837814331, + "num_tokens": 296586119.0, + "step": 7775 + }, + { + "epoch": 0.9891871263198067, + "ewc_loss": 0.029052734375, + "ewc_loss_parallel": 2.9087066650390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.720590591430664, + "learning_rate": 1e-06, + "loss": 0.4418, + "mean_token_accuracy": 0.860664427280426, + "num_tokens": 296629602.0, + "step": 7776 + }, + { + "epoch": 0.9893143365983972, + "ewc_loss": 0.0291748046875, + "ewc_loss_parallel": 2.9206275939941406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.753259658813477, + "learning_rate": 1e-06, + "loss": 0.4658, + "mean_token_accuracy": 0.8553932905197144, + "num_tokens": 296666709.0, + "step": 7777 + }, + { + "epoch": 0.9894415468769877, + "ewc_loss": 0.0291748046875, + "ewc_loss_parallel": 2.9206275939941406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.783782958984375, + "learning_rate": 1e-06, + "loss": 0.4352, + "mean_token_accuracy": 0.8624567985534668, + "num_tokens": 296702611.0, + "step": 7778 + }, + { + "epoch": 0.9895687571555781, + "ewc_loss": 0.029296875, + "ewc_loss_parallel": 2.9325485229492188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.978961944580078, + "learning_rate": 1e-06, + "loss": 0.4139, + "mean_token_accuracy": 0.8669977188110352, + "num_tokens": 296737649.0, + "step": 7779 + }, + { + "epoch": 0.9896959674341687, + "ewc_loss": 0.02880859375, + "ewc_loss_parallel": 2.8848648071289062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.753860473632812, + "learning_rate": 1e-06, + "loss": 0.4261, + "mean_token_accuracy": 0.864136278629303, + "num_tokens": 296776915.0, + "step": 7780 + }, + { + "epoch": 0.9898231777127592, + "ewc_loss": 0.0291748046875, + "ewc_loss_parallel": 2.9206275939941406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.89414405822754, + "learning_rate": 1e-06, + "loss": 0.4142, + "mean_token_accuracy": 0.8681771755218506, + "num_tokens": 296811491.0, + "step": 7781 + }, + { + "epoch": 0.9899503879913497, + "ewc_loss": 0.0291748046875, + "ewc_loss_parallel": 2.9206275939941406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.82076644897461, + "learning_rate": 1e-06, + "loss": 0.4092, + "mean_token_accuracy": 0.8666400909423828, + "num_tokens": 296847602.0, + "step": 7782 + }, + { + "epoch": 0.9900775982699402, + "ewc_loss": 0.02880859375, + "ewc_loss_parallel": 2.8848648071289062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.771528244018555, + "learning_rate": 1e-06, + "loss": 0.4875, + "mean_token_accuracy": 0.8459851741790771, + "num_tokens": 296884737.0, + "step": 7783 + }, + { + "epoch": 0.9902048085485308, + "ewc_loss": 0.029052734375, + "ewc_loss_parallel": 2.9087066650390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.800899505615234, + "learning_rate": 1e-06, + "loss": 0.4727, + "mean_token_accuracy": 0.849466860294342, + "num_tokens": 296927780.0, + "step": 7784 + }, + { + "epoch": 0.9903320188271212, + "ewc_loss": 0.029052734375, + "ewc_loss_parallel": 2.9087066650390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.90123176574707, + "learning_rate": 1e-06, + "loss": 0.3899, + "mean_token_accuracy": 0.8763665556907654, + "num_tokens": 296964200.0, + "step": 7785 + }, + { + "epoch": 0.9904592291057117, + "ewc_loss": 0.029052734375, + "ewc_loss_parallel": 2.9087066650390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.713180541992188, + "learning_rate": 1e-06, + "loss": 0.4662, + "mean_token_accuracy": 0.8538476228713989, + "num_tokens": 297009927.0, + "step": 7786 + }, + { + "epoch": 0.9905864393843022, + "ewc_loss": 0.0289306640625, + "ewc_loss_parallel": 2.8967857360839844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.903900146484375, + "learning_rate": 1e-06, + "loss": 0.4806, + "mean_token_accuracy": 0.84920334815979, + "num_tokens": 297050788.0, + "step": 7787 + }, + { + "epoch": 0.9907136496628928, + "ewc_loss": 0.0291748046875, + "ewc_loss_parallel": 2.9206275939941406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.691110610961914, + "learning_rate": 1e-06, + "loss": 0.4241, + "mean_token_accuracy": 0.868659496307373, + "num_tokens": 297092398.0, + "step": 7788 + }, + { + "epoch": 0.9908408599414833, + "ewc_loss": 0.029052734375, + "ewc_loss_parallel": 2.9087066650390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.846010208129883, + "learning_rate": 1e-06, + "loss": 0.4369, + "mean_token_accuracy": 0.8595181703567505, + "num_tokens": 297130728.0, + "step": 7789 + }, + { + "epoch": 0.9909680702200738, + "ewc_loss": 0.029052734375, + "ewc_loss_parallel": 2.9087066650390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.862947463989258, + "learning_rate": 1e-06, + "loss": 0.3615, + "mean_token_accuracy": 0.8852978348731995, + "num_tokens": 297169124.0, + "step": 7790 + }, + { + "epoch": 0.9910952804986642, + "ewc_loss": 0.0289306640625, + "ewc_loss_parallel": 2.8967857360839844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.890832901000977, + "learning_rate": 1e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.8715351819992065, + "num_tokens": 297204914.0, + "step": 7791 + }, + { + "epoch": 0.9912224907772548, + "ewc_loss": 0.0289306640625, + "ewc_loss_parallel": 2.8967857360839844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.788854598999023, + "learning_rate": 1e-06, + "loss": 0.4339, + "mean_token_accuracy": 0.8596784472465515, + "num_tokens": 297240551.0, + "step": 7792 + }, + { + "epoch": 0.9913497010558453, + "ewc_loss": 0.029052734375, + "ewc_loss_parallel": 2.9087066650390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.86806297302246, + "learning_rate": 1e-06, + "loss": 0.4502, + "mean_token_accuracy": 0.8530554175376892, + "num_tokens": 297287160.0, + "step": 7793 + }, + { + "epoch": 0.9914769113344358, + "ewc_loss": 0.02880859375, + "ewc_loss_parallel": 2.8848648071289062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.72969627380371, + "learning_rate": 1e-06, + "loss": 0.4195, + "mean_token_accuracy": 0.8642499446868896, + "num_tokens": 297323394.0, + "step": 7794 + }, + { + "epoch": 0.9916041216130264, + "ewc_loss": 0.029052734375, + "ewc_loss_parallel": 2.9087066650390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.054786682128906, + "learning_rate": 1e-06, + "loss": 0.4952, + "mean_token_accuracy": 0.8411357998847961, + "num_tokens": 297357808.0, + "step": 7795 + }, + { + "epoch": 0.9917313318916169, + "ewc_loss": 0.0291748046875, + "ewc_loss_parallel": 2.9206275939941406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.745216369628906, + "learning_rate": 1e-06, + "loss": 0.4472, + "mean_token_accuracy": 0.8571336269378662, + "num_tokens": 297397300.0, + "step": 7796 + }, + { + "epoch": 0.9918585421702073, + "ewc_loss": 0.02880859375, + "ewc_loss_parallel": 2.8848648071289062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.844139099121094, + "learning_rate": 1e-06, + "loss": 0.41, + "mean_token_accuracy": 0.8710784316062927, + "num_tokens": 297434699.0, + "step": 7797 + }, + { + "epoch": 0.9919857524487978, + "ewc_loss": 0.029052734375, + "ewc_loss_parallel": 2.9087066650390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.928321838378906, + "learning_rate": 1e-06, + "loss": 0.4357, + "mean_token_accuracy": 0.8621673583984375, + "num_tokens": 297473939.0, + "step": 7798 + }, + { + "epoch": 0.9921129627273884, + "ewc_loss": 0.0291748046875, + "ewc_loss_parallel": 2.9206275939941406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.87854766845703, + "learning_rate": 1e-06, + "loss": 0.4697, + "mean_token_accuracy": 0.8545025587081909, + "num_tokens": 297511154.0, + "step": 7799 + }, + { + "epoch": 0.9922401730059789, + "ewc_loss": 0.0289306640625, + "ewc_loss_parallel": 2.8967857360839844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.853975296020508, + "learning_rate": 1e-06, + "loss": 0.4653, + "mean_token_accuracy": 0.8534499406814575, + "num_tokens": 297556092.0, + "step": 7800 + }, + { + "epoch": 0.9923673832845694, + "ewc_loss": 0.02880859375, + "ewc_loss_parallel": 2.8848648071289062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.850051879882812, + "learning_rate": 1e-06, + "loss": 0.4199, + "mean_token_accuracy": 0.8659400343894958, + "num_tokens": 297598160.0, + "step": 7801 + }, + { + "epoch": 0.9924945935631599, + "ewc_loss": 0.02880859375, + "ewc_loss_parallel": 2.8848648071289062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.83420181274414, + "learning_rate": 1e-06, + "loss": 0.3879, + "mean_token_accuracy": 0.8782199621200562, + "num_tokens": 297638706.0, + "step": 7802 + }, + { + "epoch": 0.9926218038417504, + "ewc_loss": 0.0289306640625, + "ewc_loss_parallel": 2.8967857360839844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.651775360107422, + "learning_rate": 1e-06, + "loss": 0.4241, + "mean_token_accuracy": 0.866427481174469, + "num_tokens": 297675955.0, + "step": 7803 + }, + { + "epoch": 0.9927490141203409, + "ewc_loss": 0.0291748046875, + "ewc_loss_parallel": 2.9206275939941406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.073373794555664, + "learning_rate": 1e-06, + "loss": 0.4587, + "mean_token_accuracy": 0.8579782247543335, + "num_tokens": 297715362.0, + "step": 7804 + }, + { + "epoch": 0.9928762243989314, + "ewc_loss": 0.0291748046875, + "ewc_loss_parallel": 2.9206275939941406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.849157333374023, + "learning_rate": 1e-06, + "loss": 0.4666, + "mean_token_accuracy": 0.8479979038238525, + "num_tokens": 297750931.0, + "step": 7805 + }, + { + "epoch": 0.993003434677522, + "ewc_loss": 0.029052734375, + "ewc_loss_parallel": 2.9087066650390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.92451286315918, + "learning_rate": 1e-06, + "loss": 0.4631, + "mean_token_accuracy": 0.8548343181610107, + "num_tokens": 297786107.0, + "step": 7806 + }, + { + "epoch": 0.9931306449561125, + "ewc_loss": 0.02880859375, + "ewc_loss_parallel": 2.8848648071289062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.86309814453125, + "learning_rate": 1e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.8620994091033936, + "num_tokens": 297824782.0, + "step": 7807 + }, + { + "epoch": 0.993257855234703, + "ewc_loss": 0.0289306640625, + "ewc_loss_parallel": 2.8967857360839844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.77113151550293, + "learning_rate": 1e-06, + "loss": 0.4435, + "mean_token_accuracy": 0.8581511974334717, + "num_tokens": 297861748.0, + "step": 7808 + }, + { + "epoch": 0.9933850655132934, + "ewc_loss": 0.0289306640625, + "ewc_loss_parallel": 2.8967857360839844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.866670608520508, + "learning_rate": 1e-06, + "loss": 0.4438, + "mean_token_accuracy": 0.8617677688598633, + "num_tokens": 297902722.0, + "step": 7809 + }, + { + "epoch": 0.993512275791884, + "ewc_loss": 0.0289306640625, + "ewc_loss_parallel": 2.8967857360839844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.88120460510254, + "learning_rate": 1e-06, + "loss": 0.438, + "mean_token_accuracy": 0.8632243871688843, + "num_tokens": 297948709.0, + "step": 7810 + }, + { + "epoch": 0.9936394860704745, + "ewc_loss": 0.0289306640625, + "ewc_loss_parallel": 2.8967857360839844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.812862396240234, + "learning_rate": 1e-06, + "loss": 0.4161, + "mean_token_accuracy": 0.8639453053474426, + "num_tokens": 297986446.0, + "step": 7811 + }, + { + "epoch": 0.993766696349065, + "ewc_loss": 0.02880859375, + "ewc_loss_parallel": 2.8848648071289062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.963890075683594, + "learning_rate": 1e-06, + "loss": 0.4561, + "mean_token_accuracy": 0.8542606234550476, + "num_tokens": 298024004.0, + "step": 7812 + }, + { + "epoch": 0.9938939066276555, + "ewc_loss": 0.02880859375, + "ewc_loss_parallel": 2.8848648071289062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.944400787353516, + "learning_rate": 1e-06, + "loss": 0.4583, + "mean_token_accuracy": 0.852690577507019, + "num_tokens": 298065518.0, + "step": 7813 + }, + { + "epoch": 0.9940211169062461, + "ewc_loss": 0.0289306640625, + "ewc_loss_parallel": 2.8967857360839844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.030086517333984, + "learning_rate": 1e-06, + "loss": 0.4574, + "mean_token_accuracy": 0.8551446795463562, + "num_tokens": 298108138.0, + "step": 7814 + }, + { + "epoch": 0.9941483271848366, + "ewc_loss": 0.029052734375, + "ewc_loss_parallel": 2.9087066650390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.812583923339844, + "learning_rate": 1e-06, + "loss": 0.4338, + "mean_token_accuracy": 0.8616819381713867, + "num_tokens": 298149348.0, + "step": 7815 + }, + { + "epoch": 0.994275537463427, + "ewc_loss": 0.028564453125, + "ewc_loss_parallel": 2.86102294921875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.019689559936523, + "learning_rate": 1e-06, + "loss": 0.4505, + "mean_token_accuracy": 0.859961748123169, + "num_tokens": 298186347.0, + "step": 7816 + }, + { + "epoch": 0.9944027477420175, + "ewc_loss": 0.028564453125, + "ewc_loss_parallel": 2.86102294921875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.724863052368164, + "learning_rate": 1e-06, + "loss": 0.4946, + "mean_token_accuracy": 0.8411362171173096, + "num_tokens": 298223322.0, + "step": 7817 + }, + { + "epoch": 0.9945299580206081, + "ewc_loss": 0.02880859375, + "ewc_loss_parallel": 2.8848648071289062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.925819396972656, + "learning_rate": 1e-06, + "loss": 0.4538, + "mean_token_accuracy": 0.8570247888565063, + "num_tokens": 298258794.0, + "step": 7818 + }, + { + "epoch": 0.9946571682991986, + "ewc_loss": 0.02880859375, + "ewc_loss_parallel": 2.8848648071289062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.948158264160156, + "learning_rate": 1e-06, + "loss": 0.4565, + "mean_token_accuracy": 0.8564243912696838, + "num_tokens": 298297510.0, + "step": 7819 + }, + { + "epoch": 0.9947843785777891, + "ewc_loss": 0.0286865234375, + "ewc_loss_parallel": 2.872943878173828e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.054317474365234, + "learning_rate": 1e-06, + "loss": 0.4229, + "mean_token_accuracy": 0.8660376071929932, + "num_tokens": 298333784.0, + "step": 7820 + }, + { + "epoch": 0.9949115888563796, + "ewc_loss": 0.028564453125, + "ewc_loss_parallel": 2.86102294921875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.845666885375977, + "learning_rate": 1e-06, + "loss": 0.417, + "mean_token_accuracy": 0.8676626086235046, + "num_tokens": 298371241.0, + "step": 7821 + }, + { + "epoch": 0.9950387991349701, + "ewc_loss": 0.0286865234375, + "ewc_loss_parallel": 2.872943878173828e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.88420867919922, + "learning_rate": 1e-06, + "loss": 0.4853, + "mean_token_accuracy": 0.848625898361206, + "num_tokens": 298407902.0, + "step": 7822 + }, + { + "epoch": 0.9951660094135606, + "ewc_loss": 0.029052734375, + "ewc_loss_parallel": 2.9087066650390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.944732666015625, + "learning_rate": 1e-06, + "loss": 0.513, + "mean_token_accuracy": 0.8368089199066162, + "num_tokens": 298452339.0, + "step": 7823 + }, + { + "epoch": 0.9952932196921511, + "ewc_loss": 0.0289306640625, + "ewc_loss_parallel": 2.8967857360839844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.728534698486328, + "learning_rate": 1e-06, + "loss": 0.4699, + "mean_token_accuracy": 0.852256178855896, + "num_tokens": 298495854.0, + "step": 7824 + }, + { + "epoch": 0.9954204299707416, + "ewc_loss": 0.0289306640625, + "ewc_loss_parallel": 2.8967857360839844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.798311233520508, + "learning_rate": 1e-06, + "loss": 0.4464, + "mean_token_accuracy": 0.8557660579681396, + "num_tokens": 298529642.0, + "step": 7825 + }, + { + "epoch": 0.9955476402493322, + "ewc_loss": 0.0289306640625, + "ewc_loss_parallel": 2.8967857360839844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.795841217041016, + "learning_rate": 1e-06, + "loss": 0.4375, + "mean_token_accuracy": 0.8637507557868958, + "num_tokens": 298574396.0, + "step": 7826 + }, + { + "epoch": 0.9956748505279227, + "ewc_loss": 0.0289306640625, + "ewc_loss_parallel": 2.8967857360839844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.9605712890625, + "learning_rate": 1e-06, + "loss": 0.4836, + "mean_token_accuracy": 0.8480769395828247, + "num_tokens": 298612457.0, + "step": 7827 + }, + { + "epoch": 0.9958020608065131, + "ewc_loss": 0.0286865234375, + "ewc_loss_parallel": 2.872943878173828e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.78336524963379, + "learning_rate": 1e-06, + "loss": 0.4295, + "mean_token_accuracy": 0.8611466884613037, + "num_tokens": 298645061.0, + "step": 7828 + }, + { + "epoch": 0.9959292710851037, + "ewc_loss": 0.0289306640625, + "ewc_loss_parallel": 2.8967857360839844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.7364559173584, + "learning_rate": 1e-06, + "loss": 0.4544, + "mean_token_accuracy": 0.8601914644241333, + "num_tokens": 298688014.0, + "step": 7829 + }, + { + "epoch": 0.9960564813636942, + "ewc_loss": 0.029052734375, + "ewc_loss_parallel": 2.9087066650390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.827604293823242, + "learning_rate": 1e-06, + "loss": 0.4363, + "mean_token_accuracy": 0.8645956516265869, + "num_tokens": 298727815.0, + "step": 7830 + }, + { + "epoch": 0.9961836916422847, + "ewc_loss": 0.0289306640625, + "ewc_loss_parallel": 2.8967857360839844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.93389892578125, + "learning_rate": 1e-06, + "loss": 0.3706, + "mean_token_accuracy": 0.8818868398666382, + "num_tokens": 298765663.0, + "step": 7831 + }, + { + "epoch": 0.9963109019208752, + "ewc_loss": 0.0289306640625, + "ewc_loss_parallel": 2.8967857360839844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.756977081298828, + "learning_rate": 1e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.8652399778366089, + "num_tokens": 298799360.0, + "step": 7832 + }, + { + "epoch": 0.9964381121994658, + "ewc_loss": 0.029052734375, + "ewc_loss_parallel": 2.9087066650390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.731626510620117, + "learning_rate": 1e-06, + "loss": 0.4321, + "mean_token_accuracy": 0.8614574670791626, + "num_tokens": 298835669.0, + "step": 7833 + }, + { + "epoch": 0.9965653224780562, + "ewc_loss": 0.029052734375, + "ewc_loss_parallel": 2.9087066650390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.86362648010254, + "learning_rate": 1e-06, + "loss": 0.4378, + "mean_token_accuracy": 0.8592772483825684, + "num_tokens": 298866871.0, + "step": 7834 + }, + { + "epoch": 0.9966925327566467, + "ewc_loss": 0.0291748046875, + "ewc_loss_parallel": 2.9206275939941406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.959266662597656, + "learning_rate": 1e-06, + "loss": 0.467, + "mean_token_accuracy": 0.8523584008216858, + "num_tokens": 298904136.0, + "step": 7835 + }, + { + "epoch": 0.9968197430352372, + "ewc_loss": 0.029052734375, + "ewc_loss_parallel": 2.9087066650390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.80258560180664, + "learning_rate": 1e-06, + "loss": 0.3902, + "mean_token_accuracy": 0.8735203742980957, + "num_tokens": 298937069.0, + "step": 7836 + }, + { + "epoch": 0.9969469533138278, + "ewc_loss": 0.0289306640625, + "ewc_loss_parallel": 2.8967857360839844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.027511596679688, + "learning_rate": 1e-06, + "loss": 0.4566, + "mean_token_accuracy": 0.8537420034408569, + "num_tokens": 298976778.0, + "step": 7837 + }, + { + "epoch": 0.9970741635924183, + "ewc_loss": 0.0291748046875, + "ewc_loss_parallel": 2.9206275939941406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.60384178161621, + "learning_rate": 1e-06, + "loss": 0.4031, + "mean_token_accuracy": 0.8722800016403198, + "num_tokens": 299010808.0, + "step": 7838 + }, + { + "epoch": 0.9972013738710088, + "ewc_loss": 0.0289306640625, + "ewc_loss_parallel": 2.8967857360839844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.968448638916016, + "learning_rate": 1e-06, + "loss": 0.4659, + "mean_token_accuracy": 0.8508193492889404, + "num_tokens": 299047611.0, + "step": 7839 + }, + { + "epoch": 0.9973285841495992, + "ewc_loss": 0.0291748046875, + "ewc_loss_parallel": 2.9206275939941406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.818889617919922, + "learning_rate": 1e-06, + "loss": 0.4261, + "mean_token_accuracy": 0.8658631443977356, + "num_tokens": 299088668.0, + "step": 7840 + }, + { + "epoch": 0.9974557944281898, + "ewc_loss": 0.02880859375, + "ewc_loss_parallel": 2.8848648071289062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.876665115356445, + "learning_rate": 1e-06, + "loss": 0.4985, + "mean_token_accuracy": 0.8413798213005066, + "num_tokens": 299126572.0, + "step": 7841 + }, + { + "epoch": 0.9975830047067803, + "ewc_loss": 0.029296875, + "ewc_loss_parallel": 2.9325485229492188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.701921463012695, + "learning_rate": 1e-06, + "loss": 0.3957, + "mean_token_accuracy": 0.8733308911323547, + "num_tokens": 299164806.0, + "step": 7842 + }, + { + "epoch": 0.9977102149853708, + "ewc_loss": 0.029052734375, + "ewc_loss_parallel": 2.9087066650390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.726011276245117, + "learning_rate": 1e-06, + "loss": 0.4469, + "mean_token_accuracy": 0.8571323752403259, + "num_tokens": 299207039.0, + "step": 7843 + }, + { + "epoch": 0.9978374252639614, + "ewc_loss": 0.029052734375, + "ewc_loss_parallel": 2.9087066650390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.931110382080078, + "learning_rate": 1e-06, + "loss": 0.4133, + "mean_token_accuracy": 0.871174156665802, + "num_tokens": 299243242.0, + "step": 7844 + }, + { + "epoch": 0.9979646355425519, + "ewc_loss": 0.0291748046875, + "ewc_loss_parallel": 2.9206275939941406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.067750930786133, + "learning_rate": 1e-06, + "loss": 0.423, + "mean_token_accuracy": 0.8680892586708069, + "num_tokens": 299282326.0, + "step": 7845 + }, + { + "epoch": 0.9980918458211423, + "ewc_loss": 0.029052734375, + "ewc_loss_parallel": 2.9087066650390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.823467254638672, + "learning_rate": 1e-06, + "loss": 0.3424, + "mean_token_accuracy": 0.8896456956863403, + "num_tokens": 299316645.0, + "step": 7846 + }, + { + "epoch": 0.9982190560997328, + "ewc_loss": 0.0294189453125, + "ewc_loss_parallel": 2.944469451904297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.12502670288086, + "learning_rate": 1e-06, + "loss": 0.4355, + "mean_token_accuracy": 0.8637057542800903, + "num_tokens": 299349932.0, + "step": 7847 + }, + { + "epoch": 0.9983462663783234, + "ewc_loss": 0.029052734375, + "ewc_loss_parallel": 2.9087066650390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.940839767456055, + "learning_rate": 1e-06, + "loss": 0.4924, + "mean_token_accuracy": 0.839906632900238, + "num_tokens": 299392452.0, + "step": 7848 + }, + { + "epoch": 0.9984734766569139, + "ewc_loss": 0.0291748046875, + "ewc_loss_parallel": 2.9206275939941406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.278926849365234, + "learning_rate": 1e-06, + "loss": 0.4135, + "mean_token_accuracy": 0.8670395612716675, + "num_tokens": 299431479.0, + "step": 7849 + }, + { + "epoch": 0.9986006869355044, + "ewc_loss": 0.0291748046875, + "ewc_loss_parallel": 2.9206275939941406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.997230529785156, + "learning_rate": 1e-06, + "loss": 0.4416, + "mean_token_accuracy": 0.8590586185455322, + "num_tokens": 299475585.0, + "step": 7850 + }, + { + "epoch": 0.9987278972140949, + "ewc_loss": 0.02880859375, + "ewc_loss_parallel": 2.8848648071289062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.87594223022461, + "learning_rate": 1e-06, + "loss": 0.3929, + "mean_token_accuracy": 0.8768775463104248, + "num_tokens": 299514026.0, + "step": 7851 + }, + { + "epoch": 0.9988551074926854, + "ewc_loss": 0.029052734375, + "ewc_loss_parallel": 2.9087066650390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.13117218017578, + "learning_rate": 1e-06, + "loss": 0.4735, + "mean_token_accuracy": 0.8505231142044067, + "num_tokens": 299550858.0, + "step": 7852 + }, + { + "epoch": 0.9989823177712759, + "ewc_loss": 0.029052734375, + "ewc_loss_parallel": 2.9087066650390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.170209884643555, + "learning_rate": 1e-06, + "loss": 0.4641, + "mean_token_accuracy": 0.8554906845092773, + "num_tokens": 299586831.0, + "step": 7853 + }, + { + "epoch": 0.9991095280498664, + "ewc_loss": 0.0286865234375, + "ewc_loss_parallel": 2.872943878173828e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.762338638305664, + "learning_rate": 1e-06, + "loss": 0.4093, + "mean_token_accuracy": 0.8713630437850952, + "num_tokens": 299623116.0, + "step": 7854 + }, + { + "epoch": 0.9992367383284569, + "ewc_loss": 0.02880859375, + "ewc_loss_parallel": 2.8848648071289062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.10563850402832, + "learning_rate": 1e-06, + "loss": 0.4311, + "mean_token_accuracy": 0.8645231127738953, + "num_tokens": 299662165.0, + "step": 7855 + }, + { + "epoch": 0.9993639486070475, + "ewc_loss": 0.029052734375, + "ewc_loss_parallel": 2.9087066650390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.950693130493164, + "learning_rate": 1e-06, + "loss": 0.4585, + "mean_token_accuracy": 0.8583987355232239, + "num_tokens": 299702581.0, + "step": 7856 + }, + { + "epoch": 0.999491158885638, + "ewc_loss": 0.029052734375, + "ewc_loss_parallel": 2.9087066650390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.843233108520508, + "learning_rate": 1e-06, + "loss": 0.4775, + "mean_token_accuracy": 0.8471527695655823, + "num_tokens": 299742559.0, + "step": 7857 + }, + { + "epoch": 0.9996183691642284, + "ewc_loss": 0.02880859375, + "ewc_loss_parallel": 2.8848648071289062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.901981353759766, + "learning_rate": 1e-06, + "loss": 0.3845, + "mean_token_accuracy": 0.8777469992637634, + "num_tokens": 299778637.0, + "step": 7858 + }, + { + "epoch": 0.9997455794428189, + "ewc_loss": 0.0289306640625, + "ewc_loss_parallel": 2.8967857360839844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.837106704711914, + "learning_rate": 1e-06, + "loss": 0.4064, + "mean_token_accuracy": 0.8708160519599915, + "num_tokens": 299812808.0, + "step": 7859 + }, + { + "epoch": 0.9998727897214095, + "ewc_loss": 0.0291748046875, + "ewc_loss_parallel": 2.9206275939941406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.891817092895508, + "learning_rate": 1e-06, + "loss": 0.4376, + "mean_token_accuracy": 0.8586992025375366, + "num_tokens": 299848987.0, + "step": 7860 + }, + { + "epoch": 1.0, + "ewc_loss": 0.0291748046875, + "ewc_loss_parallel": 2.9206275939941406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.974308013916016, + "learning_rate": 1e-06, + "loss": 0.5087, + "mean_token_accuracy": 0.8430765867233276, + "num_tokens": 299886286.0, + "step": 7861 + }, + { + "epoch": 1.0001272102785905, + "ewc_loss": 0.029052734375, + "ewc_loss_parallel": 2.9087066650390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.867300033569336, + "learning_rate": 1e-06, + "loss": 0.4514, + "mean_token_accuracy": 0.8555492162704468, + "num_tokens": 299925456.0, + "step": 7862 + }, + { + "epoch": 1.000254420557181, + "ewc_loss": 0.029052734375, + "ewc_loss_parallel": 2.9087066650390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.79961395263672, + "learning_rate": 1e-06, + "loss": 0.4345, + "mean_token_accuracy": 0.8592106103897095, + "num_tokens": 299965936.0, + "step": 7863 + }, + { + "epoch": 1.0003816308357716, + "ewc_loss": 0.029052734375, + "ewc_loss_parallel": 2.9087066650390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.914241790771484, + "learning_rate": 1e-06, + "loss": 0.395, + "mean_token_accuracy": 0.8755083084106445, + "num_tokens": 300003181.0, + "step": 7864 + }, + { + "epoch": 1.0005088411143621, + "ewc_loss": 0.029052734375, + "ewc_loss_parallel": 2.9087066650390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.899656295776367, + "learning_rate": 1e-06, + "loss": 0.4371, + "mean_token_accuracy": 0.8617417812347412, + "num_tokens": 300040502.0, + "step": 7865 + }, + { + "epoch": 1.0006360513929526, + "ewc_loss": 0.029052734375, + "ewc_loss_parallel": 2.9087066650390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.94869041442871, + "learning_rate": 1e-06, + "loss": 0.3823, + "mean_token_accuracy": 0.8775816559791565, + "num_tokens": 300078795.0, + "step": 7866 + }, + { + "epoch": 1.0007632616715432, + "ewc_loss": 0.029052734375, + "ewc_loss_parallel": 2.9087066650390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.863693237304688, + "learning_rate": 1e-06, + "loss": 0.4009, + "mean_token_accuracy": 0.8732138872146606, + "num_tokens": 300115188.0, + "step": 7867 + }, + { + "epoch": 1.0008904719501335, + "ewc_loss": 0.0291748046875, + "ewc_loss_parallel": 2.9206275939941406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.111108779907227, + "learning_rate": 1e-06, + "loss": 0.4287, + "mean_token_accuracy": 0.8650451898574829, + "num_tokens": 300158583.0, + "step": 7868 + }, + { + "epoch": 1.001017682228724, + "ewc_loss": 0.029052734375, + "ewc_loss_parallel": 2.9087066650390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.840269088745117, + "learning_rate": 1e-06, + "loss": 0.4175, + "mean_token_accuracy": 0.8681590557098389, + "num_tokens": 300194415.0, + "step": 7869 + }, + { + "epoch": 1.0011448925073145, + "ewc_loss": 0.0289306640625, + "ewc_loss_parallel": 2.8967857360839844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.17867660522461, + "learning_rate": 1e-06, + "loss": 0.4396, + "mean_token_accuracy": 0.8588362336158752, + "num_tokens": 300235066.0, + "step": 7870 + }, + { + "epoch": 1.001272102785905, + "ewc_loss": 0.0289306640625, + "ewc_loss_parallel": 2.8967857360839844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.797361373901367, + "learning_rate": 1e-06, + "loss": 0.4286, + "mean_token_accuracy": 0.8656677007675171, + "num_tokens": 300272167.0, + "step": 7871 + }, + { + "epoch": 1.0013993130644956, + "ewc_loss": 0.0284423828125, + "ewc_loss_parallel": 2.849102020263672e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.959800720214844, + "learning_rate": 1e-06, + "loss": 0.4633, + "mean_token_accuracy": 0.8540999889373779, + "num_tokens": 300306032.0, + "step": 7872 + }, + { + "epoch": 1.0015265233430861, + "ewc_loss": 0.02880859375, + "ewc_loss_parallel": 2.8848648071289062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.04514503479004, + "learning_rate": 1e-06, + "loss": 0.4484, + "mean_token_accuracy": 0.859115481376648, + "num_tokens": 300338433.0, + "step": 7873 + }, + { + "epoch": 1.0016537336216766, + "ewc_loss": 0.0289306640625, + "ewc_loss_parallel": 2.8967857360839844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.856605529785156, + "learning_rate": 1e-06, + "loss": 0.4651, + "mean_token_accuracy": 0.8523070812225342, + "num_tokens": 300376527.0, + "step": 7874 + }, + { + "epoch": 1.0017809439002672, + "ewc_loss": 0.02880859375, + "ewc_loss_parallel": 2.8848648071289062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.850366592407227, + "learning_rate": 1e-06, + "loss": 0.3806, + "mean_token_accuracy": 0.8803912401199341, + "num_tokens": 300417309.0, + "step": 7875 + }, + { + "epoch": 1.0019081541788577, + "ewc_loss": 0.0289306640625, + "ewc_loss_parallel": 2.8967857360839844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.848390579223633, + "learning_rate": 1e-06, + "loss": 0.4209, + "mean_token_accuracy": 0.8647580146789551, + "num_tokens": 300459438.0, + "step": 7876 + }, + { + "epoch": 1.0020353644574482, + "ewc_loss": 0.02880859375, + "ewc_loss_parallel": 2.8848648071289062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.777746200561523, + "learning_rate": 1e-06, + "loss": 0.5039, + "mean_token_accuracy": 0.8400127291679382, + "num_tokens": 300501443.0, + "step": 7877 + }, + { + "epoch": 1.0021625747360388, + "ewc_loss": 0.0289306640625, + "ewc_loss_parallel": 2.8967857360839844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.776323318481445, + "learning_rate": 1e-06, + "loss": 0.3897, + "mean_token_accuracy": 0.8710161447525024, + "num_tokens": 300531918.0, + "step": 7878 + }, + { + "epoch": 1.0022897850146293, + "ewc_loss": 0.029052734375, + "ewc_loss_parallel": 2.9087066650390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.909709930419922, + "learning_rate": 1e-06, + "loss": 0.4747, + "mean_token_accuracy": 0.8574730157852173, + "num_tokens": 300575084.0, + "step": 7879 + }, + { + "epoch": 1.0024169952932196, + "ewc_loss": 0.0289306640625, + "ewc_loss_parallel": 2.8967857360839844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.80096435546875, + "learning_rate": 1e-06, + "loss": 0.425, + "mean_token_accuracy": 0.8639969229698181, + "num_tokens": 300608737.0, + "step": 7880 + }, + { + "epoch": 1.0025442055718101, + "ewc_loss": 0.0289306640625, + "ewc_loss_parallel": 2.8967857360839844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.975317001342773, + "learning_rate": 1e-06, + "loss": 0.465, + "mean_token_accuracy": 0.8522838354110718, + "num_tokens": 300647855.0, + "step": 7881 + }, + { + "epoch": 1.0026714158504006, + "ewc_loss": 0.0291748046875, + "ewc_loss_parallel": 2.9206275939941406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.90850830078125, + "learning_rate": 1e-06, + "loss": 0.434, + "mean_token_accuracy": 0.8637558221817017, + "num_tokens": 300689763.0, + "step": 7882 + }, + { + "epoch": 1.0027986261289912, + "ewc_loss": 0.0289306640625, + "ewc_loss_parallel": 2.8967857360839844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.83482551574707, + "learning_rate": 1e-06, + "loss": 0.3968, + "mean_token_accuracy": 0.8741211295127869, + "num_tokens": 300725838.0, + "step": 7883 + }, + { + "epoch": 1.0029258364075817, + "ewc_loss": 0.02880859375, + "ewc_loss_parallel": 2.8848648071289062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.897327423095703, + "learning_rate": 1e-06, + "loss": 0.4044, + "mean_token_accuracy": 0.8696737289428711, + "num_tokens": 300760423.0, + "step": 7884 + }, + { + "epoch": 1.0030530466861722, + "ewc_loss": 0.0291748046875, + "ewc_loss_parallel": 2.9206275939941406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.027864456176758, + "learning_rate": 1e-06, + "loss": 0.4068, + "mean_token_accuracy": 0.8695775270462036, + "num_tokens": 300796236.0, + "step": 7885 + }, + { + "epoch": 1.0031802569647628, + "ewc_loss": 0.0289306640625, + "ewc_loss_parallel": 2.8967857360839844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.04268455505371, + "learning_rate": 1e-06, + "loss": 0.4114, + "mean_token_accuracy": 0.867889404296875, + "num_tokens": 300835213.0, + "step": 7886 + }, + { + "epoch": 1.0033074672433533, + "ewc_loss": 0.029052734375, + "ewc_loss_parallel": 2.9087066650390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.693403244018555, + "learning_rate": 1e-06, + "loss": 0.4041, + "mean_token_accuracy": 0.8708187341690063, + "num_tokens": 300873363.0, + "step": 7887 + }, + { + "epoch": 1.0034346775219438, + "ewc_loss": 0.0289306640625, + "ewc_loss_parallel": 2.8967857360839844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.174320220947266, + "learning_rate": 1e-06, + "loss": 0.4439, + "mean_token_accuracy": 0.8584357500076294, + "num_tokens": 300910233.0, + "step": 7888 + }, + { + "epoch": 1.0035618878005343, + "ewc_loss": 0.029052734375, + "ewc_loss_parallel": 2.9087066650390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.876548767089844, + "learning_rate": 1e-06, + "loss": 0.4006, + "mean_token_accuracy": 0.8709367513656616, + "num_tokens": 300946986.0, + "step": 7889 + }, + { + "epoch": 1.0036890980791249, + "ewc_loss": 0.028564453125, + "ewc_loss_parallel": 2.86102294921875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.82329750061035, + "learning_rate": 1e-06, + "loss": 0.4077, + "mean_token_accuracy": 0.8653544187545776, + "num_tokens": 300985750.0, + "step": 7890 + }, + { + "epoch": 1.0038163083577154, + "ewc_loss": 0.029052734375, + "ewc_loss_parallel": 2.9087066650390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.838529586791992, + "learning_rate": 1e-06, + "loss": 0.421, + "mean_token_accuracy": 0.863603949546814, + "num_tokens": 301032820.0, + "step": 7891 + }, + { + "epoch": 1.0039435186363057, + "ewc_loss": 0.02880859375, + "ewc_loss_parallel": 2.8848648071289062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.94563102722168, + "learning_rate": 1e-06, + "loss": 0.448, + "mean_token_accuracy": 0.8534290194511414, + "num_tokens": 301070825.0, + "step": 7892 + }, + { + "epoch": 1.0040707289148962, + "ewc_loss": 0.02880859375, + "ewc_loss_parallel": 2.8848648071289062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.878807067871094, + "learning_rate": 1e-06, + "loss": 0.4422, + "mean_token_accuracy": 0.8599182367324829, + "num_tokens": 301110764.0, + "step": 7893 + }, + { + "epoch": 1.0041979391934868, + "ewc_loss": 0.02880859375, + "ewc_loss_parallel": 2.8848648071289062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.855613708496094, + "learning_rate": 1e-06, + "loss": 0.4074, + "mean_token_accuracy": 0.8667706251144409, + "num_tokens": 301152421.0, + "step": 7894 + }, + { + "epoch": 1.0043251494720773, + "ewc_loss": 0.0286865234375, + "ewc_loss_parallel": 2.872943878173828e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.97842788696289, + "learning_rate": 1e-06, + "loss": 0.4218, + "mean_token_accuracy": 0.8646528720855713, + "num_tokens": 301192780.0, + "step": 7895 + }, + { + "epoch": 1.0044523597506678, + "ewc_loss": 0.0289306640625, + "ewc_loss_parallel": 2.8967857360839844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.636510848999023, + "learning_rate": 1e-06, + "loss": 0.4421, + "mean_token_accuracy": 0.8603318929672241, + "num_tokens": 301228147.0, + "step": 7896 + }, + { + "epoch": 1.0045795700292584, + "ewc_loss": 0.0286865234375, + "ewc_loss_parallel": 2.872943878173828e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.01824951171875, + "learning_rate": 1e-06, + "loss": 0.4025, + "mean_token_accuracy": 0.8728866577148438, + "num_tokens": 301259568.0, + "step": 7897 + }, + { + "epoch": 1.0047067803078489, + "ewc_loss": 0.029052734375, + "ewc_loss_parallel": 2.9087066650390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.976036071777344, + "learning_rate": 1e-06, + "loss": 0.4262, + "mean_token_accuracy": 0.8644728660583496, + "num_tokens": 301296483.0, + "step": 7898 + }, + { + "epoch": 1.0048339905864394, + "ewc_loss": 0.02880859375, + "ewc_loss_parallel": 2.8848648071289062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.70451545715332, + "learning_rate": 1e-06, + "loss": 0.4128, + "mean_token_accuracy": 0.8666351437568665, + "num_tokens": 301337437.0, + "step": 7899 + }, + { + "epoch": 1.00496120086503, + "ewc_loss": 0.0286865234375, + "ewc_loss_parallel": 2.872943878173828e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.880910873413086, + "learning_rate": 1e-06, + "loss": 0.4132, + "mean_token_accuracy": 0.8630673289299011, + "num_tokens": 301368777.0, + "step": 7900 + }, + { + "epoch": 1.0050884111436205, + "ewc_loss": 0.02880859375, + "ewc_loss_parallel": 2.8848648071289062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.72993278503418, + "learning_rate": 1e-06, + "loss": 0.4495, + "mean_token_accuracy": 0.8601131439208984, + "num_tokens": 301404885.0, + "step": 7901 + }, + { + "epoch": 1.005215621422211, + "ewc_loss": 0.0291748046875, + "ewc_loss_parallel": 2.9206275939941406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.858665466308594, + "learning_rate": 1e-06, + "loss": 0.4732, + "mean_token_accuracy": 0.8535637855529785, + "num_tokens": 301445308.0, + "step": 7902 + }, + { + "epoch": 1.0053428317008015, + "ewc_loss": 0.0289306640625, + "ewc_loss_parallel": 2.8967857360839844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.974822998046875, + "learning_rate": 1e-06, + "loss": 0.4485, + "mean_token_accuracy": 0.8605059385299683, + "num_tokens": 301484845.0, + "step": 7903 + }, + { + "epoch": 1.0054700419793918, + "ewc_loss": 0.0289306640625, + "ewc_loss_parallel": 2.8967857360839844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.80000114440918, + "learning_rate": 1e-06, + "loss": 0.4331, + "mean_token_accuracy": 0.8604201078414917, + "num_tokens": 301524498.0, + "step": 7904 + }, + { + "epoch": 1.0055972522579824, + "ewc_loss": 0.02880859375, + "ewc_loss_parallel": 2.8848648071289062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.90508270263672, + "learning_rate": 1e-06, + "loss": 0.4083, + "mean_token_accuracy": 0.8684690594673157, + "num_tokens": 301565522.0, + "step": 7905 + }, + { + "epoch": 1.0057244625365729, + "ewc_loss": 0.0289306640625, + "ewc_loss_parallel": 2.8967857360839844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.90802001953125, + "learning_rate": 1e-06, + "loss": 0.44, + "mean_token_accuracy": 0.858932614326477, + "num_tokens": 301603724.0, + "step": 7906 + }, + { + "epoch": 1.0058516728151634, + "ewc_loss": 0.0291748046875, + "ewc_loss_parallel": 2.9206275939941406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.198610305786133, + "learning_rate": 1e-06, + "loss": 0.4332, + "mean_token_accuracy": 0.8610912561416626, + "num_tokens": 301636650.0, + "step": 7907 + }, + { + "epoch": 1.005978883093754, + "ewc_loss": 0.0291748046875, + "ewc_loss_parallel": 2.9206275939941406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.090660095214844, + "learning_rate": 1e-06, + "loss": 0.4764, + "mean_token_accuracy": 0.8539486527442932, + "num_tokens": 301675328.0, + "step": 7908 + }, + { + "epoch": 1.0061060933723445, + "ewc_loss": 0.0289306640625, + "ewc_loss_parallel": 2.8967857360839844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.98701286315918, + "learning_rate": 1e-06, + "loss": 0.4523, + "mean_token_accuracy": 0.8567162752151489, + "num_tokens": 301710058.0, + "step": 7909 + }, + { + "epoch": 1.006233303650935, + "ewc_loss": 0.0291748046875, + "ewc_loss_parallel": 2.9206275939941406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.063413619995117, + "learning_rate": 1e-06, + "loss": 0.4513, + "mean_token_accuracy": 0.8570697903633118, + "num_tokens": 301753567.0, + "step": 7910 + }, + { + "epoch": 1.0063605139295255, + "ewc_loss": 0.029052734375, + "ewc_loss_parallel": 2.9087066650390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.225643157958984, + "learning_rate": 1e-06, + "loss": 0.4026, + "mean_token_accuracy": 0.8706458210945129, + "num_tokens": 301792895.0, + "step": 7911 + }, + { + "epoch": 1.006487724208116, + "ewc_loss": 0.0291748046875, + "ewc_loss_parallel": 2.9206275939941406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.985172271728516, + "learning_rate": 1e-06, + "loss": 0.3928, + "mean_token_accuracy": 0.8748183846473694, + "num_tokens": 301833001.0, + "step": 7912 + }, + { + "epoch": 1.0066149344867066, + "ewc_loss": 0.02880859375, + "ewc_loss_parallel": 2.8848648071289062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.329097747802734, + "learning_rate": 1e-06, + "loss": 0.4202, + "mean_token_accuracy": 0.8671318888664246, + "num_tokens": 301871623.0, + "step": 7913 + }, + { + "epoch": 1.006742144765297, + "ewc_loss": 0.0289306640625, + "ewc_loss_parallel": 2.8967857360839844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.952701568603516, + "learning_rate": 1e-06, + "loss": 0.3901, + "mean_token_accuracy": 0.8750649690628052, + "num_tokens": 301903750.0, + "step": 7914 + }, + { + "epoch": 1.0068693550438876, + "ewc_loss": 0.02880859375, + "ewc_loss_parallel": 2.8848648071289062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.117504119873047, + "learning_rate": 1e-06, + "loss": 0.4198, + "mean_token_accuracy": 0.8652776479721069, + "num_tokens": 301943079.0, + "step": 7915 + }, + { + "epoch": 1.0069965653224782, + "ewc_loss": 0.0289306640625, + "ewc_loss_parallel": 2.8967857360839844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.226272583007812, + "learning_rate": 1e-06, + "loss": 0.4143, + "mean_token_accuracy": 0.8678438663482666, + "num_tokens": 301980031.0, + "step": 7916 + }, + { + "epoch": 1.0071237756010685, + "ewc_loss": 0.0289306640625, + "ewc_loss_parallel": 2.8967857360839844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.770265579223633, + "learning_rate": 1e-06, + "loss": 0.4344, + "mean_token_accuracy": 0.8631734848022461, + "num_tokens": 302020454.0, + "step": 7917 + }, + { + "epoch": 1.007250985879659, + "ewc_loss": 0.0289306640625, + "ewc_loss_parallel": 2.8967857360839844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.23903465270996, + "learning_rate": 1e-06, + "loss": 0.4569, + "mean_token_accuracy": 0.8575630187988281, + "num_tokens": 302064391.0, + "step": 7918 + }, + { + "epoch": 1.0073781961582495, + "ewc_loss": 0.0286865234375, + "ewc_loss_parallel": 2.872943878173828e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.184541702270508, + "learning_rate": 1e-06, + "loss": 0.4306, + "mean_token_accuracy": 0.8631687164306641, + "num_tokens": 302101598.0, + "step": 7919 + }, + { + "epoch": 1.00750540643684, + "ewc_loss": 0.0289306640625, + "ewc_loss_parallel": 2.8967857360839844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.993175506591797, + "learning_rate": 1e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.8673871159553528, + "num_tokens": 302138803.0, + "step": 7920 + }, + { + "epoch": 1.0076326167154306, + "ewc_loss": 0.0286865234375, + "ewc_loss_parallel": 2.872943878173828e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.138010025024414, + "learning_rate": 1e-06, + "loss": 0.435, + "mean_token_accuracy": 0.860714316368103, + "num_tokens": 302173682.0, + "step": 7921 + }, + { + "epoch": 1.0077598269940211, + "ewc_loss": 0.0289306640625, + "ewc_loss_parallel": 2.8967857360839844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.030033111572266, + "learning_rate": 1e-06, + "loss": 0.4374, + "mean_token_accuracy": 0.860267162322998, + "num_tokens": 302207390.0, + "step": 7922 + }, + { + "epoch": 1.0078870372726116, + "ewc_loss": 0.02880859375, + "ewc_loss_parallel": 2.8848648071289062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.082134246826172, + "learning_rate": 1e-06, + "loss": 0.4036, + "mean_token_accuracy": 0.8694344162940979, + "num_tokens": 302244440.0, + "step": 7923 + }, + { + "epoch": 1.0080142475512022, + "ewc_loss": 0.02880859375, + "ewc_loss_parallel": 2.8848648071289062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.065101623535156, + "learning_rate": 1e-06, + "loss": 0.4464, + "mean_token_accuracy": 0.8576817512512207, + "num_tokens": 302281058.0, + "step": 7924 + }, + { + "epoch": 1.0081414578297927, + "ewc_loss": 0.029052734375, + "ewc_loss_parallel": 2.9087066650390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.24357032775879, + "learning_rate": 1e-06, + "loss": 0.3773, + "mean_token_accuracy": 0.879849910736084, + "num_tokens": 302319586.0, + "step": 7925 + }, + { + "epoch": 1.0082686681083832, + "ewc_loss": 0.02880859375, + "ewc_loss_parallel": 2.8848648071289062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.016830444335938, + "learning_rate": 1e-06, + "loss": 0.4461, + "mean_token_accuracy": 0.8588145971298218, + "num_tokens": 302361356.0, + "step": 7926 + }, + { + "epoch": 1.0083958783869738, + "ewc_loss": 0.0289306640625, + "ewc_loss_parallel": 2.8967857360839844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.911190032958984, + "learning_rate": 1e-06, + "loss": 0.3945, + "mean_token_accuracy": 0.8717062473297119, + "num_tokens": 302399974.0, + "step": 7927 + }, + { + "epoch": 1.0085230886655643, + "ewc_loss": 0.0289306640625, + "ewc_loss_parallel": 2.8967857360839844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.02077293395996, + "learning_rate": 1e-06, + "loss": 0.4194, + "mean_token_accuracy": 0.8663449287414551, + "num_tokens": 302435709.0, + "step": 7928 + }, + { + "epoch": 1.0086502989441546, + "ewc_loss": 0.029052734375, + "ewc_loss_parallel": 2.9087066650390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.873451232910156, + "learning_rate": 1e-06, + "loss": 0.4339, + "mean_token_accuracy": 0.8594398498535156, + "num_tokens": 302474151.0, + "step": 7929 + }, + { + "epoch": 1.0087775092227451, + "ewc_loss": 0.02880859375, + "ewc_loss_parallel": 2.8848648071289062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.125225067138672, + "learning_rate": 1e-06, + "loss": 0.4219, + "mean_token_accuracy": 0.8653594255447388, + "num_tokens": 302509877.0, + "step": 7930 + }, + { + "epoch": 1.0089047195013356, + "ewc_loss": 0.0289306640625, + "ewc_loss_parallel": 2.8967857360839844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.86664581298828, + "learning_rate": 1e-06, + "loss": 0.374, + "mean_token_accuracy": 0.8773390650749207, + "num_tokens": 302547689.0, + "step": 7931 + }, + { + "epoch": 1.0090319297799262, + "ewc_loss": 0.029052734375, + "ewc_loss_parallel": 2.9087066650390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.02968978881836, + "learning_rate": 1e-06, + "loss": 0.4281, + "mean_token_accuracy": 0.8616300821304321, + "num_tokens": 302590728.0, + "step": 7932 + }, + { + "epoch": 1.0091591400585167, + "ewc_loss": 0.0289306640625, + "ewc_loss_parallel": 2.8967857360839844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.751026153564453, + "learning_rate": 1e-06, + "loss": 0.5043, + "mean_token_accuracy": 0.841417133808136, + "num_tokens": 302625709.0, + "step": 7933 + }, + { + "epoch": 1.0092863503371072, + "ewc_loss": 0.028564453125, + "ewc_loss_parallel": 2.86102294921875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.10803985595703, + "learning_rate": 1e-06, + "loss": 0.4096, + "mean_token_accuracy": 0.8720285296440125, + "num_tokens": 302662086.0, + "step": 7934 + }, + { + "epoch": 1.0094135606156978, + "ewc_loss": 0.0291748046875, + "ewc_loss_parallel": 2.9206275939941406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.8482608795166, + "learning_rate": 1e-06, + "loss": 0.376, + "mean_token_accuracy": 0.8796975612640381, + "num_tokens": 302703680.0, + "step": 7935 + }, + { + "epoch": 1.0095407708942883, + "ewc_loss": 0.0289306640625, + "ewc_loss_parallel": 2.8967857360839844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.980253219604492, + "learning_rate": 1e-06, + "loss": 0.4661, + "mean_token_accuracy": 0.8490321636199951, + "num_tokens": 302740426.0, + "step": 7936 + }, + { + "epoch": 1.0096679811728788, + "ewc_loss": 0.029052734375, + "ewc_loss_parallel": 2.9087066650390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.96271514892578, + "learning_rate": 1e-06, + "loss": 0.389, + "mean_token_accuracy": 0.8761206269264221, + "num_tokens": 302780778.0, + "step": 7937 + }, + { + "epoch": 1.0097951914514693, + "ewc_loss": 0.0291748046875, + "ewc_loss_parallel": 2.9206275939941406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.103708267211914, + "learning_rate": 1e-06, + "loss": 0.4167, + "mean_token_accuracy": 0.8672103881835938, + "num_tokens": 302816012.0, + "step": 7938 + }, + { + "epoch": 1.0099224017300599, + "ewc_loss": 0.029052734375, + "ewc_loss_parallel": 2.9087066650390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.96833038330078, + "learning_rate": 1e-06, + "loss": 0.5026, + "mean_token_accuracy": 0.8410907983779907, + "num_tokens": 302858325.0, + "step": 7939 + }, + { + "epoch": 1.0100496120086504, + "ewc_loss": 0.029052734375, + "ewc_loss_parallel": 2.9087066650390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.003660202026367, + "learning_rate": 1e-06, + "loss": 0.4265, + "mean_token_accuracy": 0.8647162318229675, + "num_tokens": 302898210.0, + "step": 7940 + }, + { + "epoch": 1.0101768222872407, + "ewc_loss": 0.029052734375, + "ewc_loss_parallel": 2.9087066650390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.87078857421875, + "learning_rate": 1e-06, + "loss": 0.4323, + "mean_token_accuracy": 0.8637207746505737, + "num_tokens": 302936072.0, + "step": 7941 + }, + { + "epoch": 1.0103040325658312, + "ewc_loss": 0.0289306640625, + "ewc_loss_parallel": 2.8967857360839844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.895938873291016, + "learning_rate": 1e-06, + "loss": 0.4505, + "mean_token_accuracy": 0.8556669354438782, + "num_tokens": 302978941.0, + "step": 7942 + }, + { + "epoch": 1.0104312428444218, + "ewc_loss": 0.029052734375, + "ewc_loss_parallel": 2.9087066650390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.051570892333984, + "learning_rate": 1e-06, + "loss": 0.4627, + "mean_token_accuracy": 0.8561166524887085, + "num_tokens": 303016664.0, + "step": 7943 + }, + { + "epoch": 1.0105584531230123, + "ewc_loss": 0.02880859375, + "ewc_loss_parallel": 2.8848648071289062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.906869888305664, + "learning_rate": 1e-06, + "loss": 0.4348, + "mean_token_accuracy": 0.8593002557754517, + "num_tokens": 303063313.0, + "step": 7944 + }, + { + "epoch": 1.0106856634016028, + "ewc_loss": 0.0289306640625, + "ewc_loss_parallel": 2.8967857360839844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.958921432495117, + "learning_rate": 1e-06, + "loss": 0.4621, + "mean_token_accuracy": 0.8540968298912048, + "num_tokens": 303103786.0, + "step": 7945 + }, + { + "epoch": 1.0108128736801933, + "ewc_loss": 0.0289306640625, + "ewc_loss_parallel": 2.8967857360839844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.036067962646484, + "learning_rate": 1e-06, + "loss": 0.4037, + "mean_token_accuracy": 0.8692241311073303, + "num_tokens": 303136928.0, + "step": 7946 + }, + { + "epoch": 1.0109400839587839, + "ewc_loss": 0.02880859375, + "ewc_loss_parallel": 2.8848648071289062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.89676284790039, + "learning_rate": 1e-06, + "loss": 0.4097, + "mean_token_accuracy": 0.8709968328475952, + "num_tokens": 303174854.0, + "step": 7947 + }, + { + "epoch": 1.0110672942373744, + "ewc_loss": 0.0289306640625, + "ewc_loss_parallel": 2.8967857360839844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.736797332763672, + "learning_rate": 1e-06, + "loss": 0.4268, + "mean_token_accuracy": 0.8612841367721558, + "num_tokens": 303214234.0, + "step": 7948 + }, + { + "epoch": 1.011194504515965, + "ewc_loss": 0.0289306640625, + "ewc_loss_parallel": 2.8967857360839844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.981578826904297, + "learning_rate": 1e-06, + "loss": 0.3837, + "mean_token_accuracy": 0.8765172362327576, + "num_tokens": 303251830.0, + "step": 7949 + }, + { + "epoch": 1.0113217147945555, + "ewc_loss": 0.02880859375, + "ewc_loss_parallel": 2.8848648071289062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.008676528930664, + "learning_rate": 1e-06, + "loss": 0.4577, + "mean_token_accuracy": 0.8534315824508667, + "num_tokens": 303291564.0, + "step": 7950 + }, + { + "epoch": 1.011448925073146, + "ewc_loss": 0.0289306640625, + "ewc_loss_parallel": 2.8967857360839844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.03190803527832, + "learning_rate": 1e-06, + "loss": 0.4374, + "mean_token_accuracy": 0.8611224889755249, + "num_tokens": 303328642.0, + "step": 7951 + }, + { + "epoch": 1.0115761353517365, + "ewc_loss": 0.0289306640625, + "ewc_loss_parallel": 2.8967857360839844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.884140014648438, + "learning_rate": 1e-06, + "loss": 0.4355, + "mean_token_accuracy": 0.8613491058349609, + "num_tokens": 303373693.0, + "step": 7952 + }, + { + "epoch": 1.0117033456303268, + "ewc_loss": 0.029052734375, + "ewc_loss_parallel": 2.9087066650390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.10777473449707, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.867527186870575, + "num_tokens": 303407124.0, + "step": 7953 + }, + { + "epoch": 1.0118305559089174, + "ewc_loss": 0.029052734375, + "ewc_loss_parallel": 2.9087066650390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.982162475585938, + "learning_rate": 1e-06, + "loss": 0.3928, + "mean_token_accuracy": 0.8749944567680359, + "num_tokens": 303441586.0, + "step": 7954 + }, + { + "epoch": 1.0119577661875079, + "ewc_loss": 0.02880859375, + "ewc_loss_parallel": 2.8848648071289062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.876413345336914, + "learning_rate": 1e-06, + "loss": 0.4286, + "mean_token_accuracy": 0.864607572555542, + "num_tokens": 303473593.0, + "step": 7955 + }, + { + "epoch": 1.0120849764660984, + "ewc_loss": 0.029052734375, + "ewc_loss_parallel": 2.9087066650390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.923891067504883, + "learning_rate": 1e-06, + "loss": 0.4365, + "mean_token_accuracy": 0.8595005869865417, + "num_tokens": 303510239.0, + "step": 7956 + }, + { + "epoch": 1.012212186744689, + "ewc_loss": 0.0289306640625, + "ewc_loss_parallel": 2.8967857360839844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.96897315979004, + "learning_rate": 1e-06, + "loss": 0.4342, + "mean_token_accuracy": 0.8613017797470093, + "num_tokens": 303544709.0, + "step": 7957 + }, + { + "epoch": 1.0123393970232795, + "ewc_loss": 0.029052734375, + "ewc_loss_parallel": 2.9087066650390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.047550201416016, + "learning_rate": 1e-06, + "loss": 0.395, + "mean_token_accuracy": 0.8733229041099548, + "num_tokens": 303581990.0, + "step": 7958 + }, + { + "epoch": 1.01246660730187, + "ewc_loss": 0.0289306640625, + "ewc_loss_parallel": 2.8967857360839844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.909503936767578, + "learning_rate": 1e-06, + "loss": 0.4243, + "mean_token_accuracy": 0.8662756681442261, + "num_tokens": 303625167.0, + "step": 7959 + }, + { + "epoch": 1.0125938175804605, + "ewc_loss": 0.0291748046875, + "ewc_loss_parallel": 2.9206275939941406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.99168586730957, + "learning_rate": 1e-06, + "loss": 0.4275, + "mean_token_accuracy": 0.863457441329956, + "num_tokens": 303664751.0, + "step": 7960 + }, + { + "epoch": 1.012721027859051, + "ewc_loss": 0.0289306640625, + "ewc_loss_parallel": 2.8967857360839844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.81326675415039, + "learning_rate": 1e-06, + "loss": 0.4437, + "mean_token_accuracy": 0.8611926436424255, + "num_tokens": 303702190.0, + "step": 7961 + }, + { + "epoch": 1.0128482381376416, + "ewc_loss": 0.029296875, + "ewc_loss_parallel": 2.9325485229492188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.93709373474121, + "learning_rate": 1e-06, + "loss": 0.3815, + "mean_token_accuracy": 0.8783565163612366, + "num_tokens": 303742872.0, + "step": 7962 + }, + { + "epoch": 1.012975448416232, + "ewc_loss": 0.029052734375, + "ewc_loss_parallel": 2.9087066650390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.20248031616211, + "learning_rate": 1e-06, + "loss": 0.4472, + "mean_token_accuracy": 0.8570690155029297, + "num_tokens": 303781265.0, + "step": 7963 + }, + { + "epoch": 1.0131026586948226, + "ewc_loss": 0.0291748046875, + "ewc_loss_parallel": 2.9206275939941406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.875247955322266, + "learning_rate": 1e-06, + "loss": 0.4347, + "mean_token_accuracy": 0.8643887042999268, + "num_tokens": 303820686.0, + "step": 7964 + }, + { + "epoch": 1.0132298689734132, + "ewc_loss": 0.029052734375, + "ewc_loss_parallel": 2.9087066650390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.930482864379883, + "learning_rate": 1e-06, + "loss": 0.4152, + "mean_token_accuracy": 0.8664988279342651, + "num_tokens": 303863428.0, + "step": 7965 + }, + { + "epoch": 1.0133570792520035, + "ewc_loss": 0.0291748046875, + "ewc_loss_parallel": 2.9206275939941406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.934770584106445, + "learning_rate": 1e-06, + "loss": 0.4468, + "mean_token_accuracy": 0.858797550201416, + "num_tokens": 303903847.0, + "step": 7966 + }, + { + "epoch": 1.013484289530594, + "ewc_loss": 0.0289306640625, + "ewc_loss_parallel": 2.8967857360839844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.855690002441406, + "learning_rate": 1e-06, + "loss": 0.4568, + "mean_token_accuracy": 0.8555608987808228, + "num_tokens": 303943083.0, + "step": 7967 + }, + { + "epoch": 1.0136114998091845, + "ewc_loss": 0.029296875, + "ewc_loss_parallel": 2.9325485229492188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.1268253326416, + "learning_rate": 1e-06, + "loss": 0.3982, + "mean_token_accuracy": 0.8729466199874878, + "num_tokens": 303978773.0, + "step": 7968 + }, + { + "epoch": 1.013738710087775, + "ewc_loss": 0.029052734375, + "ewc_loss_parallel": 2.9087066650390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.978416442871094, + "learning_rate": 1e-06, + "loss": 0.4113, + "mean_token_accuracy": 0.8713469505310059, + "num_tokens": 304017058.0, + "step": 7969 + }, + { + "epoch": 1.0138659203663656, + "ewc_loss": 0.0289306640625, + "ewc_loss_parallel": 2.8967857360839844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.943613052368164, + "learning_rate": 1e-06, + "loss": 0.4471, + "mean_token_accuracy": 0.856066107749939, + "num_tokens": 304054104.0, + "step": 7970 + }, + { + "epoch": 1.013993130644956, + "ewc_loss": 0.0291748046875, + "ewc_loss_parallel": 2.9206275939941406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.024585723876953, + "learning_rate": 1e-06, + "loss": 0.3772, + "mean_token_accuracy": 0.8779042959213257, + "num_tokens": 304091100.0, + "step": 7971 + }, + { + "epoch": 1.0141203409235466, + "ewc_loss": 0.0291748046875, + "ewc_loss_parallel": 2.9206275939941406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.05829620361328, + "learning_rate": 1e-06, + "loss": 0.4138, + "mean_token_accuracy": 0.8649404048919678, + "num_tokens": 304127747.0, + "step": 7972 + }, + { + "epoch": 1.0142475512021372, + "ewc_loss": 0.02880859375, + "ewc_loss_parallel": 2.8848648071289062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.834270477294922, + "learning_rate": 1e-06, + "loss": 0.4167, + "mean_token_accuracy": 0.8637754917144775, + "num_tokens": 304171089.0, + "step": 7973 + }, + { + "epoch": 1.0143747614807277, + "ewc_loss": 0.0291748046875, + "ewc_loss_parallel": 2.9206275939941406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.17913055419922, + "learning_rate": 1e-06, + "loss": 0.452, + "mean_token_accuracy": 0.8546313643455505, + "num_tokens": 304210207.0, + "step": 7974 + }, + { + "epoch": 1.0145019717593182, + "ewc_loss": 0.029296875, + "ewc_loss_parallel": 2.9325485229492188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.03327178955078, + "learning_rate": 1e-06, + "loss": 0.3773, + "mean_token_accuracy": 0.8811832666397095, + "num_tokens": 304249423.0, + "step": 7975 + }, + { + "epoch": 1.0146291820379088, + "ewc_loss": 0.029052734375, + "ewc_loss_parallel": 2.9087066650390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.081926345825195, + "learning_rate": 1e-06, + "loss": 0.4101, + "mean_token_accuracy": 0.8677225708961487, + "num_tokens": 304286872.0, + "step": 7976 + }, + { + "epoch": 1.0147563923164993, + "ewc_loss": 0.0289306640625, + "ewc_loss_parallel": 2.8967857360839844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.889869689941406, + "learning_rate": 1e-06, + "loss": 0.4139, + "mean_token_accuracy": 0.8678423166275024, + "num_tokens": 304323033.0, + "step": 7977 + }, + { + "epoch": 1.0148836025950896, + "ewc_loss": 0.029052734375, + "ewc_loss_parallel": 2.9087066650390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.023571014404297, + "learning_rate": 1e-06, + "loss": 0.4218, + "mean_token_accuracy": 0.8661714792251587, + "num_tokens": 304360096.0, + "step": 7978 + }, + { + "epoch": 1.0150108128736801, + "ewc_loss": 0.0291748046875, + "ewc_loss_parallel": 2.9206275939941406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.948665618896484, + "learning_rate": 1e-06, + "loss": 0.3565, + "mean_token_accuracy": 0.8843954205513, + "num_tokens": 304396364.0, + "step": 7979 + }, + { + "epoch": 1.0151380231522706, + "ewc_loss": 0.029052734375, + "ewc_loss_parallel": 2.9087066650390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.96242332458496, + "learning_rate": 1e-06, + "loss": 0.4196, + "mean_token_accuracy": 0.8656309843063354, + "num_tokens": 304431583.0, + "step": 7980 + }, + { + "epoch": 1.0152652334308612, + "ewc_loss": 0.029052734375, + "ewc_loss_parallel": 2.9087066650390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.063936233520508, + "learning_rate": 1e-06, + "loss": 0.4579, + "mean_token_accuracy": 0.8579077124595642, + "num_tokens": 304470531.0, + "step": 7981 + }, + { + "epoch": 1.0153924437094517, + "ewc_loss": 0.0291748046875, + "ewc_loss_parallel": 2.9206275939941406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.937501907348633, + "learning_rate": 1e-06, + "loss": 0.4011, + "mean_token_accuracy": 0.8690730929374695, + "num_tokens": 304505187.0, + "step": 7982 + }, + { + "epoch": 1.0155196539880422, + "ewc_loss": 0.0289306640625, + "ewc_loss_parallel": 2.8967857360839844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.968660354614258, + "learning_rate": 1e-06, + "loss": 0.4973, + "mean_token_accuracy": 0.8422595262527466, + "num_tokens": 304542685.0, + "step": 7983 + }, + { + "epoch": 1.0156468642666328, + "ewc_loss": 0.0291748046875, + "ewc_loss_parallel": 2.9206275939941406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.982280731201172, + "learning_rate": 1e-06, + "loss": 0.4312, + "mean_token_accuracy": 0.8635270595550537, + "num_tokens": 304579204.0, + "step": 7984 + }, + { + "epoch": 1.0157740745452233, + "ewc_loss": 0.0289306640625, + "ewc_loss_parallel": 2.8967857360839844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.905725479125977, + "learning_rate": 1e-06, + "loss": 0.3938, + "mean_token_accuracy": 0.8712281584739685, + "num_tokens": 304615224.0, + "step": 7985 + }, + { + "epoch": 1.0159012848238138, + "ewc_loss": 0.029052734375, + "ewc_loss_parallel": 2.9087066650390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.007036209106445, + "learning_rate": 1e-06, + "loss": 0.4427, + "mean_token_accuracy": 0.859291672706604, + "num_tokens": 304653513.0, + "step": 7986 + }, + { + "epoch": 1.0160284951024043, + "ewc_loss": 0.0291748046875, + "ewc_loss_parallel": 2.9206275939941406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.07404327392578, + "learning_rate": 1e-06, + "loss": 0.4072, + "mean_token_accuracy": 0.8688448667526245, + "num_tokens": 304686744.0, + "step": 7987 + }, + { + "epoch": 1.0161557053809949, + "ewc_loss": 0.029052734375, + "ewc_loss_parallel": 2.9087066650390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.99135398864746, + "learning_rate": 1e-06, + "loss": 0.4277, + "mean_token_accuracy": 0.8636742830276489, + "num_tokens": 304727262.0, + "step": 7988 + }, + { + "epoch": 1.0162829156595854, + "ewc_loss": 0.0291748046875, + "ewc_loss_parallel": 2.9206275939941406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.830829620361328, + "learning_rate": 1e-06, + "loss": 0.4384, + "mean_token_accuracy": 0.8624459505081177, + "num_tokens": 304763418.0, + "step": 7989 + }, + { + "epoch": 1.0164101259381757, + "ewc_loss": 0.029296875, + "ewc_loss_parallel": 2.9325485229492188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.243289947509766, + "learning_rate": 1e-06, + "loss": 0.4073, + "mean_token_accuracy": 0.8684002161026001, + "num_tokens": 304800763.0, + "step": 7990 + }, + { + "epoch": 1.0165373362167662, + "ewc_loss": 0.029296875, + "ewc_loss_parallel": 2.9325485229492188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.930116653442383, + "learning_rate": 1e-06, + "loss": 0.3612, + "mean_token_accuracy": 0.882625937461853, + "num_tokens": 304833136.0, + "step": 7991 + }, + { + "epoch": 1.0166645464953568, + "ewc_loss": 0.0291748046875, + "ewc_loss_parallel": 2.9206275939941406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.884910583496094, + "learning_rate": 1e-06, + "loss": 0.3969, + "mean_token_accuracy": 0.873215913772583, + "num_tokens": 304873208.0, + "step": 7992 + }, + { + "epoch": 1.0167917567739473, + "ewc_loss": 0.0291748046875, + "ewc_loss_parallel": 2.9206275939941406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.135208129882812, + "learning_rate": 1e-06, + "loss": 0.426, + "mean_token_accuracy": 0.8659044504165649, + "num_tokens": 304914237.0, + "step": 7993 + }, + { + "epoch": 1.0169189670525378, + "ewc_loss": 0.029052734375, + "ewc_loss_parallel": 2.9087066650390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.800701141357422, + "learning_rate": 1e-06, + "loss": 0.3805, + "mean_token_accuracy": 0.8800981044769287, + "num_tokens": 304955453.0, + "step": 7994 + }, + { + "epoch": 1.0170461773311283, + "ewc_loss": 0.0291748046875, + "ewc_loss_parallel": 2.9206275939941406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.1788272857666, + "learning_rate": 1e-06, + "loss": 0.4299, + "mean_token_accuracy": 0.8640453815460205, + "num_tokens": 304990533.0, + "step": 7995 + }, + { + "epoch": 1.0171733876097189, + "ewc_loss": 0.0291748046875, + "ewc_loss_parallel": 2.9206275939941406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.114727020263672, + "learning_rate": 1e-06, + "loss": 0.4402, + "mean_token_accuracy": 0.8633390665054321, + "num_tokens": 305024849.0, + "step": 7996 + }, + { + "epoch": 1.0173005978883094, + "ewc_loss": 0.029052734375, + "ewc_loss_parallel": 2.9087066650390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.984081268310547, + "learning_rate": 1e-06, + "loss": 0.4218, + "mean_token_accuracy": 0.8611063957214355, + "num_tokens": 305061615.0, + "step": 7997 + }, + { + "epoch": 1.0174278081669, + "ewc_loss": 0.029052734375, + "ewc_loss_parallel": 2.9087066650390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.909093856811523, + "learning_rate": 1e-06, + "loss": 0.4774, + "mean_token_accuracy": 0.8456474542617798, + "num_tokens": 305102251.0, + "step": 7998 + }, + { + "epoch": 1.0175550184454905, + "ewc_loss": 0.029052734375, + "ewc_loss_parallel": 2.9087066650390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.006093978881836, + "learning_rate": 1e-06, + "loss": 0.4045, + "mean_token_accuracy": 0.8722009062767029, + "num_tokens": 305135405.0, + "step": 7999 + }, + { + "epoch": 1.017682228724081, + "ewc_loss": 0.0291748046875, + "ewc_loss_parallel": 2.9206275939941406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.977272033691406, + "learning_rate": 1e-06, + "loss": 0.4252, + "mean_token_accuracy": 0.8631648421287537, + "num_tokens": 305169005.0, + "step": 8000 + }, + { + "epoch": 1.0178094390026715, + "ewc_loss": 0.0291748046875, + "ewc_loss_parallel": 2.9206275939941406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.787059783935547, + "learning_rate": 1e-06, + "loss": 0.4382, + "mean_token_accuracy": 0.8606516122817993, + "num_tokens": 305209050.0, + "step": 8001 + }, + { + "epoch": 1.0179366492812618, + "ewc_loss": 0.029052734375, + "ewc_loss_parallel": 2.9087066650390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.90952491760254, + "learning_rate": 1e-06, + "loss": 0.419, + "mean_token_accuracy": 0.863435685634613, + "num_tokens": 305238250.0, + "step": 8002 + }, + { + "epoch": 1.0180638595598523, + "ewc_loss": 0.0291748046875, + "ewc_loss_parallel": 2.9206275939941406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.911174774169922, + "learning_rate": 1e-06, + "loss": 0.4189, + "mean_token_accuracy": 0.8678833246231079, + "num_tokens": 305283544.0, + "step": 8003 + }, + { + "epoch": 1.0181910698384429, + "ewc_loss": 0.029296875, + "ewc_loss_parallel": 2.9325485229492188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.88859748840332, + "learning_rate": 1e-06, + "loss": 0.4794, + "mean_token_accuracy": 0.8482042551040649, + "num_tokens": 305320548.0, + "step": 8004 + }, + { + "epoch": 1.0183182801170334, + "ewc_loss": 0.0291748046875, + "ewc_loss_parallel": 2.9206275939941406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.021421432495117, + "learning_rate": 1e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.8706434965133667, + "num_tokens": 305359560.0, + "step": 8005 + }, + { + "epoch": 1.018445490395624, + "ewc_loss": 0.029296875, + "ewc_loss_parallel": 2.9325485229492188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.963611602783203, + "learning_rate": 1e-06, + "loss": 0.4189, + "mean_token_accuracy": 0.8612985610961914, + "num_tokens": 305396680.0, + "step": 8006 + }, + { + "epoch": 1.0185727006742145, + "ewc_loss": 0.0294189453125, + "ewc_loss_parallel": 2.944469451904297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.122472763061523, + "learning_rate": 1e-06, + "loss": 0.3919, + "mean_token_accuracy": 0.8732548952102661, + "num_tokens": 305432856.0, + "step": 8007 + }, + { + "epoch": 1.018699910952805, + "ewc_loss": 0.0291748046875, + "ewc_loss_parallel": 2.9206275939941406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.00794219970703, + "learning_rate": 1e-06, + "loss": 0.4246, + "mean_token_accuracy": 0.8666418194770813, + "num_tokens": 305469973.0, + "step": 8008 + }, + { + "epoch": 1.0188271212313955, + "ewc_loss": 0.029052734375, + "ewc_loss_parallel": 2.9087066650390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.880556106567383, + "learning_rate": 1e-06, + "loss": 0.3987, + "mean_token_accuracy": 0.8705660104751587, + "num_tokens": 305505576.0, + "step": 8009 + }, + { + "epoch": 1.018954331509986, + "ewc_loss": 0.0291748046875, + "ewc_loss_parallel": 2.9206275939941406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.020734786987305, + "learning_rate": 1e-06, + "loss": 0.5074, + "mean_token_accuracy": 0.8464886546134949, + "num_tokens": 305542037.0, + "step": 8010 + }, + { + "epoch": 1.0190815417885766, + "ewc_loss": 0.029296875, + "ewc_loss_parallel": 2.9325485229492188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.066099166870117, + "learning_rate": 1e-06, + "loss": 0.4027, + "mean_token_accuracy": 0.8728821277618408, + "num_tokens": 305587784.0, + "step": 8011 + }, + { + "epoch": 1.019208752067167, + "ewc_loss": 0.0291748046875, + "ewc_loss_parallel": 2.9206275939941406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.96040916442871, + "learning_rate": 1e-06, + "loss": 0.4596, + "mean_token_accuracy": 0.8579261898994446, + "num_tokens": 305621888.0, + "step": 8012 + }, + { + "epoch": 1.0193359623457576, + "ewc_loss": 0.0294189453125, + "ewc_loss_parallel": 2.944469451904297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.863576889038086, + "learning_rate": 1e-06, + "loss": 0.3679, + "mean_token_accuracy": 0.8840466737747192, + "num_tokens": 305663844.0, + "step": 8013 + }, + { + "epoch": 1.0194631726243482, + "ewc_loss": 0.02880859375, + "ewc_loss_parallel": 2.8848648071289062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.830699920654297, + "learning_rate": 1e-06, + "loss": 0.3709, + "mean_token_accuracy": 0.879280149936676, + "num_tokens": 305695913.0, + "step": 8014 + }, + { + "epoch": 1.0195903829029385, + "ewc_loss": 0.0291748046875, + "ewc_loss_parallel": 2.9206275939941406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.813589096069336, + "learning_rate": 1e-06, + "loss": 0.4351, + "mean_token_accuracy": 0.8635425567626953, + "num_tokens": 305733632.0, + "step": 8015 + }, + { + "epoch": 1.019717593181529, + "ewc_loss": 0.0291748046875, + "ewc_loss_parallel": 2.9206275939941406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.776369094848633, + "learning_rate": 1e-06, + "loss": 0.4778, + "mean_token_accuracy": 0.8491208553314209, + "num_tokens": 305766917.0, + "step": 8016 + }, + { + "epoch": 1.0198448034601195, + "ewc_loss": 0.0294189453125, + "ewc_loss_parallel": 2.944469451904297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.98912811279297, + "learning_rate": 1e-06, + "loss": 0.4494, + "mean_token_accuracy": 0.8594099879264832, + "num_tokens": 305805768.0, + "step": 8017 + }, + { + "epoch": 1.01997201373871, + "ewc_loss": 0.029296875, + "ewc_loss_parallel": 2.9325485229492188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.79865264892578, + "learning_rate": 1e-06, + "loss": 0.4191, + "mean_token_accuracy": 0.8654031157493591, + "num_tokens": 305843412.0, + "step": 8018 + }, + { + "epoch": 1.0200992240173006, + "ewc_loss": 0.0291748046875, + "ewc_loss_parallel": 2.9206275939941406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.99752426147461, + "learning_rate": 1e-06, + "loss": 0.3793, + "mean_token_accuracy": 0.8794724941253662, + "num_tokens": 305880523.0, + "step": 8019 + }, + { + "epoch": 1.020226434295891, + "ewc_loss": 0.0294189453125, + "ewc_loss_parallel": 2.944469451904297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.049283981323242, + "learning_rate": 1e-06, + "loss": 0.4074, + "mean_token_accuracy": 0.8663711547851562, + "num_tokens": 305915379.0, + "step": 8020 + }, + { + "epoch": 1.0203536445744816, + "ewc_loss": 0.029541015625, + "ewc_loss_parallel": 2.956390380859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.051050186157227, + "learning_rate": 1e-06, + "loss": 0.3815, + "mean_token_accuracy": 0.8784019351005554, + "num_tokens": 305952471.0, + "step": 8021 + }, + { + "epoch": 1.0204808548530722, + "ewc_loss": 0.0291748046875, + "ewc_loss_parallel": 2.9206275939941406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.838712692260742, + "learning_rate": 1e-06, + "loss": 0.4288, + "mean_token_accuracy": 0.8635459542274475, + "num_tokens": 305994023.0, + "step": 8022 + }, + { + "epoch": 1.0206080651316627, + "ewc_loss": 0.029296875, + "ewc_loss_parallel": 2.9325485229492188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.95840072631836, + "learning_rate": 1e-06, + "loss": 0.3999, + "mean_token_accuracy": 0.8692694902420044, + "num_tokens": 306026563.0, + "step": 8023 + }, + { + "epoch": 1.0207352754102532, + "ewc_loss": 0.029541015625, + "ewc_loss_parallel": 2.956390380859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.867454528808594, + "learning_rate": 1e-06, + "loss": 0.4334, + "mean_token_accuracy": 0.861397922039032, + "num_tokens": 306067913.0, + "step": 8024 + }, + { + "epoch": 1.0208624856888437, + "ewc_loss": 0.029296875, + "ewc_loss_parallel": 2.9325485229492188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.098731994628906, + "learning_rate": 1e-06, + "loss": 0.4356, + "mean_token_accuracy": 0.8614994287490845, + "num_tokens": 306108692.0, + "step": 8025 + }, + { + "epoch": 1.0209896959674343, + "ewc_loss": 0.029296875, + "ewc_loss_parallel": 2.9325485229492188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.939817428588867, + "learning_rate": 1e-06, + "loss": 0.4214, + "mean_token_accuracy": 0.8653675317764282, + "num_tokens": 306148082.0, + "step": 8026 + }, + { + "epoch": 1.0211169062460246, + "ewc_loss": 0.0291748046875, + "ewc_loss_parallel": 2.9206275939941406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.975818634033203, + "learning_rate": 1e-06, + "loss": 0.403, + "mean_token_accuracy": 0.8726205825805664, + "num_tokens": 306182800.0, + "step": 8027 + }, + { + "epoch": 1.021244116524615, + "ewc_loss": 0.029541015625, + "ewc_loss_parallel": 2.956390380859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.9249210357666, + "learning_rate": 1e-06, + "loss": 0.4403, + "mean_token_accuracy": 0.8621935248374939, + "num_tokens": 306226741.0, + "step": 8028 + }, + { + "epoch": 1.0213713268032056, + "ewc_loss": 0.0291748046875, + "ewc_loss_parallel": 2.9206275939941406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.992406845092773, + "learning_rate": 1e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.8629580736160278, + "num_tokens": 306265993.0, + "step": 8029 + }, + { + "epoch": 1.0214985370817962, + "ewc_loss": 0.029296875, + "ewc_loss_parallel": 2.9325485229492188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.04789161682129, + "learning_rate": 1e-06, + "loss": 0.4134, + "mean_token_accuracy": 0.8690388202667236, + "num_tokens": 306306511.0, + "step": 8030 + }, + { + "epoch": 1.0216257473603867, + "ewc_loss": 0.029296875, + "ewc_loss_parallel": 2.9325485229492188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.94236183166504, + "learning_rate": 1e-06, + "loss": 0.3723, + "mean_token_accuracy": 0.8793667554855347, + "num_tokens": 306337687.0, + "step": 8031 + }, + { + "epoch": 1.0217529576389772, + "ewc_loss": 0.0291748046875, + "ewc_loss_parallel": 2.9206275939941406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.89488983154297, + "learning_rate": 1e-06, + "loss": 0.5282, + "mean_token_accuracy": 0.8303958773612976, + "num_tokens": 306379646.0, + "step": 8032 + }, + { + "epoch": 1.0218801679175677, + "ewc_loss": 0.029296875, + "ewc_loss_parallel": 2.9325485229492188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.768848419189453, + "learning_rate": 1e-06, + "loss": 0.4317, + "mean_token_accuracy": 0.8645097017288208, + "num_tokens": 306415389.0, + "step": 8033 + }, + { + "epoch": 1.0220073781961583, + "ewc_loss": 0.0296630859375, + "ewc_loss_parallel": 2.968311309814453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.03243064880371, + "learning_rate": 1e-06, + "loss": 0.4425, + "mean_token_accuracy": 0.859377920627594, + "num_tokens": 306452035.0, + "step": 8034 + }, + { + "epoch": 1.0221345884747488, + "ewc_loss": 0.0294189453125, + "ewc_loss_parallel": 2.944469451904297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.979543685913086, + "learning_rate": 1e-06, + "loss": 0.4265, + "mean_token_accuracy": 0.8641976714134216, + "num_tokens": 306496277.0, + "step": 8035 + }, + { + "epoch": 1.0222617987533393, + "ewc_loss": 0.0294189453125, + "ewc_loss_parallel": 2.944469451904297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.88776206970215, + "learning_rate": 1e-06, + "loss": 0.4052, + "mean_token_accuracy": 0.8698869943618774, + "num_tokens": 306528212.0, + "step": 8036 + }, + { + "epoch": 1.0223890090319299, + "ewc_loss": 0.029541015625, + "ewc_loss_parallel": 2.956390380859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.112028121948242, + "learning_rate": 1e-06, + "loss": 0.4158, + "mean_token_accuracy": 0.8688249588012695, + "num_tokens": 306562047.0, + "step": 8037 + }, + { + "epoch": 1.0225162193105204, + "ewc_loss": 0.0291748046875, + "ewc_loss_parallel": 2.9206275939941406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.005765914916992, + "learning_rate": 1e-06, + "loss": 0.4584, + "mean_token_accuracy": 0.8548321723937988, + "num_tokens": 306601298.0, + "step": 8038 + }, + { + "epoch": 1.0226434295891107, + "ewc_loss": 0.029052734375, + "ewc_loss_parallel": 2.9087066650390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.7993221282959, + "learning_rate": 1e-06, + "loss": 0.4178, + "mean_token_accuracy": 0.8644874095916748, + "num_tokens": 306640961.0, + "step": 8039 + }, + { + "epoch": 1.0227706398677012, + "ewc_loss": 0.0294189453125, + "ewc_loss_parallel": 2.944469451904297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.071083068847656, + "learning_rate": 1e-06, + "loss": 0.3772, + "mean_token_accuracy": 0.8835591077804565, + "num_tokens": 306679507.0, + "step": 8040 + }, + { + "epoch": 1.0228978501462918, + "ewc_loss": 0.0291748046875, + "ewc_loss_parallel": 2.9206275939941406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.865236282348633, + "learning_rate": 1e-06, + "loss": 0.4541, + "mean_token_accuracy": 0.8558943271636963, + "num_tokens": 306717337.0, + "step": 8041 + }, + { + "epoch": 1.0230250604248823, + "ewc_loss": 0.0291748046875, + "ewc_loss_parallel": 2.9206275939941406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.12845230102539, + "learning_rate": 1e-06, + "loss": 0.4926, + "mean_token_accuracy": 0.839080810546875, + "num_tokens": 306754152.0, + "step": 8042 + }, + { + "epoch": 1.0231522707034728, + "ewc_loss": 0.0291748046875, + "ewc_loss_parallel": 2.9206275939941406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.98719596862793, + "learning_rate": 1e-06, + "loss": 0.4513, + "mean_token_accuracy": 0.8588963747024536, + "num_tokens": 306794887.0, + "step": 8043 + }, + { + "epoch": 1.0232794809820633, + "ewc_loss": 0.029296875, + "ewc_loss_parallel": 2.9325485229492188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.001195907592773, + "learning_rate": 1e-06, + "loss": 0.426, + "mean_token_accuracy": 0.868255615234375, + "num_tokens": 306828374.0, + "step": 8044 + }, + { + "epoch": 1.0234066912606539, + "ewc_loss": 0.0294189453125, + "ewc_loss_parallel": 2.944469451904297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.292335510253906, + "learning_rate": 1e-06, + "loss": 0.4123, + "mean_token_accuracy": 0.8684214353561401, + "num_tokens": 306865285.0, + "step": 8045 + }, + { + "epoch": 1.0235339015392444, + "ewc_loss": 0.0294189453125, + "ewc_loss_parallel": 2.944469451904297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.909664154052734, + "learning_rate": 1e-06, + "loss": 0.394, + "mean_token_accuracy": 0.8704273700714111, + "num_tokens": 306897195.0, + "step": 8046 + }, + { + "epoch": 1.023661111817835, + "ewc_loss": 0.029296875, + "ewc_loss_parallel": 2.9325485229492188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.074901580810547, + "learning_rate": 1e-06, + "loss": 0.4318, + "mean_token_accuracy": 0.8629174828529358, + "num_tokens": 306935352.0, + "step": 8047 + }, + { + "epoch": 1.0237883220964255, + "ewc_loss": 0.029541015625, + "ewc_loss_parallel": 2.956390380859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.811084747314453, + "learning_rate": 1e-06, + "loss": 0.4059, + "mean_token_accuracy": 0.8700718879699707, + "num_tokens": 306964621.0, + "step": 8048 + }, + { + "epoch": 1.023915532375016, + "ewc_loss": 0.029296875, + "ewc_loss_parallel": 2.9325485229492188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.99736213684082, + "learning_rate": 1e-06, + "loss": 0.4301, + "mean_token_accuracy": 0.8627002239227295, + "num_tokens": 307003451.0, + "step": 8049 + }, + { + "epoch": 1.0240427426536065, + "ewc_loss": 0.029296875, + "ewc_loss_parallel": 2.9325485229492188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.96295166015625, + "learning_rate": 1e-06, + "loss": 0.427, + "mean_token_accuracy": 0.8657796382904053, + "num_tokens": 307044695.0, + "step": 8050 + }, + { + "epoch": 1.0241699529321968, + "ewc_loss": 0.0291748046875, + "ewc_loss_parallel": 2.9206275939941406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.065868377685547, + "learning_rate": 1e-06, + "loss": 0.4007, + "mean_token_accuracy": 0.8718048334121704, + "num_tokens": 307080603.0, + "step": 8051 + }, + { + "epoch": 1.0242971632107873, + "ewc_loss": 0.029541015625, + "ewc_loss_parallel": 2.956390380859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.074352264404297, + "learning_rate": 1e-06, + "loss": 0.4463, + "mean_token_accuracy": 0.8581230044364929, + "num_tokens": 307122300.0, + "step": 8052 + }, + { + "epoch": 1.0244243734893779, + "ewc_loss": 0.029052734375, + "ewc_loss_parallel": 2.9087066650390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.904821395874023, + "learning_rate": 1e-06, + "loss": 0.4085, + "mean_token_accuracy": 0.8709627389907837, + "num_tokens": 307158685.0, + "step": 8053 + }, + { + "epoch": 1.0245515837679684, + "ewc_loss": 0.029296875, + "ewc_loss_parallel": 2.9325485229492188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.038930892944336, + "learning_rate": 1e-06, + "loss": 0.4074, + "mean_token_accuracy": 0.8710838556289673, + "num_tokens": 307199104.0, + "step": 8054 + }, + { + "epoch": 1.024678794046559, + "ewc_loss": 0.0291748046875, + "ewc_loss_parallel": 2.9206275939941406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.842592239379883, + "learning_rate": 1e-06, + "loss": 0.4452, + "mean_token_accuracy": 0.8569301962852478, + "num_tokens": 307235840.0, + "step": 8055 + }, + { + "epoch": 1.0248060043251495, + "ewc_loss": 0.0291748046875, + "ewc_loss_parallel": 2.9206275939941406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.965801239013672, + "learning_rate": 1e-06, + "loss": 0.4622, + "mean_token_accuracy": 0.8536605834960938, + "num_tokens": 307273089.0, + "step": 8056 + }, + { + "epoch": 1.02493321460374, + "ewc_loss": 0.0294189453125, + "ewc_loss_parallel": 2.944469451904297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.124364852905273, + "learning_rate": 1e-06, + "loss": 0.4271, + "mean_token_accuracy": 0.8657152652740479, + "num_tokens": 307318825.0, + "step": 8057 + }, + { + "epoch": 1.0250604248823305, + "ewc_loss": 0.0291748046875, + "ewc_loss_parallel": 2.9206275939941406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.93976402282715, + "learning_rate": 1e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.8719347715377808, + "num_tokens": 307357006.0, + "step": 8058 + }, + { + "epoch": 1.025187635160921, + "ewc_loss": 0.0291748046875, + "ewc_loss_parallel": 2.9206275939941406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.96866798400879, + "learning_rate": 1e-06, + "loss": 0.4344, + "mean_token_accuracy": 0.8631935715675354, + "num_tokens": 307390274.0, + "step": 8059 + }, + { + "epoch": 1.0253148454395116, + "ewc_loss": 0.0291748046875, + "ewc_loss_parallel": 2.9206275939941406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.916236877441406, + "learning_rate": 1e-06, + "loss": 0.4001, + "mean_token_accuracy": 0.8735224008560181, + "num_tokens": 307427157.0, + "step": 8060 + }, + { + "epoch": 1.025442055718102, + "ewc_loss": 0.0294189453125, + "ewc_loss_parallel": 2.944469451904297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.953327178955078, + "learning_rate": 1e-06, + "loss": 0.4425, + "mean_token_accuracy": 0.8620717525482178, + "num_tokens": 307466555.0, + "step": 8061 + }, + { + "epoch": 1.0255692659966926, + "ewc_loss": 0.0294189453125, + "ewc_loss_parallel": 2.944469451904297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.910024642944336, + "learning_rate": 1e-06, + "loss": 0.4433, + "mean_token_accuracy": 0.8568053245544434, + "num_tokens": 307503375.0, + "step": 8062 + }, + { + "epoch": 1.0256964762752832, + "ewc_loss": 0.0294189453125, + "ewc_loss_parallel": 2.944469451904297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.98670196533203, + "learning_rate": 1e-06, + "loss": 0.3956, + "mean_token_accuracy": 0.874430239200592, + "num_tokens": 307543170.0, + "step": 8063 + }, + { + "epoch": 1.0258236865538735, + "ewc_loss": 0.0294189453125, + "ewc_loss_parallel": 2.944469451904297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.87894630432129, + "learning_rate": 1e-06, + "loss": 0.4416, + "mean_token_accuracy": 0.8613612651824951, + "num_tokens": 307582890.0, + "step": 8064 + }, + { + "epoch": 1.025950896832464, + "ewc_loss": 0.0294189453125, + "ewc_loss_parallel": 2.944469451904297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.036680221557617, + "learning_rate": 1e-06, + "loss": 0.4561, + "mean_token_accuracy": 0.8577889204025269, + "num_tokens": 307617643.0, + "step": 8065 + }, + { + "epoch": 1.0260781071110545, + "ewc_loss": 0.0294189453125, + "ewc_loss_parallel": 2.944469451904297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.008142471313477, + "learning_rate": 1e-06, + "loss": 0.4335, + "mean_token_accuracy": 0.8620209097862244, + "num_tokens": 307652258.0, + "step": 8066 + }, + { + "epoch": 1.026205317389645, + "ewc_loss": 0.0294189453125, + "ewc_loss_parallel": 2.944469451904297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.11176109313965, + "learning_rate": 1e-06, + "loss": 0.4442, + "mean_token_accuracy": 0.8598939180374146, + "num_tokens": 307693245.0, + "step": 8067 + }, + { + "epoch": 1.0263325276682356, + "ewc_loss": 0.0294189453125, + "ewc_loss_parallel": 2.944469451904297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.106830596923828, + "learning_rate": 1e-06, + "loss": 0.4117, + "mean_token_accuracy": 0.8678012490272522, + "num_tokens": 307728751.0, + "step": 8068 + }, + { + "epoch": 1.026459737946826, + "ewc_loss": 0.0291748046875, + "ewc_loss_parallel": 2.9206275939941406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.10836410522461, + "learning_rate": 1e-06, + "loss": 0.458, + "mean_token_accuracy": 0.8536937236785889, + "num_tokens": 307767869.0, + "step": 8069 + }, + { + "epoch": 1.0265869482254166, + "ewc_loss": 0.0294189453125, + "ewc_loss_parallel": 2.944469451904297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.054983139038086, + "learning_rate": 1e-06, + "loss": 0.3932, + "mean_token_accuracy": 0.8744543790817261, + "num_tokens": 307809438.0, + "step": 8070 + }, + { + "epoch": 1.0267141585040072, + "ewc_loss": 0.0294189453125, + "ewc_loss_parallel": 2.944469451904297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.8480167388916, + "learning_rate": 1e-06, + "loss": 0.4873, + "mean_token_accuracy": 0.8447173237800598, + "num_tokens": 307848752.0, + "step": 8071 + }, + { + "epoch": 1.0268413687825977, + "ewc_loss": 0.0294189453125, + "ewc_loss_parallel": 2.944469451904297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.07524871826172, + "learning_rate": 1e-06, + "loss": 0.4433, + "mean_token_accuracy": 0.8590880632400513, + "num_tokens": 307886080.0, + "step": 8072 + }, + { + "epoch": 1.0269685790611882, + "ewc_loss": 0.0294189453125, + "ewc_loss_parallel": 2.944469451904297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.776113510131836, + "learning_rate": 1e-06, + "loss": 0.3788, + "mean_token_accuracy": 0.8781967163085938, + "num_tokens": 307922854.0, + "step": 8073 + }, + { + "epoch": 1.0270957893397787, + "ewc_loss": 0.0291748046875, + "ewc_loss_parallel": 2.9206275939941406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.25633430480957, + "learning_rate": 1e-06, + "loss": 0.4246, + "mean_token_accuracy": 0.8615496158599854, + "num_tokens": 307960032.0, + "step": 8074 + }, + { + "epoch": 1.0272229996183693, + "ewc_loss": 0.0294189453125, + "ewc_loss_parallel": 2.944469451904297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.008949279785156, + "learning_rate": 1e-06, + "loss": 0.4182, + "mean_token_accuracy": 0.8643141984939575, + "num_tokens": 307996074.0, + "step": 8075 + }, + { + "epoch": 1.0273502098969596, + "ewc_loss": 0.029052734375, + "ewc_loss_parallel": 2.9087066650390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.04245376586914, + "learning_rate": 1e-06, + "loss": 0.3806, + "mean_token_accuracy": 0.877522349357605, + "num_tokens": 308030833.0, + "step": 8076 + }, + { + "epoch": 1.02747742017555, + "ewc_loss": 0.0294189453125, + "ewc_loss_parallel": 2.944469451904297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.999378204345703, + "learning_rate": 1e-06, + "loss": 0.4005, + "mean_token_accuracy": 0.8747180700302124, + "num_tokens": 308079620.0, + "step": 8077 + }, + { + "epoch": 1.0276046304541406, + "ewc_loss": 0.029296875, + "ewc_loss_parallel": 2.9325485229492188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.19940185546875, + "learning_rate": 1e-06, + "loss": 0.4439, + "mean_token_accuracy": 0.8586850762367249, + "num_tokens": 308118555.0, + "step": 8078 + }, + { + "epoch": 1.0277318407327312, + "ewc_loss": 0.0294189453125, + "ewc_loss_parallel": 2.944469451904297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.08755111694336, + "learning_rate": 1e-06, + "loss": 0.5073, + "mean_token_accuracy": 0.8479247689247131, + "num_tokens": 308154410.0, + "step": 8079 + }, + { + "epoch": 1.0278590510113217, + "ewc_loss": 0.0291748046875, + "ewc_loss_parallel": 2.9206275939941406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.92872428894043, + "learning_rate": 1e-06, + "loss": 0.4212, + "mean_token_accuracy": 0.8647412657737732, + "num_tokens": 308191990.0, + "step": 8080 + }, + { + "epoch": 1.0279862612899122, + "ewc_loss": 0.0294189453125, + "ewc_loss_parallel": 2.944469451904297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.902320861816406, + "learning_rate": 1e-06, + "loss": 0.4788, + "mean_token_accuracy": 0.8441569805145264, + "num_tokens": 308231047.0, + "step": 8081 + }, + { + "epoch": 1.0281134715685027, + "ewc_loss": 0.0291748046875, + "ewc_loss_parallel": 2.9206275939941406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.066490173339844, + "learning_rate": 1e-06, + "loss": 0.3923, + "mean_token_accuracy": 0.87796950340271, + "num_tokens": 308269487.0, + "step": 8082 + }, + { + "epoch": 1.0282406818470933, + "ewc_loss": 0.0294189453125, + "ewc_loss_parallel": 2.944469451904297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.754962921142578, + "learning_rate": 1e-06, + "loss": 0.4286, + "mean_token_accuracy": 0.8641736507415771, + "num_tokens": 308304974.0, + "step": 8083 + }, + { + "epoch": 1.0283678921256838, + "ewc_loss": 0.029541015625, + "ewc_loss_parallel": 2.956390380859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.100065231323242, + "learning_rate": 1e-06, + "loss": 0.3572, + "mean_token_accuracy": 0.8862995505332947, + "num_tokens": 308342072.0, + "step": 8084 + }, + { + "epoch": 1.0284951024042743, + "ewc_loss": 0.029541015625, + "ewc_loss_parallel": 2.956390380859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.15563201904297, + "learning_rate": 1e-06, + "loss": 0.4353, + "mean_token_accuracy": 0.8601870536804199, + "num_tokens": 308374293.0, + "step": 8085 + }, + { + "epoch": 1.0286223126828649, + "ewc_loss": 0.0294189453125, + "ewc_loss_parallel": 2.944469451904297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.904653549194336, + "learning_rate": 1e-06, + "loss": 0.4139, + "mean_token_accuracy": 0.8691504001617432, + "num_tokens": 308415872.0, + "step": 8086 + }, + { + "epoch": 1.0287495229614554, + "ewc_loss": 0.029296875, + "ewc_loss_parallel": 2.9325485229492188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.15928077697754, + "learning_rate": 1e-06, + "loss": 0.4473, + "mean_token_accuracy": 0.8579583764076233, + "num_tokens": 308451198.0, + "step": 8087 + }, + { + "epoch": 1.0288767332400457, + "ewc_loss": 0.0294189453125, + "ewc_loss_parallel": 2.944469451904297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.918888092041016, + "learning_rate": 1e-06, + "loss": 0.4031, + "mean_token_accuracy": 0.8722884654998779, + "num_tokens": 308490554.0, + "step": 8088 + }, + { + "epoch": 1.0290039435186362, + "ewc_loss": 0.029296875, + "ewc_loss_parallel": 2.9325485229492188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.989957809448242, + "learning_rate": 1e-06, + "loss": 0.432, + "mean_token_accuracy": 0.8587548136711121, + "num_tokens": 308530221.0, + "step": 8089 + }, + { + "epoch": 1.0291311537972267, + "ewc_loss": 0.0294189453125, + "ewc_loss_parallel": 2.944469451904297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.11559295654297, + "learning_rate": 1e-06, + "loss": 0.4741, + "mean_token_accuracy": 0.84812331199646, + "num_tokens": 308563634.0, + "step": 8090 + }, + { + "epoch": 1.0292583640758173, + "ewc_loss": 0.029296875, + "ewc_loss_parallel": 2.9325485229492188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.19879722595215, + "learning_rate": 1e-06, + "loss": 0.4192, + "mean_token_accuracy": 0.8646583557128906, + "num_tokens": 308605033.0, + "step": 8091 + }, + { + "epoch": 1.0293855743544078, + "ewc_loss": 0.0294189453125, + "ewc_loss_parallel": 2.944469451904297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.917863845825195, + "learning_rate": 1e-06, + "loss": 0.4146, + "mean_token_accuracy": 0.8653745651245117, + "num_tokens": 308642603.0, + "step": 8092 + }, + { + "epoch": 1.0295127846329983, + "ewc_loss": 0.0291748046875, + "ewc_loss_parallel": 2.9206275939941406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.90250587463379, + "learning_rate": 1e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.8617908358573914, + "num_tokens": 308685189.0, + "step": 8093 + }, + { + "epoch": 1.0296399949115889, + "ewc_loss": 0.029296875, + "ewc_loss_parallel": 2.9325485229492188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.16558265686035, + "learning_rate": 1e-06, + "loss": 0.4271, + "mean_token_accuracy": 0.8634176254272461, + "num_tokens": 308726052.0, + "step": 8094 + }, + { + "epoch": 1.0297672051901794, + "ewc_loss": 0.029296875, + "ewc_loss_parallel": 2.9325485229492188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.907554626464844, + "learning_rate": 1e-06, + "loss": 0.4598, + "mean_token_accuracy": 0.852721095085144, + "num_tokens": 308764828.0, + "step": 8095 + }, + { + "epoch": 1.02989441546877, + "ewc_loss": 0.0291748046875, + "ewc_loss_parallel": 2.9206275939941406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.074018478393555, + "learning_rate": 1e-06, + "loss": 0.3942, + "mean_token_accuracy": 0.8754488825798035, + "num_tokens": 308799712.0, + "step": 8096 + }, + { + "epoch": 1.0300216257473604, + "ewc_loss": 0.029296875, + "ewc_loss_parallel": 2.9325485229492188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.098125457763672, + "learning_rate": 1e-06, + "loss": 0.4234, + "mean_token_accuracy": 0.863864541053772, + "num_tokens": 308839879.0, + "step": 8097 + }, + { + "epoch": 1.030148836025951, + "ewc_loss": 0.0291748046875, + "ewc_loss_parallel": 2.9206275939941406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.784725189208984, + "learning_rate": 1e-06, + "loss": 0.4559, + "mean_token_accuracy": 0.8576827645301819, + "num_tokens": 308880362.0, + "step": 8098 + }, + { + "epoch": 1.0302760463045415, + "ewc_loss": 0.0294189453125, + "ewc_loss_parallel": 2.944469451904297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.999404907226562, + "learning_rate": 1e-06, + "loss": 0.3916, + "mean_token_accuracy": 0.8741785287857056, + "num_tokens": 308921395.0, + "step": 8099 + }, + { + "epoch": 1.0304032565831318, + "ewc_loss": 0.0294189453125, + "ewc_loss_parallel": 2.944469451904297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.11532974243164, + "learning_rate": 1e-06, + "loss": 0.396, + "mean_token_accuracy": 0.8731046915054321, + "num_tokens": 308959976.0, + "step": 8100 + }, + { + "epoch": 1.0305304668617223, + "ewc_loss": 0.0294189453125, + "ewc_loss_parallel": 2.944469451904297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.025556564331055, + "learning_rate": 1e-06, + "loss": 0.3809, + "mean_token_accuracy": 0.8777540922164917, + "num_tokens": 309000702.0, + "step": 8101 + }, + { + "epoch": 1.0306576771403129, + "ewc_loss": 0.0294189453125, + "ewc_loss_parallel": 2.944469451904297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.045818328857422, + "learning_rate": 1e-06, + "loss": 0.3653, + "mean_token_accuracy": 0.8824714422225952, + "num_tokens": 309036540.0, + "step": 8102 + }, + { + "epoch": 1.0307848874189034, + "ewc_loss": 0.0291748046875, + "ewc_loss_parallel": 2.9206275939941406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.10011863708496, + "learning_rate": 1e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.8635367751121521, + "num_tokens": 309072408.0, + "step": 8103 + }, + { + "epoch": 1.030912097697494, + "ewc_loss": 0.029541015625, + "ewc_loss_parallel": 2.956390380859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.02299690246582, + "learning_rate": 1e-06, + "loss": 0.481, + "mean_token_accuracy": 0.8515489101409912, + "num_tokens": 309110366.0, + "step": 8104 + }, + { + "epoch": 1.0310393079760845, + "ewc_loss": 0.0291748046875, + "ewc_loss_parallel": 2.9206275939941406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.09744644165039, + "learning_rate": 1e-06, + "loss": 0.4217, + "mean_token_accuracy": 0.8679195642471313, + "num_tokens": 309155194.0, + "step": 8105 + }, + { + "epoch": 1.031166518254675, + "ewc_loss": 0.0291748046875, + "ewc_loss_parallel": 2.9206275939941406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.14566421508789, + "learning_rate": 1e-06, + "loss": 0.3938, + "mean_token_accuracy": 0.8725458383560181, + "num_tokens": 309193747.0, + "step": 8106 + }, + { + "epoch": 1.0312937285332655, + "ewc_loss": 0.0294189453125, + "ewc_loss_parallel": 2.944469451904297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.108510971069336, + "learning_rate": 1e-06, + "loss": 0.4619, + "mean_token_accuracy": 0.853469967842102, + "num_tokens": 309234161.0, + "step": 8107 + }, + { + "epoch": 1.031420938811856, + "ewc_loss": 0.029296875, + "ewc_loss_parallel": 2.9325485229492188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.025075912475586, + "learning_rate": 1e-06, + "loss": 0.458, + "mean_token_accuracy": 0.8538111448287964, + "num_tokens": 309272574.0, + "step": 8108 + }, + { + "epoch": 1.0315481490904466, + "ewc_loss": 0.0291748046875, + "ewc_loss_parallel": 2.9206275939941406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.097732543945312, + "learning_rate": 1e-06, + "loss": 0.423, + "mean_token_accuracy": 0.8676111698150635, + "num_tokens": 309306567.0, + "step": 8109 + }, + { + "epoch": 1.031675359369037, + "ewc_loss": 0.0294189453125, + "ewc_loss_parallel": 2.944469451904297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.029470443725586, + "learning_rate": 1e-06, + "loss": 0.4036, + "mean_token_accuracy": 0.8693715929985046, + "num_tokens": 309347218.0, + "step": 8110 + }, + { + "epoch": 1.0318025696476276, + "ewc_loss": 0.029296875, + "ewc_loss_parallel": 2.9325485229492188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.067428588867188, + "learning_rate": 1e-06, + "loss": 0.3825, + "mean_token_accuracy": 0.8786382675170898, + "num_tokens": 309385701.0, + "step": 8111 + }, + { + "epoch": 1.0319297799262181, + "ewc_loss": 0.029296875, + "ewc_loss_parallel": 2.9325485229492188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.041719436645508, + "learning_rate": 1e-06, + "loss": 0.4177, + "mean_token_accuracy": 0.862741231918335, + "num_tokens": 309424600.0, + "step": 8112 + }, + { + "epoch": 1.0320569902048085, + "ewc_loss": 0.0291748046875, + "ewc_loss_parallel": 2.9206275939941406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.06748390197754, + "learning_rate": 1e-06, + "loss": 0.4919, + "mean_token_accuracy": 0.8440122604370117, + "num_tokens": 309459929.0, + "step": 8113 + }, + { + "epoch": 1.032184200483399, + "ewc_loss": 0.0291748046875, + "ewc_loss_parallel": 2.9206275939941406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.033266067504883, + "learning_rate": 1e-06, + "loss": 0.4597, + "mean_token_accuracy": 0.8565546274185181, + "num_tokens": 309505085.0, + "step": 8114 + }, + { + "epoch": 1.0323114107619895, + "ewc_loss": 0.0291748046875, + "ewc_loss_parallel": 2.9206275939941406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.23891830444336, + "learning_rate": 1e-06, + "loss": 0.418, + "mean_token_accuracy": 0.8676568269729614, + "num_tokens": 309544746.0, + "step": 8115 + }, + { + "epoch": 1.03243862104058, + "ewc_loss": 0.029296875, + "ewc_loss_parallel": 2.9325485229492188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.065685272216797, + "learning_rate": 1e-06, + "loss": 0.46, + "mean_token_accuracy": 0.8511290550231934, + "num_tokens": 309584301.0, + "step": 8116 + }, + { + "epoch": 1.0325658313191706, + "ewc_loss": 0.029052734375, + "ewc_loss_parallel": 2.9087066650390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.001981735229492, + "learning_rate": 1e-06, + "loss": 0.411, + "mean_token_accuracy": 0.8689181804656982, + "num_tokens": 309622570.0, + "step": 8117 + }, + { + "epoch": 1.032693041597761, + "ewc_loss": 0.0294189453125, + "ewc_loss_parallel": 2.944469451904297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.03130531311035, + "learning_rate": 1e-06, + "loss": 0.4829, + "mean_token_accuracy": 0.8485094308853149, + "num_tokens": 309666417.0, + "step": 8118 + }, + { + "epoch": 1.0328202518763516, + "ewc_loss": 0.029296875, + "ewc_loss_parallel": 2.9325485229492188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.033920288085938, + "learning_rate": 1e-06, + "loss": 0.4483, + "mean_token_accuracy": 0.8555471301078796, + "num_tokens": 309705706.0, + "step": 8119 + }, + { + "epoch": 1.0329474621549422, + "ewc_loss": 0.029296875, + "ewc_loss_parallel": 2.9325485229492188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.021991729736328, + "learning_rate": 1e-06, + "loss": 0.3911, + "mean_token_accuracy": 0.8736993670463562, + "num_tokens": 309744288.0, + "step": 8120 + }, + { + "epoch": 1.0330746724335327, + "ewc_loss": 0.0289306640625, + "ewc_loss_parallel": 2.8967857360839844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.129032135009766, + "learning_rate": 1e-06, + "loss": 0.4698, + "mean_token_accuracy": 0.8518651723861694, + "num_tokens": 309780406.0, + "step": 8121 + }, + { + "epoch": 1.0332018827121232, + "ewc_loss": 0.0291748046875, + "ewc_loss_parallel": 2.9206275939941406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.09882164001465, + "learning_rate": 1e-06, + "loss": 0.524, + "mean_token_accuracy": 0.8326563835144043, + "num_tokens": 309811135.0, + "step": 8122 + }, + { + "epoch": 1.0333290929907137, + "ewc_loss": 0.0291748046875, + "ewc_loss_parallel": 2.9206275939941406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.05807113647461, + "learning_rate": 1e-06, + "loss": 0.4136, + "mean_token_accuracy": 0.8689257502555847, + "num_tokens": 309847037.0, + "step": 8123 + }, + { + "epoch": 1.0334563032693043, + "ewc_loss": 0.029296875, + "ewc_loss_parallel": 2.9325485229492188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.052104949951172, + "learning_rate": 1e-06, + "loss": 0.431, + "mean_token_accuracy": 0.8648879528045654, + "num_tokens": 309884274.0, + "step": 8124 + }, + { + "epoch": 1.0335835135478946, + "ewc_loss": 0.0291748046875, + "ewc_loss_parallel": 2.9206275939941406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.938310623168945, + "learning_rate": 1e-06, + "loss": 0.3922, + "mean_token_accuracy": 0.8768605589866638, + "num_tokens": 309915572.0, + "step": 8125 + }, + { + "epoch": 1.033710723826485, + "ewc_loss": 0.029541015625, + "ewc_loss_parallel": 2.956390380859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.207983016967773, + "learning_rate": 1e-06, + "loss": 0.4108, + "mean_token_accuracy": 0.8698893785476685, + "num_tokens": 309954502.0, + "step": 8126 + }, + { + "epoch": 1.0338379341050756, + "ewc_loss": 0.0291748046875, + "ewc_loss_parallel": 2.9206275939941406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.956932067871094, + "learning_rate": 1e-06, + "loss": 0.357, + "mean_token_accuracy": 0.8861523866653442, + "num_tokens": 309993015.0, + "step": 8127 + }, + { + "epoch": 1.0339651443836662, + "ewc_loss": 0.029541015625, + "ewc_loss_parallel": 2.956390380859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.9018611907959, + "learning_rate": 1e-06, + "loss": 0.4618, + "mean_token_accuracy": 0.8551398515701294, + "num_tokens": 310030840.0, + "step": 8128 + }, + { + "epoch": 1.0340923546622567, + "ewc_loss": 0.0294189453125, + "ewc_loss_parallel": 2.944469451904297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.12421226501465, + "learning_rate": 1e-06, + "loss": 0.4186, + "mean_token_accuracy": 0.8651735186576843, + "num_tokens": 310067586.0, + "step": 8129 + }, + { + "epoch": 1.0342195649408472, + "ewc_loss": 0.029541015625, + "ewc_loss_parallel": 2.956390380859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.01880645751953, + "learning_rate": 1e-06, + "loss": 0.4047, + "mean_token_accuracy": 0.873485803604126, + "num_tokens": 310104496.0, + "step": 8130 + }, + { + "epoch": 1.0343467752194377, + "ewc_loss": 0.0294189453125, + "ewc_loss_parallel": 2.944469451904297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.11959457397461, + "learning_rate": 1e-06, + "loss": 0.4755, + "mean_token_accuracy": 0.8478438854217529, + "num_tokens": 310141162.0, + "step": 8131 + }, + { + "epoch": 1.0344739854980283, + "ewc_loss": 0.029296875, + "ewc_loss_parallel": 2.9325485229492188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.115070343017578, + "learning_rate": 1e-06, + "loss": 0.442, + "mean_token_accuracy": 0.8624555468559265, + "num_tokens": 310182965.0, + "step": 8132 + }, + { + "epoch": 1.0346011957766188, + "ewc_loss": 0.029296875, + "ewc_loss_parallel": 2.9325485229492188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.567272186279297, + "learning_rate": 1e-06, + "loss": 0.4785, + "mean_token_accuracy": 0.8473783135414124, + "num_tokens": 310219572.0, + "step": 8133 + }, + { + "epoch": 1.0347284060552093, + "ewc_loss": 0.0294189453125, + "ewc_loss_parallel": 2.944469451904297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.98892593383789, + "learning_rate": 1e-06, + "loss": 0.4574, + "mean_token_accuracy": 0.8525035381317139, + "num_tokens": 310248711.0, + "step": 8134 + }, + { + "epoch": 1.0348556163337999, + "ewc_loss": 0.0291748046875, + "ewc_loss_parallel": 2.9206275939941406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.15219497680664, + "learning_rate": 1e-06, + "loss": 0.4678, + "mean_token_accuracy": 0.8478843569755554, + "num_tokens": 310287915.0, + "step": 8135 + }, + { + "epoch": 1.0349828266123904, + "ewc_loss": 0.029296875, + "ewc_loss_parallel": 2.9325485229492188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.951583862304688, + "learning_rate": 1e-06, + "loss": 0.4382, + "mean_token_accuracy": 0.8627173900604248, + "num_tokens": 310325619.0, + "step": 8136 + }, + { + "epoch": 1.0351100368909807, + "ewc_loss": 0.02880859375, + "ewc_loss_parallel": 2.8848648071289062e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.3850040435791, + "learning_rate": 1e-06, + "loss": 0.4129, + "mean_token_accuracy": 0.8726093769073486, + "num_tokens": 310362456.0, + "step": 8137 + }, + { + "epoch": 1.0352372471695712, + "ewc_loss": 0.029296875, + "ewc_loss_parallel": 2.9325485229492188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.261037826538086, + "learning_rate": 1e-06, + "loss": 0.4296, + "mean_token_accuracy": 0.8593376278877258, + "num_tokens": 310399584.0, + "step": 8138 + }, + { + "epoch": 1.0353644574481617, + "ewc_loss": 0.0289306640625, + "ewc_loss_parallel": 2.8967857360839844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.857736587524414, + "learning_rate": 1e-06, + "loss": 0.4458, + "mean_token_accuracy": 0.8577837944030762, + "num_tokens": 310437435.0, + "step": 8139 + }, + { + "epoch": 1.0354916677267523, + "ewc_loss": 0.029541015625, + "ewc_loss_parallel": 2.956390380859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.385417938232422, + "learning_rate": 1e-06, + "loss": 0.41, + "mean_token_accuracy": 0.869770348072052, + "num_tokens": 310476320.0, + "step": 8140 + }, + { + "epoch": 1.0356188780053428, + "ewc_loss": 0.029296875, + "ewc_loss_parallel": 2.9325485229492188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.234928131103516, + "learning_rate": 1e-06, + "loss": 0.3735, + "mean_token_accuracy": 0.880540132522583, + "num_tokens": 310512956.0, + "step": 8141 + }, + { + "epoch": 1.0357460882839333, + "ewc_loss": 0.0291748046875, + "ewc_loss_parallel": 2.9206275939941406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.74628448486328, + "learning_rate": 1e-06, + "loss": 0.4247, + "mean_token_accuracy": 0.8625843524932861, + "num_tokens": 310547605.0, + "step": 8142 + }, + { + "epoch": 1.0358732985625239, + "ewc_loss": 0.0291748046875, + "ewc_loss_parallel": 2.9206275939941406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.14446449279785, + "learning_rate": 1e-06, + "loss": 0.4551, + "mean_token_accuracy": 0.8620172739028931, + "num_tokens": 310586991.0, + "step": 8143 + }, + { + "epoch": 1.0360005088411144, + "ewc_loss": 0.029296875, + "ewc_loss_parallel": 2.9325485229492188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.25641441345215, + "learning_rate": 1e-06, + "loss": 0.4312, + "mean_token_accuracy": 0.8637140393257141, + "num_tokens": 310621271.0, + "step": 8144 + }, + { + "epoch": 1.036127719119705, + "ewc_loss": 0.0294189453125, + "ewc_loss_parallel": 2.944469451904297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.870447158813477, + "learning_rate": 1e-06, + "loss": 0.4232, + "mean_token_accuracy": 0.867073655128479, + "num_tokens": 310662154.0, + "step": 8145 + }, + { + "epoch": 1.0362549293982954, + "ewc_loss": 0.0291748046875, + "ewc_loss_parallel": 2.9206275939941406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.976743698120117, + "learning_rate": 1e-06, + "loss": 0.3929, + "mean_token_accuracy": 0.8739189505577087, + "num_tokens": 310701924.0, + "step": 8146 + }, + { + "epoch": 1.036382139676886, + "ewc_loss": 0.029296875, + "ewc_loss_parallel": 2.9325485229492188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.160274505615234, + "learning_rate": 1e-06, + "loss": 0.4133, + "mean_token_accuracy": 0.8708542585372925, + "num_tokens": 310739688.0, + "step": 8147 + }, + { + "epoch": 1.0365093499554765, + "ewc_loss": 0.029296875, + "ewc_loss_parallel": 2.9325485229492188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.022451400756836, + "learning_rate": 1e-06, + "loss": 0.4404, + "mean_token_accuracy": 0.859103798866272, + "num_tokens": 310773317.0, + "step": 8148 + }, + { + "epoch": 1.0366365602340668, + "ewc_loss": 0.0291748046875, + "ewc_loss_parallel": 2.9206275939941406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.05674171447754, + "learning_rate": 1e-06, + "loss": 0.4313, + "mean_token_accuracy": 0.8628867864608765, + "num_tokens": 310810854.0, + "step": 8149 + }, + { + "epoch": 1.0367637705126573, + "ewc_loss": 0.0291748046875, + "ewc_loss_parallel": 2.9206275939941406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.24118423461914, + "learning_rate": 1e-06, + "loss": 0.4581, + "mean_token_accuracy": 0.8577571511268616, + "num_tokens": 310846080.0, + "step": 8150 + }, + { + "epoch": 1.0368909807912479, + "ewc_loss": 0.029296875, + "ewc_loss_parallel": 2.9325485229492188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.244705200195312, + "learning_rate": 1e-06, + "loss": 0.5005, + "mean_token_accuracy": 0.8413223028182983, + "num_tokens": 310885032.0, + "step": 8151 + }, + { + "epoch": 1.0370181910698384, + "ewc_loss": 0.029296875, + "ewc_loss_parallel": 2.9325485229492188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.932416915893555, + "learning_rate": 1e-06, + "loss": 0.4062, + "mean_token_accuracy": 0.8699919581413269, + "num_tokens": 310924518.0, + "step": 8152 + }, + { + "epoch": 1.037145401348429, + "ewc_loss": 0.0294189453125, + "ewc_loss_parallel": 2.944469451904297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.104677200317383, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.866540253162384, + "num_tokens": 310961734.0, + "step": 8153 + }, + { + "epoch": 1.0372726116270194, + "ewc_loss": 0.0291748046875, + "ewc_loss_parallel": 2.9206275939941406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.97707176208496, + "learning_rate": 1e-06, + "loss": 0.4283, + "mean_token_accuracy": 0.866042971611023, + "num_tokens": 311004541.0, + "step": 8154 + }, + { + "epoch": 1.03739982190561, + "ewc_loss": 0.0294189453125, + "ewc_loss_parallel": 2.944469451904297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.081233978271484, + "learning_rate": 1e-06, + "loss": 0.3976, + "mean_token_accuracy": 0.8718897104263306, + "num_tokens": 311043947.0, + "step": 8155 + }, + { + "epoch": 1.0375270321842005, + "ewc_loss": 0.029296875, + "ewc_loss_parallel": 2.9325485229492188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.065210342407227, + "learning_rate": 1e-06, + "loss": 0.432, + "mean_token_accuracy": 0.8606895804405212, + "num_tokens": 311083154.0, + "step": 8156 + }, + { + "epoch": 1.037654242462791, + "ewc_loss": 0.0294189453125, + "ewc_loss_parallel": 2.944469451904297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.959150314331055, + "learning_rate": 1e-06, + "loss": 0.393, + "mean_token_accuracy": 0.8755502104759216, + "num_tokens": 311121429.0, + "step": 8157 + }, + { + "epoch": 1.0377814527413816, + "ewc_loss": 0.029296875, + "ewc_loss_parallel": 2.9325485229492188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.03326416015625, + "learning_rate": 1e-06, + "loss": 0.4676, + "mean_token_accuracy": 0.8485391736030579, + "num_tokens": 311166578.0, + "step": 8158 + }, + { + "epoch": 1.037908663019972, + "ewc_loss": 0.0294189453125, + "ewc_loss_parallel": 2.944469451904297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.05321502685547, + "learning_rate": 1e-06, + "loss": 0.426, + "mean_token_accuracy": 0.8666173815727234, + "num_tokens": 311209645.0, + "step": 8159 + }, + { + "epoch": 1.0380358732985626, + "ewc_loss": 0.029541015625, + "ewc_loss_parallel": 2.956390380859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.239519119262695, + "learning_rate": 1e-06, + "loss": 0.4125, + "mean_token_accuracy": 0.8677445650100708, + "num_tokens": 311248428.0, + "step": 8160 + }, + { + "epoch": 1.0381630835771531, + "ewc_loss": 0.029296875, + "ewc_loss_parallel": 2.9325485229492188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.057538986206055, + "learning_rate": 1e-06, + "loss": 0.4174, + "mean_token_accuracy": 0.8648709058761597, + "num_tokens": 311282040.0, + "step": 8161 + }, + { + "epoch": 1.0382902938557435, + "ewc_loss": 0.0291748046875, + "ewc_loss_parallel": 2.9206275939941406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.009355545043945, + "learning_rate": 1e-06, + "loss": 0.4176, + "mean_token_accuracy": 0.8715497255325317, + "num_tokens": 311320233.0, + "step": 8162 + }, + { + "epoch": 1.038417504134334, + "ewc_loss": 0.0291748046875, + "ewc_loss_parallel": 2.9206275939941406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.0611629486084, + "learning_rate": 1e-06, + "loss": 0.4715, + "mean_token_accuracy": 0.8498117923736572, + "num_tokens": 311357586.0, + "step": 8163 + }, + { + "epoch": 1.0385447144129245, + "ewc_loss": 0.029296875, + "ewc_loss_parallel": 2.9325485229492188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.934167861938477, + "learning_rate": 1e-06, + "loss": 0.4212, + "mean_token_accuracy": 0.864060640335083, + "num_tokens": 311395545.0, + "step": 8164 + }, + { + "epoch": 1.038671924691515, + "ewc_loss": 0.029296875, + "ewc_loss_parallel": 2.9325485229492188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.197065353393555, + "learning_rate": 1e-06, + "loss": 0.4227, + "mean_token_accuracy": 0.8637592792510986, + "num_tokens": 311428785.0, + "step": 8165 + }, + { + "epoch": 1.0387991349701056, + "ewc_loss": 0.029541015625, + "ewc_loss_parallel": 2.956390380859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.209074020385742, + "learning_rate": 1e-06, + "loss": 0.4577, + "mean_token_accuracy": 0.8525919914245605, + "num_tokens": 311469546.0, + "step": 8166 + }, + { + "epoch": 1.038926345248696, + "ewc_loss": 0.0291748046875, + "ewc_loss_parallel": 2.9206275939941406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.2431640625, + "learning_rate": 1e-06, + "loss": 0.441, + "mean_token_accuracy": 0.8562368750572205, + "num_tokens": 311504064.0, + "step": 8167 + }, + { + "epoch": 1.0390535555272866, + "ewc_loss": 0.029296875, + "ewc_loss_parallel": 2.9325485229492188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.923221588134766, + "learning_rate": 1e-06, + "loss": 0.4766, + "mean_token_accuracy": 0.8472015857696533, + "num_tokens": 311536600.0, + "step": 8168 + }, + { + "epoch": 1.0391807658058771, + "ewc_loss": 0.0294189453125, + "ewc_loss_parallel": 2.944469451904297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.85582160949707, + "learning_rate": 1e-06, + "loss": 0.3631, + "mean_token_accuracy": 0.8809542059898376, + "num_tokens": 311573638.0, + "step": 8169 + }, + { + "epoch": 1.0393079760844677, + "ewc_loss": 0.029052734375, + "ewc_loss_parallel": 2.9087066650390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.978071212768555, + "learning_rate": 1e-06, + "loss": 0.4485, + "mean_token_accuracy": 0.8558334112167358, + "num_tokens": 311611085.0, + "step": 8170 + }, + { + "epoch": 1.0394351863630582, + "ewc_loss": 0.0294189453125, + "ewc_loss_parallel": 2.944469451904297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.381052017211914, + "learning_rate": 1e-06, + "loss": 0.4547, + "mean_token_accuracy": 0.857672929763794, + "num_tokens": 311655497.0, + "step": 8171 + }, + { + "epoch": 1.0395623966416487, + "ewc_loss": 0.0294189453125, + "ewc_loss_parallel": 2.944469451904297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.07013702392578, + "learning_rate": 1e-06, + "loss": 0.457, + "mean_token_accuracy": 0.8490405082702637, + "num_tokens": 311688167.0, + "step": 8172 + }, + { + "epoch": 1.0396896069202393, + "ewc_loss": 0.0294189453125, + "ewc_loss_parallel": 2.944469451904297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.811559677124023, + "learning_rate": 1e-06, + "loss": 0.4258, + "mean_token_accuracy": 0.8639799952507019, + "num_tokens": 311724926.0, + "step": 8173 + }, + { + "epoch": 1.0398168171988296, + "ewc_loss": 0.029296875, + "ewc_loss_parallel": 2.9325485229492188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.03112030029297, + "learning_rate": 1e-06, + "loss": 0.4986, + "mean_token_accuracy": 0.8449568152427673, + "num_tokens": 311760729.0, + "step": 8174 + }, + { + "epoch": 1.03994402747742, + "ewc_loss": 0.029296875, + "ewc_loss_parallel": 2.9325485229492188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.183122634887695, + "learning_rate": 1e-06, + "loss": 0.4262, + "mean_token_accuracy": 0.8638397455215454, + "num_tokens": 311802138.0, + "step": 8175 + }, + { + "epoch": 1.0400712377560106, + "ewc_loss": 0.0291748046875, + "ewc_loss_parallel": 2.9206275939941406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.857967376708984, + "learning_rate": 1e-06, + "loss": 0.4046, + "mean_token_accuracy": 0.8709887266159058, + "num_tokens": 311838510.0, + "step": 8176 + }, + { + "epoch": 1.0401984480346012, + "ewc_loss": 0.029541015625, + "ewc_loss_parallel": 2.956390380859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.856447219848633, + "learning_rate": 1e-06, + "loss": 0.4503, + "mean_token_accuracy": 0.856732964515686, + "num_tokens": 311870875.0, + "step": 8177 + }, + { + "epoch": 1.0403256583131917, + "ewc_loss": 0.0294189453125, + "ewc_loss_parallel": 2.944469451904297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.205345153808594, + "learning_rate": 1e-06, + "loss": 0.3825, + "mean_token_accuracy": 0.8768056035041809, + "num_tokens": 311905854.0, + "step": 8178 + }, + { + "epoch": 1.0404528685917822, + "ewc_loss": 0.0296630859375, + "ewc_loss_parallel": 2.968311309814453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.947690963745117, + "learning_rate": 1e-06, + "loss": 0.4308, + "mean_token_accuracy": 0.8606371879577637, + "num_tokens": 311936381.0, + "step": 8179 + }, + { + "epoch": 1.0405800788703727, + "ewc_loss": 0.0294189453125, + "ewc_loss_parallel": 2.944469451904297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.105304718017578, + "learning_rate": 1e-06, + "loss": 0.4528, + "mean_token_accuracy": 0.8568066954612732, + "num_tokens": 311972529.0, + "step": 8180 + }, + { + "epoch": 1.0407072891489633, + "ewc_loss": 0.0294189453125, + "ewc_loss_parallel": 2.944469451904297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.847881317138672, + "learning_rate": 1e-06, + "loss": 0.3665, + "mean_token_accuracy": 0.8818718791007996, + "num_tokens": 312007883.0, + "step": 8181 + }, + { + "epoch": 1.0408344994275538, + "ewc_loss": 0.029541015625, + "ewc_loss_parallel": 2.956390380859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.07691764831543, + "learning_rate": 1e-06, + "loss": 0.4417, + "mean_token_accuracy": 0.8595825433731079, + "num_tokens": 312041000.0, + "step": 8182 + }, + { + "epoch": 1.0409617097061443, + "ewc_loss": 0.029541015625, + "ewc_loss_parallel": 2.956390380859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.949708938598633, + "learning_rate": 1e-06, + "loss": 0.4099, + "mean_token_accuracy": 0.8681647777557373, + "num_tokens": 312076011.0, + "step": 8183 + }, + { + "epoch": 1.0410889199847349, + "ewc_loss": 0.0294189453125, + "ewc_loss_parallel": 2.944469451904297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.97732925415039, + "learning_rate": 1e-06, + "loss": 0.4439, + "mean_token_accuracy": 0.8575543165206909, + "num_tokens": 312115614.0, + "step": 8184 + }, + { + "epoch": 1.0412161302633254, + "ewc_loss": 0.029541015625, + "ewc_loss_parallel": 2.956390380859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.040403366088867, + "learning_rate": 1e-06, + "loss": 0.4538, + "mean_token_accuracy": 0.8562848567962646, + "num_tokens": 312154111.0, + "step": 8185 + }, + { + "epoch": 1.0413433405419157, + "ewc_loss": 0.0296630859375, + "ewc_loss_parallel": 2.968311309814453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.98375701904297, + "learning_rate": 1e-06, + "loss": 0.4483, + "mean_token_accuracy": 0.8580066561698914, + "num_tokens": 312191571.0, + "step": 8186 + }, + { + "epoch": 1.0414705508205062, + "ewc_loss": 0.029296875, + "ewc_loss_parallel": 2.9325485229492188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.053760528564453, + "learning_rate": 1e-06, + "loss": 0.4502, + "mean_token_accuracy": 0.8556947708129883, + "num_tokens": 312229205.0, + "step": 8187 + }, + { + "epoch": 1.0415977610990967, + "ewc_loss": 0.0294189453125, + "ewc_loss_parallel": 2.944469451904297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.892351150512695, + "learning_rate": 1e-06, + "loss": 0.3814, + "mean_token_accuracy": 0.8779168725013733, + "num_tokens": 312266604.0, + "step": 8188 + }, + { + "epoch": 1.0417249713776873, + "ewc_loss": 0.0294189453125, + "ewc_loss_parallel": 2.944469451904297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.018312454223633, + "learning_rate": 1e-06, + "loss": 0.4503, + "mean_token_accuracy": 0.8537101149559021, + "num_tokens": 312305222.0, + "step": 8189 + }, + { + "epoch": 1.0418521816562778, + "ewc_loss": 0.0294189453125, + "ewc_loss_parallel": 2.944469451904297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.971418380737305, + "learning_rate": 1e-06, + "loss": 0.4896, + "mean_token_accuracy": 0.8412995934486389, + "num_tokens": 312345823.0, + "step": 8190 + }, + { + "epoch": 1.0419793919348683, + "ewc_loss": 0.029541015625, + "ewc_loss_parallel": 2.956390380859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.046592712402344, + "learning_rate": 1e-06, + "loss": 0.4497, + "mean_token_accuracy": 0.854569673538208, + "num_tokens": 312383569.0, + "step": 8191 + }, + { + "epoch": 1.0421066022134589, + "ewc_loss": 0.02978515625, + "ewc_loss_parallel": 2.9802322387695312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.216663360595703, + "learning_rate": 1e-06, + "loss": 0.4208, + "mean_token_accuracy": 0.8665897846221924, + "num_tokens": 312423068.0, + "step": 8192 + }, + { + "epoch": 1.0422338124920494, + "ewc_loss": 0.0296630859375, + "ewc_loss_parallel": 2.968311309814453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.242063522338867, + "learning_rate": 1e-06, + "loss": 0.4413, + "mean_token_accuracy": 0.858414351940155, + "num_tokens": 312460569.0, + "step": 8193 + }, + { + "epoch": 1.04236102277064, + "ewc_loss": 0.029541015625, + "ewc_loss_parallel": 2.956390380859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.142372131347656, + "learning_rate": 1e-06, + "loss": 0.4991, + "mean_token_accuracy": 0.8426178097724915, + "num_tokens": 312501654.0, + "step": 8194 + }, + { + "epoch": 1.0424882330492304, + "ewc_loss": 0.02978515625, + "ewc_loss_parallel": 2.9802322387695312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.230228424072266, + "learning_rate": 1e-06, + "loss": 0.4595, + "mean_token_accuracy": 0.855014443397522, + "num_tokens": 312543630.0, + "step": 8195 + }, + { + "epoch": 1.042615443327821, + "ewc_loss": 0.029541015625, + "ewc_loss_parallel": 2.956390380859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.00965690612793, + "learning_rate": 1e-06, + "loss": 0.428, + "mean_token_accuracy": 0.8613274097442627, + "num_tokens": 312579428.0, + "step": 8196 + }, + { + "epoch": 1.0427426536064115, + "ewc_loss": 0.029541015625, + "ewc_loss_parallel": 2.956390380859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.17681121826172, + "learning_rate": 1e-06, + "loss": 0.3806, + "mean_token_accuracy": 0.8772335648536682, + "num_tokens": 312621388.0, + "step": 8197 + }, + { + "epoch": 1.0428698638850018, + "ewc_loss": 0.0296630859375, + "ewc_loss_parallel": 2.968311309814453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.16218376159668, + "learning_rate": 1e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.8642988204956055, + "num_tokens": 312665303.0, + "step": 8198 + }, + { + "epoch": 1.0429970741635923, + "ewc_loss": 0.0291748046875, + "ewc_loss_parallel": 2.9206275939941406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.17696189880371, + "learning_rate": 1e-06, + "loss": 0.4635, + "mean_token_accuracy": 0.8521643280982971, + "num_tokens": 312702354.0, + "step": 8199 + }, + { + "epoch": 1.0431242844421829, + "ewc_loss": 0.029541015625, + "ewc_loss_parallel": 2.956390380859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.075176239013672, + "learning_rate": 1e-06, + "loss": 0.4167, + "mean_token_accuracy": 0.8677424788475037, + "num_tokens": 312737223.0, + "step": 8200 + }, + { + "epoch": 1.0432514947207734, + "ewc_loss": 0.029296875, + "ewc_loss_parallel": 2.9325485229492188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.989578247070312, + "learning_rate": 1e-06, + "loss": 0.4197, + "mean_token_accuracy": 0.8641670942306519, + "num_tokens": 312773353.0, + "step": 8201 + }, + { + "epoch": 1.043378704999364, + "ewc_loss": 0.0294189453125, + "ewc_loss_parallel": 2.944469451904297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.272136688232422, + "learning_rate": 1e-06, + "loss": 0.4822, + "mean_token_accuracy": 0.8454793691635132, + "num_tokens": 312809230.0, + "step": 8202 + }, + { + "epoch": 1.0435059152779544, + "ewc_loss": 0.029541015625, + "ewc_loss_parallel": 2.956390380859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.177818298339844, + "learning_rate": 1e-06, + "loss": 0.3681, + "mean_token_accuracy": 0.8800532817840576, + "num_tokens": 312841278.0, + "step": 8203 + }, + { + "epoch": 1.043633125556545, + "ewc_loss": 0.029296875, + "ewc_loss_parallel": 2.9325485229492188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.121612548828125, + "learning_rate": 1e-06, + "loss": 0.4129, + "mean_token_accuracy": 0.8675031065940857, + "num_tokens": 312888469.0, + "step": 8204 + }, + { + "epoch": 1.0437603358351355, + "ewc_loss": 0.029296875, + "ewc_loss_parallel": 2.9325485229492188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.963178634643555, + "learning_rate": 1e-06, + "loss": 0.4066, + "mean_token_accuracy": 0.8703266382217407, + "num_tokens": 312925446.0, + "step": 8205 + }, + { + "epoch": 1.043887546113726, + "ewc_loss": 0.029541015625, + "ewc_loss_parallel": 2.956390380859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.319292068481445, + "learning_rate": 1e-06, + "loss": 0.4273, + "mean_token_accuracy": 0.8617427349090576, + "num_tokens": 312965425.0, + "step": 8206 + }, + { + "epoch": 1.0440147563923166, + "ewc_loss": 0.0296630859375, + "ewc_loss_parallel": 2.968311309814453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.646448135375977, + "learning_rate": 1e-06, + "loss": 0.442, + "mean_token_accuracy": 0.8576763272285461, + "num_tokens": 313000637.0, + "step": 8207 + }, + { + "epoch": 1.044141966670907, + "ewc_loss": 0.029541015625, + "ewc_loss_parallel": 2.956390380859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.031553268432617, + "learning_rate": 1e-06, + "loss": 0.4359, + "mean_token_accuracy": 0.8642590641975403, + "num_tokens": 313035986.0, + "step": 8208 + }, + { + "epoch": 1.0442691769494976, + "ewc_loss": 0.0291748046875, + "ewc_loss_parallel": 2.9206275939941406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.37245750427246, + "learning_rate": 1e-06, + "loss": 0.4775, + "mean_token_accuracy": 0.8522518873214722, + "num_tokens": 313077027.0, + "step": 8209 + }, + { + "epoch": 1.0443963872280881, + "ewc_loss": 0.0291748046875, + "ewc_loss_parallel": 2.9206275939941406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.023801803588867, + "learning_rate": 1e-06, + "loss": 0.396, + "mean_token_accuracy": 0.8739525079727173, + "num_tokens": 313121733.0, + "step": 8210 + }, + { + "epoch": 1.0445235975066784, + "ewc_loss": 0.0289306640625, + "ewc_loss_parallel": 2.8967857360839844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.06711769104004, + "learning_rate": 1e-06, + "loss": 0.4429, + "mean_token_accuracy": 0.8577092289924622, + "num_tokens": 313165542.0, + "step": 8211 + }, + { + "epoch": 1.044650807785269, + "ewc_loss": 0.0294189453125, + "ewc_loss_parallel": 2.944469451904297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.207965850830078, + "learning_rate": 1e-06, + "loss": 0.4641, + "mean_token_accuracy": 0.8527710437774658, + "num_tokens": 313201082.0, + "step": 8212 + }, + { + "epoch": 1.0447780180638595, + "ewc_loss": 0.0291748046875, + "ewc_loss_parallel": 2.9206275939941406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.223848342895508, + "learning_rate": 1e-06, + "loss": 0.3932, + "mean_token_accuracy": 0.8745174407958984, + "num_tokens": 313241765.0, + "step": 8213 + }, + { + "epoch": 1.04490522834245, + "ewc_loss": 0.0294189453125, + "ewc_loss_parallel": 2.944469451904297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.007530212402344, + "learning_rate": 1e-06, + "loss": 0.4478, + "mean_token_accuracy": 0.8618087768554688, + "num_tokens": 313282471.0, + "step": 8214 + }, + { + "epoch": 1.0450324386210406, + "ewc_loss": 0.029296875, + "ewc_loss_parallel": 2.9325485229492188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.008440017700195, + "learning_rate": 1e-06, + "loss": 0.4343, + "mean_token_accuracy": 0.8592863082885742, + "num_tokens": 313331505.0, + "step": 8215 + }, + { + "epoch": 1.045159648899631, + "ewc_loss": 0.0294189453125, + "ewc_loss_parallel": 2.944469451904297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.956090927124023, + "learning_rate": 1e-06, + "loss": 0.4203, + "mean_token_accuracy": 0.8633811473846436, + "num_tokens": 313369823.0, + "step": 8216 + }, + { + "epoch": 1.0452868591782216, + "ewc_loss": 0.029541015625, + "ewc_loss_parallel": 2.956390380859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.039962768554688, + "learning_rate": 1e-06, + "loss": 0.4115, + "mean_token_accuracy": 0.8643361330032349, + "num_tokens": 313404255.0, + "step": 8217 + }, + { + "epoch": 1.0454140694568121, + "ewc_loss": 0.0296630859375, + "ewc_loss_parallel": 2.968311309814453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.01264762878418, + "learning_rate": 1e-06, + "loss": 0.3934, + "mean_token_accuracy": 0.8736312389373779, + "num_tokens": 313439368.0, + "step": 8218 + }, + { + "epoch": 1.0455412797354027, + "ewc_loss": 0.029541015625, + "ewc_loss_parallel": 2.956390380859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.073978424072266, + "learning_rate": 1e-06, + "loss": 0.4524, + "mean_token_accuracy": 0.8546768426895142, + "num_tokens": 313479475.0, + "step": 8219 + }, + { + "epoch": 1.0456684900139932, + "ewc_loss": 0.029541015625, + "ewc_loss_parallel": 2.956390380859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.148914337158203, + "learning_rate": 1e-06, + "loss": 0.4668, + "mean_token_accuracy": 0.8544039726257324, + "num_tokens": 313514604.0, + "step": 8220 + }, + { + "epoch": 1.0457957002925837, + "ewc_loss": 0.0294189453125, + "ewc_loss_parallel": 2.944469451904297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.01604461669922, + "learning_rate": 1e-06, + "loss": 0.4277, + "mean_token_accuracy": 0.8593534231185913, + "num_tokens": 313548907.0, + "step": 8221 + }, + { + "epoch": 1.0459229105711743, + "ewc_loss": 0.0294189453125, + "ewc_loss_parallel": 2.944469451904297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.21487808227539, + "learning_rate": 1e-06, + "loss": 0.4219, + "mean_token_accuracy": 0.8626686334609985, + "num_tokens": 313586615.0, + "step": 8222 + }, + { + "epoch": 1.0460501208497646, + "ewc_loss": 0.029541015625, + "ewc_loss_parallel": 2.956390380859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.977092742919922, + "learning_rate": 1e-06, + "loss": 0.4589, + "mean_token_accuracy": 0.8538897037506104, + "num_tokens": 313625202.0, + "step": 8223 + }, + { + "epoch": 1.046177331128355, + "ewc_loss": 0.029541015625, + "ewc_loss_parallel": 2.956390380859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.160444259643555, + "learning_rate": 1e-06, + "loss": 0.4441, + "mean_token_accuracy": 0.8564698696136475, + "num_tokens": 313663257.0, + "step": 8224 + }, + { + "epoch": 1.0463045414069456, + "ewc_loss": 0.0294189453125, + "ewc_loss_parallel": 2.944469451904297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.00263786315918, + "learning_rate": 1e-06, + "loss": 0.4132, + "mean_token_accuracy": 0.867404043674469, + "num_tokens": 313701430.0, + "step": 8225 + }, + { + "epoch": 1.0464317516855361, + "ewc_loss": 0.029296875, + "ewc_loss_parallel": 2.9325485229492188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.02117156982422, + "learning_rate": 1e-06, + "loss": 0.3986, + "mean_token_accuracy": 0.8736539483070374, + "num_tokens": 313741286.0, + "step": 8226 + }, + { + "epoch": 1.0465589619641267, + "ewc_loss": 0.0294189453125, + "ewc_loss_parallel": 2.944469451904297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.128686904907227, + "learning_rate": 1e-06, + "loss": 0.502, + "mean_token_accuracy": 0.848106324672699, + "num_tokens": 313779115.0, + "step": 8227 + }, + { + "epoch": 1.0466861722427172, + "ewc_loss": 0.0294189453125, + "ewc_loss_parallel": 2.944469451904297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.002723693847656, + "learning_rate": 1e-06, + "loss": 0.4092, + "mean_token_accuracy": 0.8680016398429871, + "num_tokens": 313822331.0, + "step": 8228 + }, + { + "epoch": 1.0468133825213077, + "ewc_loss": 0.029541015625, + "ewc_loss_parallel": 2.956390380859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.04090690612793, + "learning_rate": 1e-06, + "loss": 0.4496, + "mean_token_accuracy": 0.8572682738304138, + "num_tokens": 313863545.0, + "step": 8229 + }, + { + "epoch": 1.0469405927998983, + "ewc_loss": 0.02978515625, + "ewc_loss_parallel": 2.9802322387695312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.029268264770508, + "learning_rate": 1e-06, + "loss": 0.367, + "mean_token_accuracy": 0.8830980062484741, + "num_tokens": 313894956.0, + "step": 8230 + }, + { + "epoch": 1.0470678030784888, + "ewc_loss": 0.029541015625, + "ewc_loss_parallel": 2.956390380859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.00581169128418, + "learning_rate": 1e-06, + "loss": 0.407, + "mean_token_accuracy": 0.8712812662124634, + "num_tokens": 313927920.0, + "step": 8231 + }, + { + "epoch": 1.0471950133570793, + "ewc_loss": 0.029541015625, + "ewc_loss_parallel": 2.956390380859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.208236694335938, + "learning_rate": 1e-06, + "loss": 0.4017, + "mean_token_accuracy": 0.872903048992157, + "num_tokens": 313963230.0, + "step": 8232 + }, + { + "epoch": 1.0473222236356698, + "ewc_loss": 0.02978515625, + "ewc_loss_parallel": 2.9802322387695312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.274442672729492, + "learning_rate": 1e-06, + "loss": 0.4561, + "mean_token_accuracy": 0.8551239371299744, + "num_tokens": 313995951.0, + "step": 8233 + }, + { + "epoch": 1.0474494339142604, + "ewc_loss": 0.0294189453125, + "ewc_loss_parallel": 2.944469451904297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.009702682495117, + "learning_rate": 1e-06, + "loss": 0.4227, + "mean_token_accuracy": 0.8635789155960083, + "num_tokens": 314040928.0, + "step": 8234 + }, + { + "epoch": 1.0475766441928507, + "ewc_loss": 0.0294189453125, + "ewc_loss_parallel": 2.944469451904297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.17888832092285, + "learning_rate": 1e-06, + "loss": 0.456, + "mean_token_accuracy": 0.8545519113540649, + "num_tokens": 314080329.0, + "step": 8235 + }, + { + "epoch": 1.0477038544714412, + "ewc_loss": 0.0294189453125, + "ewc_loss_parallel": 2.944469451904297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.073915481567383, + "learning_rate": 1e-06, + "loss": 0.4287, + "mean_token_accuracy": 0.8640835285186768, + "num_tokens": 314117107.0, + "step": 8236 + }, + { + "epoch": 1.0478310647500317, + "ewc_loss": 0.029541015625, + "ewc_loss_parallel": 2.956390380859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.33858299255371, + "learning_rate": 1e-06, + "loss": 0.4405, + "mean_token_accuracy": 0.8604870438575745, + "num_tokens": 314148077.0, + "step": 8237 + }, + { + "epoch": 1.0479582750286223, + "ewc_loss": 0.02978515625, + "ewc_loss_parallel": 2.9802322387695312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.148847579956055, + "learning_rate": 1e-06, + "loss": 0.4578, + "mean_token_accuracy": 0.8546781539916992, + "num_tokens": 314182064.0, + "step": 8238 + }, + { + "epoch": 1.0480854853072128, + "ewc_loss": 0.029541015625, + "ewc_loss_parallel": 2.956390380859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.106107711791992, + "learning_rate": 1e-06, + "loss": 0.4478, + "mean_token_accuracy": 0.853763997554779, + "num_tokens": 314221667.0, + "step": 8239 + }, + { + "epoch": 1.0482126955858033, + "ewc_loss": 0.029541015625, + "ewc_loss_parallel": 2.956390380859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.981752395629883, + "learning_rate": 1e-06, + "loss": 0.436, + "mean_token_accuracy": 0.8605560660362244, + "num_tokens": 314255225.0, + "step": 8240 + }, + { + "epoch": 1.0483399058643939, + "ewc_loss": 0.0294189453125, + "ewc_loss_parallel": 2.944469451904297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.03734016418457, + "learning_rate": 1e-06, + "loss": 0.4217, + "mean_token_accuracy": 0.8641002178192139, + "num_tokens": 314288011.0, + "step": 8241 + }, + { + "epoch": 1.0484671161429844, + "ewc_loss": 0.029541015625, + "ewc_loss_parallel": 2.956390380859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.083559036254883, + "learning_rate": 1e-06, + "loss": 0.4309, + "mean_token_accuracy": 0.8608732223510742, + "num_tokens": 314332461.0, + "step": 8242 + }, + { + "epoch": 1.048594326421575, + "ewc_loss": 0.0294189453125, + "ewc_loss_parallel": 2.944469451904297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.882328033447266, + "learning_rate": 1e-06, + "loss": 0.4414, + "mean_token_accuracy": 0.8587408661842346, + "num_tokens": 314367967.0, + "step": 8243 + }, + { + "epoch": 1.0487215367001654, + "ewc_loss": 0.0294189453125, + "ewc_loss_parallel": 2.944469451904297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.993188858032227, + "learning_rate": 1e-06, + "loss": 0.3918, + "mean_token_accuracy": 0.8724786043167114, + "num_tokens": 314404487.0, + "step": 8244 + }, + { + "epoch": 1.048848746978756, + "ewc_loss": 0.0296630859375, + "ewc_loss_parallel": 2.968311309814453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.233142852783203, + "learning_rate": 1e-06, + "loss": 0.4409, + "mean_token_accuracy": 0.860582709312439, + "num_tokens": 314444731.0, + "step": 8245 + }, + { + "epoch": 1.0489759572573465, + "ewc_loss": 0.029541015625, + "ewc_loss_parallel": 2.956390380859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.87721061706543, + "learning_rate": 1e-06, + "loss": 0.4254, + "mean_token_accuracy": 0.862651526927948, + "num_tokens": 314479720.0, + "step": 8246 + }, + { + "epoch": 1.0491031675359368, + "ewc_loss": 0.0294189453125, + "ewc_loss_parallel": 2.944469451904297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.159698486328125, + "learning_rate": 1e-06, + "loss": 0.3807, + "mean_token_accuracy": 0.88020920753479, + "num_tokens": 314515613.0, + "step": 8247 + }, + { + "epoch": 1.0492303778145273, + "ewc_loss": 0.0294189453125, + "ewc_loss_parallel": 2.944469451904297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.86914825439453, + "learning_rate": 1e-06, + "loss": 0.449, + "mean_token_accuracy": 0.8619095087051392, + "num_tokens": 314556519.0, + "step": 8248 + }, + { + "epoch": 1.0493575880931179, + "ewc_loss": 0.0299072265625, + "ewc_loss_parallel": 2.9921531677246094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.28036117553711, + "learning_rate": 1e-06, + "loss": 0.391, + "mean_token_accuracy": 0.8743470907211304, + "num_tokens": 314590803.0, + "step": 8249 + }, + { + "epoch": 1.0494847983717084, + "ewc_loss": 0.0296630859375, + "ewc_loss_parallel": 2.968311309814453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.991580963134766, + "learning_rate": 1e-06, + "loss": 0.4222, + "mean_token_accuracy": 0.8643333315849304, + "num_tokens": 314631347.0, + "step": 8250 + }, + { + "epoch": 1.049612008650299, + "ewc_loss": 0.0296630859375, + "ewc_loss_parallel": 2.968311309814453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.066688537597656, + "learning_rate": 1e-06, + "loss": 0.4372, + "mean_token_accuracy": 0.862856388092041, + "num_tokens": 314673739.0, + "step": 8251 + }, + { + "epoch": 1.0497392189288894, + "ewc_loss": 0.02978515625, + "ewc_loss_parallel": 2.9802322387695312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.15324592590332, + "learning_rate": 1e-06, + "loss": 0.4354, + "mean_token_accuracy": 0.863574743270874, + "num_tokens": 314718498.0, + "step": 8252 + }, + { + "epoch": 1.04986642920748, + "ewc_loss": 0.02978515625, + "ewc_loss_parallel": 2.9802322387695312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.88155746459961, + "learning_rate": 1e-06, + "loss": 0.4165, + "mean_token_accuracy": 0.8678921461105347, + "num_tokens": 314758755.0, + "step": 8253 + }, + { + "epoch": 1.0499936394860705, + "ewc_loss": 0.0296630859375, + "ewc_loss_parallel": 2.968311309814453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.123933792114258, + "learning_rate": 1e-06, + "loss": 0.4176, + "mean_token_accuracy": 0.8666790723800659, + "num_tokens": 314793644.0, + "step": 8254 + }, + { + "epoch": 1.050120849764661, + "ewc_loss": 0.0299072265625, + "ewc_loss_parallel": 2.9921531677246094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.068185806274414, + "learning_rate": 1e-06, + "loss": 0.4457, + "mean_token_accuracy": 0.8588736057281494, + "num_tokens": 314828045.0, + "step": 8255 + }, + { + "epoch": 1.0502480600432516, + "ewc_loss": 0.029541015625, + "ewc_loss_parallel": 2.956390380859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.135896682739258, + "learning_rate": 1e-06, + "loss": 0.421, + "mean_token_accuracy": 0.8656656742095947, + "num_tokens": 314866088.0, + "step": 8256 + }, + { + "epoch": 1.050375270321842, + "ewc_loss": 0.02978515625, + "ewc_loss_parallel": 2.9802322387695312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.186901092529297, + "learning_rate": 1e-06, + "loss": 0.4649, + "mean_token_accuracy": 0.8550421595573425, + "num_tokens": 314904757.0, + "step": 8257 + }, + { + "epoch": 1.0505024806004326, + "ewc_loss": 0.02978515625, + "ewc_loss_parallel": 2.9802322387695312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.260114669799805, + "learning_rate": 1e-06, + "loss": 0.4487, + "mean_token_accuracy": 0.8552398681640625, + "num_tokens": 314941410.0, + "step": 8258 + }, + { + "epoch": 1.0506296908790231, + "ewc_loss": 0.0296630859375, + "ewc_loss_parallel": 2.968311309814453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.157943725585938, + "learning_rate": 1e-06, + "loss": 0.4093, + "mean_token_accuracy": 0.8716216683387756, + "num_tokens": 314982060.0, + "step": 8259 + }, + { + "epoch": 1.0507569011576134, + "ewc_loss": 0.029296875, + "ewc_loss_parallel": 2.9325485229492188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.256303787231445, + "learning_rate": 1e-06, + "loss": 0.4605, + "mean_token_accuracy": 0.8543565273284912, + "num_tokens": 315021812.0, + "step": 8260 + }, + { + "epoch": 1.050884111436204, + "ewc_loss": 0.0294189453125, + "ewc_loss_parallel": 2.944469451904297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.113248825073242, + "learning_rate": 1e-06, + "loss": 0.4414, + "mean_token_accuracy": 0.8594379425048828, + "num_tokens": 315059520.0, + "step": 8261 + }, + { + "epoch": 1.0510113217147945, + "ewc_loss": 0.0294189453125, + "ewc_loss_parallel": 2.944469451904297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.281505584716797, + "learning_rate": 1e-06, + "loss": 0.45, + "mean_token_accuracy": 0.8582903146743774, + "num_tokens": 315098392.0, + "step": 8262 + }, + { + "epoch": 1.051138531993385, + "ewc_loss": 0.0296630859375, + "ewc_loss_parallel": 2.968311309814453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.016576766967773, + "learning_rate": 1e-06, + "loss": 0.4674, + "mean_token_accuracy": 0.8515768051147461, + "num_tokens": 315137375.0, + "step": 8263 + }, + { + "epoch": 1.0512657422719756, + "ewc_loss": 0.0294189453125, + "ewc_loss_parallel": 2.944469451904297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.290056228637695, + "learning_rate": 1e-06, + "loss": 0.4017, + "mean_token_accuracy": 0.8681541681289673, + "num_tokens": 315171308.0, + "step": 8264 + }, + { + "epoch": 1.051392952550566, + "ewc_loss": 0.0296630859375, + "ewc_loss_parallel": 2.968311309814453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.23678207397461, + "learning_rate": 1e-06, + "loss": 0.4427, + "mean_token_accuracy": 0.8640590310096741, + "num_tokens": 315212176.0, + "step": 8265 + }, + { + "epoch": 1.0515201628291566, + "ewc_loss": 0.029541015625, + "ewc_loss_parallel": 2.956390380859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.996944427490234, + "learning_rate": 1e-06, + "loss": 0.4705, + "mean_token_accuracy": 0.8513292074203491, + "num_tokens": 315251321.0, + "step": 8266 + }, + { + "epoch": 1.0516473731077471, + "ewc_loss": 0.0296630859375, + "ewc_loss_parallel": 2.968311309814453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.201860427856445, + "learning_rate": 1e-06, + "loss": 0.4773, + "mean_token_accuracy": 0.8490972518920898, + "num_tokens": 315288780.0, + "step": 8267 + }, + { + "epoch": 1.0517745833863377, + "ewc_loss": 0.0296630859375, + "ewc_loss_parallel": 2.968311309814453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.038665771484375, + "learning_rate": 1e-06, + "loss": 0.3524, + "mean_token_accuracy": 0.8852476477622986, + "num_tokens": 315325630.0, + "step": 8268 + }, + { + "epoch": 1.0519017936649282, + "ewc_loss": 0.0294189453125, + "ewc_loss_parallel": 2.944469451904297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.16594696044922, + "learning_rate": 1e-06, + "loss": 0.442, + "mean_token_accuracy": 0.8577194809913635, + "num_tokens": 315362887.0, + "step": 8269 + }, + { + "epoch": 1.0520290039435187, + "ewc_loss": 0.02978515625, + "ewc_loss_parallel": 2.9802322387695312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.241687774658203, + "learning_rate": 1e-06, + "loss": 0.3886, + "mean_token_accuracy": 0.8751373291015625, + "num_tokens": 315397552.0, + "step": 8270 + }, + { + "epoch": 1.0521562142221093, + "ewc_loss": 0.029541015625, + "ewc_loss_parallel": 2.956390380859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.40675926208496, + "learning_rate": 1e-06, + "loss": 0.3939, + "mean_token_accuracy": 0.873261570930481, + "num_tokens": 315436785.0, + "step": 8271 + }, + { + "epoch": 1.0522834245006996, + "ewc_loss": 0.0294189453125, + "ewc_loss_parallel": 2.944469451904297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.99883270263672, + "learning_rate": 1e-06, + "loss": 0.3993, + "mean_token_accuracy": 0.8738541007041931, + "num_tokens": 315467849.0, + "step": 8272 + }, + { + "epoch": 1.05241063477929, + "ewc_loss": 0.029541015625, + "ewc_loss_parallel": 2.956390380859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.207738876342773, + "learning_rate": 1e-06, + "loss": 0.4097, + "mean_token_accuracy": 0.8719395399093628, + "num_tokens": 315501496.0, + "step": 8273 + }, + { + "epoch": 1.0525378450578806, + "ewc_loss": 0.0296630859375, + "ewc_loss_parallel": 2.968311309814453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.221036911010742, + "learning_rate": 1e-06, + "loss": 0.4004, + "mean_token_accuracy": 0.8711819052696228, + "num_tokens": 315540878.0, + "step": 8274 + }, + { + "epoch": 1.0526650553364711, + "ewc_loss": 0.0291748046875, + "ewc_loss_parallel": 2.9206275939941406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.05709457397461, + "learning_rate": 1e-06, + "loss": 0.3794, + "mean_token_accuracy": 0.8772394061088562, + "num_tokens": 315578139.0, + "step": 8275 + }, + { + "epoch": 1.0527922656150617, + "ewc_loss": 0.0296630859375, + "ewc_loss_parallel": 2.968311309814453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.11344337463379, + "learning_rate": 1e-06, + "loss": 0.4187, + "mean_token_accuracy": 0.8648325204849243, + "num_tokens": 315615845.0, + "step": 8276 + }, + { + "epoch": 1.0529194758936522, + "ewc_loss": 0.0294189453125, + "ewc_loss_parallel": 2.944469451904297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.18444061279297, + "learning_rate": 1e-06, + "loss": 0.4452, + "mean_token_accuracy": 0.8559610843658447, + "num_tokens": 315645407.0, + "step": 8277 + }, + { + "epoch": 1.0530466861722427, + "ewc_loss": 0.0296630859375, + "ewc_loss_parallel": 2.968311309814453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.96501922607422, + "learning_rate": 1e-06, + "loss": 0.4097, + "mean_token_accuracy": 0.8708829283714294, + "num_tokens": 315687922.0, + "step": 8278 + }, + { + "epoch": 1.0531738964508333, + "ewc_loss": 0.0299072265625, + "ewc_loss_parallel": 2.9921531677246094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.172449111938477, + "learning_rate": 1e-06, + "loss": 0.4288, + "mean_token_accuracy": 0.8649581670761108, + "num_tokens": 315723730.0, + "step": 8279 + }, + { + "epoch": 1.0533011067294238, + "ewc_loss": 0.02978515625, + "ewc_loss_parallel": 2.9802322387695312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.318147659301758, + "learning_rate": 1e-06, + "loss": 0.3879, + "mean_token_accuracy": 0.875542938709259, + "num_tokens": 315765662.0, + "step": 8280 + }, + { + "epoch": 1.0534283170080143, + "ewc_loss": 0.029541015625, + "ewc_loss_parallel": 2.956390380859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.06010627746582, + "learning_rate": 1e-06, + "loss": 0.5019, + "mean_token_accuracy": 0.8390341401100159, + "num_tokens": 315808668.0, + "step": 8281 + }, + { + "epoch": 1.0535555272866048, + "ewc_loss": 0.029541015625, + "ewc_loss_parallel": 2.956390380859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.166549682617188, + "learning_rate": 1e-06, + "loss": 0.4426, + "mean_token_accuracy": 0.8591936230659485, + "num_tokens": 315844878.0, + "step": 8282 + }, + { + "epoch": 1.0536827375651954, + "ewc_loss": 0.029541015625, + "ewc_loss_parallel": 2.956390380859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.348543167114258, + "learning_rate": 1e-06, + "loss": 0.3785, + "mean_token_accuracy": 0.8770367503166199, + "num_tokens": 315879472.0, + "step": 8283 + }, + { + "epoch": 1.0538099478437857, + "ewc_loss": 0.029541015625, + "ewc_loss_parallel": 2.956390380859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.085411071777344, + "learning_rate": 1e-06, + "loss": 0.4429, + "mean_token_accuracy": 0.8579686880111694, + "num_tokens": 315915496.0, + "step": 8284 + }, + { + "epoch": 1.0539371581223762, + "ewc_loss": 0.0294189453125, + "ewc_loss_parallel": 2.944469451904297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.220748901367188, + "learning_rate": 1e-06, + "loss": 0.4193, + "mean_token_accuracy": 0.8658391237258911, + "num_tokens": 315954317.0, + "step": 8285 + }, + { + "epoch": 1.0540643684009667, + "ewc_loss": 0.029541015625, + "ewc_loss_parallel": 2.956390380859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.039854049682617, + "learning_rate": 1e-06, + "loss": 0.4422, + "mean_token_accuracy": 0.8587863445281982, + "num_tokens": 315995518.0, + "step": 8286 + }, + { + "epoch": 1.0541915786795573, + "ewc_loss": 0.029541015625, + "ewc_loss_parallel": 2.956390380859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.27182960510254, + "learning_rate": 1e-06, + "loss": 0.3843, + "mean_token_accuracy": 0.8769403696060181, + "num_tokens": 316040475.0, + "step": 8287 + }, + { + "epoch": 1.0543187889581478, + "ewc_loss": 0.029541015625, + "ewc_loss_parallel": 2.956390380859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.040971755981445, + "learning_rate": 1e-06, + "loss": 0.4406, + "mean_token_accuracy": 0.8628294467926025, + "num_tokens": 316083003.0, + "step": 8288 + }, + { + "epoch": 1.0544459992367383, + "ewc_loss": 0.0296630859375, + "ewc_loss_parallel": 2.968311309814453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.26548194885254, + "learning_rate": 1e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.8644534945487976, + "num_tokens": 316124687.0, + "step": 8289 + }, + { + "epoch": 1.0545732095153288, + "ewc_loss": 0.0294189453125, + "ewc_loss_parallel": 2.944469451904297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.95635223388672, + "learning_rate": 1e-06, + "loss": 0.4695, + "mean_token_accuracy": 0.8517165184020996, + "num_tokens": 316170052.0, + "step": 8290 + }, + { + "epoch": 1.0547004197939194, + "ewc_loss": 0.0296630859375, + "ewc_loss_parallel": 2.968311309814453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.21851921081543, + "learning_rate": 1e-06, + "loss": 0.4794, + "mean_token_accuracy": 0.8495836853981018, + "num_tokens": 316211941.0, + "step": 8291 + }, + { + "epoch": 1.05482763007251, + "ewc_loss": 0.029541015625, + "ewc_loss_parallel": 2.956390380859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.153751373291016, + "learning_rate": 1e-06, + "loss": 0.4307, + "mean_token_accuracy": 0.8666435480117798, + "num_tokens": 316252592.0, + "step": 8292 + }, + { + "epoch": 1.0549548403511004, + "ewc_loss": 0.0291748046875, + "ewc_loss_parallel": 2.9206275939941406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.038921356201172, + "learning_rate": 1e-06, + "loss": 0.3999, + "mean_token_accuracy": 0.872397780418396, + "num_tokens": 316287250.0, + "step": 8293 + }, + { + "epoch": 1.055082050629691, + "ewc_loss": 0.0294189453125, + "ewc_loss_parallel": 2.944469451904297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.114410400390625, + "learning_rate": 1e-06, + "loss": 0.417, + "mean_token_accuracy": 0.8674401044845581, + "num_tokens": 316326308.0, + "step": 8294 + }, + { + "epoch": 1.0552092609082815, + "ewc_loss": 0.0294189453125, + "ewc_loss_parallel": 2.944469451904297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.03110122680664, + "learning_rate": 1e-06, + "loss": 0.4138, + "mean_token_accuracy": 0.867556095123291, + "num_tokens": 316364519.0, + "step": 8295 + }, + { + "epoch": 1.0553364711868718, + "ewc_loss": 0.0294189453125, + "ewc_loss_parallel": 2.944469451904297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.212657928466797, + "learning_rate": 1e-06, + "loss": 0.4005, + "mean_token_accuracy": 0.8684911727905273, + "num_tokens": 316401165.0, + "step": 8296 + }, + { + "epoch": 1.0554636814654623, + "ewc_loss": 0.0294189453125, + "ewc_loss_parallel": 2.944469451904297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.117033004760742, + "learning_rate": 1e-06, + "loss": 0.4431, + "mean_token_accuracy": 0.8583188056945801, + "num_tokens": 316440245.0, + "step": 8297 + }, + { + "epoch": 1.0555908917440529, + "ewc_loss": 0.0294189453125, + "ewc_loss_parallel": 2.944469451904297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.0150089263916, + "learning_rate": 1e-06, + "loss": 0.4055, + "mean_token_accuracy": 0.8727520704269409, + "num_tokens": 316484969.0, + "step": 8298 + }, + { + "epoch": 1.0557181020226434, + "ewc_loss": 0.0296630859375, + "ewc_loss_parallel": 2.968311309814453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.052274703979492, + "learning_rate": 1e-06, + "loss": 0.3688, + "mean_token_accuracy": 0.8815978169441223, + "num_tokens": 316520510.0, + "step": 8299 + }, + { + "epoch": 1.055845312301234, + "ewc_loss": 0.029541015625, + "ewc_loss_parallel": 2.956390380859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.303457260131836, + "learning_rate": 1e-06, + "loss": 0.4769, + "mean_token_accuracy": 0.8475058674812317, + "num_tokens": 316561946.0, + "step": 8300 + }, + { + "epoch": 1.0559725225798244, + "ewc_loss": 0.029541015625, + "ewc_loss_parallel": 2.956390380859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.2216739654541, + "learning_rate": 1e-06, + "loss": 0.3774, + "mean_token_accuracy": 0.881493091583252, + "num_tokens": 316605489.0, + "step": 8301 + }, + { + "epoch": 1.056099732858415, + "ewc_loss": 0.029541015625, + "ewc_loss_parallel": 2.956390380859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.0406551361084, + "learning_rate": 1e-06, + "loss": 0.483, + "mean_token_accuracy": 0.8474170565605164, + "num_tokens": 316648590.0, + "step": 8302 + }, + { + "epoch": 1.0562269431370055, + "ewc_loss": 0.0294189453125, + "ewc_loss_parallel": 2.944469451904297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.00124168395996, + "learning_rate": 1e-06, + "loss": 0.3763, + "mean_token_accuracy": 0.8771307468414307, + "num_tokens": 316688169.0, + "step": 8303 + }, + { + "epoch": 1.056354153415596, + "ewc_loss": 0.029296875, + "ewc_loss_parallel": 2.9325485229492188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.890106201171875, + "learning_rate": 1e-06, + "loss": 0.5028, + "mean_token_accuracy": 0.8441938161849976, + "num_tokens": 316724031.0, + "step": 8304 + }, + { + "epoch": 1.0564813636941865, + "ewc_loss": 0.0296630859375, + "ewc_loss_parallel": 2.968311309814453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.155248641967773, + "learning_rate": 1e-06, + "loss": 0.4246, + "mean_token_accuracy": 0.8636652231216431, + "num_tokens": 316769762.0, + "step": 8305 + }, + { + "epoch": 1.056608573972777, + "ewc_loss": 0.029541015625, + "ewc_loss_parallel": 2.956390380859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.004737854003906, + "learning_rate": 1e-06, + "loss": 0.4041, + "mean_token_accuracy": 0.8721778988838196, + "num_tokens": 316814156.0, + "step": 8306 + }, + { + "epoch": 1.0567357842513676, + "ewc_loss": 0.029541015625, + "ewc_loss_parallel": 2.956390380859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.04825210571289, + "learning_rate": 1e-06, + "loss": 0.4003, + "mean_token_accuracy": 0.8733819723129272, + "num_tokens": 316852784.0, + "step": 8307 + }, + { + "epoch": 1.0568629945299581, + "ewc_loss": 0.029541015625, + "ewc_loss_parallel": 2.956390380859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.95140266418457, + "learning_rate": 1e-06, + "loss": 0.4366, + "mean_token_accuracy": 0.8574243187904358, + "num_tokens": 316891992.0, + "step": 8308 + }, + { + "epoch": 1.0569902048085484, + "ewc_loss": 0.0294189453125, + "ewc_loss_parallel": 2.944469451904297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.0877742767334, + "learning_rate": 1e-06, + "loss": 0.3836, + "mean_token_accuracy": 0.8787890672683716, + "num_tokens": 316932246.0, + "step": 8309 + }, + { + "epoch": 1.057117415087139, + "ewc_loss": 0.0296630859375, + "ewc_loss_parallel": 2.968311309814453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.031524658203125, + "learning_rate": 1e-06, + "loss": 0.4467, + "mean_token_accuracy": 0.8556945323944092, + "num_tokens": 316972017.0, + "step": 8310 + }, + { + "epoch": 1.0572446253657295, + "ewc_loss": 0.0294189453125, + "ewc_loss_parallel": 2.944469451904297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.96180534362793, + "learning_rate": 1e-06, + "loss": 0.4442, + "mean_token_accuracy": 0.8582907915115356, + "num_tokens": 317008631.0, + "step": 8311 + }, + { + "epoch": 1.05737183564432, + "ewc_loss": 0.0296630859375, + "ewc_loss_parallel": 2.968311309814453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.308589935302734, + "learning_rate": 1e-06, + "loss": 0.4097, + "mean_token_accuracy": 0.8702219724655151, + "num_tokens": 317048616.0, + "step": 8312 + }, + { + "epoch": 1.0574990459229106, + "ewc_loss": 0.02978515625, + "ewc_loss_parallel": 2.9802322387695312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.133283615112305, + "learning_rate": 1e-06, + "loss": 0.4589, + "mean_token_accuracy": 0.8563479781150818, + "num_tokens": 317088675.0, + "step": 8313 + }, + { + "epoch": 1.057626256201501, + "ewc_loss": 0.0296630859375, + "ewc_loss_parallel": 2.968311309814453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.128868103027344, + "learning_rate": 1e-06, + "loss": 0.3596, + "mean_token_accuracy": 0.8842917680740356, + "num_tokens": 317120351.0, + "step": 8314 + }, + { + "epoch": 1.0577534664800916, + "ewc_loss": 0.029296875, + "ewc_loss_parallel": 2.9325485229492188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.201475143432617, + "learning_rate": 1e-06, + "loss": 0.4516, + "mean_token_accuracy": 0.8578174114227295, + "num_tokens": 317159027.0, + "step": 8315 + }, + { + "epoch": 1.0578806767586821, + "ewc_loss": 0.029541015625, + "ewc_loss_parallel": 2.956390380859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.142078399658203, + "learning_rate": 1e-06, + "loss": 0.4196, + "mean_token_accuracy": 0.8635733127593994, + "num_tokens": 317188892.0, + "step": 8316 + }, + { + "epoch": 1.0580078870372727, + "ewc_loss": 0.029541015625, + "ewc_loss_parallel": 2.956390380859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.23103141784668, + "learning_rate": 1e-06, + "loss": 0.4122, + "mean_token_accuracy": 0.8694427013397217, + "num_tokens": 317229026.0, + "step": 8317 + }, + { + "epoch": 1.0581350973158632, + "ewc_loss": 0.0294189453125, + "ewc_loss_parallel": 2.944469451904297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.278915405273438, + "learning_rate": 1e-06, + "loss": 0.4286, + "mean_token_accuracy": 0.8634538650512695, + "num_tokens": 317272946.0, + "step": 8318 + }, + { + "epoch": 1.0582623075944537, + "ewc_loss": 0.029541015625, + "ewc_loss_parallel": 2.956390380859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.174266815185547, + "learning_rate": 1e-06, + "loss": 0.4241, + "mean_token_accuracy": 0.8638800382614136, + "num_tokens": 317314625.0, + "step": 8319 + }, + { + "epoch": 1.058389517873044, + "ewc_loss": 0.0296630859375, + "ewc_loss_parallel": 2.968311309814453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.228605270385742, + "learning_rate": 1e-06, + "loss": 0.3877, + "mean_token_accuracy": 0.8743146061897278, + "num_tokens": 317351450.0, + "step": 8320 + }, + { + "epoch": 1.0585167281516346, + "ewc_loss": 0.029541015625, + "ewc_loss_parallel": 2.956390380859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.245790481567383, + "learning_rate": 1e-06, + "loss": 0.4688, + "mean_token_accuracy": 0.8473080396652222, + "num_tokens": 317391519.0, + "step": 8321 + }, + { + "epoch": 1.058643938430225, + "ewc_loss": 0.0294189453125, + "ewc_loss_parallel": 2.944469451904297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.09511947631836, + "learning_rate": 1e-06, + "loss": 0.4559, + "mean_token_accuracy": 0.8569322228431702, + "num_tokens": 317430672.0, + "step": 8322 + }, + { + "epoch": 1.0587711487088156, + "ewc_loss": 0.029541015625, + "ewc_loss_parallel": 2.956390380859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.341039657592773, + "learning_rate": 1e-06, + "loss": 0.4257, + "mean_token_accuracy": 0.8658450245857239, + "num_tokens": 317470203.0, + "step": 8323 + }, + { + "epoch": 1.0588983589874061, + "ewc_loss": 0.0294189453125, + "ewc_loss_parallel": 2.944469451904297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.159822463989258, + "learning_rate": 1e-06, + "loss": 0.4201, + "mean_token_accuracy": 0.8659297227859497, + "num_tokens": 317509594.0, + "step": 8324 + }, + { + "epoch": 1.0590255692659967, + "ewc_loss": 0.0296630859375, + "ewc_loss_parallel": 2.968311309814453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.205324172973633, + "learning_rate": 1e-06, + "loss": 0.4304, + "mean_token_accuracy": 0.8620492815971375, + "num_tokens": 317549735.0, + "step": 8325 + }, + { + "epoch": 1.0591527795445872, + "ewc_loss": 0.029541015625, + "ewc_loss_parallel": 2.956390380859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.465457916259766, + "learning_rate": 1e-06, + "loss": 0.4329, + "mean_token_accuracy": 0.8612505197525024, + "num_tokens": 317585064.0, + "step": 8326 + }, + { + "epoch": 1.0592799898231777, + "ewc_loss": 0.029296875, + "ewc_loss_parallel": 2.9325485229492188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.215700149536133, + "learning_rate": 1e-06, + "loss": 0.3751, + "mean_token_accuracy": 0.8791540861129761, + "num_tokens": 317622289.0, + "step": 8327 + }, + { + "epoch": 1.0594072001017683, + "ewc_loss": 0.029541015625, + "ewc_loss_parallel": 2.956390380859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.559968948364258, + "learning_rate": 1e-06, + "loss": 0.4621, + "mean_token_accuracy": 0.8526480793952942, + "num_tokens": 317662872.0, + "step": 8328 + }, + { + "epoch": 1.0595344103803588, + "ewc_loss": 0.029296875, + "ewc_loss_parallel": 2.9325485229492188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.099973678588867, + "learning_rate": 1e-06, + "loss": 0.425, + "mean_token_accuracy": 0.8648781776428223, + "num_tokens": 317701418.0, + "step": 8329 + }, + { + "epoch": 1.0596616206589493, + "ewc_loss": 0.029541015625, + "ewc_loss_parallel": 2.956390380859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 21.958017349243164, + "learning_rate": 1e-06, + "loss": 0.384, + "mean_token_accuracy": 0.876686155796051, + "num_tokens": 317741416.0, + "step": 8330 + }, + { + "epoch": 1.0597888309375398, + "ewc_loss": 0.029541015625, + "ewc_loss_parallel": 2.956390380859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.50312614440918, + "learning_rate": 1e-06, + "loss": 0.413, + "mean_token_accuracy": 0.8683737516403198, + "num_tokens": 317778623.0, + "step": 8331 + }, + { + "epoch": 1.0599160412161304, + "ewc_loss": 0.0294189453125, + "ewc_loss_parallel": 2.944469451904297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.149784088134766, + "learning_rate": 1e-06, + "loss": 0.4197, + "mean_token_accuracy": 0.8655146956443787, + "num_tokens": 317816577.0, + "step": 8332 + }, + { + "epoch": 1.0600432514947207, + "ewc_loss": 0.0294189453125, + "ewc_loss_parallel": 2.944469451904297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.156747817993164, + "learning_rate": 1e-06, + "loss": 0.4515, + "mean_token_accuracy": 0.8620648384094238, + "num_tokens": 317851532.0, + "step": 8333 + }, + { + "epoch": 1.0601704617733112, + "ewc_loss": 0.0296630859375, + "ewc_loss_parallel": 2.968311309814453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.353729248046875, + "learning_rate": 1e-06, + "loss": 0.4203, + "mean_token_accuracy": 0.8644416332244873, + "num_tokens": 317886789.0, + "step": 8334 + }, + { + "epoch": 1.0602976720519017, + "ewc_loss": 0.029296875, + "ewc_loss_parallel": 2.9325485229492188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.052701950073242, + "learning_rate": 1e-06, + "loss": 0.4287, + "mean_token_accuracy": 0.8667999505996704, + "num_tokens": 317929511.0, + "step": 8335 + }, + { + "epoch": 1.0604248823304923, + "ewc_loss": 0.029541015625, + "ewc_loss_parallel": 2.956390380859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.085330963134766, + "learning_rate": 1e-06, + "loss": 0.4318, + "mean_token_accuracy": 0.8622706532478333, + "num_tokens": 317963767.0, + "step": 8336 + }, + { + "epoch": 1.0605520926090828, + "ewc_loss": 0.0294189453125, + "ewc_loss_parallel": 2.944469451904297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.05007553100586, + "learning_rate": 1e-06, + "loss": 0.3788, + "mean_token_accuracy": 0.8817574977874756, + "num_tokens": 318000458.0, + "step": 8337 + }, + { + "epoch": 1.0606793028876733, + "ewc_loss": 0.0296630859375, + "ewc_loss_parallel": 2.968311309814453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.157188415527344, + "learning_rate": 1e-06, + "loss": 0.3682, + "mean_token_accuracy": 0.8814843893051147, + "num_tokens": 318040394.0, + "step": 8338 + }, + { + "epoch": 1.0608065131662638, + "ewc_loss": 0.0294189453125, + "ewc_loss_parallel": 2.944469451904297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.19003677368164, + "learning_rate": 1e-06, + "loss": 0.4298, + "mean_token_accuracy": 0.8597874641418457, + "num_tokens": 318080859.0, + "step": 8339 + }, + { + "epoch": 1.0609337234448544, + "ewc_loss": 0.029541015625, + "ewc_loss_parallel": 2.956390380859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.04295539855957, + "learning_rate": 1e-06, + "loss": 0.371, + "mean_token_accuracy": 0.8811041116714478, + "num_tokens": 318115378.0, + "step": 8340 + }, + { + "epoch": 1.061060933723445, + "ewc_loss": 0.029541015625, + "ewc_loss_parallel": 2.956390380859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.568523406982422, + "learning_rate": 1e-06, + "loss": 0.4401, + "mean_token_accuracy": 0.8559499979019165, + "num_tokens": 318150351.0, + "step": 8341 + }, + { + "epoch": 1.0611881440020354, + "ewc_loss": 0.029541015625, + "ewc_loss_parallel": 2.956390380859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.147449493408203, + "learning_rate": 1e-06, + "loss": 0.4252, + "mean_token_accuracy": 0.8635610342025757, + "num_tokens": 318192427.0, + "step": 8342 + }, + { + "epoch": 1.061315354280626, + "ewc_loss": 0.029541015625, + "ewc_loss_parallel": 2.956390380859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.026485443115234, + "learning_rate": 1e-06, + "loss": 0.4782, + "mean_token_accuracy": 0.8516135811805725, + "num_tokens": 318232150.0, + "step": 8343 + }, + { + "epoch": 1.0614425645592165, + "ewc_loss": 0.029541015625, + "ewc_loss_parallel": 2.956390380859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.310514450073242, + "learning_rate": 1e-06, + "loss": 0.438, + "mean_token_accuracy": 0.86043381690979, + "num_tokens": 318267742.0, + "step": 8344 + }, + { + "epoch": 1.0615697748378068, + "ewc_loss": 0.029541015625, + "ewc_loss_parallel": 2.956390380859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.27764129638672, + "learning_rate": 1e-06, + "loss": 0.4134, + "mean_token_accuracy": 0.8694554567337036, + "num_tokens": 318307974.0, + "step": 8345 + }, + { + "epoch": 1.0616969851163973, + "ewc_loss": 0.029296875, + "ewc_loss_parallel": 2.9325485229492188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.224971771240234, + "learning_rate": 1e-06, + "loss": 0.4765, + "mean_token_accuracy": 0.8460209965705872, + "num_tokens": 318349880.0, + "step": 8346 + }, + { + "epoch": 1.0618241953949878, + "ewc_loss": 0.029541015625, + "ewc_loss_parallel": 2.956390380859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.392353057861328, + "learning_rate": 1e-06, + "loss": 0.4345, + "mean_token_accuracy": 0.8623524308204651, + "num_tokens": 318383869.0, + "step": 8347 + }, + { + "epoch": 1.0619514056735784, + "ewc_loss": 0.0296630859375, + "ewc_loss_parallel": 2.968311309814453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.38945198059082, + "learning_rate": 1e-06, + "loss": 0.3776, + "mean_token_accuracy": 0.8796531558036804, + "num_tokens": 318419659.0, + "step": 8348 + }, + { + "epoch": 1.062078615952169, + "ewc_loss": 0.0294189453125, + "ewc_loss_parallel": 2.944469451904297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.170734405517578, + "learning_rate": 1e-06, + "loss": 0.3983, + "mean_token_accuracy": 0.8730480670928955, + "num_tokens": 318460795.0, + "step": 8349 + }, + { + "epoch": 1.0622058262307594, + "ewc_loss": 0.0294189453125, + "ewc_loss_parallel": 2.944469451904297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.234132766723633, + "learning_rate": 1e-06, + "loss": 0.4551, + "mean_token_accuracy": 0.8544799089431763, + "num_tokens": 318499451.0, + "step": 8350 + }, + { + "epoch": 1.06233303650935, + "ewc_loss": 0.029541015625, + "ewc_loss_parallel": 2.956390380859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.683719635009766, + "learning_rate": 1e-06, + "loss": 0.4138, + "mean_token_accuracy": 0.8693256378173828, + "num_tokens": 318533413.0, + "step": 8351 + }, + { + "epoch": 1.0624602467879405, + "ewc_loss": 0.0294189453125, + "ewc_loss_parallel": 2.944469451904297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.126331329345703, + "learning_rate": 1e-06, + "loss": 0.4562, + "mean_token_accuracy": 0.8540658950805664, + "num_tokens": 318572533.0, + "step": 8352 + }, + { + "epoch": 1.062587457066531, + "ewc_loss": 0.0294189453125, + "ewc_loss_parallel": 2.944469451904297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.36709213256836, + "learning_rate": 1e-06, + "loss": 0.4147, + "mean_token_accuracy": 0.8665946125984192, + "num_tokens": 318609442.0, + "step": 8353 + }, + { + "epoch": 1.0627146673451215, + "ewc_loss": 0.029541015625, + "ewc_loss_parallel": 2.956390380859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.329561233520508, + "learning_rate": 1e-06, + "loss": 0.3747, + "mean_token_accuracy": 0.8790192604064941, + "num_tokens": 318643475.0, + "step": 8354 + }, + { + "epoch": 1.062841877623712, + "ewc_loss": 0.0296630859375, + "ewc_loss_parallel": 2.968311309814453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.12876319885254, + "learning_rate": 1e-06, + "loss": 0.4165, + "mean_token_accuracy": 0.868669331073761, + "num_tokens": 318684223.0, + "step": 8355 + }, + { + "epoch": 1.0629690879023026, + "ewc_loss": 0.029541015625, + "ewc_loss_parallel": 2.956390380859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.241762161254883, + "learning_rate": 1e-06, + "loss": 0.4267, + "mean_token_accuracy": 0.866148829460144, + "num_tokens": 318717148.0, + "step": 8356 + }, + { + "epoch": 1.0630962981808931, + "ewc_loss": 0.029296875, + "ewc_loss_parallel": 2.9325485229492188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.490705490112305, + "learning_rate": 1e-06, + "loss": 0.3924, + "mean_token_accuracy": 0.875410795211792, + "num_tokens": 318758929.0, + "step": 8357 + }, + { + "epoch": 1.0632235084594834, + "ewc_loss": 0.0294189453125, + "ewc_loss_parallel": 2.944469451904297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.085966110229492, + "learning_rate": 1e-06, + "loss": 0.4449, + "mean_token_accuracy": 0.8620530366897583, + "num_tokens": 318796438.0, + "step": 8358 + }, + { + "epoch": 1.063350718738074, + "ewc_loss": 0.0294189453125, + "ewc_loss_parallel": 2.944469451904297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.142091751098633, + "learning_rate": 1e-06, + "loss": 0.4623, + "mean_token_accuracy": 0.8522542715072632, + "num_tokens": 318835732.0, + "step": 8359 + }, + { + "epoch": 1.0634779290166645, + "ewc_loss": 0.0296630859375, + "ewc_loss_parallel": 2.968311309814453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.42276382446289, + "learning_rate": 1e-06, + "loss": 0.4295, + "mean_token_accuracy": 0.8623206615447998, + "num_tokens": 318872151.0, + "step": 8360 + }, + { + "epoch": 1.063605139295255, + "ewc_loss": 0.029296875, + "ewc_loss_parallel": 2.9325485229492188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.376203536987305, + "learning_rate": 1e-06, + "loss": 0.4654, + "mean_token_accuracy": 0.8530714511871338, + "num_tokens": 318911956.0, + "step": 8361 + }, + { + "epoch": 1.0637323495738455, + "ewc_loss": 0.0294189453125, + "ewc_loss_parallel": 2.944469451904297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.202070236206055, + "learning_rate": 1e-06, + "loss": 0.424, + "mean_token_accuracy": 0.8659398555755615, + "num_tokens": 318945593.0, + "step": 8362 + }, + { + "epoch": 1.063859559852436, + "ewc_loss": 0.029541015625, + "ewc_loss_parallel": 2.956390380859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.161134719848633, + "learning_rate": 1e-06, + "loss": 0.3893, + "mean_token_accuracy": 0.8739825487136841, + "num_tokens": 318976670.0, + "step": 8363 + }, + { + "epoch": 1.0639867701310266, + "ewc_loss": 0.0294189453125, + "ewc_loss_parallel": 2.944469451904297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.407577514648438, + "learning_rate": 1e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.8671914935112, + "num_tokens": 319007798.0, + "step": 8364 + }, + { + "epoch": 1.0641139804096171, + "ewc_loss": 0.029541015625, + "ewc_loss_parallel": 2.956390380859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.07303237915039, + "learning_rate": 1e-06, + "loss": 0.4605, + "mean_token_accuracy": 0.853717565536499, + "num_tokens": 319044808.0, + "step": 8365 + }, + { + "epoch": 1.0642411906882077, + "ewc_loss": 0.029296875, + "ewc_loss_parallel": 2.9325485229492188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.14686393737793, + "learning_rate": 1e-06, + "loss": 0.3682, + "mean_token_accuracy": 0.8841005563735962, + "num_tokens": 319083387.0, + "step": 8366 + }, + { + "epoch": 1.0643684009667982, + "ewc_loss": 0.029541015625, + "ewc_loss_parallel": 2.956390380859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.16130256652832, + "learning_rate": 1e-06, + "loss": 0.3888, + "mean_token_accuracy": 0.8797827959060669, + "num_tokens": 319121897.0, + "step": 8367 + }, + { + "epoch": 1.0644956112453887, + "ewc_loss": 0.0296630859375, + "ewc_loss_parallel": 2.968311309814453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.141109466552734, + "learning_rate": 1e-06, + "loss": 0.4579, + "mean_token_accuracy": 0.8526679277420044, + "num_tokens": 319157190.0, + "step": 8368 + }, + { + "epoch": 1.064622821523979, + "ewc_loss": 0.0294189453125, + "ewc_loss_parallel": 2.944469451904297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.516862869262695, + "learning_rate": 1e-06, + "loss": 0.3965, + "mean_token_accuracy": 0.8741322755813599, + "num_tokens": 319188353.0, + "step": 8369 + }, + { + "epoch": 1.0647500318025696, + "ewc_loss": 0.029541015625, + "ewc_loss_parallel": 2.956390380859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.05467987060547, + "learning_rate": 1e-06, + "loss": 0.4135, + "mean_token_accuracy": 0.8656147718429565, + "num_tokens": 319225536.0, + "step": 8370 + }, + { + "epoch": 1.06487724208116, + "ewc_loss": 0.029296875, + "ewc_loss_parallel": 2.9325485229492188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.131994247436523, + "learning_rate": 1e-06, + "loss": 0.401, + "mean_token_accuracy": 0.871069610118866, + "num_tokens": 319265114.0, + "step": 8371 + }, + { + "epoch": 1.0650044523597506, + "ewc_loss": 0.0296630859375, + "ewc_loss_parallel": 2.968311309814453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.16849708557129, + "learning_rate": 1e-06, + "loss": 0.3952, + "mean_token_accuracy": 0.8752009868621826, + "num_tokens": 319309280.0, + "step": 8372 + }, + { + "epoch": 1.0651316626383411, + "ewc_loss": 0.029541015625, + "ewc_loss_parallel": 2.956390380859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.099178314208984, + "learning_rate": 1e-06, + "loss": 0.3868, + "mean_token_accuracy": 0.8759828805923462, + "num_tokens": 319346114.0, + "step": 8373 + }, + { + "epoch": 1.0652588729169317, + "ewc_loss": 0.0296630859375, + "ewc_loss_parallel": 2.968311309814453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.12053108215332, + "learning_rate": 1e-06, + "loss": 0.4055, + "mean_token_accuracy": 0.8727980852127075, + "num_tokens": 319389001.0, + "step": 8374 + }, + { + "epoch": 1.0653860831955222, + "ewc_loss": 0.029541015625, + "ewc_loss_parallel": 2.956390380859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.281997680664062, + "learning_rate": 1e-06, + "loss": 0.3969, + "mean_token_accuracy": 0.8715841174125671, + "num_tokens": 319431464.0, + "step": 8375 + }, + { + "epoch": 1.0655132934741127, + "ewc_loss": 0.0296630859375, + "ewc_loss_parallel": 2.968311309814453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.180240631103516, + "learning_rate": 1e-06, + "loss": 0.5017, + "mean_token_accuracy": 0.8410221934318542, + "num_tokens": 319475638.0, + "step": 8376 + }, + { + "epoch": 1.0656405037527032, + "ewc_loss": 0.0296630859375, + "ewc_loss_parallel": 2.968311309814453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.275630950927734, + "learning_rate": 1e-06, + "loss": 0.3892, + "mean_token_accuracy": 0.8743884563446045, + "num_tokens": 319515983.0, + "step": 8377 + }, + { + "epoch": 1.0657677140312938, + "ewc_loss": 0.0296630859375, + "ewc_loss_parallel": 2.968311309814453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.168716430664062, + "learning_rate": 1e-06, + "loss": 0.4415, + "mean_token_accuracy": 0.8607885837554932, + "num_tokens": 319552152.0, + "step": 8378 + }, + { + "epoch": 1.0658949243098843, + "ewc_loss": 0.0296630859375, + "ewc_loss_parallel": 2.968311309814453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.37957000732422, + "learning_rate": 1e-06, + "loss": 0.4574, + "mean_token_accuracy": 0.8566058278083801, + "num_tokens": 319592963.0, + "step": 8379 + }, + { + "epoch": 1.0660221345884748, + "ewc_loss": 0.0296630859375, + "ewc_loss_parallel": 2.968311309814453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.16532325744629, + "learning_rate": 1e-06, + "loss": 0.471, + "mean_token_accuracy": 0.8514822721481323, + "num_tokens": 319635136.0, + "step": 8380 + }, + { + "epoch": 1.0661493448670654, + "ewc_loss": 0.0294189453125, + "ewc_loss_parallel": 2.944469451904297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.15045166015625, + "learning_rate": 1e-06, + "loss": 0.4093, + "mean_token_accuracy": 0.8684749603271484, + "num_tokens": 319670118.0, + "step": 8381 + }, + { + "epoch": 1.0662765551456557, + "ewc_loss": 0.02978515625, + "ewc_loss_parallel": 2.9802322387695312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.281522750854492, + "learning_rate": 1e-06, + "loss": 0.403, + "mean_token_accuracy": 0.8699401617050171, + "num_tokens": 319711057.0, + "step": 8382 + }, + { + "epoch": 1.0664037654242462, + "ewc_loss": 0.0296630859375, + "ewc_loss_parallel": 2.968311309814453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.024404525756836, + "learning_rate": 1e-06, + "loss": 0.4324, + "mean_token_accuracy": 0.8637328147888184, + "num_tokens": 319746451.0, + "step": 8383 + }, + { + "epoch": 1.0665309757028367, + "ewc_loss": 0.0296630859375, + "ewc_loss_parallel": 2.968311309814453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.355426788330078, + "learning_rate": 1e-06, + "loss": 0.4414, + "mean_token_accuracy": 0.857928991317749, + "num_tokens": 319792149.0, + "step": 8384 + }, + { + "epoch": 1.0666581859814273, + "ewc_loss": 0.0296630859375, + "ewc_loss_parallel": 2.968311309814453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.158193588256836, + "learning_rate": 1e-06, + "loss": 0.4353, + "mean_token_accuracy": 0.8567231893539429, + "num_tokens": 319826508.0, + "step": 8385 + }, + { + "epoch": 1.0667853962600178, + "ewc_loss": 0.0296630859375, + "ewc_loss_parallel": 2.968311309814453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.210813522338867, + "learning_rate": 1e-06, + "loss": 0.413, + "mean_token_accuracy": 0.8664993047714233, + "num_tokens": 319867000.0, + "step": 8386 + }, + { + "epoch": 1.0669126065386083, + "ewc_loss": 0.0296630859375, + "ewc_loss_parallel": 2.968311309814453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.319683074951172, + "learning_rate": 1e-06, + "loss": 0.4731, + "mean_token_accuracy": 0.8466199636459351, + "num_tokens": 319904476.0, + "step": 8387 + }, + { + "epoch": 1.0670398168171988, + "ewc_loss": 0.029541015625, + "ewc_loss_parallel": 2.956390380859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.16887092590332, + "learning_rate": 1e-06, + "loss": 0.5011, + "mean_token_accuracy": 0.8415791988372803, + "num_tokens": 319943279.0, + "step": 8388 + }, + { + "epoch": 1.0671670270957894, + "ewc_loss": 0.0296630859375, + "ewc_loss_parallel": 2.968311309814453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.286102294921875, + "learning_rate": 1e-06, + "loss": 0.4446, + "mean_token_accuracy": 0.8612823486328125, + "num_tokens": 319986610.0, + "step": 8389 + }, + { + "epoch": 1.06729423737438, + "ewc_loss": 0.0296630859375, + "ewc_loss_parallel": 2.968311309814453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.21857452392578, + "learning_rate": 1e-06, + "loss": 0.4426, + "mean_token_accuracy": 0.8579869270324707, + "num_tokens": 320022702.0, + "step": 8390 + }, + { + "epoch": 1.0674214476529704, + "ewc_loss": 0.0296630859375, + "ewc_loss_parallel": 2.968311309814453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.174346923828125, + "learning_rate": 1e-06, + "loss": 0.4269, + "mean_token_accuracy": 0.8671735525131226, + "num_tokens": 320067918.0, + "step": 8391 + }, + { + "epoch": 1.067548657931561, + "ewc_loss": 0.0296630859375, + "ewc_loss_parallel": 2.968311309814453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.14364242553711, + "learning_rate": 1e-06, + "loss": 0.4518, + "mean_token_accuracy": 0.8556153178215027, + "num_tokens": 320110455.0, + "step": 8392 + }, + { + "epoch": 1.0676758682101515, + "ewc_loss": 0.029541015625, + "ewc_loss_parallel": 2.956390380859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.160776138305664, + "learning_rate": 1e-06, + "loss": 0.3867, + "mean_token_accuracy": 0.8812764883041382, + "num_tokens": 320147675.0, + "step": 8393 + }, + { + "epoch": 1.0678030784887418, + "ewc_loss": 0.0296630859375, + "ewc_loss_parallel": 2.968311309814453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.309581756591797, + "learning_rate": 1e-06, + "loss": 0.4711, + "mean_token_accuracy": 0.8508358001708984, + "num_tokens": 320183882.0, + "step": 8394 + }, + { + "epoch": 1.0679302887673323, + "ewc_loss": 0.0296630859375, + "ewc_loss_parallel": 2.968311309814453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.18769645690918, + "learning_rate": 1e-06, + "loss": 0.4612, + "mean_token_accuracy": 0.8533640503883362, + "num_tokens": 320222215.0, + "step": 8395 + }, + { + "epoch": 1.0680574990459228, + "ewc_loss": 0.02978515625, + "ewc_loss_parallel": 2.9802322387695312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.26934242248535, + "learning_rate": 1e-06, + "loss": 0.4511, + "mean_token_accuracy": 0.857467770576477, + "num_tokens": 320262418.0, + "step": 8396 + }, + { + "epoch": 1.0681847093245134, + "ewc_loss": 0.02978515625, + "ewc_loss_parallel": 2.9802322387695312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.290237426757812, + "learning_rate": 1e-06, + "loss": 0.4467, + "mean_token_accuracy": 0.8576936721801758, + "num_tokens": 320297304.0, + "step": 8397 + }, + { + "epoch": 1.068311919603104, + "ewc_loss": 0.029541015625, + "ewc_loss_parallel": 2.956390380859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.246408462524414, + "learning_rate": 1e-06, + "loss": 0.4235, + "mean_token_accuracy": 0.866305947303772, + "num_tokens": 320330919.0, + "step": 8398 + }, + { + "epoch": 1.0684391298816944, + "ewc_loss": 0.029541015625, + "ewc_loss_parallel": 2.956390380859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.39167022705078, + "learning_rate": 1e-06, + "loss": 0.4211, + "mean_token_accuracy": 0.8642457723617554, + "num_tokens": 320367602.0, + "step": 8399 + }, + { + "epoch": 1.068566340160285, + "ewc_loss": 0.0296630859375, + "ewc_loss_parallel": 2.968311309814453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.11775779724121, + "learning_rate": 1e-06, + "loss": 0.4242, + "mean_token_accuracy": 0.8646447062492371, + "num_tokens": 320403875.0, + "step": 8400 + }, + { + "epoch": 1.0686935504388755, + "ewc_loss": 0.0296630859375, + "ewc_loss_parallel": 2.968311309814453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.40203094482422, + "learning_rate": 1e-06, + "loss": 0.4287, + "mean_token_accuracy": 0.8673757314682007, + "num_tokens": 320439149.0, + "step": 8401 + }, + { + "epoch": 1.068820760717466, + "ewc_loss": 0.0296630859375, + "ewc_loss_parallel": 2.968311309814453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.089839935302734, + "learning_rate": 1e-06, + "loss": 0.3898, + "mean_token_accuracy": 0.8759714365005493, + "num_tokens": 320476786.0, + "step": 8402 + }, + { + "epoch": 1.0689479709960565, + "ewc_loss": 0.0296630859375, + "ewc_loss_parallel": 2.968311309814453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.134897232055664, + "learning_rate": 1e-06, + "loss": 0.4712, + "mean_token_accuracy": 0.848842978477478, + "num_tokens": 320516945.0, + "step": 8403 + }, + { + "epoch": 1.069075181274647, + "ewc_loss": 0.02978515625, + "ewc_loss_parallel": 2.9802322387695312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.18376350402832, + "learning_rate": 1e-06, + "loss": 0.3734, + "mean_token_accuracy": 0.8785354495048523, + "num_tokens": 320552801.0, + "step": 8404 + }, + { + "epoch": 1.0692023915532376, + "ewc_loss": 0.0294189453125, + "ewc_loss_parallel": 2.944469451904297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.278982162475586, + "learning_rate": 1e-06, + "loss": 0.4213, + "mean_token_accuracy": 0.867875337600708, + "num_tokens": 320593605.0, + "step": 8405 + }, + { + "epoch": 1.0693296018318281, + "ewc_loss": 0.029541015625, + "ewc_loss_parallel": 2.956390380859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.254409790039062, + "learning_rate": 1e-06, + "loss": 0.4072, + "mean_token_accuracy": 0.8714264631271362, + "num_tokens": 320624476.0, + "step": 8406 + }, + { + "epoch": 1.0694568121104184, + "ewc_loss": 0.029296875, + "ewc_loss_parallel": 2.9325485229492188e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.255199432373047, + "learning_rate": 1e-06, + "loss": 0.3972, + "mean_token_accuracy": 0.8709688186645508, + "num_tokens": 320667000.0, + "step": 8407 + }, + { + "epoch": 1.069584022389009, + "ewc_loss": 0.029541015625, + "ewc_loss_parallel": 2.956390380859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.326021194458008, + "learning_rate": 1e-06, + "loss": 0.4057, + "mean_token_accuracy": 0.8699033260345459, + "num_tokens": 320699286.0, + "step": 8408 + }, + { + "epoch": 1.0697112326675995, + "ewc_loss": 0.0296630859375, + "ewc_loss_parallel": 2.968311309814453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.12066650390625, + "learning_rate": 1e-06, + "loss": 0.4063, + "mean_token_accuracy": 0.8702956438064575, + "num_tokens": 320732520.0, + "step": 8409 + }, + { + "epoch": 1.06983844294619, + "ewc_loss": 0.029541015625, + "ewc_loss_parallel": 2.956390380859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.281843185424805, + "learning_rate": 1e-06, + "loss": 0.4659, + "mean_token_accuracy": 0.8535292744636536, + "num_tokens": 320773425.0, + "step": 8410 + }, + { + "epoch": 1.0699656532247805, + "ewc_loss": 0.0296630859375, + "ewc_loss_parallel": 2.968311309814453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.298107147216797, + "learning_rate": 1e-06, + "loss": 0.5092, + "mean_token_accuracy": 0.8424487709999084, + "num_tokens": 320810859.0, + "step": 8411 + }, + { + "epoch": 1.070092863503371, + "ewc_loss": 0.029541015625, + "ewc_loss_parallel": 2.956390380859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.278799057006836, + "learning_rate": 1e-06, + "loss": 0.4063, + "mean_token_accuracy": 0.8675380349159241, + "num_tokens": 320850266.0, + "step": 8412 + }, + { + "epoch": 1.0702200737819616, + "ewc_loss": 0.029541015625, + "ewc_loss_parallel": 2.956390380859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.228954315185547, + "learning_rate": 1e-06, + "loss": 0.443, + "mean_token_accuracy": 0.8632731437683105, + "num_tokens": 320892812.0, + "step": 8413 + }, + { + "epoch": 1.0703472840605521, + "ewc_loss": 0.0296630859375, + "ewc_loss_parallel": 2.968311309814453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.363025665283203, + "learning_rate": 1e-06, + "loss": 0.3866, + "mean_token_accuracy": 0.8768909573554993, + "num_tokens": 320929563.0, + "step": 8414 + }, + { + "epoch": 1.0704744943391427, + "ewc_loss": 0.0296630859375, + "ewc_loss_parallel": 2.968311309814453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.23056983947754, + "learning_rate": 1e-06, + "loss": 0.3929, + "mean_token_accuracy": 0.871973991394043, + "num_tokens": 320964010.0, + "step": 8415 + }, + { + "epoch": 1.0706017046177332, + "ewc_loss": 0.0296630859375, + "ewc_loss_parallel": 2.968311309814453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.27252769470215, + "learning_rate": 1e-06, + "loss": 0.4413, + "mean_token_accuracy": 0.8615404963493347, + "num_tokens": 321006405.0, + "step": 8416 + }, + { + "epoch": 1.0707289148963237, + "ewc_loss": 0.029541015625, + "ewc_loss_parallel": 2.956390380859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.269058227539062, + "learning_rate": 1e-06, + "loss": 0.4468, + "mean_token_accuracy": 0.8591339588165283, + "num_tokens": 321050146.0, + "step": 8417 + }, + { + "epoch": 1.070856125174914, + "ewc_loss": 0.0296630859375, + "ewc_loss_parallel": 2.968311309814453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.14793586730957, + "learning_rate": 1e-06, + "loss": 0.4296, + "mean_token_accuracy": 0.8633954524993896, + "num_tokens": 321086857.0, + "step": 8418 + }, + { + "epoch": 1.0709833354535045, + "ewc_loss": 0.029541015625, + "ewc_loss_parallel": 2.956390380859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.54428482055664, + "learning_rate": 1e-06, + "loss": 0.3986, + "mean_token_accuracy": 0.8693234324455261, + "num_tokens": 321127310.0, + "step": 8419 + }, + { + "epoch": 1.071110545732095, + "ewc_loss": 0.02978515625, + "ewc_loss_parallel": 2.9802322387695312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.296157836914062, + "learning_rate": 1e-06, + "loss": 0.4667, + "mean_token_accuracy": 0.8542450666427612, + "num_tokens": 321165881.0, + "step": 8420 + }, + { + "epoch": 1.0712377560106856, + "ewc_loss": 0.029541015625, + "ewc_loss_parallel": 2.956390380859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.1955509185791, + "learning_rate": 1e-06, + "loss": 0.4513, + "mean_token_accuracy": 0.8580149412155151, + "num_tokens": 321204494.0, + "step": 8421 + }, + { + "epoch": 1.0713649662892761, + "ewc_loss": 0.029541015625, + "ewc_loss_parallel": 2.956390380859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.267717361450195, + "learning_rate": 1e-06, + "loss": 0.4491, + "mean_token_accuracy": 0.8544483184814453, + "num_tokens": 321243639.0, + "step": 8422 + }, + { + "epoch": 1.0714921765678667, + "ewc_loss": 0.0294189453125, + "ewc_loss_parallel": 2.944469451904297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.412302017211914, + "learning_rate": 1e-06, + "loss": 0.4491, + "mean_token_accuracy": 0.8561667203903198, + "num_tokens": 321281399.0, + "step": 8423 + }, + { + "epoch": 1.0716193868464572, + "ewc_loss": 0.029541015625, + "ewc_loss_parallel": 2.956390380859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.201501846313477, + "learning_rate": 1e-06, + "loss": 0.4586, + "mean_token_accuracy": 0.851330041885376, + "num_tokens": 321316539.0, + "step": 8424 + }, + { + "epoch": 1.0717465971250477, + "ewc_loss": 0.029541015625, + "ewc_loss_parallel": 2.956390380859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.292617797851562, + "learning_rate": 1e-06, + "loss": 0.4349, + "mean_token_accuracy": 0.8589828014373779, + "num_tokens": 321348618.0, + "step": 8425 + }, + { + "epoch": 1.0718738074036382, + "ewc_loss": 0.0296630859375, + "ewc_loss_parallel": 2.968311309814453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.37128448486328, + "learning_rate": 1e-06, + "loss": 0.4061, + "mean_token_accuracy": 0.8699433207511902, + "num_tokens": 321381674.0, + "step": 8426 + }, + { + "epoch": 1.0720010176822288, + "ewc_loss": 0.0296630859375, + "ewc_loss_parallel": 2.968311309814453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.455135345458984, + "learning_rate": 1e-06, + "loss": 0.3719, + "mean_token_accuracy": 0.8807108402252197, + "num_tokens": 321420342.0, + "step": 8427 + }, + { + "epoch": 1.0721282279608193, + "ewc_loss": 0.0294189453125, + "ewc_loss_parallel": 2.944469451904297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.133190155029297, + "learning_rate": 1e-06, + "loss": 0.4352, + "mean_token_accuracy": 0.8619331121444702, + "num_tokens": 321450770.0, + "step": 8428 + }, + { + "epoch": 1.0722554382394098, + "ewc_loss": 0.02978515625, + "ewc_loss_parallel": 2.9802322387695312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.090290069580078, + "learning_rate": 1e-06, + "loss": 0.411, + "mean_token_accuracy": 0.8670123815536499, + "num_tokens": 321491450.0, + "step": 8429 + }, + { + "epoch": 1.0723826485180004, + "ewc_loss": 0.0296630859375, + "ewc_loss_parallel": 2.968311309814453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.376123428344727, + "learning_rate": 1e-06, + "loss": 0.4448, + "mean_token_accuracy": 0.8591843843460083, + "num_tokens": 321531774.0, + "step": 8430 + }, + { + "epoch": 1.0725098587965907, + "ewc_loss": 0.0296630859375, + "ewc_loss_parallel": 2.968311309814453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.095117568969727, + "learning_rate": 1e-06, + "loss": 0.425, + "mean_token_accuracy": 0.8642566800117493, + "num_tokens": 321567692.0, + "step": 8431 + }, + { + "epoch": 1.0726370690751812, + "ewc_loss": 0.0296630859375, + "ewc_loss_parallel": 2.968311309814453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.397409439086914, + "learning_rate": 1e-06, + "loss": 0.4297, + "mean_token_accuracy": 0.8587145209312439, + "num_tokens": 321601168.0, + "step": 8432 + }, + { + "epoch": 1.0727642793537717, + "ewc_loss": 0.0299072265625, + "ewc_loss_parallel": 2.9921531677246094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.421287536621094, + "learning_rate": 1e-06, + "loss": 0.4848, + "mean_token_accuracy": 0.846750020980835, + "num_tokens": 321636100.0, + "step": 8433 + }, + { + "epoch": 1.0728914896323622, + "ewc_loss": 0.0299072265625, + "ewc_loss_parallel": 2.9921531677246094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.104543685913086, + "learning_rate": 1e-06, + "loss": 0.4272, + "mean_token_accuracy": 0.8654399514198303, + "num_tokens": 321674365.0, + "step": 8434 + }, + { + "epoch": 1.0730186999109528, + "ewc_loss": 0.0296630859375, + "ewc_loss_parallel": 2.968311309814453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.66156005859375, + "learning_rate": 1e-06, + "loss": 0.3919, + "mean_token_accuracy": 0.8743036985397339, + "num_tokens": 321712267.0, + "step": 8435 + }, + { + "epoch": 1.0731459101895433, + "ewc_loss": 0.0296630859375, + "ewc_loss_parallel": 2.968311309814453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.196130752563477, + "learning_rate": 1e-06, + "loss": 0.4374, + "mean_token_accuracy": 0.8629056215286255, + "num_tokens": 321749955.0, + "step": 8436 + }, + { + "epoch": 1.0732731204681338, + "ewc_loss": 0.0296630859375, + "ewc_loss_parallel": 2.968311309814453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.285911560058594, + "learning_rate": 1e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.8718667030334473, + "num_tokens": 321788522.0, + "step": 8437 + }, + { + "epoch": 1.0734003307467244, + "ewc_loss": 0.029541015625, + "ewc_loss_parallel": 2.956390380859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.37118148803711, + "learning_rate": 1e-06, + "loss": 0.4184, + "mean_token_accuracy": 0.8620330691337585, + "num_tokens": 321826314.0, + "step": 8438 + }, + { + "epoch": 1.073527541025315, + "ewc_loss": 0.0296630859375, + "ewc_loss_parallel": 2.968311309814453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.189889907836914, + "learning_rate": 1e-06, + "loss": 0.4427, + "mean_token_accuracy": 0.8607613444328308, + "num_tokens": 321864851.0, + "step": 8439 + }, + { + "epoch": 1.0736547513039054, + "ewc_loss": 0.0296630859375, + "ewc_loss_parallel": 2.968311309814453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.367244720458984, + "learning_rate": 1e-06, + "loss": 0.4049, + "mean_token_accuracy": 0.8732476234436035, + "num_tokens": 321901001.0, + "step": 8440 + }, + { + "epoch": 1.073781961582496, + "ewc_loss": 0.0296630859375, + "ewc_loss_parallel": 2.968311309814453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.102859497070312, + "learning_rate": 1e-06, + "loss": 0.4327, + "mean_token_accuracy": 0.8624013662338257, + "num_tokens": 321941368.0, + "step": 8441 + }, + { + "epoch": 1.0739091718610865, + "ewc_loss": 0.029541015625, + "ewc_loss_parallel": 2.956390380859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.25627326965332, + "learning_rate": 1e-06, + "loss": 0.4124, + "mean_token_accuracy": 0.8673229217529297, + "num_tokens": 321978268.0, + "step": 8442 + }, + { + "epoch": 1.0740363821396768, + "ewc_loss": 0.0299072265625, + "ewc_loss_parallel": 2.9921531677246094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.180988311767578, + "learning_rate": 1e-06, + "loss": 0.4033, + "mean_token_accuracy": 0.8716458082199097, + "num_tokens": 322016969.0, + "step": 8443 + }, + { + "epoch": 1.0741635924182673, + "ewc_loss": 0.0294189453125, + "ewc_loss_parallel": 2.944469451904297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.325153350830078, + "learning_rate": 1e-06, + "loss": 0.4565, + "mean_token_accuracy": 0.8574361205101013, + "num_tokens": 322059362.0, + "step": 8444 + }, + { + "epoch": 1.0742908026968578, + "ewc_loss": 0.02978515625, + "ewc_loss_parallel": 2.9802322387695312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.391141891479492, + "learning_rate": 1e-06, + "loss": 0.4896, + "mean_token_accuracy": 0.8449308276176453, + "num_tokens": 322097334.0, + "step": 8445 + }, + { + "epoch": 1.0744180129754484, + "ewc_loss": 0.0296630859375, + "ewc_loss_parallel": 2.968311309814453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.256315231323242, + "learning_rate": 1e-06, + "loss": 0.3518, + "mean_token_accuracy": 0.8842458724975586, + "num_tokens": 322121988.0, + "step": 8446 + }, + { + "epoch": 1.074545223254039, + "ewc_loss": 0.0296630859375, + "ewc_loss_parallel": 2.968311309814453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.231386184692383, + "learning_rate": 1e-06, + "loss": 0.4303, + "mean_token_accuracy": 0.8593049645423889, + "num_tokens": 322156602.0, + "step": 8447 + }, + { + "epoch": 1.0746724335326294, + "ewc_loss": 0.0299072265625, + "ewc_loss_parallel": 2.9921531677246094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.36387825012207, + "learning_rate": 1e-06, + "loss": 0.4442, + "mean_token_accuracy": 0.8610798120498657, + "num_tokens": 322194475.0, + "step": 8448 + }, + { + "epoch": 1.07479964381122, + "ewc_loss": 0.02978515625, + "ewc_loss_parallel": 2.9802322387695312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.222248077392578, + "learning_rate": 1e-06, + "loss": 0.3944, + "mean_token_accuracy": 0.8718301057815552, + "num_tokens": 322232158.0, + "step": 8449 + }, + { + "epoch": 1.0749268540898105, + "ewc_loss": 0.0296630859375, + "ewc_loss_parallel": 2.968311309814453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.451738357543945, + "learning_rate": 1e-06, + "loss": 0.4027, + "mean_token_accuracy": 0.8722495436668396, + "num_tokens": 322272170.0, + "step": 8450 + }, + { + "epoch": 1.075054064368401, + "ewc_loss": 0.02978515625, + "ewc_loss_parallel": 2.9802322387695312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.222936630249023, + "learning_rate": 1e-06, + "loss": 0.4082, + "mean_token_accuracy": 0.8698490858078003, + "num_tokens": 322305496.0, + "step": 8451 + }, + { + "epoch": 1.0751812746469915, + "ewc_loss": 0.0296630859375, + "ewc_loss_parallel": 2.968311309814453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.459272384643555, + "learning_rate": 1e-06, + "loss": 0.4115, + "mean_token_accuracy": 0.8686838746070862, + "num_tokens": 322347552.0, + "step": 8452 + }, + { + "epoch": 1.075308484925582, + "ewc_loss": 0.0294189453125, + "ewc_loss_parallel": 2.944469451904297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.110918045043945, + "learning_rate": 1e-06, + "loss": 0.4106, + "mean_token_accuracy": 0.8662986159324646, + "num_tokens": 322386471.0, + "step": 8453 + }, + { + "epoch": 1.0754356952041726, + "ewc_loss": 0.029541015625, + "ewc_loss_parallel": 2.956390380859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.429723739624023, + "learning_rate": 1e-06, + "loss": 0.5308, + "mean_token_accuracy": 0.829123854637146, + "num_tokens": 322431019.0, + "step": 8454 + }, + { + "epoch": 1.0755629054827631, + "ewc_loss": 0.0296630859375, + "ewc_loss_parallel": 2.968311309814453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.202110290527344, + "learning_rate": 1e-06, + "loss": 0.3653, + "mean_token_accuracy": 0.8826866149902344, + "num_tokens": 322471525.0, + "step": 8455 + }, + { + "epoch": 1.0756901157613534, + "ewc_loss": 0.029541015625, + "ewc_loss_parallel": 2.956390380859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.398681640625, + "learning_rate": 1e-06, + "loss": 0.4216, + "mean_token_accuracy": 0.8660296201705933, + "num_tokens": 322516594.0, + "step": 8456 + }, + { + "epoch": 1.075817326039944, + "ewc_loss": 0.029541015625, + "ewc_loss_parallel": 2.956390380859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.262741088867188, + "learning_rate": 1e-06, + "loss": 0.4097, + "mean_token_accuracy": 0.8703079223632812, + "num_tokens": 322557837.0, + "step": 8457 + }, + { + "epoch": 1.0759445363185345, + "ewc_loss": 0.029541015625, + "ewc_loss_parallel": 2.956390380859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.238235473632812, + "learning_rate": 1e-06, + "loss": 0.4353, + "mean_token_accuracy": 0.8596909046173096, + "num_tokens": 322594165.0, + "step": 8458 + }, + { + "epoch": 1.076071746597125, + "ewc_loss": 0.0296630859375, + "ewc_loss_parallel": 2.968311309814453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.249174118041992, + "learning_rate": 1e-06, + "loss": 0.4868, + "mean_token_accuracy": 0.8468970060348511, + "num_tokens": 322637083.0, + "step": 8459 + }, + { + "epoch": 1.0761989568757155, + "ewc_loss": 0.02978515625, + "ewc_loss_parallel": 2.9802322387695312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.167724609375, + "learning_rate": 1e-06, + "loss": 0.441, + "mean_token_accuracy": 0.8626333475112915, + "num_tokens": 322673216.0, + "step": 8460 + }, + { + "epoch": 1.076326167154306, + "ewc_loss": 0.029541015625, + "ewc_loss_parallel": 2.956390380859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.37143325805664, + "learning_rate": 1e-06, + "loss": 0.4043, + "mean_token_accuracy": 0.866690993309021, + "num_tokens": 322709150.0, + "step": 8461 + }, + { + "epoch": 1.0764533774328966, + "ewc_loss": 0.02978515625, + "ewc_loss_parallel": 2.9802322387695312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.175617218017578, + "learning_rate": 1e-06, + "loss": 0.4005, + "mean_token_accuracy": 0.8702382445335388, + "num_tokens": 322747893.0, + "step": 8462 + }, + { + "epoch": 1.0765805877114871, + "ewc_loss": 0.0296630859375, + "ewc_loss_parallel": 2.968311309814453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.126272201538086, + "learning_rate": 1e-06, + "loss": 0.482, + "mean_token_accuracy": 0.8466264009475708, + "num_tokens": 322781667.0, + "step": 8463 + }, + { + "epoch": 1.0767077979900777, + "ewc_loss": 0.029541015625, + "ewc_loss_parallel": 2.956390380859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.298141479492188, + "learning_rate": 1e-06, + "loss": 0.4188, + "mean_token_accuracy": 0.8638801574707031, + "num_tokens": 322818008.0, + "step": 8464 + }, + { + "epoch": 1.0768350082686682, + "ewc_loss": 0.02978515625, + "ewc_loss_parallel": 2.9802322387695312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.3261661529541, + "learning_rate": 1e-06, + "loss": 0.4893, + "mean_token_accuracy": 0.8439764380455017, + "num_tokens": 322856528.0, + "step": 8465 + }, + { + "epoch": 1.0769622185472587, + "ewc_loss": 0.029541015625, + "ewc_loss_parallel": 2.956390380859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.20517921447754, + "learning_rate": 1e-06, + "loss": 0.4161, + "mean_token_accuracy": 0.8673485517501831, + "num_tokens": 322888798.0, + "step": 8466 + }, + { + "epoch": 1.077089428825849, + "ewc_loss": 0.0296630859375, + "ewc_loss_parallel": 2.968311309814453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.488037109375, + "learning_rate": 1e-06, + "loss": 0.4216, + "mean_token_accuracy": 0.8659266233444214, + "num_tokens": 322923301.0, + "step": 8467 + }, + { + "epoch": 1.0772166391044395, + "ewc_loss": 0.0296630859375, + "ewc_loss_parallel": 2.968311309814453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.287187576293945, + "learning_rate": 1e-06, + "loss": 0.3946, + "mean_token_accuracy": 0.8762459754943848, + "num_tokens": 322964253.0, + "step": 8468 + }, + { + "epoch": 1.07734384938303, + "ewc_loss": 0.0294189453125, + "ewc_loss_parallel": 2.944469451904297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.19963836669922, + "learning_rate": 1e-06, + "loss": 0.4286, + "mean_token_accuracy": 0.8618963956832886, + "num_tokens": 323006926.0, + "step": 8469 + }, + { + "epoch": 1.0774710596616206, + "ewc_loss": 0.029541015625, + "ewc_loss_parallel": 2.956390380859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.269920349121094, + "learning_rate": 1e-06, + "loss": 0.3761, + "mean_token_accuracy": 0.8773571252822876, + "num_tokens": 323047541.0, + "step": 8470 + }, + { + "epoch": 1.0775982699402111, + "ewc_loss": 0.0296630859375, + "ewc_loss_parallel": 2.968311309814453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.304920196533203, + "learning_rate": 1e-06, + "loss": 0.4542, + "mean_token_accuracy": 0.8571522235870361, + "num_tokens": 323084272.0, + "step": 8471 + }, + { + "epoch": 1.0777254802188017, + "ewc_loss": 0.0296630859375, + "ewc_loss_parallel": 2.968311309814453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.18711280822754, + "learning_rate": 1e-06, + "loss": 0.3898, + "mean_token_accuracy": 0.8758620023727417, + "num_tokens": 323116698.0, + "step": 8472 + }, + { + "epoch": 1.0778526904973922, + "ewc_loss": 0.029541015625, + "ewc_loss_parallel": 2.956390380859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.410266876220703, + "learning_rate": 1e-06, + "loss": 0.4008, + "mean_token_accuracy": 0.8723278045654297, + "num_tokens": 323158836.0, + "step": 8473 + }, + { + "epoch": 1.0779799007759827, + "ewc_loss": 0.02978515625, + "ewc_loss_parallel": 2.9802322387695312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.241727828979492, + "learning_rate": 1e-06, + "loss": 0.4258, + "mean_token_accuracy": 0.8603382110595703, + "num_tokens": 323193777.0, + "step": 8474 + }, + { + "epoch": 1.0781071110545732, + "ewc_loss": 0.029541015625, + "ewc_loss_parallel": 2.956390380859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.216651916503906, + "learning_rate": 1e-06, + "loss": 0.4285, + "mean_token_accuracy": 0.8651528358459473, + "num_tokens": 323232546.0, + "step": 8475 + }, + { + "epoch": 1.0782343213331638, + "ewc_loss": 0.0296630859375, + "ewc_loss_parallel": 2.968311309814453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.124286651611328, + "learning_rate": 1e-06, + "loss": 0.4247, + "mean_token_accuracy": 0.8636333346366882, + "num_tokens": 323269403.0, + "step": 8476 + }, + { + "epoch": 1.0783615316117543, + "ewc_loss": 0.02978515625, + "ewc_loss_parallel": 2.9802322387695312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.3123722076416, + "learning_rate": 1e-06, + "loss": 0.3803, + "mean_token_accuracy": 0.877639889717102, + "num_tokens": 323305982.0, + "step": 8477 + }, + { + "epoch": 1.0784887418903448, + "ewc_loss": 0.0296630859375, + "ewc_loss_parallel": 2.968311309814453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.169092178344727, + "learning_rate": 1e-06, + "loss": 0.4381, + "mean_token_accuracy": 0.8595207929611206, + "num_tokens": 323341032.0, + "step": 8478 + }, + { + "epoch": 1.0786159521689354, + "ewc_loss": 0.0296630859375, + "ewc_loss_parallel": 2.968311309814453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.2624454498291, + "learning_rate": 1e-06, + "loss": 0.4399, + "mean_token_accuracy": 0.8577696681022644, + "num_tokens": 323381647.0, + "step": 8479 + }, + { + "epoch": 1.0787431624475257, + "ewc_loss": 0.0299072265625, + "ewc_loss_parallel": 2.9921531677246094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.170917510986328, + "learning_rate": 1e-06, + "loss": 0.4128, + "mean_token_accuracy": 0.8684492111206055, + "num_tokens": 323417185.0, + "step": 8480 + }, + { + "epoch": 1.0788703727261162, + "ewc_loss": 0.0296630859375, + "ewc_loss_parallel": 2.968311309814453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.287233352661133, + "learning_rate": 1e-06, + "loss": 0.4293, + "mean_token_accuracy": 0.8636520504951477, + "num_tokens": 323454624.0, + "step": 8481 + }, + { + "epoch": 1.0789975830047067, + "ewc_loss": 0.029541015625, + "ewc_loss_parallel": 2.956390380859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.129703521728516, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.8648111820220947, + "num_tokens": 323492862.0, + "step": 8482 + }, + { + "epoch": 1.0791247932832972, + "ewc_loss": 0.0299072265625, + "ewc_loss_parallel": 2.9921531677246094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.155447006225586, + "learning_rate": 1e-06, + "loss": 0.4383, + "mean_token_accuracy": 0.860598623752594, + "num_tokens": 323536012.0, + "step": 8483 + }, + { + "epoch": 1.0792520035618878, + "ewc_loss": 0.02978515625, + "ewc_loss_parallel": 2.9802322387695312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.222810745239258, + "learning_rate": 1e-06, + "loss": 0.4134, + "mean_token_accuracy": 0.8688942193984985, + "num_tokens": 323573495.0, + "step": 8484 + }, + { + "epoch": 1.0793792138404783, + "ewc_loss": 0.0296630859375, + "ewc_loss_parallel": 2.968311309814453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.150054931640625, + "learning_rate": 1e-06, + "loss": 0.401, + "mean_token_accuracy": 0.8747471570968628, + "num_tokens": 323610900.0, + "step": 8485 + }, + { + "epoch": 1.0795064241190688, + "ewc_loss": 0.02978515625, + "ewc_loss_parallel": 2.9802322387695312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.3797550201416, + "learning_rate": 1e-06, + "loss": 0.437, + "mean_token_accuracy": 0.8640127182006836, + "num_tokens": 323650983.0, + "step": 8486 + }, + { + "epoch": 1.0796336343976594, + "ewc_loss": 0.0296630859375, + "ewc_loss_parallel": 2.968311309814453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.108022689819336, + "learning_rate": 1e-06, + "loss": 0.4544, + "mean_token_accuracy": 0.8568019866943359, + "num_tokens": 323690155.0, + "step": 8487 + }, + { + "epoch": 1.0797608446762499, + "ewc_loss": 0.0296630859375, + "ewc_loss_parallel": 2.968311309814453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.180192947387695, + "learning_rate": 1e-06, + "loss": 0.3846, + "mean_token_accuracy": 0.8775482177734375, + "num_tokens": 323732212.0, + "step": 8488 + }, + { + "epoch": 1.0798880549548404, + "ewc_loss": 0.0296630859375, + "ewc_loss_parallel": 2.968311309814453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.03070831298828, + "learning_rate": 1e-06, + "loss": 0.4003, + "mean_token_accuracy": 0.8729546070098877, + "num_tokens": 323773400.0, + "step": 8489 + }, + { + "epoch": 1.080015265233431, + "ewc_loss": 0.02978515625, + "ewc_loss_parallel": 2.9802322387695312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.365964889526367, + "learning_rate": 1e-06, + "loss": 0.4057, + "mean_token_accuracy": 0.8715689182281494, + "num_tokens": 323812753.0, + "step": 8490 + }, + { + "epoch": 1.0801424755120215, + "ewc_loss": 0.02978515625, + "ewc_loss_parallel": 2.9802322387695312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.14036750793457, + "learning_rate": 1e-06, + "loss": 0.4165, + "mean_token_accuracy": 0.8665283918380737, + "num_tokens": 323855835.0, + "step": 8491 + }, + { + "epoch": 1.0802696857906118, + "ewc_loss": 0.0296630859375, + "ewc_loss_parallel": 2.968311309814453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.3156795501709, + "learning_rate": 1e-06, + "loss": 0.3825, + "mean_token_accuracy": 0.877049446105957, + "num_tokens": 323896065.0, + "step": 8492 + }, + { + "epoch": 1.0803968960692023, + "ewc_loss": 0.0296630859375, + "ewc_loss_parallel": 2.968311309814453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.16559410095215, + "learning_rate": 1e-06, + "loss": 0.4051, + "mean_token_accuracy": 0.8678945302963257, + "num_tokens": 323934351.0, + "step": 8493 + }, + { + "epoch": 1.0805241063477928, + "ewc_loss": 0.0296630859375, + "ewc_loss_parallel": 2.968311309814453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.37752914428711, + "learning_rate": 1e-06, + "loss": 0.4546, + "mean_token_accuracy": 0.854854166507721, + "num_tokens": 323968878.0, + "step": 8494 + }, + { + "epoch": 1.0806513166263834, + "ewc_loss": 0.0299072265625, + "ewc_loss_parallel": 2.9921531677246094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.455657958984375, + "learning_rate": 1e-06, + "loss": 0.4278, + "mean_token_accuracy": 0.8635051250457764, + "num_tokens": 324004838.0, + "step": 8495 + }, + { + "epoch": 1.080778526904974, + "ewc_loss": 0.0294189453125, + "ewc_loss_parallel": 2.944469451904297e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.184432983398438, + "learning_rate": 1e-06, + "loss": 0.3754, + "mean_token_accuracy": 0.880172848701477, + "num_tokens": 324040851.0, + "step": 8496 + }, + { + "epoch": 1.0809057371835644, + "ewc_loss": 0.029541015625, + "ewc_loss_parallel": 2.956390380859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.342065811157227, + "learning_rate": 1e-06, + "loss": 0.4642, + "mean_token_accuracy": 0.8519201278686523, + "num_tokens": 324084675.0, + "step": 8497 + }, + { + "epoch": 1.081032947462155, + "ewc_loss": 0.02978515625, + "ewc_loss_parallel": 2.9802322387695312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.08933448791504, + "learning_rate": 1e-06, + "loss": 0.4134, + "mean_token_accuracy": 0.8701556324958801, + "num_tokens": 324122136.0, + "step": 8498 + }, + { + "epoch": 1.0811601577407455, + "ewc_loss": 0.029541015625, + "ewc_loss_parallel": 2.956390380859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.3005428314209, + "learning_rate": 1e-06, + "loss": 0.4176, + "mean_token_accuracy": 0.8632317781448364, + "num_tokens": 324153142.0, + "step": 8499 + }, + { + "epoch": 1.081287368019336, + "ewc_loss": 0.0296630859375, + "ewc_loss_parallel": 2.968311309814453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.25117301940918, + "learning_rate": 1e-06, + "loss": 0.4938, + "mean_token_accuracy": 0.8452579379081726, + "num_tokens": 324187244.0, + "step": 8500 + }, + { + "epoch": 1.0814145782979265, + "ewc_loss": 0.02978515625, + "ewc_loss_parallel": 2.9802322387695312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.4384708404541, + "learning_rate": 1e-06, + "loss": 0.4178, + "mean_token_accuracy": 0.8670485019683838, + "num_tokens": 324225644.0, + "step": 8501 + }, + { + "epoch": 1.081541788576517, + "ewc_loss": 0.0299072265625, + "ewc_loss_parallel": 2.9921531677246094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.302663803100586, + "learning_rate": 1e-06, + "loss": 0.396, + "mean_token_accuracy": 0.8727666139602661, + "num_tokens": 324270394.0, + "step": 8502 + }, + { + "epoch": 1.0816689988551076, + "ewc_loss": 0.0296630859375, + "ewc_loss_parallel": 2.968311309814453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.238618850708008, + "learning_rate": 1e-06, + "loss": 0.3926, + "mean_token_accuracy": 0.875784695148468, + "num_tokens": 324311244.0, + "step": 8503 + }, + { + "epoch": 1.0817962091336981, + "ewc_loss": 0.0296630859375, + "ewc_loss_parallel": 2.968311309814453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.36638641357422, + "learning_rate": 1e-06, + "loss": 0.4746, + "mean_token_accuracy": 0.8470979332923889, + "num_tokens": 324350821.0, + "step": 8504 + }, + { + "epoch": 1.0819234194122884, + "ewc_loss": 0.0299072265625, + "ewc_loss_parallel": 2.9921531677246094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.43174171447754, + "learning_rate": 1e-06, + "loss": 0.4271, + "mean_token_accuracy": 0.8645675182342529, + "num_tokens": 324388255.0, + "step": 8505 + }, + { + "epoch": 1.082050629690879, + "ewc_loss": 0.0299072265625, + "ewc_loss_parallel": 2.9921531677246094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.34624481201172, + "learning_rate": 1e-06, + "loss": 0.4945, + "mean_token_accuracy": 0.8402516841888428, + "num_tokens": 324430068.0, + "step": 8506 + }, + { + "epoch": 1.0821778399694695, + "ewc_loss": 0.0296630859375, + "ewc_loss_parallel": 2.968311309814453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.19608497619629, + "learning_rate": 1e-06, + "loss": 0.3856, + "mean_token_accuracy": 0.8719426393508911, + "num_tokens": 324465341.0, + "step": 8507 + }, + { + "epoch": 1.08230505024806, + "ewc_loss": 0.02978515625, + "ewc_loss_parallel": 2.9802322387695312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.148956298828125, + "learning_rate": 1e-06, + "loss": 0.4174, + "mean_token_accuracy": 0.8729154467582703, + "num_tokens": 324502199.0, + "step": 8508 + }, + { + "epoch": 1.0824322605266505, + "ewc_loss": 0.0299072265625, + "ewc_loss_parallel": 2.9921531677246094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.312795639038086, + "learning_rate": 1e-06, + "loss": 0.4096, + "mean_token_accuracy": 0.8670468926429749, + "num_tokens": 324535898.0, + "step": 8509 + }, + { + "epoch": 1.082559470805241, + "ewc_loss": 0.02978515625, + "ewc_loss_parallel": 2.9802322387695312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.27117919921875, + "learning_rate": 1e-06, + "loss": 0.4507, + "mean_token_accuracy": 0.8554879426956177, + "num_tokens": 324574404.0, + "step": 8510 + }, + { + "epoch": 1.0826866810838316, + "ewc_loss": 0.0296630859375, + "ewc_loss_parallel": 2.968311309814453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.245927810668945, + "learning_rate": 1e-06, + "loss": 0.4222, + "mean_token_accuracy": 0.8637474775314331, + "num_tokens": 324611743.0, + "step": 8511 + }, + { + "epoch": 1.0828138913624221, + "ewc_loss": 0.02978515625, + "ewc_loss_parallel": 2.9802322387695312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.351417541503906, + "learning_rate": 1e-06, + "loss": 0.4411, + "mean_token_accuracy": 0.8560694456100464, + "num_tokens": 324653126.0, + "step": 8512 + }, + { + "epoch": 1.0829411016410126, + "ewc_loss": 0.0296630859375, + "ewc_loss_parallel": 2.968311309814453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.243833541870117, + "learning_rate": 1e-06, + "loss": 0.402, + "mean_token_accuracy": 0.872210681438446, + "num_tokens": 324692957.0, + "step": 8513 + }, + { + "epoch": 1.0830683119196032, + "ewc_loss": 0.02978515625, + "ewc_loss_parallel": 2.9802322387695312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.28081512451172, + "learning_rate": 1e-06, + "loss": 0.4382, + "mean_token_accuracy": 0.8599033355712891, + "num_tokens": 324737214.0, + "step": 8514 + }, + { + "epoch": 1.0831955221981937, + "ewc_loss": 0.0296630859375, + "ewc_loss_parallel": 2.968311309814453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.467893600463867, + "learning_rate": 1e-06, + "loss": 0.3812, + "mean_token_accuracy": 0.8757404088973999, + "num_tokens": 324773098.0, + "step": 8515 + }, + { + "epoch": 1.083322732476784, + "ewc_loss": 0.0299072265625, + "ewc_loss_parallel": 2.9921531677246094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.284175872802734, + "learning_rate": 1e-06, + "loss": 0.4589, + "mean_token_accuracy": 0.8544685244560242, + "num_tokens": 324815846.0, + "step": 8516 + }, + { + "epoch": 1.0834499427553745, + "ewc_loss": 0.0296630859375, + "ewc_loss_parallel": 2.968311309814453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.095046997070312, + "learning_rate": 1e-06, + "loss": 0.4327, + "mean_token_accuracy": 0.8612666726112366, + "num_tokens": 324855912.0, + "step": 8517 + }, + { + "epoch": 1.083577153033965, + "ewc_loss": 0.0296630859375, + "ewc_loss_parallel": 2.968311309814453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.343807220458984, + "learning_rate": 1e-06, + "loss": 0.4304, + "mean_token_accuracy": 0.8581570386886597, + "num_tokens": 324893090.0, + "step": 8518 + }, + { + "epoch": 1.0837043633125556, + "ewc_loss": 0.0296630859375, + "ewc_loss_parallel": 2.968311309814453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.06183624267578, + "learning_rate": 1e-06, + "loss": 0.4323, + "mean_token_accuracy": 0.8638632893562317, + "num_tokens": 324929779.0, + "step": 8519 + }, + { + "epoch": 1.0838315735911461, + "ewc_loss": 0.0296630859375, + "ewc_loss_parallel": 2.968311309814453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.495798110961914, + "learning_rate": 1e-06, + "loss": 0.4041, + "mean_token_accuracy": 0.8700901865959167, + "num_tokens": 324961349.0, + "step": 8520 + }, + { + "epoch": 1.0839587838697367, + "ewc_loss": 0.0296630859375, + "ewc_loss_parallel": 2.968311309814453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.331012725830078, + "learning_rate": 1e-06, + "loss": 0.4306, + "mean_token_accuracy": 0.864307701587677, + "num_tokens": 324998688.0, + "step": 8521 + }, + { + "epoch": 1.0840859941483272, + "ewc_loss": 0.0296630859375, + "ewc_loss_parallel": 2.968311309814453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.220266342163086, + "learning_rate": 1e-06, + "loss": 0.4532, + "mean_token_accuracy": 0.8571048974990845, + "num_tokens": 325040601.0, + "step": 8522 + }, + { + "epoch": 1.0842132044269177, + "ewc_loss": 0.02978515625, + "ewc_loss_parallel": 2.9802322387695312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.33879280090332, + "learning_rate": 1e-06, + "loss": 0.4202, + "mean_token_accuracy": 0.8694214224815369, + "num_tokens": 325081729.0, + "step": 8523 + }, + { + "epoch": 1.0843404147055082, + "ewc_loss": 0.0296630859375, + "ewc_loss_parallel": 2.968311309814453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.20346450805664, + "learning_rate": 1e-06, + "loss": 0.4612, + "mean_token_accuracy": 0.854507565498352, + "num_tokens": 325118886.0, + "step": 8524 + }, + { + "epoch": 1.0844676249840988, + "ewc_loss": 0.02978515625, + "ewc_loss_parallel": 2.9802322387695312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.404624938964844, + "learning_rate": 1e-06, + "loss": 0.3636, + "mean_token_accuracy": 0.8820969462394714, + "num_tokens": 325156417.0, + "step": 8525 + }, + { + "epoch": 1.0845948352626893, + "ewc_loss": 0.02978515625, + "ewc_loss_parallel": 2.9802322387695312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.214868545532227, + "learning_rate": 1e-06, + "loss": 0.392, + "mean_token_accuracy": 0.8768832683563232, + "num_tokens": 325200902.0, + "step": 8526 + }, + { + "epoch": 1.0847220455412798, + "ewc_loss": 0.0296630859375, + "ewc_loss_parallel": 2.968311309814453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.26375389099121, + "learning_rate": 1e-06, + "loss": 0.4147, + "mean_token_accuracy": 0.8658291101455688, + "num_tokens": 325233362.0, + "step": 8527 + }, + { + "epoch": 1.0848492558198704, + "ewc_loss": 0.0299072265625, + "ewc_loss_parallel": 2.9921531677246094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.374244689941406, + "learning_rate": 1e-06, + "loss": 0.4299, + "mean_token_accuracy": 0.8631666898727417, + "num_tokens": 325274646.0, + "step": 8528 + }, + { + "epoch": 1.0849764660984607, + "ewc_loss": 0.0296630859375, + "ewc_loss_parallel": 2.968311309814453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.04292869567871, + "learning_rate": 1e-06, + "loss": 0.3725, + "mean_token_accuracy": 0.8807618021965027, + "num_tokens": 325304735.0, + "step": 8529 + }, + { + "epoch": 1.0851036763770512, + "ewc_loss": 0.0299072265625, + "ewc_loss_parallel": 2.9921531677246094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.304780960083008, + "learning_rate": 1e-06, + "loss": 0.4104, + "mean_token_accuracy": 0.8680464029312134, + "num_tokens": 325339833.0, + "step": 8530 + }, + { + "epoch": 1.0852308866556417, + "ewc_loss": 0.02978515625, + "ewc_loss_parallel": 2.9802322387695312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.210866928100586, + "learning_rate": 1e-06, + "loss": 0.4669, + "mean_token_accuracy": 0.8530972599983215, + "num_tokens": 325373838.0, + "step": 8531 + }, + { + "epoch": 1.0853580969342322, + "ewc_loss": 0.0296630859375, + "ewc_loss_parallel": 2.968311309814453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.210765838623047, + "learning_rate": 1e-06, + "loss": 0.3555, + "mean_token_accuracy": 0.8866551518440247, + "num_tokens": 325412913.0, + "step": 8532 + }, + { + "epoch": 1.0854853072128228, + "ewc_loss": 0.0299072265625, + "ewc_loss_parallel": 2.9921531677246094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.35646629333496, + "learning_rate": 1e-06, + "loss": 0.4092, + "mean_token_accuracy": 0.8703105449676514, + "num_tokens": 325451830.0, + "step": 8533 + }, + { + "epoch": 1.0856125174914133, + "ewc_loss": 0.0299072265625, + "ewc_loss_parallel": 2.9921531677246094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.264503479003906, + "learning_rate": 1e-06, + "loss": 0.4806, + "mean_token_accuracy": 0.8475147485733032, + "num_tokens": 325487162.0, + "step": 8534 + }, + { + "epoch": 1.0857397277700038, + "ewc_loss": 0.0299072265625, + "ewc_loss_parallel": 2.9921531677246094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.30242919921875, + "learning_rate": 1e-06, + "loss": 0.3922, + "mean_token_accuracy": 0.8744547367095947, + "num_tokens": 325525906.0, + "step": 8535 + }, + { + "epoch": 1.0858669380485944, + "ewc_loss": 0.0299072265625, + "ewc_loss_parallel": 2.9921531677246094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.239702224731445, + "learning_rate": 1e-06, + "loss": 0.3839, + "mean_token_accuracy": 0.8762457370758057, + "num_tokens": 325559925.0, + "step": 8536 + }, + { + "epoch": 1.0859941483271849, + "ewc_loss": 0.0296630859375, + "ewc_loss_parallel": 2.968311309814453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.235855102539062, + "learning_rate": 1e-06, + "loss": 0.4211, + "mean_token_accuracy": 0.8643788695335388, + "num_tokens": 325599980.0, + "step": 8537 + }, + { + "epoch": 1.0861213586057754, + "ewc_loss": 0.0299072265625, + "ewc_loss_parallel": 2.9921531677246094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.27338981628418, + "learning_rate": 1e-06, + "loss": 0.4585, + "mean_token_accuracy": 0.8496773838996887, + "num_tokens": 325631745.0, + "step": 8538 + }, + { + "epoch": 1.086248568884366, + "ewc_loss": 0.0299072265625, + "ewc_loss_parallel": 2.9921531677246094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.479717254638672, + "learning_rate": 1e-06, + "loss": 0.4349, + "mean_token_accuracy": 0.8614712953567505, + "num_tokens": 325673583.0, + "step": 8539 + }, + { + "epoch": 1.0863757791629565, + "ewc_loss": 0.0299072265625, + "ewc_loss_parallel": 2.9921531677246094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.36767578125, + "learning_rate": 1e-06, + "loss": 0.4509, + "mean_token_accuracy": 0.8573772311210632, + "num_tokens": 325712995.0, + "step": 8540 + }, + { + "epoch": 1.0865029894415468, + "ewc_loss": 0.0299072265625, + "ewc_loss_parallel": 2.9921531677246094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.25493621826172, + "learning_rate": 1e-06, + "loss": 0.3995, + "mean_token_accuracy": 0.871622622013092, + "num_tokens": 325748259.0, + "step": 8541 + }, + { + "epoch": 1.0866301997201373, + "ewc_loss": 0.0301513671875, + "ewc_loss_parallel": 3.0159950256347656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.257699966430664, + "learning_rate": 1e-06, + "loss": 0.4026, + "mean_token_accuracy": 0.8702154159545898, + "num_tokens": 325787676.0, + "step": 8542 + }, + { + "epoch": 1.0867574099987278, + "ewc_loss": 0.030029296875, + "ewc_loss_parallel": 3.0040740966796875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.415403366088867, + "learning_rate": 1e-06, + "loss": 0.4948, + "mean_token_accuracy": 0.8400096893310547, + "num_tokens": 325827819.0, + "step": 8543 + }, + { + "epoch": 1.0868846202773184, + "ewc_loss": 0.030029296875, + "ewc_loss_parallel": 3.0040740966796875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.19961929321289, + "learning_rate": 1e-06, + "loss": 0.3952, + "mean_token_accuracy": 0.8735393285751343, + "num_tokens": 325866328.0, + "step": 8544 + }, + { + "epoch": 1.0870118305559089, + "ewc_loss": 0.0299072265625, + "ewc_loss_parallel": 2.9921531677246094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.242511749267578, + "learning_rate": 1e-06, + "loss": 0.4415, + "mean_token_accuracy": 0.8604453206062317, + "num_tokens": 325906752.0, + "step": 8545 + }, + { + "epoch": 1.0871390408344994, + "ewc_loss": 0.0299072265625, + "ewc_loss_parallel": 2.9921531677246094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.109420776367188, + "learning_rate": 1e-06, + "loss": 0.441, + "mean_token_accuracy": 0.8587024211883545, + "num_tokens": 325943765.0, + "step": 8546 + }, + { + "epoch": 1.08726625111309, + "ewc_loss": 0.030029296875, + "ewc_loss_parallel": 3.0040740966796875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.307910919189453, + "learning_rate": 1e-06, + "loss": 0.4073, + "mean_token_accuracy": 0.871611475944519, + "num_tokens": 325981471.0, + "step": 8547 + }, + { + "epoch": 1.0873934613916805, + "ewc_loss": 0.0296630859375, + "ewc_loss_parallel": 2.968311309814453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.401582717895508, + "learning_rate": 1e-06, + "loss": 0.4195, + "mean_token_accuracy": 0.8703562021255493, + "num_tokens": 326017253.0, + "step": 8548 + }, + { + "epoch": 1.087520671670271, + "ewc_loss": 0.0301513671875, + "ewc_loss_parallel": 3.0159950256347656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.211708068847656, + "learning_rate": 1e-06, + "loss": 0.4215, + "mean_token_accuracy": 0.8663989305496216, + "num_tokens": 326053484.0, + "step": 8549 + }, + { + "epoch": 1.0876478819488615, + "ewc_loss": 0.02978515625, + "ewc_loss_parallel": 2.9802322387695312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.270397186279297, + "learning_rate": 1e-06, + "loss": 0.415, + "mean_token_accuracy": 0.8705971240997314, + "num_tokens": 326084779.0, + "step": 8550 + }, + { + "epoch": 1.087775092227452, + "ewc_loss": 0.0301513671875, + "ewc_loss_parallel": 3.0159950256347656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.123939514160156, + "learning_rate": 1e-06, + "loss": 0.4255, + "mean_token_accuracy": 0.8636215329170227, + "num_tokens": 326120795.0, + "step": 8551 + }, + { + "epoch": 1.0879023025060426, + "ewc_loss": 0.02978515625, + "ewc_loss_parallel": 2.9802322387695312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.28966522216797, + "learning_rate": 1e-06, + "loss": 0.4275, + "mean_token_accuracy": 0.862966001033783, + "num_tokens": 326155991.0, + "step": 8552 + }, + { + "epoch": 1.0880295127846331, + "ewc_loss": 0.0299072265625, + "ewc_loss_parallel": 2.9921531677246094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.445545196533203, + "learning_rate": 1e-06, + "loss": 0.4286, + "mean_token_accuracy": 0.8612304925918579, + "num_tokens": 326195738.0, + "step": 8553 + }, + { + "epoch": 1.0881567230632234, + "ewc_loss": 0.02978515625, + "ewc_loss_parallel": 2.9802322387695312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.042997360229492, + "learning_rate": 1e-06, + "loss": 0.4258, + "mean_token_accuracy": 0.8644629120826721, + "num_tokens": 326237773.0, + "step": 8554 + }, + { + "epoch": 1.088283933341814, + "ewc_loss": 0.0296630859375, + "ewc_loss_parallel": 2.968311309814453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.190526962280273, + "learning_rate": 1e-06, + "loss": 0.3792, + "mean_token_accuracy": 0.878361701965332, + "num_tokens": 326278589.0, + "step": 8555 + }, + { + "epoch": 1.0884111436204045, + "ewc_loss": 0.02978515625, + "ewc_loss_parallel": 2.9802322387695312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.29607582092285, + "learning_rate": 1e-06, + "loss": 0.4424, + "mean_token_accuracy": 0.8585490584373474, + "num_tokens": 326316862.0, + "step": 8556 + }, + { + "epoch": 1.088538353898995, + "ewc_loss": 0.0301513671875, + "ewc_loss_parallel": 3.0159950256347656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.23707389831543, + "learning_rate": 1e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.8624944090843201, + "num_tokens": 326353892.0, + "step": 8557 + }, + { + "epoch": 1.0886655641775855, + "ewc_loss": 0.030029296875, + "ewc_loss_parallel": 3.0040740966796875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.32423210144043, + "learning_rate": 1e-06, + "loss": 0.4208, + "mean_token_accuracy": 0.8684839010238647, + "num_tokens": 326389478.0, + "step": 8558 + }, + { + "epoch": 1.088792774456176, + "ewc_loss": 0.030029296875, + "ewc_loss_parallel": 3.0040740966796875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.316987991333008, + "learning_rate": 1e-06, + "loss": 0.441, + "mean_token_accuracy": 0.8597051501274109, + "num_tokens": 326428072.0, + "step": 8559 + }, + { + "epoch": 1.0889199847347666, + "ewc_loss": 0.0296630859375, + "ewc_loss_parallel": 2.968311309814453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.190752029418945, + "learning_rate": 1e-06, + "loss": 0.4307, + "mean_token_accuracy": 0.867777407169342, + "num_tokens": 326466374.0, + "step": 8560 + }, + { + "epoch": 1.0890471950133571, + "ewc_loss": 0.030029296875, + "ewc_loss_parallel": 3.0040740966796875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.239648818969727, + "learning_rate": 1e-06, + "loss": 0.4297, + "mean_token_accuracy": 0.8612833023071289, + "num_tokens": 326501081.0, + "step": 8561 + }, + { + "epoch": 1.0891744052919476, + "ewc_loss": 0.0299072265625, + "ewc_loss_parallel": 2.9921531677246094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.41265106201172, + "learning_rate": 1e-06, + "loss": 0.4078, + "mean_token_accuracy": 0.8744180202484131, + "num_tokens": 326532539.0, + "step": 8562 + }, + { + "epoch": 1.0893016155705382, + "ewc_loss": 0.0302734375, + "ewc_loss_parallel": 3.0279159545898438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.316556930541992, + "learning_rate": 1e-06, + "loss": 0.4068, + "mean_token_accuracy": 0.8702912330627441, + "num_tokens": 326569481.0, + "step": 8563 + }, + { + "epoch": 1.0894288258491287, + "ewc_loss": 0.0296630859375, + "ewc_loss_parallel": 2.968311309814453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.25965690612793, + "learning_rate": 1e-06, + "loss": 0.4258, + "mean_token_accuracy": 0.8629454970359802, + "num_tokens": 326609746.0, + "step": 8564 + }, + { + "epoch": 1.089556036127719, + "ewc_loss": 0.02978515625, + "ewc_loss_parallel": 2.9802322387695312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.25153160095215, + "learning_rate": 1e-06, + "loss": 0.4545, + "mean_token_accuracy": 0.8538241386413574, + "num_tokens": 326652379.0, + "step": 8565 + }, + { + "epoch": 1.0896832464063095, + "ewc_loss": 0.02978515625, + "ewc_loss_parallel": 2.9802322387695312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.105575561523438, + "learning_rate": 1e-06, + "loss": 0.4415, + "mean_token_accuracy": 0.8609414100646973, + "num_tokens": 326687617.0, + "step": 8566 + }, + { + "epoch": 1.0898104566849, + "ewc_loss": 0.030029296875, + "ewc_loss_parallel": 3.0040740966796875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.548498153686523, + "learning_rate": 1e-06, + "loss": 0.454, + "mean_token_accuracy": 0.8573187589645386, + "num_tokens": 326724155.0, + "step": 8567 + }, + { + "epoch": 1.0899376669634906, + "ewc_loss": 0.0301513671875, + "ewc_loss_parallel": 3.0159950256347656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.116010665893555, + "learning_rate": 1e-06, + "loss": 0.4281, + "mean_token_accuracy": 0.8631243705749512, + "num_tokens": 326768234.0, + "step": 8568 + }, + { + "epoch": 1.0900648772420811, + "ewc_loss": 0.02978515625, + "ewc_loss_parallel": 2.9802322387695312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.188100814819336, + "learning_rate": 1e-06, + "loss": 0.4089, + "mean_token_accuracy": 0.869037389755249, + "num_tokens": 326810423.0, + "step": 8569 + }, + { + "epoch": 1.0901920875206716, + "ewc_loss": 0.0299072265625, + "ewc_loss_parallel": 2.9921531677246094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.41950035095215, + "learning_rate": 1e-06, + "loss": 0.4311, + "mean_token_accuracy": 0.8632211685180664, + "num_tokens": 326851834.0, + "step": 8570 + }, + { + "epoch": 1.0903192977992622, + "ewc_loss": 0.0299072265625, + "ewc_loss_parallel": 2.9921531677246094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.051876068115234, + "learning_rate": 1e-06, + "loss": 0.4627, + "mean_token_accuracy": 0.8533255457878113, + "num_tokens": 326889334.0, + "step": 8571 + }, + { + "epoch": 1.0904465080778527, + "ewc_loss": 0.02978515625, + "ewc_loss_parallel": 2.9802322387695312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.34589958190918, + "learning_rate": 1e-06, + "loss": 0.4645, + "mean_token_accuracy": 0.8534587025642395, + "num_tokens": 326931884.0, + "step": 8572 + }, + { + "epoch": 1.0905737183564432, + "ewc_loss": 0.030029296875, + "ewc_loss_parallel": 3.0040740966796875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.647356033325195, + "learning_rate": 1e-06, + "loss": 0.4077, + "mean_token_accuracy": 0.87298184633255, + "num_tokens": 326969043.0, + "step": 8573 + }, + { + "epoch": 1.0907009286350338, + "ewc_loss": 0.030029296875, + "ewc_loss_parallel": 3.0040740966796875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.315153121948242, + "learning_rate": 1e-06, + "loss": 0.4209, + "mean_token_accuracy": 0.8651458621025085, + "num_tokens": 326998369.0, + "step": 8574 + }, + { + "epoch": 1.0908281389136243, + "ewc_loss": 0.0301513671875, + "ewc_loss_parallel": 3.0159950256347656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.470077514648438, + "learning_rate": 1e-06, + "loss": 0.4261, + "mean_token_accuracy": 0.8641392588615417, + "num_tokens": 327036909.0, + "step": 8575 + }, + { + "epoch": 1.0909553491922148, + "ewc_loss": 0.02978515625, + "ewc_loss_parallel": 2.9802322387695312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.432504653930664, + "learning_rate": 1e-06, + "loss": 0.4454, + "mean_token_accuracy": 0.8579038381576538, + "num_tokens": 327076053.0, + "step": 8576 + }, + { + "epoch": 1.0910825594708053, + "ewc_loss": 0.030029296875, + "ewc_loss_parallel": 3.0040740966796875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.321645736694336, + "learning_rate": 1e-06, + "loss": 0.4087, + "mean_token_accuracy": 0.8718655705451965, + "num_tokens": 327112285.0, + "step": 8577 + }, + { + "epoch": 1.0912097697493957, + "ewc_loss": 0.0299072265625, + "ewc_loss_parallel": 2.9921531677246094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.411457061767578, + "learning_rate": 1e-06, + "loss": 0.3958, + "mean_token_accuracy": 0.8703973293304443, + "num_tokens": 327151591.0, + "step": 8578 + }, + { + "epoch": 1.0913369800279862, + "ewc_loss": 0.02978515625, + "ewc_loss_parallel": 2.9802322387695312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.598567962646484, + "learning_rate": 1e-06, + "loss": 0.4755, + "mean_token_accuracy": 0.8493250012397766, + "num_tokens": 327193600.0, + "step": 8579 + }, + { + "epoch": 1.0914641903065767, + "ewc_loss": 0.02978515625, + "ewc_loss_parallel": 2.9802322387695312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.190261840820312, + "learning_rate": 1e-06, + "loss": 0.442, + "mean_token_accuracy": 0.8548712730407715, + "num_tokens": 327230249.0, + "step": 8580 + }, + { + "epoch": 1.0915914005851672, + "ewc_loss": 0.02978515625, + "ewc_loss_parallel": 2.9802322387695312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.147838592529297, + "learning_rate": 1e-06, + "loss": 0.3893, + "mean_token_accuracy": 0.877960205078125, + "num_tokens": 327270043.0, + "step": 8581 + }, + { + "epoch": 1.0917186108637578, + "ewc_loss": 0.0299072265625, + "ewc_loss_parallel": 2.9921531677246094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.313779830932617, + "learning_rate": 1e-06, + "loss": 0.4221, + "mean_token_accuracy": 0.8670774698257446, + "num_tokens": 327309396.0, + "step": 8582 + }, + { + "epoch": 1.0918458211423483, + "ewc_loss": 0.02978515625, + "ewc_loss_parallel": 2.9802322387695312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.27194595336914, + "learning_rate": 1e-06, + "loss": 0.4069, + "mean_token_accuracy": 0.8722379207611084, + "num_tokens": 327349497.0, + "step": 8583 + }, + { + "epoch": 1.0919730314209388, + "ewc_loss": 0.02978515625, + "ewc_loss_parallel": 2.9802322387695312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.45899772644043, + "learning_rate": 1e-06, + "loss": 0.4513, + "mean_token_accuracy": 0.8559037446975708, + "num_tokens": 327385648.0, + "step": 8584 + }, + { + "epoch": 1.0921002416995294, + "ewc_loss": 0.02978515625, + "ewc_loss_parallel": 2.9802322387695312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.318737030029297, + "learning_rate": 1e-06, + "loss": 0.4175, + "mean_token_accuracy": 0.8655744791030884, + "num_tokens": 327426379.0, + "step": 8585 + }, + { + "epoch": 1.0922274519781199, + "ewc_loss": 0.030029296875, + "ewc_loss_parallel": 3.0040740966796875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.257429122924805, + "learning_rate": 1e-06, + "loss": 0.3971, + "mean_token_accuracy": 0.868497371673584, + "num_tokens": 327467555.0, + "step": 8586 + }, + { + "epoch": 1.0923546622567104, + "ewc_loss": 0.02978515625, + "ewc_loss_parallel": 2.9802322387695312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.32135772705078, + "learning_rate": 1e-06, + "loss": 0.4309, + "mean_token_accuracy": 0.860533595085144, + "num_tokens": 327505303.0, + "step": 8587 + }, + { + "epoch": 1.092481872535301, + "ewc_loss": 0.0296630859375, + "ewc_loss_parallel": 2.968311309814453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.299898147583008, + "learning_rate": 1e-06, + "loss": 0.4102, + "mean_token_accuracy": 0.8694633841514587, + "num_tokens": 327547854.0, + "step": 8588 + }, + { + "epoch": 1.0926090828138915, + "ewc_loss": 0.0299072265625, + "ewc_loss_parallel": 2.9921531677246094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.466901779174805, + "learning_rate": 1e-06, + "loss": 0.4344, + "mean_token_accuracy": 0.8661305904388428, + "num_tokens": 327584132.0, + "step": 8589 + }, + { + "epoch": 1.0927362930924818, + "ewc_loss": 0.02978515625, + "ewc_loss_parallel": 2.9802322387695312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.230825424194336, + "learning_rate": 1e-06, + "loss": 0.4592, + "mean_token_accuracy": 0.8533352613449097, + "num_tokens": 327629250.0, + "step": 8590 + }, + { + "epoch": 1.0928635033710723, + "ewc_loss": 0.0296630859375, + "ewc_loss_parallel": 2.968311309814453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.062665939331055, + "learning_rate": 1e-06, + "loss": 0.4508, + "mean_token_accuracy": 0.8569902181625366, + "num_tokens": 327666988.0, + "step": 8591 + }, + { + "epoch": 1.0929907136496628, + "ewc_loss": 0.0299072265625, + "ewc_loss_parallel": 2.9921531677246094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.45075035095215, + "learning_rate": 1e-06, + "loss": 0.4123, + "mean_token_accuracy": 0.8669891953468323, + "num_tokens": 327705955.0, + "step": 8592 + }, + { + "epoch": 1.0931179239282534, + "ewc_loss": 0.030029296875, + "ewc_loss_parallel": 3.0040740966796875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.19668960571289, + "learning_rate": 1e-06, + "loss": 0.4517, + "mean_token_accuracy": 0.8571397066116333, + "num_tokens": 327746362.0, + "step": 8593 + }, + { + "epoch": 1.0932451342068439, + "ewc_loss": 0.030029296875, + "ewc_loss_parallel": 3.0040740966796875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.278919219970703, + "learning_rate": 1e-06, + "loss": 0.4219, + "mean_token_accuracy": 0.8632054924964905, + "num_tokens": 327789034.0, + "step": 8594 + }, + { + "epoch": 1.0933723444854344, + "ewc_loss": 0.030029296875, + "ewc_loss_parallel": 3.0040740966796875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.291259765625, + "learning_rate": 1e-06, + "loss": 0.4631, + "mean_token_accuracy": 0.8543851375579834, + "num_tokens": 327824203.0, + "step": 8595 + }, + { + "epoch": 1.093499554764025, + "ewc_loss": 0.030029296875, + "ewc_loss_parallel": 3.0040740966796875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.20461654663086, + "learning_rate": 1e-06, + "loss": 0.4215, + "mean_token_accuracy": 0.8657888770103455, + "num_tokens": 327860340.0, + "step": 8596 + }, + { + "epoch": 1.0936267650426155, + "ewc_loss": 0.0299072265625, + "ewc_loss_parallel": 2.9921531677246094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.50649070739746, + "learning_rate": 1e-06, + "loss": 0.4421, + "mean_token_accuracy": 0.8569459915161133, + "num_tokens": 327901709.0, + "step": 8597 + }, + { + "epoch": 1.093753975321206, + "ewc_loss": 0.030029296875, + "ewc_loss_parallel": 3.0040740966796875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.241626739501953, + "learning_rate": 1e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.8656913042068481, + "num_tokens": 327940327.0, + "step": 8598 + }, + { + "epoch": 1.0938811855997965, + "ewc_loss": 0.02978515625, + "ewc_loss_parallel": 2.9802322387695312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.336292266845703, + "learning_rate": 1e-06, + "loss": 0.4171, + "mean_token_accuracy": 0.8694565892219543, + "num_tokens": 327977966.0, + "step": 8599 + }, + { + "epoch": 1.094008395878387, + "ewc_loss": 0.0299072265625, + "ewc_loss_parallel": 2.9921531677246094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.250673294067383, + "learning_rate": 1e-06, + "loss": 0.4298, + "mean_token_accuracy": 0.8606342077255249, + "num_tokens": 328021143.0, + "step": 8600 + }, + { + "epoch": 1.0941356061569776, + "ewc_loss": 0.0296630859375, + "ewc_loss_parallel": 2.968311309814453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.43515396118164, + "learning_rate": 1e-06, + "loss": 0.4061, + "mean_token_accuracy": 0.8696833848953247, + "num_tokens": 328060792.0, + "step": 8601 + }, + { + "epoch": 1.094262816435568, + "ewc_loss": 0.02978515625, + "ewc_loss_parallel": 2.9802322387695312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.189498901367188, + "learning_rate": 1e-06, + "loss": 0.4256, + "mean_token_accuracy": 0.8649731278419495, + "num_tokens": 328099698.0, + "step": 8602 + }, + { + "epoch": 1.0943900267141584, + "ewc_loss": 0.02978515625, + "ewc_loss_parallel": 2.9802322387695312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.46246337890625, + "learning_rate": 1e-06, + "loss": 0.4014, + "mean_token_accuracy": 0.8681778907775879, + "num_tokens": 328141075.0, + "step": 8603 + }, + { + "epoch": 1.094517236992749, + "ewc_loss": 0.030029296875, + "ewc_loss_parallel": 3.0040740966796875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.06966209411621, + "learning_rate": 1e-06, + "loss": 0.4556, + "mean_token_accuracy": 0.8575506806373596, + "num_tokens": 328178945.0, + "step": 8604 + }, + { + "epoch": 1.0946444472713395, + "ewc_loss": 0.030029296875, + "ewc_loss_parallel": 3.0040740966796875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.414640426635742, + "learning_rate": 1e-06, + "loss": 0.5021, + "mean_token_accuracy": 0.8378657102584839, + "num_tokens": 328214423.0, + "step": 8605 + }, + { + "epoch": 1.09477165754993, + "ewc_loss": 0.0299072265625, + "ewc_loss_parallel": 2.9921531677246094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.26877212524414, + "learning_rate": 1e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.8646405935287476, + "num_tokens": 328248640.0, + "step": 8606 + }, + { + "epoch": 1.0948988678285205, + "ewc_loss": 0.02978515625, + "ewc_loss_parallel": 2.9802322387695312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.375328063964844, + "learning_rate": 1e-06, + "loss": 0.4808, + "mean_token_accuracy": 0.8432540893554688, + "num_tokens": 328283016.0, + "step": 8607 + }, + { + "epoch": 1.095026078107111, + "ewc_loss": 0.02978515625, + "ewc_loss_parallel": 2.9802322387695312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.234663009643555, + "learning_rate": 1e-06, + "loss": 0.4146, + "mean_token_accuracy": 0.8681374788284302, + "num_tokens": 328322377.0, + "step": 8608 + }, + { + "epoch": 1.0951532883857016, + "ewc_loss": 0.0302734375, + "ewc_loss_parallel": 3.0279159545898438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.375391006469727, + "learning_rate": 1e-06, + "loss": 0.4203, + "mean_token_accuracy": 0.865628719329834, + "num_tokens": 328359409.0, + "step": 8609 + }, + { + "epoch": 1.0952804986642921, + "ewc_loss": 0.0299072265625, + "ewc_loss_parallel": 2.9921531677246094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.30936050415039, + "learning_rate": 1e-06, + "loss": 0.4116, + "mean_token_accuracy": 0.8689202070236206, + "num_tokens": 328401257.0, + "step": 8610 + }, + { + "epoch": 1.0954077089428826, + "ewc_loss": 0.02978515625, + "ewc_loss_parallel": 2.9802322387695312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.188783645629883, + "learning_rate": 1e-06, + "loss": 0.4104, + "mean_token_accuracy": 0.867711067199707, + "num_tokens": 328435886.0, + "step": 8611 + }, + { + "epoch": 1.0955349192214732, + "ewc_loss": 0.0299072265625, + "ewc_loss_parallel": 2.9921531677246094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.349071502685547, + "learning_rate": 1e-06, + "loss": 0.4528, + "mean_token_accuracy": 0.853196382522583, + "num_tokens": 328472091.0, + "step": 8612 + }, + { + "epoch": 1.0956621295000637, + "ewc_loss": 0.0296630859375, + "ewc_loss_parallel": 2.968311309814453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.346328735351562, + "learning_rate": 1e-06, + "loss": 0.4361, + "mean_token_accuracy": 0.8566908836364746, + "num_tokens": 328509453.0, + "step": 8613 + }, + { + "epoch": 1.095789339778654, + "ewc_loss": 0.030029296875, + "ewc_loss_parallel": 3.0040740966796875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.27138900756836, + "learning_rate": 1e-06, + "loss": 0.4292, + "mean_token_accuracy": 0.8636637926101685, + "num_tokens": 328545590.0, + "step": 8614 + }, + { + "epoch": 1.0959165500572445, + "ewc_loss": 0.0299072265625, + "ewc_loss_parallel": 2.9921531677246094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.396169662475586, + "learning_rate": 1e-06, + "loss": 0.456, + "mean_token_accuracy": 0.8541961908340454, + "num_tokens": 328583093.0, + "step": 8615 + }, + { + "epoch": 1.096043760335835, + "ewc_loss": 0.0301513671875, + "ewc_loss_parallel": 3.0159950256347656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.25689697265625, + "learning_rate": 1e-06, + "loss": 0.4821, + "mean_token_accuracy": 0.8469277620315552, + "num_tokens": 328625312.0, + "step": 8616 + }, + { + "epoch": 1.0961709706144256, + "ewc_loss": 0.0299072265625, + "ewc_loss_parallel": 2.9921531677246094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.36908721923828, + "learning_rate": 1e-06, + "loss": 0.4399, + "mean_token_accuracy": 0.8568456768989563, + "num_tokens": 328655392.0, + "step": 8617 + }, + { + "epoch": 1.0962981808930161, + "ewc_loss": 0.030029296875, + "ewc_loss_parallel": 3.0040740966796875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.383712768554688, + "learning_rate": 1e-06, + "loss": 0.4826, + "mean_token_accuracy": 0.8491674661636353, + "num_tokens": 328693686.0, + "step": 8618 + }, + { + "epoch": 1.0964253911716066, + "ewc_loss": 0.030029296875, + "ewc_loss_parallel": 3.0040740966796875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.247467041015625, + "learning_rate": 1e-06, + "loss": 0.3858, + "mean_token_accuracy": 0.8753507137298584, + "num_tokens": 328728051.0, + "step": 8619 + }, + { + "epoch": 1.0965526014501972, + "ewc_loss": 0.030029296875, + "ewc_loss_parallel": 3.0040740966796875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.365345001220703, + "learning_rate": 1e-06, + "loss": 0.4594, + "mean_token_accuracy": 0.8535035848617554, + "num_tokens": 328768091.0, + "step": 8620 + }, + { + "epoch": 1.0966798117287877, + "ewc_loss": 0.0301513671875, + "ewc_loss_parallel": 3.0159950256347656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.304454803466797, + "learning_rate": 1e-06, + "loss": 0.4329, + "mean_token_accuracy": 0.8609021902084351, + "num_tokens": 328803851.0, + "step": 8621 + }, + { + "epoch": 1.0968070220073782, + "ewc_loss": 0.0301513671875, + "ewc_loss_parallel": 3.0159950256347656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.506256103515625, + "learning_rate": 1e-06, + "loss": 0.3967, + "mean_token_accuracy": 0.874925971031189, + "num_tokens": 328837429.0, + "step": 8622 + }, + { + "epoch": 1.0969342322859688, + "ewc_loss": 0.0296630859375, + "ewc_loss_parallel": 2.968311309814453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.104820251464844, + "learning_rate": 1e-06, + "loss": 0.427, + "mean_token_accuracy": 0.864295482635498, + "num_tokens": 328874264.0, + "step": 8623 + }, + { + "epoch": 1.0970614425645593, + "ewc_loss": 0.0296630859375, + "ewc_loss_parallel": 2.968311309814453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.195524215698242, + "learning_rate": 1e-06, + "loss": 0.4221, + "mean_token_accuracy": 0.8671797513961792, + "num_tokens": 328911627.0, + "step": 8624 + }, + { + "epoch": 1.0971886528431498, + "ewc_loss": 0.0301513671875, + "ewc_loss_parallel": 3.0159950256347656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.25698471069336, + "learning_rate": 1e-06, + "loss": 0.4462, + "mean_token_accuracy": 0.8571202158927917, + "num_tokens": 328944471.0, + "step": 8625 + }, + { + "epoch": 1.0973158631217403, + "ewc_loss": 0.030029296875, + "ewc_loss_parallel": 3.0040740966796875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.280000686645508, + "learning_rate": 1e-06, + "loss": 0.3987, + "mean_token_accuracy": 0.8711854815483093, + "num_tokens": 328983826.0, + "step": 8626 + }, + { + "epoch": 1.0974430734003306, + "ewc_loss": 0.0303955078125, + "ewc_loss_parallel": 3.039836883544922e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.32599639892578, + "learning_rate": 1e-06, + "loss": 0.4174, + "mean_token_accuracy": 0.8642432689666748, + "num_tokens": 329019385.0, + "step": 8627 + }, + { + "epoch": 1.0975702836789212, + "ewc_loss": 0.0301513671875, + "ewc_loss_parallel": 3.0159950256347656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.330081939697266, + "learning_rate": 1e-06, + "loss": 0.4752, + "mean_token_accuracy": 0.8513432145118713, + "num_tokens": 329065179.0, + "step": 8628 + }, + { + "epoch": 1.0976974939575117, + "ewc_loss": 0.0301513671875, + "ewc_loss_parallel": 3.0159950256347656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.34226417541504, + "learning_rate": 1e-06, + "loss": 0.408, + "mean_token_accuracy": 0.870762050151825, + "num_tokens": 329101077.0, + "step": 8629 + }, + { + "epoch": 1.0978247042361022, + "ewc_loss": 0.02978515625, + "ewc_loss_parallel": 2.9802322387695312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.1405086517334, + "learning_rate": 1e-06, + "loss": 0.4094, + "mean_token_accuracy": 0.8689039349555969, + "num_tokens": 329137538.0, + "step": 8630 + }, + { + "epoch": 1.0979519145146928, + "ewc_loss": 0.0301513671875, + "ewc_loss_parallel": 3.0159950256347656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.330873489379883, + "learning_rate": 1e-06, + "loss": 0.3839, + "mean_token_accuracy": 0.8777888417243958, + "num_tokens": 329171952.0, + "step": 8631 + }, + { + "epoch": 1.0980791247932833, + "ewc_loss": 0.0301513671875, + "ewc_loss_parallel": 3.0159950256347656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.201189041137695, + "learning_rate": 1e-06, + "loss": 0.3975, + "mean_token_accuracy": 0.8721731901168823, + "num_tokens": 329214956.0, + "step": 8632 + }, + { + "epoch": 1.0982063350718738, + "ewc_loss": 0.030029296875, + "ewc_loss_parallel": 3.0040740966796875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.18154525756836, + "learning_rate": 1e-06, + "loss": 0.4513, + "mean_token_accuracy": 0.8536036610603333, + "num_tokens": 329254228.0, + "step": 8633 + }, + { + "epoch": 1.0983335453504643, + "ewc_loss": 0.030029296875, + "ewc_loss_parallel": 3.0040740966796875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.243627548217773, + "learning_rate": 1e-06, + "loss": 0.4143, + "mean_token_accuracy": 0.8662124872207642, + "num_tokens": 329288337.0, + "step": 8634 + }, + { + "epoch": 1.0984607556290549, + "ewc_loss": 0.0299072265625, + "ewc_loss_parallel": 2.9921531677246094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.434734344482422, + "learning_rate": 1e-06, + "loss": 0.4618, + "mean_token_accuracy": 0.8574846982955933, + "num_tokens": 329329118.0, + "step": 8635 + }, + { + "epoch": 1.0985879659076454, + "ewc_loss": 0.0301513671875, + "ewc_loss_parallel": 3.0159950256347656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.386432647705078, + "learning_rate": 1e-06, + "loss": 0.4679, + "mean_token_accuracy": 0.8503588438034058, + "num_tokens": 329366301.0, + "step": 8636 + }, + { + "epoch": 1.098715176186236, + "ewc_loss": 0.030029296875, + "ewc_loss_parallel": 3.0040740966796875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.23708152770996, + "learning_rate": 1e-06, + "loss": 0.4539, + "mean_token_accuracy": 0.8528695106506348, + "num_tokens": 329401612.0, + "step": 8637 + }, + { + "epoch": 1.0988423864648265, + "ewc_loss": 0.030029296875, + "ewc_loss_parallel": 3.0040740966796875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.50408172607422, + "learning_rate": 1e-06, + "loss": 0.4273, + "mean_token_accuracy": 0.8655049800872803, + "num_tokens": 329440403.0, + "step": 8638 + }, + { + "epoch": 1.0989695967434168, + "ewc_loss": 0.0302734375, + "ewc_loss_parallel": 3.0279159545898438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.19163703918457, + "learning_rate": 1e-06, + "loss": 0.3896, + "mean_token_accuracy": 0.8732355833053589, + "num_tokens": 329478699.0, + "step": 8639 + }, + { + "epoch": 1.0990968070220073, + "ewc_loss": 0.0299072265625, + "ewc_loss_parallel": 2.9921531677246094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.505416870117188, + "learning_rate": 1e-06, + "loss": 0.4259, + "mean_token_accuracy": 0.8629342317581177, + "num_tokens": 329517860.0, + "step": 8640 + }, + { + "epoch": 1.0992240173005978, + "ewc_loss": 0.030029296875, + "ewc_loss_parallel": 3.0040740966796875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.301795959472656, + "learning_rate": 1e-06, + "loss": 0.4021, + "mean_token_accuracy": 0.870328962802887, + "num_tokens": 329555385.0, + "step": 8641 + }, + { + "epoch": 1.0993512275791884, + "ewc_loss": 0.02978515625, + "ewc_loss_parallel": 2.9802322387695312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.34973907470703, + "learning_rate": 1e-06, + "loss": 0.4387, + "mean_token_accuracy": 0.860210657119751, + "num_tokens": 329594775.0, + "step": 8642 + }, + { + "epoch": 1.0994784378577789, + "ewc_loss": 0.0299072265625, + "ewc_loss_parallel": 2.9921531677246094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.435258865356445, + "learning_rate": 1e-06, + "loss": 0.4141, + "mean_token_accuracy": 0.867662250995636, + "num_tokens": 329629872.0, + "step": 8643 + }, + { + "epoch": 1.0996056481363694, + "ewc_loss": 0.0299072265625, + "ewc_loss_parallel": 2.9921531677246094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.2440242767334, + "learning_rate": 1e-06, + "loss": 0.4421, + "mean_token_accuracy": 0.8588004112243652, + "num_tokens": 329665263.0, + "step": 8644 + }, + { + "epoch": 1.09973285841496, + "ewc_loss": 0.0296630859375, + "ewc_loss_parallel": 2.968311309814453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.475908279418945, + "learning_rate": 1e-06, + "loss": 0.4524, + "mean_token_accuracy": 0.8537940979003906, + "num_tokens": 329705853.0, + "step": 8645 + }, + { + "epoch": 1.0998600686935505, + "ewc_loss": 0.0301513671875, + "ewc_loss_parallel": 3.0159950256347656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.35209083557129, + "learning_rate": 1e-06, + "loss": 0.4088, + "mean_token_accuracy": 0.8711643218994141, + "num_tokens": 329741690.0, + "step": 8646 + }, + { + "epoch": 1.099987278972141, + "ewc_loss": 0.030029296875, + "ewc_loss_parallel": 3.0040740966796875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.31309700012207, + "learning_rate": 1e-06, + "loss": 0.4472, + "mean_token_accuracy": 0.8566666841506958, + "num_tokens": 329781548.0, + "step": 8647 + }, + { + "epoch": 1.1001144892507315, + "ewc_loss": 0.030029296875, + "ewc_loss_parallel": 3.0040740966796875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.475540161132812, + "learning_rate": 1e-06, + "loss": 0.4017, + "mean_token_accuracy": 0.8721150159835815, + "num_tokens": 329822304.0, + "step": 8648 + }, + { + "epoch": 1.100241699529322, + "ewc_loss": 0.030029296875, + "ewc_loss_parallel": 3.0040740966796875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.324413299560547, + "learning_rate": 1e-06, + "loss": 0.4347, + "mean_token_accuracy": 0.8652712106704712, + "num_tokens": 329856065.0, + "step": 8649 + }, + { + "epoch": 1.1003689098079126, + "ewc_loss": 0.030029296875, + "ewc_loss_parallel": 3.0040740966796875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.549205780029297, + "learning_rate": 1e-06, + "loss": 0.4972, + "mean_token_accuracy": 0.8478814363479614, + "num_tokens": 329895002.0, + "step": 8650 + }, + { + "epoch": 1.100496120086503, + "ewc_loss": 0.030029296875, + "ewc_loss_parallel": 3.0040740966796875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.367826461791992, + "learning_rate": 1e-06, + "loss": 0.442, + "mean_token_accuracy": 0.8580631613731384, + "num_tokens": 329938810.0, + "step": 8651 + }, + { + "epoch": 1.1006233303650934, + "ewc_loss": 0.030029296875, + "ewc_loss_parallel": 3.0040740966796875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.446353912353516, + "learning_rate": 1e-06, + "loss": 0.3895, + "mean_token_accuracy": 0.8764824867248535, + "num_tokens": 329976513.0, + "step": 8652 + }, + { + "epoch": 1.100750540643684, + "ewc_loss": 0.02978515625, + "ewc_loss_parallel": 2.9802322387695312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.50959014892578, + "learning_rate": 1e-06, + "loss": 0.4053, + "mean_token_accuracy": 0.8716025352478027, + "num_tokens": 330014013.0, + "step": 8653 + }, + { + "epoch": 1.1008777509222745, + "ewc_loss": 0.0299072265625, + "ewc_loss_parallel": 2.9921531677246094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.260515213012695, + "learning_rate": 1e-06, + "loss": 0.4297, + "mean_token_accuracy": 0.8596593141555786, + "num_tokens": 330052577.0, + "step": 8654 + }, + { + "epoch": 1.101004961200865, + "ewc_loss": 0.02978515625, + "ewc_loss_parallel": 2.9802322387695312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.442920684814453, + "learning_rate": 1e-06, + "loss": 0.4309, + "mean_token_accuracy": 0.8617786169052124, + "num_tokens": 330091385.0, + "step": 8655 + }, + { + "epoch": 1.1011321714794555, + "ewc_loss": 0.030029296875, + "ewc_loss_parallel": 3.0040740966796875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.27088737487793, + "learning_rate": 1e-06, + "loss": 0.4123, + "mean_token_accuracy": 0.8727757930755615, + "num_tokens": 330130285.0, + "step": 8656 + }, + { + "epoch": 1.101259381758046, + "ewc_loss": 0.02978515625, + "ewc_loss_parallel": 2.9802322387695312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.499513626098633, + "learning_rate": 1e-06, + "loss": 0.4628, + "mean_token_accuracy": 0.8506433963775635, + "num_tokens": 330166823.0, + "step": 8657 + }, + { + "epoch": 1.1013865920366366, + "ewc_loss": 0.030029296875, + "ewc_loss_parallel": 3.0040740966796875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.281545639038086, + "learning_rate": 1e-06, + "loss": 0.4188, + "mean_token_accuracy": 0.8656215667724609, + "num_tokens": 330200655.0, + "step": 8658 + }, + { + "epoch": 1.101513802315227, + "ewc_loss": 0.0299072265625, + "ewc_loss_parallel": 2.9921531677246094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.397762298583984, + "learning_rate": 1e-06, + "loss": 0.4414, + "mean_token_accuracy": 0.8628379106521606, + "num_tokens": 330243656.0, + "step": 8659 + }, + { + "epoch": 1.1016410125938176, + "ewc_loss": 0.0301513671875, + "ewc_loss_parallel": 3.0159950256347656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.48781394958496, + "learning_rate": 1e-06, + "loss": 0.4546, + "mean_token_accuracy": 0.8562238216400146, + "num_tokens": 330287200.0, + "step": 8660 + }, + { + "epoch": 1.1017682228724082, + "ewc_loss": 0.0299072265625, + "ewc_loss_parallel": 2.9921531677246094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.196266174316406, + "learning_rate": 1e-06, + "loss": 0.43, + "mean_token_accuracy": 0.8671990633010864, + "num_tokens": 330324794.0, + "step": 8661 + }, + { + "epoch": 1.1018954331509987, + "ewc_loss": 0.0299072265625, + "ewc_loss_parallel": 2.9921531677246094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.46282386779785, + "learning_rate": 1e-06, + "loss": 0.4287, + "mean_token_accuracy": 0.8664150238037109, + "num_tokens": 330365297.0, + "step": 8662 + }, + { + "epoch": 1.102022643429589, + "ewc_loss": 0.030029296875, + "ewc_loss_parallel": 3.0040740966796875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.454856872558594, + "learning_rate": 1e-06, + "loss": 0.373, + "mean_token_accuracy": 0.8812469244003296, + "num_tokens": 330408046.0, + "step": 8663 + }, + { + "epoch": 1.1021498537081795, + "ewc_loss": 0.0299072265625, + "ewc_loss_parallel": 2.9921531677246094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.366214752197266, + "learning_rate": 1e-06, + "loss": 0.4352, + "mean_token_accuracy": 0.8602139353752136, + "num_tokens": 330447815.0, + "step": 8664 + }, + { + "epoch": 1.10227706398677, + "ewc_loss": 0.0299072265625, + "ewc_loss_parallel": 2.9921531677246094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.42508888244629, + "learning_rate": 1e-06, + "loss": 0.4538, + "mean_token_accuracy": 0.8531600832939148, + "num_tokens": 330487796.0, + "step": 8665 + }, + { + "epoch": 1.1024042742653606, + "ewc_loss": 0.0299072265625, + "ewc_loss_parallel": 2.9921531677246094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.466876983642578, + "learning_rate": 1e-06, + "loss": 0.3931, + "mean_token_accuracy": 0.874617874622345, + "num_tokens": 330531155.0, + "step": 8666 + }, + { + "epoch": 1.1025314845439511, + "ewc_loss": 0.02978515625, + "ewc_loss_parallel": 2.9802322387695312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.195598602294922, + "learning_rate": 1e-06, + "loss": 0.3983, + "mean_token_accuracy": 0.8741159439086914, + "num_tokens": 330568611.0, + "step": 8667 + }, + { + "epoch": 1.1026586948225416, + "ewc_loss": 0.02978515625, + "ewc_loss_parallel": 2.9802322387695312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.403213500976562, + "learning_rate": 1e-06, + "loss": 0.4499, + "mean_token_accuracy": 0.8586200475692749, + "num_tokens": 330608085.0, + "step": 8668 + }, + { + "epoch": 1.1027859051011322, + "ewc_loss": 0.02978515625, + "ewc_loss_parallel": 2.9802322387695312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.353073120117188, + "learning_rate": 1e-06, + "loss": 0.4702, + "mean_token_accuracy": 0.846474826335907, + "num_tokens": 330644776.0, + "step": 8669 + }, + { + "epoch": 1.1029131153797227, + "ewc_loss": 0.0299072265625, + "ewc_loss_parallel": 2.9921531677246094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.23786735534668, + "learning_rate": 1e-06, + "loss": 0.4537, + "mean_token_accuracy": 0.8530182838439941, + "num_tokens": 330686401.0, + "step": 8670 + }, + { + "epoch": 1.1030403256583132, + "ewc_loss": 0.02978515625, + "ewc_loss_parallel": 2.9802322387695312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.178239822387695, + "learning_rate": 1e-06, + "loss": 0.4159, + "mean_token_accuracy": 0.8697514533996582, + "num_tokens": 330723115.0, + "step": 8671 + }, + { + "epoch": 1.1031675359369038, + "ewc_loss": 0.0301513671875, + "ewc_loss_parallel": 3.0159950256347656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.300500869750977, + "learning_rate": 1e-06, + "loss": 0.3788, + "mean_token_accuracy": 0.8799798488616943, + "num_tokens": 330761170.0, + "step": 8672 + }, + { + "epoch": 1.1032947462154943, + "ewc_loss": 0.030029296875, + "ewc_loss_parallel": 3.0040740966796875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.45726203918457, + "learning_rate": 1e-06, + "loss": 0.4185, + "mean_token_accuracy": 0.8647884130477905, + "num_tokens": 330790230.0, + "step": 8673 + }, + { + "epoch": 1.1034219564940848, + "ewc_loss": 0.030029296875, + "ewc_loss_parallel": 3.0040740966796875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.287090301513672, + "learning_rate": 1e-06, + "loss": 0.3974, + "mean_token_accuracy": 0.8752766251564026, + "num_tokens": 330826178.0, + "step": 8674 + }, + { + "epoch": 1.1035491667726753, + "ewc_loss": 0.030029296875, + "ewc_loss_parallel": 3.0040740966796875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.39609718322754, + "learning_rate": 1e-06, + "loss": 0.4012, + "mean_token_accuracy": 0.8707404136657715, + "num_tokens": 330866341.0, + "step": 8675 + }, + { + "epoch": 1.1036763770512656, + "ewc_loss": 0.030029296875, + "ewc_loss_parallel": 3.0040740966796875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.33603286743164, + "learning_rate": 1e-06, + "loss": 0.4722, + "mean_token_accuracy": 0.8503803610801697, + "num_tokens": 330903955.0, + "step": 8676 + }, + { + "epoch": 1.1038035873298562, + "ewc_loss": 0.0299072265625, + "ewc_loss_parallel": 2.9921531677246094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.456859588623047, + "learning_rate": 1e-06, + "loss": 0.418, + "mean_token_accuracy": 0.8607136011123657, + "num_tokens": 330943024.0, + "step": 8677 + }, + { + "epoch": 1.1039307976084467, + "ewc_loss": 0.030029296875, + "ewc_loss_parallel": 3.0040740966796875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.375349044799805, + "learning_rate": 1e-06, + "loss": 0.4496, + "mean_token_accuracy": 0.85955810546875, + "num_tokens": 330981004.0, + "step": 8678 + }, + { + "epoch": 1.1040580078870372, + "ewc_loss": 0.0301513671875, + "ewc_loss_parallel": 3.0159950256347656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.484561920166016, + "learning_rate": 1e-06, + "loss": 0.4316, + "mean_token_accuracy": 0.8628354072570801, + "num_tokens": 331021532.0, + "step": 8679 + }, + { + "epoch": 1.1041852181656278, + "ewc_loss": 0.02978515625, + "ewc_loss_parallel": 2.9802322387695312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.437753677368164, + "learning_rate": 1e-06, + "loss": 0.4068, + "mean_token_accuracy": 0.8717036843299866, + "num_tokens": 331058010.0, + "step": 8680 + }, + { + "epoch": 1.1043124284442183, + "ewc_loss": 0.02978515625, + "ewc_loss_parallel": 2.9802322387695312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.457212448120117, + "learning_rate": 1e-06, + "loss": 0.4005, + "mean_token_accuracy": 0.8727512955665588, + "num_tokens": 331090713.0, + "step": 8681 + }, + { + "epoch": 1.1044396387228088, + "ewc_loss": 0.02978515625, + "ewc_loss_parallel": 2.9802322387695312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.136005401611328, + "learning_rate": 1e-06, + "loss": 0.4186, + "mean_token_accuracy": 0.8697175979614258, + "num_tokens": 331127532.0, + "step": 8682 + }, + { + "epoch": 1.1045668490013993, + "ewc_loss": 0.0299072265625, + "ewc_loss_parallel": 2.9921531677246094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.358318328857422, + "learning_rate": 1e-06, + "loss": 0.501, + "mean_token_accuracy": 0.843837559223175, + "num_tokens": 331172088.0, + "step": 8683 + }, + { + "epoch": 1.1046940592799899, + "ewc_loss": 0.0301513671875, + "ewc_loss_parallel": 3.0159950256347656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.27718162536621, + "learning_rate": 1e-06, + "loss": 0.4474, + "mean_token_accuracy": 0.8580672740936279, + "num_tokens": 331213419.0, + "step": 8684 + }, + { + "epoch": 1.1048212695585804, + "ewc_loss": 0.030029296875, + "ewc_loss_parallel": 3.0040740966796875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.42864990234375, + "learning_rate": 1e-06, + "loss": 0.4353, + "mean_token_accuracy": 0.8637517690658569, + "num_tokens": 331250102.0, + "step": 8685 + }, + { + "epoch": 1.104948479837171, + "ewc_loss": 0.0299072265625, + "ewc_loss_parallel": 2.9921531677246094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.27542495727539, + "learning_rate": 1e-06, + "loss": 0.4341, + "mean_token_accuracy": 0.8658480644226074, + "num_tokens": 331284597.0, + "step": 8686 + }, + { + "epoch": 1.1050756901157615, + "ewc_loss": 0.0299072265625, + "ewc_loss_parallel": 2.9921531677246094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.272428512573242, + "learning_rate": 1e-06, + "loss": 0.4499, + "mean_token_accuracy": 0.8564127683639526, + "num_tokens": 331330688.0, + "step": 8687 + }, + { + "epoch": 1.1052029003943518, + "ewc_loss": 0.0299072265625, + "ewc_loss_parallel": 2.9921531677246094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.468273162841797, + "learning_rate": 1e-06, + "loss": 0.4349, + "mean_token_accuracy": 0.8670789003372192, + "num_tokens": 331368777.0, + "step": 8688 + }, + { + "epoch": 1.1053301106729423, + "ewc_loss": 0.0296630859375, + "ewc_loss_parallel": 2.968311309814453e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.3627986907959, + "learning_rate": 1e-06, + "loss": 0.407, + "mean_token_accuracy": 0.868317723274231, + "num_tokens": 331414369.0, + "step": 8689 + }, + { + "epoch": 1.1054573209515328, + "ewc_loss": 0.030029296875, + "ewc_loss_parallel": 3.0040740966796875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.273174285888672, + "learning_rate": 1e-06, + "loss": 0.4018, + "mean_token_accuracy": 0.870476245880127, + "num_tokens": 331452788.0, + "step": 8690 + }, + { + "epoch": 1.1055845312301233, + "ewc_loss": 0.0299072265625, + "ewc_loss_parallel": 2.9921531677246094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.421735763549805, + "learning_rate": 1e-06, + "loss": 0.4544, + "mean_token_accuracy": 0.8528939485549927, + "num_tokens": 331489575.0, + "step": 8691 + }, + { + "epoch": 1.1057117415087139, + "ewc_loss": 0.0299072265625, + "ewc_loss_parallel": 2.9921531677246094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.472126007080078, + "learning_rate": 1e-06, + "loss": 0.3737, + "mean_token_accuracy": 0.8815170526504517, + "num_tokens": 331529129.0, + "step": 8692 + }, + { + "epoch": 1.1058389517873044, + "ewc_loss": 0.0299072265625, + "ewc_loss_parallel": 2.9921531677246094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.27892303466797, + "learning_rate": 1e-06, + "loss": 0.441, + "mean_token_accuracy": 0.8599507808685303, + "num_tokens": 331564989.0, + "step": 8693 + }, + { + "epoch": 1.105966162065895, + "ewc_loss": 0.0299072265625, + "ewc_loss_parallel": 2.9921531677246094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.60359001159668, + "learning_rate": 1e-06, + "loss": 0.424, + "mean_token_accuracy": 0.8649716973304749, + "num_tokens": 331602561.0, + "step": 8694 + }, + { + "epoch": 1.1060933723444855, + "ewc_loss": 0.0301513671875, + "ewc_loss_parallel": 3.0159950256347656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.24567222595215, + "learning_rate": 1e-06, + "loss": 0.4236, + "mean_token_accuracy": 0.8655076026916504, + "num_tokens": 331646902.0, + "step": 8695 + }, + { + "epoch": 1.106220582623076, + "ewc_loss": 0.030029296875, + "ewc_loss_parallel": 3.0040740966796875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.500356674194336, + "learning_rate": 1e-06, + "loss": 0.4287, + "mean_token_accuracy": 0.8653356432914734, + "num_tokens": 331682347.0, + "step": 8696 + }, + { + "epoch": 1.1063477929016665, + "ewc_loss": 0.0301513671875, + "ewc_loss_parallel": 3.0159950256347656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.2711181640625, + "learning_rate": 1e-06, + "loss": 0.4538, + "mean_token_accuracy": 0.8567323684692383, + "num_tokens": 331725813.0, + "step": 8697 + }, + { + "epoch": 1.106475003180257, + "ewc_loss": 0.02978515625, + "ewc_loss_parallel": 2.9802322387695312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.20301628112793, + "learning_rate": 1e-06, + "loss": 0.4922, + "mean_token_accuracy": 0.8431931734085083, + "num_tokens": 331767856.0, + "step": 8698 + }, + { + "epoch": 1.1066022134588476, + "ewc_loss": 0.030029296875, + "ewc_loss_parallel": 3.0040740966796875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.35270118713379, + "learning_rate": 1e-06, + "loss": 0.4003, + "mean_token_accuracy": 0.8728203177452087, + "num_tokens": 331804238.0, + "step": 8699 + }, + { + "epoch": 1.106729423737438, + "ewc_loss": 0.0302734375, + "ewc_loss_parallel": 3.0279159545898438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.48513412475586, + "learning_rate": 1e-06, + "loss": 0.4228, + "mean_token_accuracy": 0.865143358707428, + "num_tokens": 331839963.0, + "step": 8700 + }, + { + "epoch": 1.1068566340160284, + "ewc_loss": 0.030029296875, + "ewc_loss_parallel": 3.0040740966796875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.23944854736328, + "learning_rate": 1e-06, + "loss": 0.4131, + "mean_token_accuracy": 0.8685640096664429, + "num_tokens": 331881563.0, + "step": 8701 + }, + { + "epoch": 1.106983844294619, + "ewc_loss": 0.0299072265625, + "ewc_loss_parallel": 2.9921531677246094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.396547317504883, + "learning_rate": 1e-06, + "loss": 0.4315, + "mean_token_accuracy": 0.8600261807441711, + "num_tokens": 331925540.0, + "step": 8702 + }, + { + "epoch": 1.1071110545732095, + "ewc_loss": 0.030029296875, + "ewc_loss_parallel": 3.0040740966796875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.370222091674805, + "learning_rate": 1e-06, + "loss": 0.3955, + "mean_token_accuracy": 0.8723238706588745, + "num_tokens": 331963171.0, + "step": 8703 + }, + { + "epoch": 1.1072382648518, + "ewc_loss": 0.02978515625, + "ewc_loss_parallel": 2.9802322387695312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.308582305908203, + "learning_rate": 1e-06, + "loss": 0.3673, + "mean_token_accuracy": 0.8834567070007324, + "num_tokens": 331998310.0, + "step": 8704 + }, + { + "epoch": 1.1073654751303905, + "ewc_loss": 0.0302734375, + "ewc_loss_parallel": 3.0279159545898438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.288145065307617, + "learning_rate": 1e-06, + "loss": 0.3963, + "mean_token_accuracy": 0.8698471188545227, + "num_tokens": 332034181.0, + "step": 8705 + }, + { + "epoch": 1.107492685408981, + "ewc_loss": 0.030029296875, + "ewc_loss_parallel": 3.0040740966796875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.203327178955078, + "learning_rate": 1e-06, + "loss": 0.3876, + "mean_token_accuracy": 0.8739292621612549, + "num_tokens": 332075368.0, + "step": 8706 + }, + { + "epoch": 1.1076198956875716, + "ewc_loss": 0.0302734375, + "ewc_loss_parallel": 3.0279159545898438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.31787109375, + "learning_rate": 1e-06, + "loss": 0.3945, + "mean_token_accuracy": 0.8748806715011597, + "num_tokens": 332113281.0, + "step": 8707 + }, + { + "epoch": 1.107747105966162, + "ewc_loss": 0.0301513671875, + "ewc_loss_parallel": 3.0159950256347656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.49001693725586, + "learning_rate": 1e-06, + "loss": 0.4617, + "mean_token_accuracy": 0.8564891815185547, + "num_tokens": 332154206.0, + "step": 8708 + }, + { + "epoch": 1.1078743162447526, + "ewc_loss": 0.030029296875, + "ewc_loss_parallel": 3.0040740966796875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.25583839416504, + "learning_rate": 1e-06, + "loss": 0.4756, + "mean_token_accuracy": 0.8496354818344116, + "num_tokens": 332195409.0, + "step": 8709 + }, + { + "epoch": 1.1080015265233432, + "ewc_loss": 0.030029296875, + "ewc_loss_parallel": 3.0040740966796875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.43031120300293, + "learning_rate": 1e-06, + "loss": 0.3975, + "mean_token_accuracy": 0.8728892803192139, + "num_tokens": 332233836.0, + "step": 8710 + }, + { + "epoch": 1.1081287368019337, + "ewc_loss": 0.030029296875, + "ewc_loss_parallel": 3.0040740966796875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.347028732299805, + "learning_rate": 1e-06, + "loss": 0.4155, + "mean_token_accuracy": 0.86922287940979, + "num_tokens": 332274167.0, + "step": 8711 + }, + { + "epoch": 1.108255947080524, + "ewc_loss": 0.030029296875, + "ewc_loss_parallel": 3.0040740966796875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.15843391418457, + "learning_rate": 1e-06, + "loss": 0.4513, + "mean_token_accuracy": 0.8613441586494446, + "num_tokens": 332315856.0, + "step": 8712 + }, + { + "epoch": 1.1083831573591145, + "ewc_loss": 0.02978515625, + "ewc_loss_parallel": 2.9802322387695312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.353801727294922, + "learning_rate": 1e-06, + "loss": 0.4275, + "mean_token_accuracy": 0.8642336130142212, + "num_tokens": 332366365.0, + "step": 8713 + }, + { + "epoch": 1.108510367637705, + "ewc_loss": 0.0302734375, + "ewc_loss_parallel": 3.0279159545898438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.049257278442383, + "learning_rate": 1e-06, + "loss": 0.4284, + "mean_token_accuracy": 0.8621335625648499, + "num_tokens": 332400411.0, + "step": 8714 + }, + { + "epoch": 1.1086375779162956, + "ewc_loss": 0.0299072265625, + "ewc_loss_parallel": 2.9921531677246094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.412342071533203, + "learning_rate": 1e-06, + "loss": 0.4156, + "mean_token_accuracy": 0.8674330115318298, + "num_tokens": 332441959.0, + "step": 8715 + }, + { + "epoch": 1.108764788194886, + "ewc_loss": 0.0301513671875, + "ewc_loss_parallel": 3.0159950256347656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.348529815673828, + "learning_rate": 1e-06, + "loss": 0.4872, + "mean_token_accuracy": 0.8508046269416809, + "num_tokens": 332479545.0, + "step": 8716 + }, + { + "epoch": 1.1088919984734766, + "ewc_loss": 0.0301513671875, + "ewc_loss_parallel": 3.0159950256347656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.435935974121094, + "learning_rate": 1e-06, + "loss": 0.3861, + "mean_token_accuracy": 0.8749186396598816, + "num_tokens": 332516240.0, + "step": 8717 + }, + { + "epoch": 1.1090192087520672, + "ewc_loss": 0.0302734375, + "ewc_loss_parallel": 3.0279159545898438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.40858268737793, + "learning_rate": 1e-06, + "loss": 0.4051, + "mean_token_accuracy": 0.8695731163024902, + "num_tokens": 332553488.0, + "step": 8718 + }, + { + "epoch": 1.1091464190306577, + "ewc_loss": 0.030029296875, + "ewc_loss_parallel": 3.0040740966796875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.458581924438477, + "learning_rate": 1e-06, + "loss": 0.4531, + "mean_token_accuracy": 0.8558851480484009, + "num_tokens": 332588646.0, + "step": 8719 + }, + { + "epoch": 1.1092736293092482, + "ewc_loss": 0.030029296875, + "ewc_loss_parallel": 3.0040740966796875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.50684356689453, + "learning_rate": 1e-06, + "loss": 0.4604, + "mean_token_accuracy": 0.8546609878540039, + "num_tokens": 332629103.0, + "step": 8720 + }, + { + "epoch": 1.1094008395878387, + "ewc_loss": 0.0299072265625, + "ewc_loss_parallel": 2.9921531677246094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.42578887939453, + "learning_rate": 1e-06, + "loss": 0.4018, + "mean_token_accuracy": 0.8700186014175415, + "num_tokens": 332664820.0, + "step": 8721 + }, + { + "epoch": 1.1095280498664293, + "ewc_loss": 0.0301513671875, + "ewc_loss_parallel": 3.0159950256347656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.470638275146484, + "learning_rate": 1e-06, + "loss": 0.4064, + "mean_token_accuracy": 0.8666646480560303, + "num_tokens": 332699772.0, + "step": 8722 + }, + { + "epoch": 1.1096552601450198, + "ewc_loss": 0.0301513671875, + "ewc_loss_parallel": 3.0159950256347656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.406558990478516, + "learning_rate": 1e-06, + "loss": 0.396, + "mean_token_accuracy": 0.8729662895202637, + "num_tokens": 332733478.0, + "step": 8723 + }, + { + "epoch": 1.1097824704236103, + "ewc_loss": 0.030029296875, + "ewc_loss_parallel": 3.0040740966796875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.553104400634766, + "learning_rate": 1e-06, + "loss": 0.4219, + "mean_token_accuracy": 0.8600387573242188, + "num_tokens": 332774733.0, + "step": 8724 + }, + { + "epoch": 1.1099096807022006, + "ewc_loss": 0.0301513671875, + "ewc_loss_parallel": 3.0159950256347656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.527921676635742, + "learning_rate": 1e-06, + "loss": 0.4954, + "mean_token_accuracy": 0.8411448001861572, + "num_tokens": 332812970.0, + "step": 8725 + }, + { + "epoch": 1.1100368909807912, + "ewc_loss": 0.02978515625, + "ewc_loss_parallel": 2.9802322387695312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.28419303894043, + "learning_rate": 1e-06, + "loss": 0.4403, + "mean_token_accuracy": 0.8600567579269409, + "num_tokens": 332854974.0, + "step": 8726 + }, + { + "epoch": 1.1101641012593817, + "ewc_loss": 0.02978515625, + "ewc_loss_parallel": 2.9802322387695312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.385692596435547, + "learning_rate": 1e-06, + "loss": 0.4576, + "mean_token_accuracy": 0.8500127792358398, + "num_tokens": 332886462.0, + "step": 8727 + }, + { + "epoch": 1.1102913115379722, + "ewc_loss": 0.0299072265625, + "ewc_loss_parallel": 2.9921531677246094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.178720474243164, + "learning_rate": 1e-06, + "loss": 0.4355, + "mean_token_accuracy": 0.8608513474464417, + "num_tokens": 332928614.0, + "step": 8728 + }, + { + "epoch": 1.1104185218165628, + "ewc_loss": 0.02978515625, + "ewc_loss_parallel": 2.9802322387695312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.36250877380371, + "learning_rate": 1e-06, + "loss": 0.4208, + "mean_token_accuracy": 0.8649455308914185, + "num_tokens": 332962447.0, + "step": 8729 + }, + { + "epoch": 1.1105457320951533, + "ewc_loss": 0.0301513671875, + "ewc_loss_parallel": 3.0159950256347656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.371681213378906, + "learning_rate": 1e-06, + "loss": 0.4441, + "mean_token_accuracy": 0.8548267483711243, + "num_tokens": 332991395.0, + "step": 8730 + }, + { + "epoch": 1.1106729423737438, + "ewc_loss": 0.0299072265625, + "ewc_loss_parallel": 2.9921531677246094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.34732437133789, + "learning_rate": 1e-06, + "loss": 0.4527, + "mean_token_accuracy": 0.8562253713607788, + "num_tokens": 333027852.0, + "step": 8731 + }, + { + "epoch": 1.1108001526523343, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.372268676757812, + "learning_rate": 1e-06, + "loss": 0.4169, + "mean_token_accuracy": 0.8667478561401367, + "num_tokens": 333063337.0, + "step": 8732 + }, + { + "epoch": 1.1109273629309249, + "ewc_loss": 0.0299072265625, + "ewc_loss_parallel": 2.9921531677246094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.393062591552734, + "learning_rate": 1e-06, + "loss": 0.4488, + "mean_token_accuracy": 0.8550591468811035, + "num_tokens": 333094632.0, + "step": 8733 + }, + { + "epoch": 1.1110545732095154, + "ewc_loss": 0.0302734375, + "ewc_loss_parallel": 3.0279159545898438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.30164337158203, + "learning_rate": 1e-06, + "loss": 0.3703, + "mean_token_accuracy": 0.878511905670166, + "num_tokens": 333134071.0, + "step": 8734 + }, + { + "epoch": 1.111181783488106, + "ewc_loss": 0.0301513671875, + "ewc_loss_parallel": 3.0159950256347656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.424230575561523, + "learning_rate": 1e-06, + "loss": 0.4036, + "mean_token_accuracy": 0.8701751828193665, + "num_tokens": 333171288.0, + "step": 8735 + }, + { + "epoch": 1.1113089937666965, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.3275203704834, + "learning_rate": 1e-06, + "loss": 0.4136, + "mean_token_accuracy": 0.8711397647857666, + "num_tokens": 333212111.0, + "step": 8736 + }, + { + "epoch": 1.1114362040452868, + "ewc_loss": 0.0299072265625, + "ewc_loss_parallel": 2.9921531677246094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.37726402282715, + "learning_rate": 1e-06, + "loss": 0.4059, + "mean_token_accuracy": 0.8697826266288757, + "num_tokens": 333247557.0, + "step": 8737 + }, + { + "epoch": 1.1115634143238773, + "ewc_loss": 0.0303955078125, + "ewc_loss_parallel": 3.039836883544922e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.34145164489746, + "learning_rate": 1e-06, + "loss": 0.4224, + "mean_token_accuracy": 0.8636317253112793, + "num_tokens": 333278705.0, + "step": 8738 + }, + { + "epoch": 1.1116906246024678, + "ewc_loss": 0.0303955078125, + "ewc_loss_parallel": 3.039836883544922e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.41657066345215, + "learning_rate": 1e-06, + "loss": 0.4268, + "mean_token_accuracy": 0.865776538848877, + "num_tokens": 333316493.0, + "step": 8739 + }, + { + "epoch": 1.1118178348810583, + "ewc_loss": 0.0301513671875, + "ewc_loss_parallel": 3.0159950256347656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.297035217285156, + "learning_rate": 1e-06, + "loss": 0.419, + "mean_token_accuracy": 0.8669092059135437, + "num_tokens": 333360384.0, + "step": 8740 + }, + { + "epoch": 1.1119450451596489, + "ewc_loss": 0.0302734375, + "ewc_loss_parallel": 3.0279159545898438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.45707893371582, + "learning_rate": 1e-06, + "loss": 0.4294, + "mean_token_accuracy": 0.8652448654174805, + "num_tokens": 333395568.0, + "step": 8741 + }, + { + "epoch": 1.1120722554382394, + "ewc_loss": 0.0302734375, + "ewc_loss_parallel": 3.0279159545898438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.40816879272461, + "learning_rate": 1e-06, + "loss": 0.4098, + "mean_token_accuracy": 0.8690609931945801, + "num_tokens": 333429267.0, + "step": 8742 + }, + { + "epoch": 1.11219946571683, + "ewc_loss": 0.0303955078125, + "ewc_loss_parallel": 3.039836883544922e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.234691619873047, + "learning_rate": 1e-06, + "loss": 0.405, + "mean_token_accuracy": 0.870539665222168, + "num_tokens": 333469908.0, + "step": 8743 + }, + { + "epoch": 1.1123266759954205, + "ewc_loss": 0.0301513671875, + "ewc_loss_parallel": 3.0159950256347656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.499073028564453, + "learning_rate": 1e-06, + "loss": 0.4152, + "mean_token_accuracy": 0.8676137328147888, + "num_tokens": 333507865.0, + "step": 8744 + }, + { + "epoch": 1.112453886274011, + "ewc_loss": 0.0302734375, + "ewc_loss_parallel": 3.0279159545898438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.302284240722656, + "learning_rate": 1e-06, + "loss": 0.3967, + "mean_token_accuracy": 0.8754159212112427, + "num_tokens": 333544011.0, + "step": 8745 + }, + { + "epoch": 1.1125810965526015, + "ewc_loss": 0.0299072265625, + "ewc_loss_parallel": 2.9921531677246094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.37213897705078, + "learning_rate": 1e-06, + "loss": 0.4111, + "mean_token_accuracy": 0.8659154772758484, + "num_tokens": 333581949.0, + "step": 8746 + }, + { + "epoch": 1.112708306831192, + "ewc_loss": 0.0302734375, + "ewc_loss_parallel": 3.0279159545898438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.53758430480957, + "learning_rate": 1e-06, + "loss": 0.3951, + "mean_token_accuracy": 0.8728584051132202, + "num_tokens": 333615994.0, + "step": 8747 + }, + { + "epoch": 1.1128355171097826, + "ewc_loss": 0.0302734375, + "ewc_loss_parallel": 3.0279159545898438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.38167381286621, + "learning_rate": 1e-06, + "loss": 0.3815, + "mean_token_accuracy": 0.8802106380462646, + "num_tokens": 333654454.0, + "step": 8748 + }, + { + "epoch": 1.112962727388373, + "ewc_loss": 0.0301513671875, + "ewc_loss_parallel": 3.0159950256347656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.121267318725586, + "learning_rate": 1e-06, + "loss": 0.4535, + "mean_token_accuracy": 0.8570706844329834, + "num_tokens": 333700206.0, + "step": 8749 + }, + { + "epoch": 1.1130899376669634, + "ewc_loss": 0.030029296875, + "ewc_loss_parallel": 3.0040740966796875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.407485961914062, + "learning_rate": 1e-06, + "loss": 0.4013, + "mean_token_accuracy": 0.86864173412323, + "num_tokens": 333737538.0, + "step": 8750 + }, + { + "epoch": 1.113217147945554, + "ewc_loss": 0.030029296875, + "ewc_loss_parallel": 3.0040740966796875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.47660255432129, + "learning_rate": 1e-06, + "loss": 0.4193, + "mean_token_accuracy": 0.8672736883163452, + "num_tokens": 333771237.0, + "step": 8751 + }, + { + "epoch": 1.1133443582241445, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.374183654785156, + "learning_rate": 1e-06, + "loss": 0.4426, + "mean_token_accuracy": 0.8615761995315552, + "num_tokens": 333806522.0, + "step": 8752 + }, + { + "epoch": 1.113471568502735, + "ewc_loss": 0.0302734375, + "ewc_loss_parallel": 3.0279159545898438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.642919540405273, + "learning_rate": 1e-06, + "loss": 0.424, + "mean_token_accuracy": 0.8646453619003296, + "num_tokens": 333846656.0, + "step": 8753 + }, + { + "epoch": 1.1135987787813255, + "ewc_loss": 0.0303955078125, + "ewc_loss_parallel": 3.039836883544922e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.269113540649414, + "learning_rate": 1e-06, + "loss": 0.4533, + "mean_token_accuracy": 0.8562570810317993, + "num_tokens": 333886294.0, + "step": 8754 + }, + { + "epoch": 1.113725989059916, + "ewc_loss": 0.0301513671875, + "ewc_loss_parallel": 3.0159950256347656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.492895126342773, + "learning_rate": 1e-06, + "loss": 0.4323, + "mean_token_accuracy": 0.8616725206375122, + "num_tokens": 333923868.0, + "step": 8755 + }, + { + "epoch": 1.1138531993385066, + "ewc_loss": 0.0302734375, + "ewc_loss_parallel": 3.0279159545898438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.4744930267334, + "learning_rate": 1e-06, + "loss": 0.4167, + "mean_token_accuracy": 0.8684214949607849, + "num_tokens": 333957187.0, + "step": 8756 + }, + { + "epoch": 1.113980409617097, + "ewc_loss": 0.030029296875, + "ewc_loss_parallel": 3.0040740966796875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.42917823791504, + "learning_rate": 1e-06, + "loss": 0.3748, + "mean_token_accuracy": 0.8785173892974854, + "num_tokens": 333999061.0, + "step": 8757 + }, + { + "epoch": 1.1141076198956876, + "ewc_loss": 0.0301513671875, + "ewc_loss_parallel": 3.0159950256347656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.43354606628418, + "learning_rate": 1e-06, + "loss": 0.441, + "mean_token_accuracy": 0.8590865135192871, + "num_tokens": 334043595.0, + "step": 8758 + }, + { + "epoch": 1.1142348301742782, + "ewc_loss": 0.0299072265625, + "ewc_loss_parallel": 2.9921531677246094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.42569923400879, + "learning_rate": 1e-06, + "loss": 0.4013, + "mean_token_accuracy": 0.8686593174934387, + "num_tokens": 334078487.0, + "step": 8759 + }, + { + "epoch": 1.1143620404528687, + "ewc_loss": 0.0301513671875, + "ewc_loss_parallel": 3.0159950256347656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.406190872192383, + "learning_rate": 1e-06, + "loss": 0.3981, + "mean_token_accuracy": 0.8733404278755188, + "num_tokens": 334119610.0, + "step": 8760 + }, + { + "epoch": 1.114489250731459, + "ewc_loss": 0.02978515625, + "ewc_loss_parallel": 2.9802322387695312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.448192596435547, + "learning_rate": 1e-06, + "loss": 0.442, + "mean_token_accuracy": 0.8609781861305237, + "num_tokens": 334160149.0, + "step": 8761 + }, + { + "epoch": 1.1146164610100495, + "ewc_loss": 0.0301513671875, + "ewc_loss_parallel": 3.0159950256347656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.428951263427734, + "learning_rate": 1e-06, + "loss": 0.4124, + "mean_token_accuracy": 0.8656301498413086, + "num_tokens": 334198667.0, + "step": 8762 + }, + { + "epoch": 1.11474367128864, + "ewc_loss": 0.030029296875, + "ewc_loss_parallel": 3.0040740966796875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.398540496826172, + "learning_rate": 1e-06, + "loss": 0.4148, + "mean_token_accuracy": 0.8666805028915405, + "num_tokens": 334233838.0, + "step": 8763 + }, + { + "epoch": 1.1148708815672306, + "ewc_loss": 0.0301513671875, + "ewc_loss_parallel": 3.0159950256347656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.39610481262207, + "learning_rate": 1e-06, + "loss": 0.4532, + "mean_token_accuracy": 0.8571972846984863, + "num_tokens": 334275703.0, + "step": 8764 + }, + { + "epoch": 1.114998091845821, + "ewc_loss": 0.030029296875, + "ewc_loss_parallel": 3.0040740966796875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.33025550842285, + "learning_rate": 1e-06, + "loss": 0.3885, + "mean_token_accuracy": 0.8761776685714722, + "num_tokens": 334311119.0, + "step": 8765 + }, + { + "epoch": 1.1151253021244116, + "ewc_loss": 0.030029296875, + "ewc_loss_parallel": 3.0040740966796875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.329994201660156, + "learning_rate": 1e-06, + "loss": 0.455, + "mean_token_accuracy": 0.8580548763275146, + "num_tokens": 334348298.0, + "step": 8766 + }, + { + "epoch": 1.1152525124030022, + "ewc_loss": 0.0299072265625, + "ewc_loss_parallel": 2.9921531677246094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.390918731689453, + "learning_rate": 1e-06, + "loss": 0.4552, + "mean_token_accuracy": 0.8514537811279297, + "num_tokens": 334386986.0, + "step": 8767 + }, + { + "epoch": 1.1153797226815927, + "ewc_loss": 0.0301513671875, + "ewc_loss_parallel": 3.0159950256347656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.501989364624023, + "learning_rate": 1e-06, + "loss": 0.4026, + "mean_token_accuracy": 0.8693394660949707, + "num_tokens": 334423263.0, + "step": 8768 + }, + { + "epoch": 1.1155069329601832, + "ewc_loss": 0.0299072265625, + "ewc_loss_parallel": 2.9921531677246094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.180932998657227, + "learning_rate": 1e-06, + "loss": 0.3735, + "mean_token_accuracy": 0.8814382553100586, + "num_tokens": 334460363.0, + "step": 8769 + }, + { + "epoch": 1.1156341432387737, + "ewc_loss": 0.0302734375, + "ewc_loss_parallel": 3.0279159545898438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.535343170166016, + "learning_rate": 1e-06, + "loss": 0.4222, + "mean_token_accuracy": 0.8688111305236816, + "num_tokens": 334501883.0, + "step": 8770 + }, + { + "epoch": 1.1157613535173643, + "ewc_loss": 0.0303955078125, + "ewc_loss_parallel": 3.039836883544922e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.52741813659668, + "learning_rate": 1e-06, + "loss": 0.4329, + "mean_token_accuracy": 0.8645564317703247, + "num_tokens": 334536242.0, + "step": 8771 + }, + { + "epoch": 1.1158885637959548, + "ewc_loss": 0.02978515625, + "ewc_loss_parallel": 2.9802322387695312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.165054321289062, + "learning_rate": 1e-06, + "loss": 0.3807, + "mean_token_accuracy": 0.8774710297584534, + "num_tokens": 334569685.0, + "step": 8772 + }, + { + "epoch": 1.1160157740745453, + "ewc_loss": 0.0302734375, + "ewc_loss_parallel": 3.0279159545898438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.531478881835938, + "learning_rate": 1e-06, + "loss": 0.4157, + "mean_token_accuracy": 0.867859423160553, + "num_tokens": 334604946.0, + "step": 8773 + }, + { + "epoch": 1.1161429843531356, + "ewc_loss": 0.0302734375, + "ewc_loss_parallel": 3.0279159545898438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.35462760925293, + "learning_rate": 1e-06, + "loss": 0.4092, + "mean_token_accuracy": 0.8708498477935791, + "num_tokens": 334645495.0, + "step": 8774 + }, + { + "epoch": 1.1162701946317262, + "ewc_loss": 0.0301513671875, + "ewc_loss_parallel": 3.0159950256347656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.4427433013916, + "learning_rate": 1e-06, + "loss": 0.4428, + "mean_token_accuracy": 0.8598591089248657, + "num_tokens": 334684239.0, + "step": 8775 + }, + { + "epoch": 1.1163974049103167, + "ewc_loss": 0.0302734375, + "ewc_loss_parallel": 3.0279159545898438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.39169692993164, + "learning_rate": 1e-06, + "loss": 0.4036, + "mean_token_accuracy": 0.870548665523529, + "num_tokens": 334713799.0, + "step": 8776 + }, + { + "epoch": 1.1165246151889072, + "ewc_loss": 0.0302734375, + "ewc_loss_parallel": 3.0279159545898438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.22832679748535, + "learning_rate": 1e-06, + "loss": 0.3727, + "mean_token_accuracy": 0.8849846720695496, + "num_tokens": 334753374.0, + "step": 8777 + }, + { + "epoch": 1.1166518254674977, + "ewc_loss": 0.0303955078125, + "ewc_loss_parallel": 3.039836883544922e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.640945434570312, + "learning_rate": 1e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.8659053444862366, + "num_tokens": 334792867.0, + "step": 8778 + }, + { + "epoch": 1.1167790357460883, + "ewc_loss": 0.0301513671875, + "ewc_loss_parallel": 3.0159950256347656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.371784210205078, + "learning_rate": 1e-06, + "loss": 0.4417, + "mean_token_accuracy": 0.8625036478042603, + "num_tokens": 334832664.0, + "step": 8779 + }, + { + "epoch": 1.1169062460246788, + "ewc_loss": 0.030029296875, + "ewc_loss_parallel": 3.0040740966796875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.253040313720703, + "learning_rate": 1e-06, + "loss": 0.4431, + "mean_token_accuracy": 0.8585538864135742, + "num_tokens": 334876763.0, + "step": 8780 + }, + { + "epoch": 1.1170334563032693, + "ewc_loss": 0.030029296875, + "ewc_loss_parallel": 3.0040740966796875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.317798614501953, + "learning_rate": 1e-06, + "loss": 0.4188, + "mean_token_accuracy": 0.8662147521972656, + "num_tokens": 334909431.0, + "step": 8781 + }, + { + "epoch": 1.1171606665818599, + "ewc_loss": 0.0303955078125, + "ewc_loss_parallel": 3.039836883544922e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.27484130859375, + "learning_rate": 1e-06, + "loss": 0.4248, + "mean_token_accuracy": 0.86432945728302, + "num_tokens": 334941939.0, + "step": 8782 + }, + { + "epoch": 1.1172878768604504, + "ewc_loss": 0.0302734375, + "ewc_loss_parallel": 3.0279159545898438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.33426856994629, + "learning_rate": 1e-06, + "loss": 0.3936, + "mean_token_accuracy": 0.8739719986915588, + "num_tokens": 334981338.0, + "step": 8783 + }, + { + "epoch": 1.117415087139041, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.37681770324707, + "learning_rate": 1e-06, + "loss": 0.416, + "mean_token_accuracy": 0.8642868995666504, + "num_tokens": 335020660.0, + "step": 8784 + }, + { + "epoch": 1.1175422974176314, + "ewc_loss": 0.0302734375, + "ewc_loss_parallel": 3.0279159545898438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.278099060058594, + "learning_rate": 1e-06, + "loss": 0.4113, + "mean_token_accuracy": 0.8677415251731873, + "num_tokens": 335057377.0, + "step": 8785 + }, + { + "epoch": 1.1176695076962218, + "ewc_loss": 0.0301513671875, + "ewc_loss_parallel": 3.0159950256347656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.395034790039062, + "learning_rate": 1e-06, + "loss": 0.4524, + "mean_token_accuracy": 0.8549016714096069, + "num_tokens": 335092912.0, + "step": 8786 + }, + { + "epoch": 1.1177967179748123, + "ewc_loss": 0.0302734375, + "ewc_loss_parallel": 3.0279159545898438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.570547103881836, + "learning_rate": 1e-06, + "loss": 0.3771, + "mean_token_accuracy": 0.8796617388725281, + "num_tokens": 335129157.0, + "step": 8787 + }, + { + "epoch": 1.1179239282534028, + "ewc_loss": 0.0303955078125, + "ewc_loss_parallel": 3.039836883544922e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.317272186279297, + "learning_rate": 1e-06, + "loss": 0.4195, + "mean_token_accuracy": 0.8665386438369751, + "num_tokens": 335168065.0, + "step": 8788 + }, + { + "epoch": 1.1180511385319933, + "ewc_loss": 0.0302734375, + "ewc_loss_parallel": 3.0279159545898438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.49786949157715, + "learning_rate": 1e-06, + "loss": 0.4271, + "mean_token_accuracy": 0.863715648651123, + "num_tokens": 335218876.0, + "step": 8789 + }, + { + "epoch": 1.1181783488105839, + "ewc_loss": 0.030029296875, + "ewc_loss_parallel": 3.0040740966796875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.336278915405273, + "learning_rate": 1e-06, + "loss": 0.3878, + "mean_token_accuracy": 0.8745191693305969, + "num_tokens": 335256300.0, + "step": 8790 + }, + { + "epoch": 1.1183055590891744, + "ewc_loss": 0.0301513671875, + "ewc_loss_parallel": 3.0159950256347656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.426103591918945, + "learning_rate": 1e-06, + "loss": 0.3918, + "mean_token_accuracy": 0.8742672801017761, + "num_tokens": 335291035.0, + "step": 8791 + }, + { + "epoch": 1.118432769367765, + "ewc_loss": 0.0299072265625, + "ewc_loss_parallel": 2.9921531677246094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.6397647857666, + "learning_rate": 1e-06, + "loss": 0.3981, + "mean_token_accuracy": 0.8769450187683105, + "num_tokens": 335330092.0, + "step": 8792 + }, + { + "epoch": 1.1185599796463555, + "ewc_loss": 0.0302734375, + "ewc_loss_parallel": 3.0279159545898438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.323659896850586, + "learning_rate": 1e-06, + "loss": 0.4617, + "mean_token_accuracy": 0.852397084236145, + "num_tokens": 335367146.0, + "step": 8793 + }, + { + "epoch": 1.118687189924946, + "ewc_loss": 0.0301513671875, + "ewc_loss_parallel": 3.0159950256347656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.602113723754883, + "learning_rate": 1e-06, + "loss": 0.4147, + "mean_token_accuracy": 0.8656084537506104, + "num_tokens": 335405069.0, + "step": 8794 + }, + { + "epoch": 1.1188144002035365, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.420429229736328, + "learning_rate": 1e-06, + "loss": 0.47, + "mean_token_accuracy": 0.8551893830299377, + "num_tokens": 335443779.0, + "step": 8795 + }, + { + "epoch": 1.118941610482127, + "ewc_loss": 0.0302734375, + "ewc_loss_parallel": 3.0279159545898438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.57886505126953, + "learning_rate": 1e-06, + "loss": 0.4572, + "mean_token_accuracy": 0.8579416275024414, + "num_tokens": 335485978.0, + "step": 8796 + }, + { + "epoch": 1.1190688207607176, + "ewc_loss": 0.0302734375, + "ewc_loss_parallel": 3.0279159545898438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.240894317626953, + "learning_rate": 1e-06, + "loss": 0.4028, + "mean_token_accuracy": 0.8700016736984253, + "num_tokens": 335530375.0, + "step": 8797 + }, + { + "epoch": 1.119196031039308, + "ewc_loss": 0.0302734375, + "ewc_loss_parallel": 3.0279159545898438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.752262115478516, + "learning_rate": 1e-06, + "loss": 0.4004, + "mean_token_accuracy": 0.8703134059906006, + "num_tokens": 335569109.0, + "step": 8798 + }, + { + "epoch": 1.1193232413178984, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.36637306213379, + "learning_rate": 1e-06, + "loss": 0.4772, + "mean_token_accuracy": 0.8516921997070312, + "num_tokens": 335599283.0, + "step": 8799 + }, + { + "epoch": 1.119450451596489, + "ewc_loss": 0.02978515625, + "ewc_loss_parallel": 2.9802322387695312e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.315275192260742, + "learning_rate": 1e-06, + "loss": 0.4102, + "mean_token_accuracy": 0.8721756935119629, + "num_tokens": 335634792.0, + "step": 8800 + }, + { + "epoch": 1.1195776618750795, + "ewc_loss": 0.0303955078125, + "ewc_loss_parallel": 3.039836883544922e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.479055404663086, + "learning_rate": 1e-06, + "loss": 0.3963, + "mean_token_accuracy": 0.8788620233535767, + "num_tokens": 335672869.0, + "step": 8801 + }, + { + "epoch": 1.11970487215367, + "ewc_loss": 0.030029296875, + "ewc_loss_parallel": 3.0040740966796875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.404531478881836, + "learning_rate": 1e-06, + "loss": 0.412, + "mean_token_accuracy": 0.8706933259963989, + "num_tokens": 335714996.0, + "step": 8802 + }, + { + "epoch": 1.1198320824322605, + "ewc_loss": 0.0301513671875, + "ewc_loss_parallel": 3.0159950256347656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.448211669921875, + "learning_rate": 1e-06, + "loss": 0.4378, + "mean_token_accuracy": 0.857276439666748, + "num_tokens": 335756884.0, + "step": 8803 + }, + { + "epoch": 1.119959292710851, + "ewc_loss": 0.030029296875, + "ewc_loss_parallel": 3.0040740966796875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.360326766967773, + "learning_rate": 1e-06, + "loss": 0.3925, + "mean_token_accuracy": 0.8747535347938538, + "num_tokens": 335793935.0, + "step": 8804 + }, + { + "epoch": 1.1200865029894416, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.590923309326172, + "learning_rate": 1e-06, + "loss": 0.398, + "mean_token_accuracy": 0.8768049478530884, + "num_tokens": 335834169.0, + "step": 8805 + }, + { + "epoch": 1.120213713268032, + "ewc_loss": 0.0302734375, + "ewc_loss_parallel": 3.0279159545898438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.459794998168945, + "learning_rate": 1e-06, + "loss": 0.3766, + "mean_token_accuracy": 0.8831552267074585, + "num_tokens": 335872302.0, + "step": 8806 + }, + { + "epoch": 1.1203409235466226, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.41913414001465, + "learning_rate": 1e-06, + "loss": 0.4191, + "mean_token_accuracy": 0.866070568561554, + "num_tokens": 335908074.0, + "step": 8807 + }, + { + "epoch": 1.1204681338252132, + "ewc_loss": 0.0302734375, + "ewc_loss_parallel": 3.0279159545898438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.43032455444336, + "learning_rate": 1e-06, + "loss": 0.3916, + "mean_token_accuracy": 0.8759385943412781, + "num_tokens": 335946268.0, + "step": 8808 + }, + { + "epoch": 1.1205953441038037, + "ewc_loss": 0.0302734375, + "ewc_loss_parallel": 3.0279159545898438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.424205780029297, + "learning_rate": 1e-06, + "loss": 0.4208, + "mean_token_accuracy": 0.8662974238395691, + "num_tokens": 335985364.0, + "step": 8809 + }, + { + "epoch": 1.120722554382394, + "ewc_loss": 0.0303955078125, + "ewc_loss_parallel": 3.039836883544922e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.551353454589844, + "learning_rate": 1e-06, + "loss": 0.4071, + "mean_token_accuracy": 0.8688846230506897, + "num_tokens": 336025300.0, + "step": 8810 + }, + { + "epoch": 1.1208497646609845, + "ewc_loss": 0.0303955078125, + "ewc_loss_parallel": 3.039836883544922e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.461400985717773, + "learning_rate": 1e-06, + "loss": 0.419, + "mean_token_accuracy": 0.87037593126297, + "num_tokens": 336062105.0, + "step": 8811 + }, + { + "epoch": 1.120976974939575, + "ewc_loss": 0.030029296875, + "ewc_loss_parallel": 3.0040740966796875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.373464584350586, + "learning_rate": 1e-06, + "loss": 0.4563, + "mean_token_accuracy": 0.8541097640991211, + "num_tokens": 336102944.0, + "step": 8812 + }, + { + "epoch": 1.1211041852181656, + "ewc_loss": 0.0301513671875, + "ewc_loss_parallel": 3.0159950256347656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.22584342956543, + "learning_rate": 1e-06, + "loss": 0.3399, + "mean_token_accuracy": 0.8916789889335632, + "num_tokens": 336142964.0, + "step": 8813 + }, + { + "epoch": 1.121231395496756, + "ewc_loss": 0.030029296875, + "ewc_loss_parallel": 3.0040740966796875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.40534782409668, + "learning_rate": 1e-06, + "loss": 0.4185, + "mean_token_accuracy": 0.8679420351982117, + "num_tokens": 336186962.0, + "step": 8814 + }, + { + "epoch": 1.1213586057753466, + "ewc_loss": 0.0302734375, + "ewc_loss_parallel": 3.0279159545898438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.36538314819336, + "learning_rate": 1e-06, + "loss": 0.3928, + "mean_token_accuracy": 0.874483048915863, + "num_tokens": 336220775.0, + "step": 8815 + }, + { + "epoch": 1.1214858160539372, + "ewc_loss": 0.0302734375, + "ewc_loss_parallel": 3.0279159545898438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.304447174072266, + "learning_rate": 1e-06, + "loss": 0.4299, + "mean_token_accuracy": 0.8628944158554077, + "num_tokens": 336263067.0, + "step": 8816 + }, + { + "epoch": 1.1216130263325277, + "ewc_loss": 0.0302734375, + "ewc_loss_parallel": 3.0279159545898438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.351415634155273, + "learning_rate": 1e-06, + "loss": 0.4092, + "mean_token_accuracy": 0.8693050146102905, + "num_tokens": 336308463.0, + "step": 8817 + }, + { + "epoch": 1.1217402366111182, + "ewc_loss": 0.0302734375, + "ewc_loss_parallel": 3.0279159545898438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.257911682128906, + "learning_rate": 1e-06, + "loss": 0.3855, + "mean_token_accuracy": 0.8789408802986145, + "num_tokens": 336346965.0, + "step": 8818 + }, + { + "epoch": 1.1218674468897087, + "ewc_loss": 0.0303955078125, + "ewc_loss_parallel": 3.039836883544922e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.517881393432617, + "learning_rate": 1e-06, + "loss": 0.424, + "mean_token_accuracy": 0.8674716949462891, + "num_tokens": 336382114.0, + "step": 8819 + }, + { + "epoch": 1.1219946571682993, + "ewc_loss": 0.0302734375, + "ewc_loss_parallel": 3.0279159545898438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.223176956176758, + "learning_rate": 1e-06, + "loss": 0.4111, + "mean_token_accuracy": 0.8707138299942017, + "num_tokens": 336414020.0, + "step": 8820 + }, + { + "epoch": 1.1221218674468898, + "ewc_loss": 0.030029296875, + "ewc_loss_parallel": 3.0040740966796875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.407405853271484, + "learning_rate": 1e-06, + "loss": 0.4268, + "mean_token_accuracy": 0.8640814423561096, + "num_tokens": 336452522.0, + "step": 8821 + }, + { + "epoch": 1.1222490777254803, + "ewc_loss": 0.0301513671875, + "ewc_loss_parallel": 3.0159950256347656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.389507293701172, + "learning_rate": 1e-06, + "loss": 0.3822, + "mean_token_accuracy": 0.8769530057907104, + "num_tokens": 336484593.0, + "step": 8822 + }, + { + "epoch": 1.1223762880040706, + "ewc_loss": 0.0302734375, + "ewc_loss_parallel": 3.0279159545898438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.25394630432129, + "learning_rate": 1e-06, + "loss": 0.4817, + "mean_token_accuracy": 0.8501169681549072, + "num_tokens": 336518964.0, + "step": 8823 + }, + { + "epoch": 1.1225034982826612, + "ewc_loss": 0.0301513671875, + "ewc_loss_parallel": 3.0159950256347656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.301973342895508, + "learning_rate": 1e-06, + "loss": 0.4601, + "mean_token_accuracy": 0.8541303873062134, + "num_tokens": 336556320.0, + "step": 8824 + }, + { + "epoch": 1.1226307085612517, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.404993057250977, + "learning_rate": 1e-06, + "loss": 0.4529, + "mean_token_accuracy": 0.8562671542167664, + "num_tokens": 336595473.0, + "step": 8825 + }, + { + "epoch": 1.1227579188398422, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.234119415283203, + "learning_rate": 1e-06, + "loss": 0.4197, + "mean_token_accuracy": 0.8648268580436707, + "num_tokens": 336631793.0, + "step": 8826 + }, + { + "epoch": 1.1228851291184327, + "ewc_loss": 0.030029296875, + "ewc_loss_parallel": 3.0040740966796875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.208404541015625, + "learning_rate": 1e-06, + "loss": 0.4666, + "mean_token_accuracy": 0.851579487323761, + "num_tokens": 336671230.0, + "step": 8827 + }, + { + "epoch": 1.1230123393970233, + "ewc_loss": 0.0302734375, + "ewc_loss_parallel": 3.0279159545898438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.450927734375, + "learning_rate": 1e-06, + "loss": 0.41, + "mean_token_accuracy": 0.867913544178009, + "num_tokens": 336712015.0, + "step": 8828 + }, + { + "epoch": 1.1231395496756138, + "ewc_loss": 0.0303955078125, + "ewc_loss_parallel": 3.039836883544922e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.310171127319336, + "learning_rate": 1e-06, + "loss": 0.4682, + "mean_token_accuracy": 0.850167453289032, + "num_tokens": 336741206.0, + "step": 8829 + }, + { + "epoch": 1.1232667599542043, + "ewc_loss": 0.0302734375, + "ewc_loss_parallel": 3.0279159545898438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.35468864440918, + "learning_rate": 1e-06, + "loss": 0.4559, + "mean_token_accuracy": 0.8584170341491699, + "num_tokens": 336778838.0, + "step": 8830 + }, + { + "epoch": 1.1233939702327949, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.43649673461914, + "learning_rate": 1e-06, + "loss": 0.4059, + "mean_token_accuracy": 0.8717364072799683, + "num_tokens": 336815471.0, + "step": 8831 + }, + { + "epoch": 1.1235211805113854, + "ewc_loss": 0.0302734375, + "ewc_loss_parallel": 3.0279159545898438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.301685333251953, + "learning_rate": 1e-06, + "loss": 0.413, + "mean_token_accuracy": 0.8680312633514404, + "num_tokens": 336850870.0, + "step": 8832 + }, + { + "epoch": 1.123648390789976, + "ewc_loss": 0.0302734375, + "ewc_loss_parallel": 3.0279159545898438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.15508270263672, + "learning_rate": 1e-06, + "loss": 0.4066, + "mean_token_accuracy": 0.8700523376464844, + "num_tokens": 336891779.0, + "step": 8833 + }, + { + "epoch": 1.1237756010685664, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.342458724975586, + "learning_rate": 1e-06, + "loss": 0.3853, + "mean_token_accuracy": 0.8764233589172363, + "num_tokens": 336931171.0, + "step": 8834 + }, + { + "epoch": 1.1239028113471567, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.424341201782227, + "learning_rate": 1e-06, + "loss": 0.3839, + "mean_token_accuracy": 0.8764154314994812, + "num_tokens": 336963080.0, + "step": 8835 + }, + { + "epoch": 1.1240300216257473, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.267393112182617, + "learning_rate": 1e-06, + "loss": 0.4635, + "mean_token_accuracy": 0.8516950607299805, + "num_tokens": 337000817.0, + "step": 8836 + }, + { + "epoch": 1.1241572319043378, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.440876007080078, + "learning_rate": 1e-06, + "loss": 0.4385, + "mean_token_accuracy": 0.8591747283935547, + "num_tokens": 337037249.0, + "step": 8837 + }, + { + "epoch": 1.1242844421829283, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.39261817932129, + "learning_rate": 1e-06, + "loss": 0.403, + "mean_token_accuracy": 0.8740913271903992, + "num_tokens": 337077752.0, + "step": 8838 + }, + { + "epoch": 1.1244116524615189, + "ewc_loss": 0.0302734375, + "ewc_loss_parallel": 3.0279159545898438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.27145767211914, + "learning_rate": 1e-06, + "loss": 0.4236, + "mean_token_accuracy": 0.8633884191513062, + "num_tokens": 337119649.0, + "step": 8839 + }, + { + "epoch": 1.1245388627401094, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.2919864654541, + "learning_rate": 1e-06, + "loss": 0.4775, + "mean_token_accuracy": 0.8500233292579651, + "num_tokens": 337154376.0, + "step": 8840 + }, + { + "epoch": 1.1246660730187, + "ewc_loss": 0.0302734375, + "ewc_loss_parallel": 3.0279159545898438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.370820999145508, + "learning_rate": 1e-06, + "loss": 0.3892, + "mean_token_accuracy": 0.8753810524940491, + "num_tokens": 337193377.0, + "step": 8841 + }, + { + "epoch": 1.1247932832972904, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.343114852905273, + "learning_rate": 1e-06, + "loss": 0.4518, + "mean_token_accuracy": 0.8564252853393555, + "num_tokens": 337233176.0, + "step": 8842 + }, + { + "epoch": 1.124920493575881, + "ewc_loss": 0.0303955078125, + "ewc_loss_parallel": 3.039836883544922e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.380298614501953, + "learning_rate": 1e-06, + "loss": 0.4729, + "mean_token_accuracy": 0.8521119356155396, + "num_tokens": 337262995.0, + "step": 8843 + }, + { + "epoch": 1.1250477038544715, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.314741134643555, + "learning_rate": 1e-06, + "loss": 0.4248, + "mean_token_accuracy": 0.8639245629310608, + "num_tokens": 337302466.0, + "step": 8844 + }, + { + "epoch": 1.125174914133062, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.632875442504883, + "learning_rate": 1e-06, + "loss": 0.3841, + "mean_token_accuracy": 0.8774080872535706, + "num_tokens": 337341197.0, + "step": 8845 + }, + { + "epoch": 1.1253021244116526, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.330272674560547, + "learning_rate": 1e-06, + "loss": 0.4164, + "mean_token_accuracy": 0.8671712875366211, + "num_tokens": 337375602.0, + "step": 8846 + }, + { + "epoch": 1.125429334690243, + "ewc_loss": 0.0302734375, + "ewc_loss_parallel": 3.0279159545898438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.368661880493164, + "learning_rate": 1e-06, + "loss": 0.4479, + "mean_token_accuracy": 0.8562800884246826, + "num_tokens": 337406685.0, + "step": 8847 + }, + { + "epoch": 1.1255565449688334, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.325883865356445, + "learning_rate": 1e-06, + "loss": 0.4332, + "mean_token_accuracy": 0.8598958253860474, + "num_tokens": 337447364.0, + "step": 8848 + }, + { + "epoch": 1.125683755247424, + "ewc_loss": 0.0302734375, + "ewc_loss_parallel": 3.0279159545898438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.522274017333984, + "learning_rate": 1e-06, + "loss": 0.4215, + "mean_token_accuracy": 0.866234302520752, + "num_tokens": 337487840.0, + "step": 8849 + }, + { + "epoch": 1.1258109655260145, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.557397842407227, + "learning_rate": 1e-06, + "loss": 0.4035, + "mean_token_accuracy": 0.8693009614944458, + "num_tokens": 337525117.0, + "step": 8850 + }, + { + "epoch": 1.125938175804605, + "ewc_loss": 0.0301513671875, + "ewc_loss_parallel": 3.0159950256347656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.421140670776367, + "learning_rate": 1e-06, + "loss": 0.4459, + "mean_token_accuracy": 0.8588320016860962, + "num_tokens": 337567024.0, + "step": 8851 + }, + { + "epoch": 1.1260653860831955, + "ewc_loss": 0.0302734375, + "ewc_loss_parallel": 3.0279159545898438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.402631759643555, + "learning_rate": 1e-06, + "loss": 0.4286, + "mean_token_accuracy": 0.8628567457199097, + "num_tokens": 337600117.0, + "step": 8852 + }, + { + "epoch": 1.126192596361786, + "ewc_loss": 0.0303955078125, + "ewc_loss_parallel": 3.039836883544922e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.52094841003418, + "learning_rate": 1e-06, + "loss": 0.4689, + "mean_token_accuracy": 0.850281298160553, + "num_tokens": 337637188.0, + "step": 8853 + }, + { + "epoch": 1.1263198066403766, + "ewc_loss": 0.0303955078125, + "ewc_loss_parallel": 3.039836883544922e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.361188888549805, + "learning_rate": 1e-06, + "loss": 0.4353, + "mean_token_accuracy": 0.8602419495582581, + "num_tokens": 337676492.0, + "step": 8854 + }, + { + "epoch": 1.126447016918967, + "ewc_loss": 0.030029296875, + "ewc_loss_parallel": 3.0040740966796875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.536643981933594, + "learning_rate": 1e-06, + "loss": 0.3813, + "mean_token_accuracy": 0.8783676624298096, + "num_tokens": 337713468.0, + "step": 8855 + }, + { + "epoch": 1.1265742271975576, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.364242553710938, + "learning_rate": 1e-06, + "loss": 0.4619, + "mean_token_accuracy": 0.8518978953361511, + "num_tokens": 337750944.0, + "step": 8856 + }, + { + "epoch": 1.1267014374761481, + "ewc_loss": 0.030029296875, + "ewc_loss_parallel": 3.0040740966796875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.49508285522461, + "learning_rate": 1e-06, + "loss": 0.3776, + "mean_token_accuracy": 0.8792044520378113, + "num_tokens": 337793802.0, + "step": 8857 + }, + { + "epoch": 1.1268286477547387, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.541292190551758, + "learning_rate": 1e-06, + "loss": 0.3789, + "mean_token_accuracy": 0.8784409761428833, + "num_tokens": 337828550.0, + "step": 8858 + }, + { + "epoch": 1.126955858033329, + "ewc_loss": 0.0302734375, + "ewc_loss_parallel": 3.0279159545898438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.277099609375, + "learning_rate": 1e-06, + "loss": 0.3861, + "mean_token_accuracy": 0.8755085468292236, + "num_tokens": 337862168.0, + "step": 8859 + }, + { + "epoch": 1.1270830683119195, + "ewc_loss": 0.0302734375, + "ewc_loss_parallel": 3.0279159545898438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.456348419189453, + "learning_rate": 1e-06, + "loss": 0.4327, + "mean_token_accuracy": 0.8614294528961182, + "num_tokens": 337898837.0, + "step": 8860 + }, + { + "epoch": 1.12721027859051, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.479658126831055, + "learning_rate": 1e-06, + "loss": 0.4687, + "mean_token_accuracy": 0.8519057035446167, + "num_tokens": 337939608.0, + "step": 8861 + }, + { + "epoch": 1.1273374888691006, + "ewc_loss": 0.0303955078125, + "ewc_loss_parallel": 3.039836883544922e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.400978088378906, + "learning_rate": 1e-06, + "loss": 0.4244, + "mean_token_accuracy": 0.86480712890625, + "num_tokens": 337983412.0, + "step": 8862 + }, + { + "epoch": 1.127464699147691, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.379396438598633, + "learning_rate": 1e-06, + "loss": 0.3911, + "mean_token_accuracy": 0.8758065700531006, + "num_tokens": 338022171.0, + "step": 8863 + }, + { + "epoch": 1.1275919094262816, + "ewc_loss": 0.0303955078125, + "ewc_loss_parallel": 3.039836883544922e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.362754821777344, + "learning_rate": 1e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.8629639148712158, + "num_tokens": 338058838.0, + "step": 8864 + }, + { + "epoch": 1.1277191197048722, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.51552391052246, + "learning_rate": 1e-06, + "loss": 0.4309, + "mean_token_accuracy": 0.8615509271621704, + "num_tokens": 338095080.0, + "step": 8865 + }, + { + "epoch": 1.1278463299834627, + "ewc_loss": 0.0302734375, + "ewc_loss_parallel": 3.0279159545898438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.251811981201172, + "learning_rate": 1e-06, + "loss": 0.4819, + "mean_token_accuracy": 0.8475106954574585, + "num_tokens": 338134206.0, + "step": 8866 + }, + { + "epoch": 1.1279735402620532, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.28120994567871, + "learning_rate": 1e-06, + "loss": 0.3956, + "mean_token_accuracy": 0.8714761734008789, + "num_tokens": 338173794.0, + "step": 8867 + }, + { + "epoch": 1.1281007505406437, + "ewc_loss": 0.0302734375, + "ewc_loss_parallel": 3.0279159545898438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.387311935424805, + "learning_rate": 1e-06, + "loss": 0.4254, + "mean_token_accuracy": 0.8630576729774475, + "num_tokens": 338206591.0, + "step": 8868 + }, + { + "epoch": 1.1282279608192343, + "ewc_loss": 0.0302734375, + "ewc_loss_parallel": 3.0279159545898438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.36151695251465, + "learning_rate": 1e-06, + "loss": 0.3811, + "mean_token_accuracy": 0.8778547048568726, + "num_tokens": 338238728.0, + "step": 8869 + }, + { + "epoch": 1.1283551710978248, + "ewc_loss": 0.0303955078125, + "ewc_loss_parallel": 3.039836883544922e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.31443214416504, + "learning_rate": 1e-06, + "loss": 0.3996, + "mean_token_accuracy": 0.8747555613517761, + "num_tokens": 338282372.0, + "step": 8870 + }, + { + "epoch": 1.1284823813764153, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.42855453491211, + "learning_rate": 1e-06, + "loss": 0.4401, + "mean_token_accuracy": 0.859992265701294, + "num_tokens": 338319175.0, + "step": 8871 + }, + { + "epoch": 1.1286095916550056, + "ewc_loss": 0.0302734375, + "ewc_loss_parallel": 3.0279159545898438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.534578323364258, + "learning_rate": 1e-06, + "loss": 0.4354, + "mean_token_accuracy": 0.8622372150421143, + "num_tokens": 338357967.0, + "step": 8872 + }, + { + "epoch": 1.1287368019335962, + "ewc_loss": 0.0302734375, + "ewc_loss_parallel": 3.0279159545898438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.222423553466797, + "learning_rate": 1e-06, + "loss": 0.3961, + "mean_token_accuracy": 0.8722177743911743, + "num_tokens": 338391431.0, + "step": 8873 + }, + { + "epoch": 1.1288640122121867, + "ewc_loss": 0.0302734375, + "ewc_loss_parallel": 3.0279159545898438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.48004913330078, + "learning_rate": 1e-06, + "loss": 0.4026, + "mean_token_accuracy": 0.873002290725708, + "num_tokens": 338426298.0, + "step": 8874 + }, + { + "epoch": 1.1289912224907772, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.35157585144043, + "learning_rate": 1e-06, + "loss": 0.4732, + "mean_token_accuracy": 0.848429262638092, + "num_tokens": 338468147.0, + "step": 8875 + }, + { + "epoch": 1.1291184327693677, + "ewc_loss": 0.0302734375, + "ewc_loss_parallel": 3.0279159545898438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.487838745117188, + "learning_rate": 1e-06, + "loss": 0.4799, + "mean_token_accuracy": 0.8488826751708984, + "num_tokens": 338517095.0, + "step": 8876 + }, + { + "epoch": 1.1292456430479583, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.33306884765625, + "learning_rate": 1e-06, + "loss": 0.3995, + "mean_token_accuracy": 0.8721430897712708, + "num_tokens": 338549925.0, + "step": 8877 + }, + { + "epoch": 1.1293728533265488, + "ewc_loss": 0.0302734375, + "ewc_loss_parallel": 3.0279159545898438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.3707332611084, + "learning_rate": 1e-06, + "loss": 0.4059, + "mean_token_accuracy": 0.8689945936203003, + "num_tokens": 338583966.0, + "step": 8878 + }, + { + "epoch": 1.1295000636051393, + "ewc_loss": 0.0303955078125, + "ewc_loss_parallel": 3.039836883544922e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.3602237701416, + "learning_rate": 1e-06, + "loss": 0.4054, + "mean_token_accuracy": 0.8702425956726074, + "num_tokens": 338622715.0, + "step": 8879 + }, + { + "epoch": 1.1296272738837299, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.375322341918945, + "learning_rate": 1e-06, + "loss": 0.4658, + "mean_token_accuracy": 0.8527549505233765, + "num_tokens": 338659920.0, + "step": 8880 + }, + { + "epoch": 1.1297544841623204, + "ewc_loss": 0.0302734375, + "ewc_loss_parallel": 3.0279159545898438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.53887939453125, + "learning_rate": 1e-06, + "loss": 0.4396, + "mean_token_accuracy": 0.8611779808998108, + "num_tokens": 338698183.0, + "step": 8881 + }, + { + "epoch": 1.129881694440911, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.344148635864258, + "learning_rate": 1e-06, + "loss": 0.455, + "mean_token_accuracy": 0.8545476794242859, + "num_tokens": 338735853.0, + "step": 8882 + }, + { + "epoch": 1.1300089047195012, + "ewc_loss": 0.0301513671875, + "ewc_loss_parallel": 3.0159950256347656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.570249557495117, + "learning_rate": 1e-06, + "loss": 0.4537, + "mean_token_accuracy": 0.8558516502380371, + "num_tokens": 338771714.0, + "step": 8883 + }, + { + "epoch": 1.1301361149980917, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.367382049560547, + "learning_rate": 1e-06, + "loss": 0.3671, + "mean_token_accuracy": 0.8829558491706848, + "num_tokens": 338813823.0, + "step": 8884 + }, + { + "epoch": 1.1302633252766823, + "ewc_loss": 0.0303955078125, + "ewc_loss_parallel": 3.039836883544922e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.53120231628418, + "learning_rate": 1e-06, + "loss": 0.4149, + "mean_token_accuracy": 0.8685062527656555, + "num_tokens": 338858137.0, + "step": 8885 + }, + { + "epoch": 1.1303905355552728, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.399524688720703, + "learning_rate": 1e-06, + "loss": 0.4568, + "mean_token_accuracy": 0.8551092147827148, + "num_tokens": 338893653.0, + "step": 8886 + }, + { + "epoch": 1.1305177458338633, + "ewc_loss": 0.0303955078125, + "ewc_loss_parallel": 3.039836883544922e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.556243896484375, + "learning_rate": 1e-06, + "loss": 0.4804, + "mean_token_accuracy": 0.8468482494354248, + "num_tokens": 338931230.0, + "step": 8887 + }, + { + "epoch": 1.1306449561124539, + "ewc_loss": 0.0302734375, + "ewc_loss_parallel": 3.0279159545898438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.5606689453125, + "learning_rate": 1e-06, + "loss": 0.4231, + "mean_token_accuracy": 0.8638750910758972, + "num_tokens": 338966921.0, + "step": 8888 + }, + { + "epoch": 1.1307721663910444, + "ewc_loss": 0.0301513671875, + "ewc_loss_parallel": 3.0159950256347656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.268510818481445, + "learning_rate": 1e-06, + "loss": 0.4993, + "mean_token_accuracy": 0.8454917669296265, + "num_tokens": 339000957.0, + "step": 8889 + }, + { + "epoch": 1.130899376669635, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.605792999267578, + "learning_rate": 1e-06, + "loss": 0.4545, + "mean_token_accuracy": 0.8575973510742188, + "num_tokens": 339035876.0, + "step": 8890 + }, + { + "epoch": 1.1310265869482254, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.454504013061523, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.8661783933639526, + "num_tokens": 339073700.0, + "step": 8891 + }, + { + "epoch": 1.131153797226816, + "ewc_loss": 0.0303955078125, + "ewc_loss_parallel": 3.039836883544922e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.31309700012207, + "learning_rate": 1e-06, + "loss": 0.4088, + "mean_token_accuracy": 0.8692438006401062, + "num_tokens": 339108222.0, + "step": 8892 + }, + { + "epoch": 1.1312810075054065, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.62135124206543, + "learning_rate": 1e-06, + "loss": 0.3792, + "mean_token_accuracy": 0.8767201900482178, + "num_tokens": 339145749.0, + "step": 8893 + }, + { + "epoch": 1.131408217783997, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.437143325805664, + "learning_rate": 1e-06, + "loss": 0.4837, + "mean_token_accuracy": 0.8515286445617676, + "num_tokens": 339183852.0, + "step": 8894 + }, + { + "epoch": 1.1315354280625876, + "ewc_loss": 0.0303955078125, + "ewc_loss_parallel": 3.039836883544922e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.51595687866211, + "learning_rate": 1e-06, + "loss": 0.3733, + "mean_token_accuracy": 0.8800018429756165, + "num_tokens": 339218766.0, + "step": 8895 + }, + { + "epoch": 1.131662638341178, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.537771224975586, + "learning_rate": 1e-06, + "loss": 0.4136, + "mean_token_accuracy": 0.8665997385978699, + "num_tokens": 339259932.0, + "step": 8896 + }, + { + "epoch": 1.1317898486197684, + "ewc_loss": 0.0302734375, + "ewc_loss_parallel": 3.0279159545898438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.551563262939453, + "learning_rate": 1e-06, + "loss": 0.4175, + "mean_token_accuracy": 0.8686395883560181, + "num_tokens": 339302201.0, + "step": 8897 + }, + { + "epoch": 1.131917058898359, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.482269287109375, + "learning_rate": 1e-06, + "loss": 0.535, + "mean_token_accuracy": 0.8361607789993286, + "num_tokens": 339342091.0, + "step": 8898 + }, + { + "epoch": 1.1320442691769494, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.4300594329834, + "learning_rate": 1e-06, + "loss": 0.4289, + "mean_token_accuracy": 0.8601599931716919, + "num_tokens": 339379951.0, + "step": 8899 + }, + { + "epoch": 1.13217147945554, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.427207946777344, + "learning_rate": 1e-06, + "loss": 0.4458, + "mean_token_accuracy": 0.8602651357650757, + "num_tokens": 339422243.0, + "step": 8900 + }, + { + "epoch": 1.1322986897341305, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.71950340270996, + "learning_rate": 1e-06, + "loss": 0.3796, + "mean_token_accuracy": 0.8790199756622314, + "num_tokens": 339464969.0, + "step": 8901 + }, + { + "epoch": 1.132425900012721, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.452131271362305, + "learning_rate": 1e-06, + "loss": 0.4065, + "mean_token_accuracy": 0.8688647747039795, + "num_tokens": 339502382.0, + "step": 8902 + }, + { + "epoch": 1.1325531102913116, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.276268005371094, + "learning_rate": 1e-06, + "loss": 0.4672, + "mean_token_accuracy": 0.8496391773223877, + "num_tokens": 339542875.0, + "step": 8903 + }, + { + "epoch": 1.132680320569902, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.560665130615234, + "learning_rate": 1e-06, + "loss": 0.4583, + "mean_token_accuracy": 0.8526595234870911, + "num_tokens": 339580776.0, + "step": 8904 + }, + { + "epoch": 1.1328075308484926, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.460344314575195, + "learning_rate": 1e-06, + "loss": 0.3852, + "mean_token_accuracy": 0.8786826133728027, + "num_tokens": 339620195.0, + "step": 8905 + }, + { + "epoch": 1.1329347411270831, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.634977340698242, + "learning_rate": 1e-06, + "loss": 0.3853, + "mean_token_accuracy": 0.8731064796447754, + "num_tokens": 339659234.0, + "step": 8906 + }, + { + "epoch": 1.1330619514056737, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.23809814453125, + "learning_rate": 1e-06, + "loss": 0.4519, + "mean_token_accuracy": 0.8553286194801331, + "num_tokens": 339703259.0, + "step": 8907 + }, + { + "epoch": 1.133189161684264, + "ewc_loss": 0.0303955078125, + "ewc_loss_parallel": 3.039836883544922e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.427701950073242, + "learning_rate": 1e-06, + "loss": 0.4027, + "mean_token_accuracy": 0.8715655207633972, + "num_tokens": 339745901.0, + "step": 8908 + }, + { + "epoch": 1.1333163719628545, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.47945213317871, + "learning_rate": 1e-06, + "loss": 0.4527, + "mean_token_accuracy": 0.8566630482673645, + "num_tokens": 339790417.0, + "step": 8909 + }, + { + "epoch": 1.133443582241445, + "ewc_loss": 0.0303955078125, + "ewc_loss_parallel": 3.039836883544922e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.64539337158203, + "learning_rate": 1e-06, + "loss": 0.439, + "mean_token_accuracy": 0.8568363189697266, + "num_tokens": 339827224.0, + "step": 8910 + }, + { + "epoch": 1.1335707925200356, + "ewc_loss": 0.0302734375, + "ewc_loss_parallel": 3.0279159545898438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.78306007385254, + "learning_rate": 1e-06, + "loss": 0.4342, + "mean_token_accuracy": 0.8593440055847168, + "num_tokens": 339859932.0, + "step": 8911 + }, + { + "epoch": 1.133698002798626, + "ewc_loss": 0.0302734375, + "ewc_loss_parallel": 3.0279159545898438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.274303436279297, + "learning_rate": 1e-06, + "loss": 0.4621, + "mean_token_accuracy": 0.8528441190719604, + "num_tokens": 339902416.0, + "step": 8912 + }, + { + "epoch": 1.1338252130772166, + "ewc_loss": 0.0302734375, + "ewc_loss_parallel": 3.0279159545898438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.514455795288086, + "learning_rate": 1e-06, + "loss": 0.3967, + "mean_token_accuracy": 0.8734118938446045, + "num_tokens": 339940495.0, + "step": 8913 + }, + { + "epoch": 1.1339524233558071, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.620128631591797, + "learning_rate": 1e-06, + "loss": 0.468, + "mean_token_accuracy": 0.8468527793884277, + "num_tokens": 339983010.0, + "step": 8914 + }, + { + "epoch": 1.1340796336343977, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.268062591552734, + "learning_rate": 1e-06, + "loss": 0.4324, + "mean_token_accuracy": 0.8649637699127197, + "num_tokens": 340020683.0, + "step": 8915 + }, + { + "epoch": 1.1342068439129882, + "ewc_loss": 0.0302734375, + "ewc_loss_parallel": 3.0279159545898438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.4945125579834, + "learning_rate": 1e-06, + "loss": 0.4557, + "mean_token_accuracy": 0.8564100861549377, + "num_tokens": 340063179.0, + "step": 8916 + }, + { + "epoch": 1.1343340541915787, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.521968841552734, + "learning_rate": 1e-06, + "loss": 0.3921, + "mean_token_accuracy": 0.8746155500411987, + "num_tokens": 340100166.0, + "step": 8917 + }, + { + "epoch": 1.1344612644701693, + "ewc_loss": 0.0302734375, + "ewc_loss_parallel": 3.0279159545898438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.376325607299805, + "learning_rate": 1e-06, + "loss": 0.4275, + "mean_token_accuracy": 0.8619564771652222, + "num_tokens": 340144325.0, + "step": 8918 + }, + { + "epoch": 1.1345884747487598, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.635913848876953, + "learning_rate": 1e-06, + "loss": 0.42, + "mean_token_accuracy": 0.8642419576644897, + "num_tokens": 340180563.0, + "step": 8919 + }, + { + "epoch": 1.1347156850273503, + "ewc_loss": 0.0302734375, + "ewc_loss_parallel": 3.0279159545898438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.403013229370117, + "learning_rate": 1e-06, + "loss": 0.471, + "mean_token_accuracy": 0.8516169786453247, + "num_tokens": 340224621.0, + "step": 8920 + }, + { + "epoch": 1.1348428953059406, + "ewc_loss": 0.0302734375, + "ewc_loss_parallel": 3.0279159545898438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.624570846557617, + "learning_rate": 1e-06, + "loss": 0.41, + "mean_token_accuracy": 0.8696312308311462, + "num_tokens": 340264810.0, + "step": 8921 + }, + { + "epoch": 1.1349701055845312, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.444360733032227, + "learning_rate": 1e-06, + "loss": 0.3803, + "mean_token_accuracy": 0.8777107000350952, + "num_tokens": 340301158.0, + "step": 8922 + }, + { + "epoch": 1.1350973158631217, + "ewc_loss": 0.0301513671875, + "ewc_loss_parallel": 3.0159950256347656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.6325740814209, + "learning_rate": 1e-06, + "loss": 0.4374, + "mean_token_accuracy": 0.864059567451477, + "num_tokens": 340339806.0, + "step": 8923 + }, + { + "epoch": 1.1352245261417122, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.547773361206055, + "learning_rate": 1e-06, + "loss": 0.4116, + "mean_token_accuracy": 0.8701825141906738, + "num_tokens": 340386412.0, + "step": 8924 + }, + { + "epoch": 1.1353517364203027, + "ewc_loss": 0.0302734375, + "ewc_loss_parallel": 3.0279159545898438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.471649169921875, + "learning_rate": 1e-06, + "loss": 0.4016, + "mean_token_accuracy": 0.8747573494911194, + "num_tokens": 340431391.0, + "step": 8925 + }, + { + "epoch": 1.1354789466988933, + "ewc_loss": 0.0301513671875, + "ewc_loss_parallel": 3.0159950256347656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.610252380371094, + "learning_rate": 1e-06, + "loss": 0.4109, + "mean_token_accuracy": 0.8687570691108704, + "num_tokens": 340465904.0, + "step": 8926 + }, + { + "epoch": 1.1356061569774838, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.611631393432617, + "learning_rate": 1e-06, + "loss": 0.4115, + "mean_token_accuracy": 0.8682730197906494, + "num_tokens": 340504290.0, + "step": 8927 + }, + { + "epoch": 1.1357333672560743, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.51219367980957, + "learning_rate": 1e-06, + "loss": 0.4437, + "mean_token_accuracy": 0.8601377010345459, + "num_tokens": 340539100.0, + "step": 8928 + }, + { + "epoch": 1.1358605775346649, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.484142303466797, + "learning_rate": 1e-06, + "loss": 0.3673, + "mean_token_accuracy": 0.8792867064476013, + "num_tokens": 340569005.0, + "step": 8929 + }, + { + "epoch": 1.1359877878132554, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.51390266418457, + "learning_rate": 1e-06, + "loss": 0.424, + "mean_token_accuracy": 0.8646716475486755, + "num_tokens": 340613476.0, + "step": 8930 + }, + { + "epoch": 1.136114998091846, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.507240295410156, + "learning_rate": 1e-06, + "loss": 0.4496, + "mean_token_accuracy": 0.8576395511627197, + "num_tokens": 340647982.0, + "step": 8931 + }, + { + "epoch": 1.1362422083704362, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.408571243286133, + "learning_rate": 1e-06, + "loss": 0.5024, + "mean_token_accuracy": 0.8426569700241089, + "num_tokens": 340683270.0, + "step": 8932 + }, + { + "epoch": 1.1363694186490267, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.461545944213867, + "learning_rate": 1e-06, + "loss": 0.4187, + "mean_token_accuracy": 0.8682540655136108, + "num_tokens": 340719955.0, + "step": 8933 + }, + { + "epoch": 1.1364966289276173, + "ewc_loss": 0.0303955078125, + "ewc_loss_parallel": 3.039836883544922e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.660079956054688, + "learning_rate": 1e-06, + "loss": 0.4307, + "mean_token_accuracy": 0.8656994104385376, + "num_tokens": 340753934.0, + "step": 8934 + }, + { + "epoch": 1.1366238392062078, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.257915496826172, + "learning_rate": 1e-06, + "loss": 0.4072, + "mean_token_accuracy": 0.8710471391677856, + "num_tokens": 340787545.0, + "step": 8935 + }, + { + "epoch": 1.1367510494847983, + "ewc_loss": 0.0302734375, + "ewc_loss_parallel": 3.0279159545898438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.452112197875977, + "learning_rate": 1e-06, + "loss": 0.3976, + "mean_token_accuracy": 0.8745402097702026, + "num_tokens": 340826362.0, + "step": 8936 + }, + { + "epoch": 1.1368782597633889, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.47539710998535, + "learning_rate": 1e-06, + "loss": 0.4121, + "mean_token_accuracy": 0.8700602054595947, + "num_tokens": 340857871.0, + "step": 8937 + }, + { + "epoch": 1.1370054700419794, + "ewc_loss": 0.0303955078125, + "ewc_loss_parallel": 3.039836883544922e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.443513870239258, + "learning_rate": 1e-06, + "loss": 0.4057, + "mean_token_accuracy": 0.8714281320571899, + "num_tokens": 340897736.0, + "step": 8938 + }, + { + "epoch": 1.13713268032057, + "ewc_loss": 0.0302734375, + "ewc_loss_parallel": 3.0279159545898438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.537044525146484, + "learning_rate": 1e-06, + "loss": 0.4267, + "mean_token_accuracy": 0.8607340455055237, + "num_tokens": 340935774.0, + "step": 8939 + }, + { + "epoch": 1.1372598905991604, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.634963989257812, + "learning_rate": 1e-06, + "loss": 0.42, + "mean_token_accuracy": 0.8684256672859192, + "num_tokens": 340971497.0, + "step": 8940 + }, + { + "epoch": 1.137387100877751, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.273448944091797, + "learning_rate": 1e-06, + "loss": 0.429, + "mean_token_accuracy": 0.8666086196899414, + "num_tokens": 341016209.0, + "step": 8941 + }, + { + "epoch": 1.1375143111563415, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.591676712036133, + "learning_rate": 1e-06, + "loss": 0.4312, + "mean_token_accuracy": 0.8597700595855713, + "num_tokens": 341052137.0, + "step": 8942 + }, + { + "epoch": 1.137641521434932, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.631614685058594, + "learning_rate": 1e-06, + "loss": 0.409, + "mean_token_accuracy": 0.8674536347389221, + "num_tokens": 341083417.0, + "step": 8943 + }, + { + "epoch": 1.1377687317135226, + "ewc_loss": 0.0302734375, + "ewc_loss_parallel": 3.0279159545898438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.43115997314453, + "learning_rate": 1e-06, + "loss": 0.4191, + "mean_token_accuracy": 0.8683489561080933, + "num_tokens": 341127798.0, + "step": 8944 + }, + { + "epoch": 1.137895941992113, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.738845825195312, + "learning_rate": 1e-06, + "loss": 0.4636, + "mean_token_accuracy": 0.8516493439674377, + "num_tokens": 341173485.0, + "step": 8945 + }, + { + "epoch": 1.1380231522707034, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.637845993041992, + "learning_rate": 1e-06, + "loss": 0.4622, + "mean_token_accuracy": 0.8543338775634766, + "num_tokens": 341212834.0, + "step": 8946 + }, + { + "epoch": 1.138150362549294, + "ewc_loss": 0.0302734375, + "ewc_loss_parallel": 3.0279159545898438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.495384216308594, + "learning_rate": 1e-06, + "loss": 0.4273, + "mean_token_accuracy": 0.8582582473754883, + "num_tokens": 341246497.0, + "step": 8947 + }, + { + "epoch": 1.1382775728278844, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.703536987304688, + "learning_rate": 1e-06, + "loss": 0.4712, + "mean_token_accuracy": 0.8494638204574585, + "num_tokens": 341291498.0, + "step": 8948 + }, + { + "epoch": 1.138404783106475, + "ewc_loss": 0.0303955078125, + "ewc_loss_parallel": 3.039836883544922e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.399837493896484, + "learning_rate": 1e-06, + "loss": 0.4348, + "mean_token_accuracy": 0.859602153301239, + "num_tokens": 341328104.0, + "step": 8949 + }, + { + "epoch": 1.1385319933850655, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.457916259765625, + "learning_rate": 1e-06, + "loss": 0.3755, + "mean_token_accuracy": 0.8758953809738159, + "num_tokens": 341359042.0, + "step": 8950 + }, + { + "epoch": 1.138659203663656, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.710134506225586, + "learning_rate": 1e-06, + "loss": 0.4315, + "mean_token_accuracy": 0.8648103475570679, + "num_tokens": 341403529.0, + "step": 8951 + }, + { + "epoch": 1.1387864139422466, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.434856414794922, + "learning_rate": 1e-06, + "loss": 0.4002, + "mean_token_accuracy": 0.8726892471313477, + "num_tokens": 341442509.0, + "step": 8952 + }, + { + "epoch": 1.138913624220837, + "ewc_loss": 0.0303955078125, + "ewc_loss_parallel": 3.039836883544922e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.64922523498535, + "learning_rate": 1e-06, + "loss": 0.4326, + "mean_token_accuracy": 0.8622584342956543, + "num_tokens": 341483015.0, + "step": 8953 + }, + { + "epoch": 1.1390408344994276, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.519424438476562, + "learning_rate": 1e-06, + "loss": 0.4241, + "mean_token_accuracy": 0.8648594617843628, + "num_tokens": 341519676.0, + "step": 8954 + }, + { + "epoch": 1.1391680447780181, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.380447387695312, + "learning_rate": 1e-06, + "loss": 0.4226, + "mean_token_accuracy": 0.8651173114776611, + "num_tokens": 341563483.0, + "step": 8955 + }, + { + "epoch": 1.1392952550566087, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.633508682250977, + "learning_rate": 1e-06, + "loss": 0.4394, + "mean_token_accuracy": 0.8572603464126587, + "num_tokens": 341598387.0, + "step": 8956 + }, + { + "epoch": 1.139422465335199, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.39838981628418, + "learning_rate": 1e-06, + "loss": 0.42, + "mean_token_accuracy": 0.8678789138793945, + "num_tokens": 341640253.0, + "step": 8957 + }, + { + "epoch": 1.1395496756137895, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.50984764099121, + "learning_rate": 1e-06, + "loss": 0.3452, + "mean_token_accuracy": 0.8922011852264404, + "num_tokens": 341675788.0, + "step": 8958 + }, + { + "epoch": 1.13967688589238, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.427778244018555, + "learning_rate": 1e-06, + "loss": 0.4163, + "mean_token_accuracy": 0.8651755452156067, + "num_tokens": 341716917.0, + "step": 8959 + }, + { + "epoch": 1.1398040961709706, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.503536224365234, + "learning_rate": 1e-06, + "loss": 0.4352, + "mean_token_accuracy": 0.8644533753395081, + "num_tokens": 341758936.0, + "step": 8960 + }, + { + "epoch": 1.139931306449561, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.468294143676758, + "learning_rate": 1e-06, + "loss": 0.4959, + "mean_token_accuracy": 0.8410199284553528, + "num_tokens": 341797612.0, + "step": 8961 + }, + { + "epoch": 1.1400585167281516, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.384136199951172, + "learning_rate": 1e-06, + "loss": 0.4476, + "mean_token_accuracy": 0.8611028790473938, + "num_tokens": 341836061.0, + "step": 8962 + }, + { + "epoch": 1.1401857270067421, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.427446365356445, + "learning_rate": 1e-06, + "loss": 0.4289, + "mean_token_accuracy": 0.8642815351486206, + "num_tokens": 341875465.0, + "step": 8963 + }, + { + "epoch": 1.1403129372853327, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.493892669677734, + "learning_rate": 1e-06, + "loss": 0.4369, + "mean_token_accuracy": 0.859386146068573, + "num_tokens": 341919795.0, + "step": 8964 + }, + { + "epoch": 1.1404401475639232, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.54833984375, + "learning_rate": 1e-06, + "loss": 0.4254, + "mean_token_accuracy": 0.8620358109474182, + "num_tokens": 341953291.0, + "step": 8965 + }, + { + "epoch": 1.1405673578425137, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.3225040435791, + "learning_rate": 1e-06, + "loss": 0.3718, + "mean_token_accuracy": 0.8836517930030823, + "num_tokens": 341988364.0, + "step": 8966 + }, + { + "epoch": 1.1406945681211043, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.505355834960938, + "learning_rate": 1e-06, + "loss": 0.4256, + "mean_token_accuracy": 0.8689352869987488, + "num_tokens": 342028478.0, + "step": 8967 + }, + { + "epoch": 1.1408217783996948, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.289169311523438, + "learning_rate": 1e-06, + "loss": 0.398, + "mean_token_accuracy": 0.8726361989974976, + "num_tokens": 342067348.0, + "step": 8968 + }, + { + "epoch": 1.1409489886782853, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.414257049560547, + "learning_rate": 1e-06, + "loss": 0.4308, + "mean_token_accuracy": 0.8611942529678345, + "num_tokens": 342105845.0, + "step": 8969 + }, + { + "epoch": 1.1410761989568756, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.53310775756836, + "learning_rate": 1e-06, + "loss": 0.4287, + "mean_token_accuracy": 0.8659636974334717, + "num_tokens": 342142072.0, + "step": 8970 + }, + { + "epoch": 1.1412034092354661, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.527624130249023, + "learning_rate": 1e-06, + "loss": 0.4119, + "mean_token_accuracy": 0.8680580854415894, + "num_tokens": 342176592.0, + "step": 8971 + }, + { + "epoch": 1.1413306195140567, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.615116119384766, + "learning_rate": 1e-06, + "loss": 0.4148, + "mean_token_accuracy": 0.867745041847229, + "num_tokens": 342211896.0, + "step": 8972 + }, + { + "epoch": 1.1414578297926472, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.417278289794922, + "learning_rate": 1e-06, + "loss": 0.4165, + "mean_token_accuracy": 0.8690807223320007, + "num_tokens": 342256078.0, + "step": 8973 + }, + { + "epoch": 1.1415850400712377, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.581911087036133, + "learning_rate": 1e-06, + "loss": 0.4183, + "mean_token_accuracy": 0.866981029510498, + "num_tokens": 342296766.0, + "step": 8974 + }, + { + "epoch": 1.1417122503498283, + "ewc_loss": 0.0303955078125, + "ewc_loss_parallel": 3.039836883544922e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.37921142578125, + "learning_rate": 1e-06, + "loss": 0.4196, + "mean_token_accuracy": 0.8644549250602722, + "num_tokens": 342333689.0, + "step": 8975 + }, + { + "epoch": 1.1418394606284188, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.499841690063477, + "learning_rate": 1e-06, + "loss": 0.3858, + "mean_token_accuracy": 0.8752546310424805, + "num_tokens": 342369605.0, + "step": 8976 + }, + { + "epoch": 1.1419666709070093, + "ewc_loss": 0.0303955078125, + "ewc_loss_parallel": 3.039836883544922e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.34381103515625, + "learning_rate": 1e-06, + "loss": 0.3986, + "mean_token_accuracy": 0.8742129802703857, + "num_tokens": 342410806.0, + "step": 8977 + }, + { + "epoch": 1.1420938811855998, + "ewc_loss": 0.0302734375, + "ewc_loss_parallel": 3.0279159545898438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.39299201965332, + "learning_rate": 1e-06, + "loss": 0.3454, + "mean_token_accuracy": 0.8910512924194336, + "num_tokens": 342446957.0, + "step": 8978 + }, + { + "epoch": 1.1422210914641904, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.55805015563965, + "learning_rate": 1e-06, + "loss": 0.4494, + "mean_token_accuracy": 0.8536840677261353, + "num_tokens": 342480717.0, + "step": 8979 + }, + { + "epoch": 1.142348301742781, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.369356155395508, + "learning_rate": 1e-06, + "loss": 0.3995, + "mean_token_accuracy": 0.8729457855224609, + "num_tokens": 342520431.0, + "step": 8980 + }, + { + "epoch": 1.1424755120213712, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.431764602661133, + "learning_rate": 1e-06, + "loss": 0.4359, + "mean_token_accuracy": 0.8621876835823059, + "num_tokens": 342561735.0, + "step": 8981 + }, + { + "epoch": 1.1426027222999617, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.51268768310547, + "learning_rate": 1e-06, + "loss": 0.3793, + "mean_token_accuracy": 0.8771512508392334, + "num_tokens": 342593899.0, + "step": 8982 + }, + { + "epoch": 1.1427299325785523, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.622039794921875, + "learning_rate": 1e-06, + "loss": 0.4342, + "mean_token_accuracy": 0.8641550540924072, + "num_tokens": 342628044.0, + "step": 8983 + }, + { + "epoch": 1.1428571428571428, + "ewc_loss": 0.0303955078125, + "ewc_loss_parallel": 3.039836883544922e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.407329559326172, + "learning_rate": 1e-06, + "loss": 0.4285, + "mean_token_accuracy": 0.8628841042518616, + "num_tokens": 342665018.0, + "step": 8984 + }, + { + "epoch": 1.1429843531357333, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.443872451782227, + "learning_rate": 1e-06, + "loss": 0.4616, + "mean_token_accuracy": 0.854651927947998, + "num_tokens": 342704832.0, + "step": 8985 + }, + { + "epoch": 1.1431115634143239, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.40308380126953, + "learning_rate": 1e-06, + "loss": 0.4122, + "mean_token_accuracy": 0.8724555373191833, + "num_tokens": 342741598.0, + "step": 8986 + }, + { + "epoch": 1.1432387736929144, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.524394989013672, + "learning_rate": 1e-06, + "loss": 0.4435, + "mean_token_accuracy": 0.8613771200180054, + "num_tokens": 342777302.0, + "step": 8987 + }, + { + "epoch": 1.143365983971505, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.253557205200195, + "learning_rate": 1e-06, + "loss": 0.4901, + "mean_token_accuracy": 0.8448395133018494, + "num_tokens": 342809578.0, + "step": 8988 + }, + { + "epoch": 1.1434931942500954, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.577228546142578, + "learning_rate": 1e-06, + "loss": 0.4149, + "mean_token_accuracy": 0.8668575882911682, + "num_tokens": 342845809.0, + "step": 8989 + }, + { + "epoch": 1.143620404528686, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.495208740234375, + "learning_rate": 1e-06, + "loss": 0.4685, + "mean_token_accuracy": 0.8510972261428833, + "num_tokens": 342886118.0, + "step": 8990 + }, + { + "epoch": 1.1437476148072765, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.261003494262695, + "learning_rate": 1e-06, + "loss": 0.4343, + "mean_token_accuracy": 0.859915554523468, + "num_tokens": 342922517.0, + "step": 8991 + }, + { + "epoch": 1.143874825085867, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.597068786621094, + "learning_rate": 1e-06, + "loss": 0.4415, + "mean_token_accuracy": 0.8595820069313049, + "num_tokens": 342959741.0, + "step": 8992 + }, + { + "epoch": 1.1440020353644575, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.546863555908203, + "learning_rate": 1e-06, + "loss": 0.3814, + "mean_token_accuracy": 0.8782564401626587, + "num_tokens": 342991418.0, + "step": 8993 + }, + { + "epoch": 1.144129245643048, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.465791702270508, + "learning_rate": 1e-06, + "loss": 0.3646, + "mean_token_accuracy": 0.882361650466919, + "num_tokens": 343024263.0, + "step": 8994 + }, + { + "epoch": 1.1442564559216384, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.432451248168945, + "learning_rate": 1e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.866178035736084, + "num_tokens": 343066388.0, + "step": 8995 + }, + { + "epoch": 1.144383666200229, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.24493408203125, + "learning_rate": 1e-06, + "loss": 0.4106, + "mean_token_accuracy": 0.8667445778846741, + "num_tokens": 343107631.0, + "step": 8996 + }, + { + "epoch": 1.1445108764788194, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.527931213378906, + "learning_rate": 1e-06, + "loss": 0.402, + "mean_token_accuracy": 0.8714152574539185, + "num_tokens": 343143645.0, + "step": 8997 + }, + { + "epoch": 1.14463808675741, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.534996032714844, + "learning_rate": 1e-06, + "loss": 0.4666, + "mean_token_accuracy": 0.8516983389854431, + "num_tokens": 343182076.0, + "step": 8998 + }, + { + "epoch": 1.1447652970360005, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.559232711791992, + "learning_rate": 1e-06, + "loss": 0.3634, + "mean_token_accuracy": 0.8839595913887024, + "num_tokens": 343222808.0, + "step": 8999 + }, + { + "epoch": 1.144892507314591, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.390682220458984, + "learning_rate": 1e-06, + "loss": 0.4359, + "mean_token_accuracy": 0.861386775970459, + "num_tokens": 343268229.0, + "step": 9000 + }, + { + "epoch": 1.1450197175931816, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.530866622924805, + "learning_rate": 1e-06, + "loss": 0.4395, + "mean_token_accuracy": 0.8574297428131104, + "num_tokens": 343304459.0, + "step": 9001 + }, + { + "epoch": 1.145146927871772, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.501476287841797, + "learning_rate": 1e-06, + "loss": 0.5518, + "mean_token_accuracy": 0.8279600739479065, + "num_tokens": 343343211.0, + "step": 9002 + }, + { + "epoch": 1.1452741381503626, + "ewc_loss": 0.0302734375, + "ewc_loss_parallel": 3.0279159545898438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.418590545654297, + "learning_rate": 1e-06, + "loss": 0.4358, + "mean_token_accuracy": 0.8610528707504272, + "num_tokens": 343388498.0, + "step": 9003 + }, + { + "epoch": 1.1454013484289531, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.4693603515625, + "learning_rate": 1e-06, + "loss": 0.4345, + "mean_token_accuracy": 0.862891435623169, + "num_tokens": 343430101.0, + "step": 9004 + }, + { + "epoch": 1.1455285587075437, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.64517593383789, + "learning_rate": 1e-06, + "loss": 0.4711, + "mean_token_accuracy": 0.8534371852874756, + "num_tokens": 343466454.0, + "step": 9005 + }, + { + "epoch": 1.145655768986134, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.551071166992188, + "learning_rate": 1e-06, + "loss": 0.375, + "mean_token_accuracy": 0.8801363706588745, + "num_tokens": 343506960.0, + "step": 9006 + }, + { + "epoch": 1.1457829792647245, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.47928810119629, + "learning_rate": 1e-06, + "loss": 0.4174, + "mean_token_accuracy": 0.8730956315994263, + "num_tokens": 343541133.0, + "step": 9007 + }, + { + "epoch": 1.145910189543315, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.54638671875, + "learning_rate": 1e-06, + "loss": 0.4037, + "mean_token_accuracy": 0.8734106421470642, + "num_tokens": 343580266.0, + "step": 9008 + }, + { + "epoch": 1.1460373998219056, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.601646423339844, + "learning_rate": 1e-06, + "loss": 0.4291, + "mean_token_accuracy": 0.863111138343811, + "num_tokens": 343611691.0, + "step": 9009 + }, + { + "epoch": 1.146164610100496, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.58454704284668, + "learning_rate": 1e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.8714054822921753, + "num_tokens": 343649355.0, + "step": 9010 + }, + { + "epoch": 1.1462918203790866, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.518096923828125, + "learning_rate": 1e-06, + "loss": 0.3835, + "mean_token_accuracy": 0.8798854947090149, + "num_tokens": 343681606.0, + "step": 9011 + }, + { + "epoch": 1.1464190306576771, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.487030029296875, + "learning_rate": 1e-06, + "loss": 0.3955, + "mean_token_accuracy": 0.8759710788726807, + "num_tokens": 343721592.0, + "step": 9012 + }, + { + "epoch": 1.1465462409362677, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.559062957763672, + "learning_rate": 1e-06, + "loss": 0.4357, + "mean_token_accuracy": 0.8624836206436157, + "num_tokens": 343764084.0, + "step": 9013 + }, + { + "epoch": 1.1466734512148582, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.696489334106445, + "learning_rate": 1e-06, + "loss": 0.4217, + "mean_token_accuracy": 0.8671562075614929, + "num_tokens": 343801213.0, + "step": 9014 + }, + { + "epoch": 1.1468006614934487, + "ewc_loss": 0.0303955078125, + "ewc_loss_parallel": 3.039836883544922e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.55297088623047, + "learning_rate": 1e-06, + "loss": 0.4186, + "mean_token_accuracy": 0.8671448230743408, + "num_tokens": 343839761.0, + "step": 9015 + }, + { + "epoch": 1.1469278717720393, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.518869400024414, + "learning_rate": 1e-06, + "loss": 0.4267, + "mean_token_accuracy": 0.8647518157958984, + "num_tokens": 343877792.0, + "step": 9016 + }, + { + "epoch": 1.1470550820506298, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.574081420898438, + "learning_rate": 1e-06, + "loss": 0.3927, + "mean_token_accuracy": 0.8716290593147278, + "num_tokens": 343915107.0, + "step": 9017 + }, + { + "epoch": 1.1471822923292203, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.56729507446289, + "learning_rate": 1e-06, + "loss": 0.4046, + "mean_token_accuracy": 0.870588481426239, + "num_tokens": 343949783.0, + "step": 9018 + }, + { + "epoch": 1.1473095026078106, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.44211196899414, + "learning_rate": 1e-06, + "loss": 0.4774, + "mean_token_accuracy": 0.846801221370697, + "num_tokens": 343991058.0, + "step": 9019 + }, + { + "epoch": 1.1474367128864011, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.457351684570312, + "learning_rate": 1e-06, + "loss": 0.4245, + "mean_token_accuracy": 0.8653996586799622, + "num_tokens": 344030424.0, + "step": 9020 + }, + { + "epoch": 1.1475639231649917, + "ewc_loss": 0.0303955078125, + "ewc_loss_parallel": 3.039836883544922e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.50214195251465, + "learning_rate": 1e-06, + "loss": 0.4128, + "mean_token_accuracy": 0.867226779460907, + "num_tokens": 344071385.0, + "step": 9021 + }, + { + "epoch": 1.1476911334435822, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.5274658203125, + "learning_rate": 1e-06, + "loss": 0.3923, + "mean_token_accuracy": 0.8751274943351746, + "num_tokens": 344109725.0, + "step": 9022 + }, + { + "epoch": 1.1478183437221727, + "ewc_loss": 0.0303955078125, + "ewc_loss_parallel": 3.039836883544922e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.507246017456055, + "learning_rate": 1e-06, + "loss": 0.3868, + "mean_token_accuracy": 0.873192310333252, + "num_tokens": 344143947.0, + "step": 9023 + }, + { + "epoch": 1.1479455540007633, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.56749725341797, + "learning_rate": 1e-06, + "loss": 0.4391, + "mean_token_accuracy": 0.8557796478271484, + "num_tokens": 344179969.0, + "step": 9024 + }, + { + "epoch": 1.1480727642793538, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.58653450012207, + "learning_rate": 1e-06, + "loss": 0.3871, + "mean_token_accuracy": 0.8762571215629578, + "num_tokens": 344212279.0, + "step": 9025 + }, + { + "epoch": 1.1481999745579443, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.435272216796875, + "learning_rate": 1e-06, + "loss": 0.3998, + "mean_token_accuracy": 0.8712470531463623, + "num_tokens": 344248402.0, + "step": 9026 + }, + { + "epoch": 1.1483271848365348, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.441970825195312, + "learning_rate": 1e-06, + "loss": 0.443, + "mean_token_accuracy": 0.862682580947876, + "num_tokens": 344286557.0, + "step": 9027 + }, + { + "epoch": 1.1484543951151254, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.38108253479004, + "learning_rate": 1e-06, + "loss": 0.4151, + "mean_token_accuracy": 0.8663415908813477, + "num_tokens": 344331014.0, + "step": 9028 + }, + { + "epoch": 1.148581605393716, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.734378814697266, + "learning_rate": 1e-06, + "loss": 0.421, + "mean_token_accuracy": 0.8632761836051941, + "num_tokens": 344373239.0, + "step": 9029 + }, + { + "epoch": 1.1487088156723062, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.453523635864258, + "learning_rate": 1e-06, + "loss": 0.3761, + "mean_token_accuracy": 0.8806130886077881, + "num_tokens": 344409871.0, + "step": 9030 + }, + { + "epoch": 1.1488360259508967, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.478490829467773, + "learning_rate": 1e-06, + "loss": 0.4553, + "mean_token_accuracy": 0.8555142879486084, + "num_tokens": 344452108.0, + "step": 9031 + }, + { + "epoch": 1.1489632362294873, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.502384185791016, + "learning_rate": 1e-06, + "loss": 0.4433, + "mean_token_accuracy": 0.8593152165412903, + "num_tokens": 344489798.0, + "step": 9032 + }, + { + "epoch": 1.1490904465080778, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.53815460205078, + "learning_rate": 1e-06, + "loss": 0.3941, + "mean_token_accuracy": 0.8737081289291382, + "num_tokens": 344528194.0, + "step": 9033 + }, + { + "epoch": 1.1492176567866683, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.45281219482422, + "learning_rate": 1e-06, + "loss": 0.4631, + "mean_token_accuracy": 0.8513075113296509, + "num_tokens": 344559699.0, + "step": 9034 + }, + { + "epoch": 1.1493448670652588, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.468538284301758, + "learning_rate": 1e-06, + "loss": 0.4499, + "mean_token_accuracy": 0.8580940365791321, + "num_tokens": 344598234.0, + "step": 9035 + }, + { + "epoch": 1.1494720773438494, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.504383087158203, + "learning_rate": 1e-06, + "loss": 0.4334, + "mean_token_accuracy": 0.8591468334197998, + "num_tokens": 344634780.0, + "step": 9036 + }, + { + "epoch": 1.14959928762244, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.481401443481445, + "learning_rate": 1e-06, + "loss": 0.4107, + "mean_token_accuracy": 0.8694837093353271, + "num_tokens": 344677167.0, + "step": 9037 + }, + { + "epoch": 1.1497264979010304, + "ewc_loss": 0.0303955078125, + "ewc_loss_parallel": 3.039836883544922e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.631494522094727, + "learning_rate": 1e-06, + "loss": 0.4603, + "mean_token_accuracy": 0.852479875087738, + "num_tokens": 344713824.0, + "step": 9038 + }, + { + "epoch": 1.149853708179621, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.556020736694336, + "learning_rate": 1e-06, + "loss": 0.4681, + "mean_token_accuracy": 0.853341281414032, + "num_tokens": 344755814.0, + "step": 9039 + }, + { + "epoch": 1.1499809184582115, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.511829376220703, + "learning_rate": 1e-06, + "loss": 0.3729, + "mean_token_accuracy": 0.8800493478775024, + "num_tokens": 344795022.0, + "step": 9040 + }, + { + "epoch": 1.150108128736802, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.640655517578125, + "learning_rate": 1e-06, + "loss": 0.3895, + "mean_token_accuracy": 0.8748242855072021, + "num_tokens": 344833906.0, + "step": 9041 + }, + { + "epoch": 1.1502353390153925, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.53919792175293, + "learning_rate": 1e-06, + "loss": 0.4143, + "mean_token_accuracy": 0.8710923790931702, + "num_tokens": 344875975.0, + "step": 9042 + }, + { + "epoch": 1.150362549293983, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.45958709716797, + "learning_rate": 1e-06, + "loss": 0.4113, + "mean_token_accuracy": 0.8685612678527832, + "num_tokens": 344915433.0, + "step": 9043 + }, + { + "epoch": 1.1504897595725734, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.480010986328125, + "learning_rate": 1e-06, + "loss": 0.4024, + "mean_token_accuracy": 0.8700367212295532, + "num_tokens": 344949038.0, + "step": 9044 + }, + { + "epoch": 1.150616969851164, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.54888916015625, + "learning_rate": 1e-06, + "loss": 0.4116, + "mean_token_accuracy": 0.8670642971992493, + "num_tokens": 344988260.0, + "step": 9045 + }, + { + "epoch": 1.1507441801297544, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.478609085083008, + "learning_rate": 1e-06, + "loss": 0.3908, + "mean_token_accuracy": 0.8780043125152588, + "num_tokens": 345024355.0, + "step": 9046 + }, + { + "epoch": 1.150871390408345, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.47871208190918, + "learning_rate": 1e-06, + "loss": 0.3935, + "mean_token_accuracy": 0.8749235272407532, + "num_tokens": 345063041.0, + "step": 9047 + }, + { + "epoch": 1.1509986006869355, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.405588150024414, + "learning_rate": 1e-06, + "loss": 0.3664, + "mean_token_accuracy": 0.8854560852050781, + "num_tokens": 345097813.0, + "step": 9048 + }, + { + "epoch": 1.151125810965526, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.45322608947754, + "learning_rate": 1e-06, + "loss": 0.4355, + "mean_token_accuracy": 0.8617575168609619, + "num_tokens": 345134534.0, + "step": 9049 + }, + { + "epoch": 1.1512530212441165, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.445446014404297, + "learning_rate": 1e-06, + "loss": 0.4169, + "mean_token_accuracy": 0.8696675896644592, + "num_tokens": 345171018.0, + "step": 9050 + }, + { + "epoch": 1.151380231522707, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.425153732299805, + "learning_rate": 1e-06, + "loss": 0.3871, + "mean_token_accuracy": 0.8768450617790222, + "num_tokens": 345208006.0, + "step": 9051 + }, + { + "epoch": 1.1515074418012976, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.35080909729004, + "learning_rate": 1e-06, + "loss": 0.3672, + "mean_token_accuracy": 0.8837262392044067, + "num_tokens": 345246616.0, + "step": 9052 + }, + { + "epoch": 1.1516346520798881, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.482666015625, + "learning_rate": 1e-06, + "loss": 0.381, + "mean_token_accuracy": 0.8816932439804077, + "num_tokens": 345288095.0, + "step": 9053 + }, + { + "epoch": 1.1517618623584787, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.603914260864258, + "learning_rate": 1e-06, + "loss": 0.453, + "mean_token_accuracy": 0.8516362309455872, + "num_tokens": 345333291.0, + "step": 9054 + }, + { + "epoch": 1.151889072637069, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.59536361694336, + "learning_rate": 1e-06, + "loss": 0.4275, + "mean_token_accuracy": 0.864829957485199, + "num_tokens": 345375093.0, + "step": 9055 + }, + { + "epoch": 1.1520162829156595, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.304058074951172, + "learning_rate": 1e-06, + "loss": 0.4479, + "mean_token_accuracy": 0.8594818115234375, + "num_tokens": 345415547.0, + "step": 9056 + }, + { + "epoch": 1.15214349319425, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.571130752563477, + "learning_rate": 1e-06, + "loss": 0.4123, + "mean_token_accuracy": 0.8700481653213501, + "num_tokens": 345457340.0, + "step": 9057 + }, + { + "epoch": 1.1522707034728406, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.334285736083984, + "learning_rate": 1e-06, + "loss": 0.4357, + "mean_token_accuracy": 0.8622586727142334, + "num_tokens": 345494711.0, + "step": 9058 + }, + { + "epoch": 1.152397913751431, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.445682525634766, + "learning_rate": 1e-06, + "loss": 0.4523, + "mean_token_accuracy": 0.8580315113067627, + "num_tokens": 345534249.0, + "step": 9059 + }, + { + "epoch": 1.1525251240300216, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.461780548095703, + "learning_rate": 1e-06, + "loss": 0.4232, + "mean_token_accuracy": 0.8647204637527466, + "num_tokens": 345570832.0, + "step": 9060 + }, + { + "epoch": 1.1526523343086121, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.50345802307129, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8605514764785767, + "num_tokens": 345609007.0, + "step": 9061 + }, + { + "epoch": 1.1527795445872027, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.560789108276367, + "learning_rate": 1e-06, + "loss": 0.3847, + "mean_token_accuracy": 0.8772685527801514, + "num_tokens": 345647979.0, + "step": 9062 + }, + { + "epoch": 1.1529067548657932, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.39558982849121, + "learning_rate": 1e-06, + "loss": 0.4562, + "mean_token_accuracy": 0.8591785430908203, + "num_tokens": 345681821.0, + "step": 9063 + }, + { + "epoch": 1.1530339651443837, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.595287322998047, + "learning_rate": 1e-06, + "loss": 0.4453, + "mean_token_accuracy": 0.8551723957061768, + "num_tokens": 345724274.0, + "step": 9064 + }, + { + "epoch": 1.1531611754229742, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.603553771972656, + "learning_rate": 1e-06, + "loss": 0.3722, + "mean_token_accuracy": 0.8812646865844727, + "num_tokens": 345763558.0, + "step": 9065 + }, + { + "epoch": 1.1532883857015648, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.699953079223633, + "learning_rate": 1e-06, + "loss": 0.4442, + "mean_token_accuracy": 0.8613704442977905, + "num_tokens": 345798486.0, + "step": 9066 + }, + { + "epoch": 1.1534155959801553, + "ewc_loss": 0.0303955078125, + "ewc_loss_parallel": 3.039836883544922e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.35393524169922, + "learning_rate": 1e-06, + "loss": 0.4345, + "mean_token_accuracy": 0.862179160118103, + "num_tokens": 345839379.0, + "step": 9067 + }, + { + "epoch": 1.1535428062587456, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.53965950012207, + "learning_rate": 1e-06, + "loss": 0.4347, + "mean_token_accuracy": 0.8619481921195984, + "num_tokens": 345876675.0, + "step": 9068 + }, + { + "epoch": 1.1536700165373361, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.551206588745117, + "learning_rate": 1e-06, + "loss": 0.3696, + "mean_token_accuracy": 0.8842930793762207, + "num_tokens": 345913482.0, + "step": 9069 + }, + { + "epoch": 1.1537972268159267, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.390047073364258, + "learning_rate": 1e-06, + "loss": 0.436, + "mean_token_accuracy": 0.8611369132995605, + "num_tokens": 345947671.0, + "step": 9070 + }, + { + "epoch": 1.1539244370945172, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.872272491455078, + "learning_rate": 1e-06, + "loss": 0.4074, + "mean_token_accuracy": 0.8688867688179016, + "num_tokens": 345981520.0, + "step": 9071 + }, + { + "epoch": 1.1540516473731077, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.365386962890625, + "learning_rate": 1e-06, + "loss": 0.4677, + "mean_token_accuracy": 0.8494241237640381, + "num_tokens": 346020050.0, + "step": 9072 + }, + { + "epoch": 1.1541788576516983, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.710479736328125, + "learning_rate": 1e-06, + "loss": 0.4558, + "mean_token_accuracy": 0.857862651348114, + "num_tokens": 346050000.0, + "step": 9073 + }, + { + "epoch": 1.1543060679302888, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.551294326782227, + "learning_rate": 1e-06, + "loss": 0.4252, + "mean_token_accuracy": 0.8601032495498657, + "num_tokens": 346085038.0, + "step": 9074 + }, + { + "epoch": 1.1544332782088793, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.552326202392578, + "learning_rate": 1e-06, + "loss": 0.3831, + "mean_token_accuracy": 0.8759965896606445, + "num_tokens": 346125871.0, + "step": 9075 + }, + { + "epoch": 1.1545604884874698, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.554574966430664, + "learning_rate": 1e-06, + "loss": 0.3818, + "mean_token_accuracy": 0.8738119602203369, + "num_tokens": 346160460.0, + "step": 9076 + }, + { + "epoch": 1.1546876987660604, + "ewc_loss": 0.0303955078125, + "ewc_loss_parallel": 3.039836883544922e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.52457046508789, + "learning_rate": 1e-06, + "loss": 0.4538, + "mean_token_accuracy": 0.8555691242218018, + "num_tokens": 346201621.0, + "step": 9077 + }, + { + "epoch": 1.154814909044651, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.504165649414062, + "learning_rate": 1e-06, + "loss": 0.4241, + "mean_token_accuracy": 0.8682823777198792, + "num_tokens": 346242680.0, + "step": 9078 + }, + { + "epoch": 1.1549421193232412, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.384981155395508, + "learning_rate": 1e-06, + "loss": 0.4477, + "mean_token_accuracy": 0.8564071655273438, + "num_tokens": 346275774.0, + "step": 9079 + }, + { + "epoch": 1.1550693296018317, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.64684295654297, + "learning_rate": 1e-06, + "loss": 0.4054, + "mean_token_accuracy": 0.8681740164756775, + "num_tokens": 346317648.0, + "step": 9080 + }, + { + "epoch": 1.1551965398804223, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.428722381591797, + "learning_rate": 1e-06, + "loss": 0.4118, + "mean_token_accuracy": 0.8698930740356445, + "num_tokens": 346357736.0, + "step": 9081 + }, + { + "epoch": 1.1553237501590128, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.44236946105957, + "learning_rate": 1e-06, + "loss": 0.428, + "mean_token_accuracy": 0.8652126789093018, + "num_tokens": 346397961.0, + "step": 9082 + }, + { + "epoch": 1.1554509604376033, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.41061019897461, + "learning_rate": 1e-06, + "loss": 0.4027, + "mean_token_accuracy": 0.8766626715660095, + "num_tokens": 346438268.0, + "step": 9083 + }, + { + "epoch": 1.1555781707161938, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.603416442871094, + "learning_rate": 1e-06, + "loss": 0.4567, + "mean_token_accuracy": 0.8569904565811157, + "num_tokens": 346479024.0, + "step": 9084 + }, + { + "epoch": 1.1557053809947844, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.445775985717773, + "learning_rate": 1e-06, + "loss": 0.3991, + "mean_token_accuracy": 0.8721234798431396, + "num_tokens": 346523526.0, + "step": 9085 + }, + { + "epoch": 1.155832591273375, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.626073837280273, + "learning_rate": 1e-06, + "loss": 0.4103, + "mean_token_accuracy": 0.8745424747467041, + "num_tokens": 346561985.0, + "step": 9086 + }, + { + "epoch": 1.1559598015519654, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.351654052734375, + "learning_rate": 1e-06, + "loss": 0.4691, + "mean_token_accuracy": 0.8527517318725586, + "num_tokens": 346605600.0, + "step": 9087 + }, + { + "epoch": 1.156087011830556, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.53719711303711, + "learning_rate": 1e-06, + "loss": 0.3818, + "mean_token_accuracy": 0.8782755136489868, + "num_tokens": 346640798.0, + "step": 9088 + }, + { + "epoch": 1.1562142221091465, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.54700469970703, + "learning_rate": 1e-06, + "loss": 0.4362, + "mean_token_accuracy": 0.8602408766746521, + "num_tokens": 346678483.0, + "step": 9089 + }, + { + "epoch": 1.156341432387737, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.556934356689453, + "learning_rate": 1e-06, + "loss": 0.3707, + "mean_token_accuracy": 0.8816755414009094, + "num_tokens": 346714604.0, + "step": 9090 + }, + { + "epoch": 1.1564686426663275, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.654033660888672, + "learning_rate": 1e-06, + "loss": 0.3758, + "mean_token_accuracy": 0.876438319683075, + "num_tokens": 346747203.0, + "step": 9091 + }, + { + "epoch": 1.156595852944918, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.361661911010742, + "learning_rate": 1e-06, + "loss": 0.4254, + "mean_token_accuracy": 0.8646935224533081, + "num_tokens": 346790650.0, + "step": 9092 + }, + { + "epoch": 1.1567230632235084, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.63997459411621, + "learning_rate": 1e-06, + "loss": 0.3537, + "mean_token_accuracy": 0.8886698484420776, + "num_tokens": 346822872.0, + "step": 9093 + }, + { + "epoch": 1.156850273502099, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.562044143676758, + "learning_rate": 1e-06, + "loss": 0.4029, + "mean_token_accuracy": 0.8723230361938477, + "num_tokens": 346857999.0, + "step": 9094 + }, + { + "epoch": 1.1569774837806894, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.496492385864258, + "learning_rate": 1e-06, + "loss": 0.4019, + "mean_token_accuracy": 0.8725932836532593, + "num_tokens": 346891804.0, + "step": 9095 + }, + { + "epoch": 1.15710469405928, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.66493034362793, + "learning_rate": 1e-06, + "loss": 0.4245, + "mean_token_accuracy": 0.8656832575798035, + "num_tokens": 346931554.0, + "step": 9096 + }, + { + "epoch": 1.1572319043378705, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.474647521972656, + "learning_rate": 1e-06, + "loss": 0.4633, + "mean_token_accuracy": 0.8528860807418823, + "num_tokens": 346965214.0, + "step": 9097 + }, + { + "epoch": 1.157359114616461, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.405712127685547, + "learning_rate": 1e-06, + "loss": 0.3914, + "mean_token_accuracy": 0.8773012161254883, + "num_tokens": 347001219.0, + "step": 9098 + }, + { + "epoch": 1.1574863248950515, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.392961502075195, + "learning_rate": 1e-06, + "loss": 0.4758, + "mean_token_accuracy": 0.84926837682724, + "num_tokens": 347043212.0, + "step": 9099 + }, + { + "epoch": 1.157613535173642, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.612001419067383, + "learning_rate": 1e-06, + "loss": 0.3833, + "mean_token_accuracy": 0.878673791885376, + "num_tokens": 347082249.0, + "step": 9100 + }, + { + "epoch": 1.1577407454522326, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.352632522583008, + "learning_rate": 1e-06, + "loss": 0.4191, + "mean_token_accuracy": 0.8641706705093384, + "num_tokens": 347119286.0, + "step": 9101 + }, + { + "epoch": 1.1578679557308231, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.462474822998047, + "learning_rate": 1e-06, + "loss": 0.3729, + "mean_token_accuracy": 0.8784792423248291, + "num_tokens": 347155549.0, + "step": 9102 + }, + { + "epoch": 1.1579951660094137, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.519697189331055, + "learning_rate": 1e-06, + "loss": 0.4158, + "mean_token_accuracy": 0.8691061735153198, + "num_tokens": 347196846.0, + "step": 9103 + }, + { + "epoch": 1.158122376288004, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.58064842224121, + "learning_rate": 1e-06, + "loss": 0.3753, + "mean_token_accuracy": 0.8798628449440002, + "num_tokens": 347234028.0, + "step": 9104 + }, + { + "epoch": 1.1582495865665945, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.473947525024414, + "learning_rate": 1e-06, + "loss": 0.3951, + "mean_token_accuracy": 0.8774949312210083, + "num_tokens": 347276931.0, + "step": 9105 + }, + { + "epoch": 1.158376796845185, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.613494873046875, + "learning_rate": 1e-06, + "loss": 0.445, + "mean_token_accuracy": 0.8633590936660767, + "num_tokens": 347317139.0, + "step": 9106 + }, + { + "epoch": 1.1585040071237755, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.64240074157715, + "learning_rate": 1e-06, + "loss": 0.4126, + "mean_token_accuracy": 0.8687471747398376, + "num_tokens": 347355718.0, + "step": 9107 + }, + { + "epoch": 1.158631217402366, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.6033878326416, + "learning_rate": 1e-06, + "loss": 0.4534, + "mean_token_accuracy": 0.8571531772613525, + "num_tokens": 347393403.0, + "step": 9108 + }, + { + "epoch": 1.1587584276809566, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.552637100219727, + "learning_rate": 1e-06, + "loss": 0.3997, + "mean_token_accuracy": 0.874436616897583, + "num_tokens": 347433736.0, + "step": 9109 + }, + { + "epoch": 1.1588856379595471, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.52382469177246, + "learning_rate": 1e-06, + "loss": 0.4172, + "mean_token_accuracy": 0.8682656288146973, + "num_tokens": 347474574.0, + "step": 9110 + }, + { + "epoch": 1.1590128482381377, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.542495727539062, + "learning_rate": 1e-06, + "loss": 0.4052, + "mean_token_accuracy": 0.8723328113555908, + "num_tokens": 347518785.0, + "step": 9111 + }, + { + "epoch": 1.1591400585167282, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.676368713378906, + "learning_rate": 1e-06, + "loss": 0.4567, + "mean_token_accuracy": 0.8542565107345581, + "num_tokens": 347557518.0, + "step": 9112 + }, + { + "epoch": 1.1592672687953187, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.596633911132812, + "learning_rate": 1e-06, + "loss": 0.5086, + "mean_token_accuracy": 0.8371846675872803, + "num_tokens": 347600094.0, + "step": 9113 + }, + { + "epoch": 1.1593944790739092, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.550487518310547, + "learning_rate": 1e-06, + "loss": 0.4125, + "mean_token_accuracy": 0.8649610280990601, + "num_tokens": 347633709.0, + "step": 9114 + }, + { + "epoch": 1.1595216893524998, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.606042861938477, + "learning_rate": 1e-06, + "loss": 0.4011, + "mean_token_accuracy": 0.8730172514915466, + "num_tokens": 347670027.0, + "step": 9115 + }, + { + "epoch": 1.1596488996310903, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.586467742919922, + "learning_rate": 1e-06, + "loss": 0.4052, + "mean_token_accuracy": 0.871041476726532, + "num_tokens": 347707488.0, + "step": 9116 + }, + { + "epoch": 1.1597761099096806, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.8007755279541, + "learning_rate": 1e-06, + "loss": 0.4378, + "mean_token_accuracy": 0.8604308366775513, + "num_tokens": 347746030.0, + "step": 9117 + }, + { + "epoch": 1.1599033201882711, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.611026763916016, + "learning_rate": 1e-06, + "loss": 0.4513, + "mean_token_accuracy": 0.8587560057640076, + "num_tokens": 347783409.0, + "step": 9118 + }, + { + "epoch": 1.1600305304668617, + "ewc_loss": 0.0303955078125, + "ewc_loss_parallel": 3.039836883544922e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.617170333862305, + "learning_rate": 1e-06, + "loss": 0.3991, + "mean_token_accuracy": 0.8741111755371094, + "num_tokens": 347821335.0, + "step": 9119 + }, + { + "epoch": 1.1601577407454522, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.63443374633789, + "learning_rate": 1e-06, + "loss": 0.3807, + "mean_token_accuracy": 0.87703537940979, + "num_tokens": 347857590.0, + "step": 9120 + }, + { + "epoch": 1.1602849510240427, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.50598907470703, + "learning_rate": 1e-06, + "loss": 0.3767, + "mean_token_accuracy": 0.8784432411193848, + "num_tokens": 347897000.0, + "step": 9121 + }, + { + "epoch": 1.1604121613026332, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.529598236083984, + "learning_rate": 1e-06, + "loss": 0.4171, + "mean_token_accuracy": 0.8656982779502869, + "num_tokens": 347928947.0, + "step": 9122 + }, + { + "epoch": 1.1605393715812238, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.68170928955078, + "learning_rate": 1e-06, + "loss": 0.5352, + "mean_token_accuracy": 0.8367155194282532, + "num_tokens": 347959294.0, + "step": 9123 + }, + { + "epoch": 1.1606665818598143, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.604019165039062, + "learning_rate": 1e-06, + "loss": 0.4041, + "mean_token_accuracy": 0.872453510761261, + "num_tokens": 347993103.0, + "step": 9124 + }, + { + "epoch": 1.1607937921384048, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.3960018157959, + "learning_rate": 1e-06, + "loss": 0.3765, + "mean_token_accuracy": 0.8797107934951782, + "num_tokens": 348028412.0, + "step": 9125 + }, + { + "epoch": 1.1609210024169954, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.52410888671875, + "learning_rate": 1e-06, + "loss": 0.4747, + "mean_token_accuracy": 0.8496462106704712, + "num_tokens": 348071429.0, + "step": 9126 + }, + { + "epoch": 1.161048212695586, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.526334762573242, + "learning_rate": 1e-06, + "loss": 0.4068, + "mean_token_accuracy": 0.8718328475952148, + "num_tokens": 348113222.0, + "step": 9127 + }, + { + "epoch": 1.1611754229741762, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.765342712402344, + "learning_rate": 1e-06, + "loss": 0.42, + "mean_token_accuracy": 0.8650608658790588, + "num_tokens": 348150188.0, + "step": 9128 + }, + { + "epoch": 1.1613026332527667, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.390701293945312, + "learning_rate": 1e-06, + "loss": 0.3971, + "mean_token_accuracy": 0.8701491355895996, + "num_tokens": 348188547.0, + "step": 9129 + }, + { + "epoch": 1.1614298435313573, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.649660110473633, + "learning_rate": 1e-06, + "loss": 0.4375, + "mean_token_accuracy": 0.8617403507232666, + "num_tokens": 348228624.0, + "step": 9130 + }, + { + "epoch": 1.1615570538099478, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.495235443115234, + "learning_rate": 1e-06, + "loss": 0.4203, + "mean_token_accuracy": 0.8633738160133362, + "num_tokens": 348264932.0, + "step": 9131 + }, + { + "epoch": 1.1616842640885383, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.661849975585938, + "learning_rate": 1e-06, + "loss": 0.4142, + "mean_token_accuracy": 0.8679999709129333, + "num_tokens": 348307680.0, + "step": 9132 + }, + { + "epoch": 1.1618114743671288, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.564250946044922, + "learning_rate": 1e-06, + "loss": 0.3859, + "mean_token_accuracy": 0.88070148229599, + "num_tokens": 348346449.0, + "step": 9133 + }, + { + "epoch": 1.1619386846457194, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.611900329589844, + "learning_rate": 1e-06, + "loss": 0.3844, + "mean_token_accuracy": 0.8774107694625854, + "num_tokens": 348386393.0, + "step": 9134 + }, + { + "epoch": 1.16206589492431, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.49139404296875, + "learning_rate": 1e-06, + "loss": 0.4401, + "mean_token_accuracy": 0.8602896332740784, + "num_tokens": 348429458.0, + "step": 9135 + }, + { + "epoch": 1.1621931052029004, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.73921775817871, + "learning_rate": 1e-06, + "loss": 0.4643, + "mean_token_accuracy": 0.8545988202095032, + "num_tokens": 348471365.0, + "step": 9136 + }, + { + "epoch": 1.162320315481491, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.716365814208984, + "learning_rate": 1e-06, + "loss": 0.4672, + "mean_token_accuracy": 0.8511019945144653, + "num_tokens": 348513425.0, + "step": 9137 + }, + { + "epoch": 1.1624475257600815, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.550525665283203, + "learning_rate": 1e-06, + "loss": 0.4165, + "mean_token_accuracy": 0.864109992980957, + "num_tokens": 348551181.0, + "step": 9138 + }, + { + "epoch": 1.162574736038672, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.69747543334961, + "learning_rate": 1e-06, + "loss": 0.3452, + "mean_token_accuracy": 0.8875975012779236, + "num_tokens": 348583611.0, + "step": 9139 + }, + { + "epoch": 1.1627019463172625, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.549964904785156, + "learning_rate": 1e-06, + "loss": 0.4224, + "mean_token_accuracy": 0.8647891283035278, + "num_tokens": 348624281.0, + "step": 9140 + }, + { + "epoch": 1.162829156595853, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.65495491027832, + "learning_rate": 1e-06, + "loss": 0.4293, + "mean_token_accuracy": 0.8654663562774658, + "num_tokens": 348661416.0, + "step": 9141 + }, + { + "epoch": 1.1629563668744434, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.62800407409668, + "learning_rate": 1e-06, + "loss": 0.4806, + "mean_token_accuracy": 0.8466506004333496, + "num_tokens": 348703907.0, + "step": 9142 + }, + { + "epoch": 1.163083577153034, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.512460708618164, + "learning_rate": 1e-06, + "loss": 0.3775, + "mean_token_accuracy": 0.8785971403121948, + "num_tokens": 348735617.0, + "step": 9143 + }, + { + "epoch": 1.1632107874316244, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.65945053100586, + "learning_rate": 1e-06, + "loss": 0.4313, + "mean_token_accuracy": 0.8642324209213257, + "num_tokens": 348773552.0, + "step": 9144 + }, + { + "epoch": 1.163337997710215, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.694761276245117, + "learning_rate": 1e-06, + "loss": 0.4188, + "mean_token_accuracy": 0.8673185110092163, + "num_tokens": 348811274.0, + "step": 9145 + }, + { + "epoch": 1.1634652079888055, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.62962532043457, + "learning_rate": 1e-06, + "loss": 0.456, + "mean_token_accuracy": 0.8543316125869751, + "num_tokens": 348847773.0, + "step": 9146 + }, + { + "epoch": 1.163592418267396, + "ewc_loss": 0.0302734375, + "ewc_loss_parallel": 3.0279159545898438e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.539260864257812, + "learning_rate": 1e-06, + "loss": 0.4475, + "mean_token_accuracy": 0.8582075238227844, + "num_tokens": 348886356.0, + "step": 9147 + }, + { + "epoch": 1.1637196285459865, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.67885398864746, + "learning_rate": 1e-06, + "loss": 0.3874, + "mean_token_accuracy": 0.877813458442688, + "num_tokens": 348922892.0, + "step": 9148 + }, + { + "epoch": 1.163846838824577, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.47030258178711, + "learning_rate": 1e-06, + "loss": 0.3599, + "mean_token_accuracy": 0.8856326341629028, + "num_tokens": 348954924.0, + "step": 9149 + }, + { + "epoch": 1.1639740491031676, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.73406219482422, + "learning_rate": 1e-06, + "loss": 0.4077, + "mean_token_accuracy": 0.8730124831199646, + "num_tokens": 348997335.0, + "step": 9150 + }, + { + "epoch": 1.1641012593817581, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.489721298217773, + "learning_rate": 1e-06, + "loss": 0.4422, + "mean_token_accuracy": 0.8561788201332092, + "num_tokens": 349038221.0, + "step": 9151 + }, + { + "epoch": 1.1642284696603487, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.890243530273438, + "learning_rate": 1e-06, + "loss": 0.4355, + "mean_token_accuracy": 0.8625020980834961, + "num_tokens": 349076559.0, + "step": 9152 + }, + { + "epoch": 1.164355679938939, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.6627197265625, + "learning_rate": 1e-06, + "loss": 0.4908, + "mean_token_accuracy": 0.8451056480407715, + "num_tokens": 349112264.0, + "step": 9153 + }, + { + "epoch": 1.1644828902175295, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.843246459960938, + "learning_rate": 1e-06, + "loss": 0.377, + "mean_token_accuracy": 0.8797365427017212, + "num_tokens": 349146571.0, + "step": 9154 + }, + { + "epoch": 1.16461010049612, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.73341941833496, + "learning_rate": 1e-06, + "loss": 0.4016, + "mean_token_accuracy": 0.8732495307922363, + "num_tokens": 349183674.0, + "step": 9155 + }, + { + "epoch": 1.1647373107747105, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.464698791503906, + "learning_rate": 1e-06, + "loss": 0.4056, + "mean_token_accuracy": 0.8706496953964233, + "num_tokens": 349227042.0, + "step": 9156 + }, + { + "epoch": 1.164864521053301, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.822711944580078, + "learning_rate": 1e-06, + "loss": 0.4331, + "mean_token_accuracy": 0.8617970943450928, + "num_tokens": 349264072.0, + "step": 9157 + }, + { + "epoch": 1.1649917313318916, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.547988891601562, + "learning_rate": 1e-06, + "loss": 0.4278, + "mean_token_accuracy": 0.8684061765670776, + "num_tokens": 349304427.0, + "step": 9158 + }, + { + "epoch": 1.1651189416104821, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.54970932006836, + "learning_rate": 1e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.8657625317573547, + "num_tokens": 349346708.0, + "step": 9159 + }, + { + "epoch": 1.1652461518890727, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.65540313720703, + "learning_rate": 1e-06, + "loss": 0.4218, + "mean_token_accuracy": 0.8655509948730469, + "num_tokens": 349387716.0, + "step": 9160 + }, + { + "epoch": 1.1653733621676632, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.786331176757812, + "learning_rate": 1e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.8634687066078186, + "num_tokens": 349425757.0, + "step": 9161 + }, + { + "epoch": 1.1655005724462537, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.279647827148438, + "learning_rate": 1e-06, + "loss": 0.4058, + "mean_token_accuracy": 0.8732316493988037, + "num_tokens": 349472097.0, + "step": 9162 + }, + { + "epoch": 1.1656277827248442, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.591083526611328, + "learning_rate": 1e-06, + "loss": 0.4295, + "mean_token_accuracy": 0.8640220165252686, + "num_tokens": 349513809.0, + "step": 9163 + }, + { + "epoch": 1.1657549930034348, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.516748428344727, + "learning_rate": 1e-06, + "loss": 0.3689, + "mean_token_accuracy": 0.8820230960845947, + "num_tokens": 349544906.0, + "step": 9164 + }, + { + "epoch": 1.1658822032820253, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.617069244384766, + "learning_rate": 1e-06, + "loss": 0.4345, + "mean_token_accuracy": 0.861910343170166, + "num_tokens": 349582529.0, + "step": 9165 + }, + { + "epoch": 1.1660094135606156, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.56098747253418, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8613591194152832, + "num_tokens": 349624871.0, + "step": 9166 + }, + { + "epoch": 1.1661366238392061, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.770633697509766, + "learning_rate": 1e-06, + "loss": 0.5233, + "mean_token_accuracy": 0.8314423561096191, + "num_tokens": 349660783.0, + "step": 9167 + }, + { + "epoch": 1.1662638341177967, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.525590896606445, + "learning_rate": 1e-06, + "loss": 0.3806, + "mean_token_accuracy": 0.8762590289115906, + "num_tokens": 349698430.0, + "step": 9168 + }, + { + "epoch": 1.1663910443963872, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.608760833740234, + "learning_rate": 1e-06, + "loss": 0.4386, + "mean_token_accuracy": 0.8588764071464539, + "num_tokens": 349734981.0, + "step": 9169 + }, + { + "epoch": 1.1665182546749777, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.69508171081543, + "learning_rate": 1e-06, + "loss": 0.46, + "mean_token_accuracy": 0.8609856367111206, + "num_tokens": 349774267.0, + "step": 9170 + }, + { + "epoch": 1.1666454649535682, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.491682052612305, + "learning_rate": 1e-06, + "loss": 0.42, + "mean_token_accuracy": 0.862360417842865, + "num_tokens": 349811400.0, + "step": 9171 + }, + { + "epoch": 1.1667726752321588, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.772634506225586, + "learning_rate": 1e-06, + "loss": 0.4647, + "mean_token_accuracy": 0.8583327531814575, + "num_tokens": 349850031.0, + "step": 9172 + }, + { + "epoch": 1.1668998855107493, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.607206344604492, + "learning_rate": 1e-06, + "loss": 0.4006, + "mean_token_accuracy": 0.869147777557373, + "num_tokens": 349889967.0, + "step": 9173 + }, + { + "epoch": 1.1670270957893398, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.50727081298828, + "learning_rate": 1e-06, + "loss": 0.4351, + "mean_token_accuracy": 0.8611633777618408, + "num_tokens": 349930276.0, + "step": 9174 + }, + { + "epoch": 1.1671543060679304, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.559465408325195, + "learning_rate": 1e-06, + "loss": 0.4317, + "mean_token_accuracy": 0.8613373637199402, + "num_tokens": 349969889.0, + "step": 9175 + }, + { + "epoch": 1.1672815163465209, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.608036041259766, + "learning_rate": 1e-06, + "loss": 0.4272, + "mean_token_accuracy": 0.8638609051704407, + "num_tokens": 350005729.0, + "step": 9176 + }, + { + "epoch": 1.1674087266251112, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.453624725341797, + "learning_rate": 1e-06, + "loss": 0.3973, + "mean_token_accuracy": 0.872825026512146, + "num_tokens": 350043130.0, + "step": 9177 + }, + { + "epoch": 1.1675359369037017, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.44835662841797, + "learning_rate": 1e-06, + "loss": 0.3633, + "mean_token_accuracy": 0.8851161599159241, + "num_tokens": 350081233.0, + "step": 9178 + }, + { + "epoch": 1.1676631471822922, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.44791603088379, + "learning_rate": 1e-06, + "loss": 0.4716, + "mean_token_accuracy": 0.8537300825119019, + "num_tokens": 350119403.0, + "step": 9179 + }, + { + "epoch": 1.1677903574608828, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.590431213378906, + "learning_rate": 1e-06, + "loss": 0.46, + "mean_token_accuracy": 0.8540483713150024, + "num_tokens": 350159099.0, + "step": 9180 + }, + { + "epoch": 1.1679175677394733, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.733062744140625, + "learning_rate": 1e-06, + "loss": 0.4309, + "mean_token_accuracy": 0.8664133548736572, + "num_tokens": 350195642.0, + "step": 9181 + }, + { + "epoch": 1.1680447780180638, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.54859161376953, + "learning_rate": 1e-06, + "loss": 0.402, + "mean_token_accuracy": 0.873971164226532, + "num_tokens": 350236721.0, + "step": 9182 + }, + { + "epoch": 1.1681719882966544, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.737014770507812, + "learning_rate": 1e-06, + "loss": 0.4316, + "mean_token_accuracy": 0.8615777492523193, + "num_tokens": 350272691.0, + "step": 9183 + }, + { + "epoch": 1.168299198575245, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.49882698059082, + "learning_rate": 1e-06, + "loss": 0.4265, + "mean_token_accuracy": 0.865902841091156, + "num_tokens": 350318299.0, + "step": 9184 + }, + { + "epoch": 1.1684264088538354, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.56110191345215, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8653566241264343, + "num_tokens": 350360973.0, + "step": 9185 + }, + { + "epoch": 1.168553619132426, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.654245376586914, + "learning_rate": 1e-06, + "loss": 0.4017, + "mean_token_accuracy": 0.8741707801818848, + "num_tokens": 350400516.0, + "step": 9186 + }, + { + "epoch": 1.1686808294110165, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.600370407104492, + "learning_rate": 1e-06, + "loss": 0.4291, + "mean_token_accuracy": 0.8619587421417236, + "num_tokens": 350438147.0, + "step": 9187 + }, + { + "epoch": 1.168808039689607, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.74738883972168, + "learning_rate": 1e-06, + "loss": 0.4051, + "mean_token_accuracy": 0.8712834119796753, + "num_tokens": 350481604.0, + "step": 9188 + }, + { + "epoch": 1.1689352499681975, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.702253341674805, + "learning_rate": 1e-06, + "loss": 0.363, + "mean_token_accuracy": 0.8865578174591064, + "num_tokens": 350518225.0, + "step": 9189 + }, + { + "epoch": 1.169062460246788, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.510883331298828, + "learning_rate": 1e-06, + "loss": 0.4157, + "mean_token_accuracy": 0.8651276230812073, + "num_tokens": 350553752.0, + "step": 9190 + }, + { + "epoch": 1.1691896705253784, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.609615325927734, + "learning_rate": 1e-06, + "loss": 0.4102, + "mean_token_accuracy": 0.8708950281143188, + "num_tokens": 350593676.0, + "step": 9191 + }, + { + "epoch": 1.169316880803969, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.80653953552246, + "learning_rate": 1e-06, + "loss": 0.4289, + "mean_token_accuracy": 0.8642638921737671, + "num_tokens": 350633618.0, + "step": 9192 + }, + { + "epoch": 1.1694440910825594, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.44843292236328, + "learning_rate": 1e-06, + "loss": 0.4257, + "mean_token_accuracy": 0.8637056946754456, + "num_tokens": 350678221.0, + "step": 9193 + }, + { + "epoch": 1.16957130136115, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.835693359375, + "learning_rate": 1e-06, + "loss": 0.3874, + "mean_token_accuracy": 0.8761842846870422, + "num_tokens": 350718258.0, + "step": 9194 + }, + { + "epoch": 1.1696985116397405, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.7249813079834, + "learning_rate": 1e-06, + "loss": 0.3886, + "mean_token_accuracy": 0.8765684962272644, + "num_tokens": 350757370.0, + "step": 9195 + }, + { + "epoch": 1.169825721918331, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.539430618286133, + "learning_rate": 1e-06, + "loss": 0.4167, + "mean_token_accuracy": 0.8671103715896606, + "num_tokens": 350801394.0, + "step": 9196 + }, + { + "epoch": 1.1699529321969215, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.606515884399414, + "learning_rate": 1e-06, + "loss": 0.4542, + "mean_token_accuracy": 0.8587785959243774, + "num_tokens": 350840635.0, + "step": 9197 + }, + { + "epoch": 1.170080142475512, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.642404556274414, + "learning_rate": 1e-06, + "loss": 0.3686, + "mean_token_accuracy": 0.8818346858024597, + "num_tokens": 350879616.0, + "step": 9198 + }, + { + "epoch": 1.1702073527541026, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.548036575317383, + "learning_rate": 1e-06, + "loss": 0.4162, + "mean_token_accuracy": 0.865759551525116, + "num_tokens": 350921710.0, + "step": 9199 + }, + { + "epoch": 1.1703345630326931, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.582691192626953, + "learning_rate": 1e-06, + "loss": 0.4517, + "mean_token_accuracy": 0.85433030128479, + "num_tokens": 350964218.0, + "step": 9200 + }, + { + "epoch": 1.1704617733112836, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.39480209350586, + "learning_rate": 1e-06, + "loss": 0.3917, + "mean_token_accuracy": 0.8763180375099182, + "num_tokens": 351004511.0, + "step": 9201 + }, + { + "epoch": 1.170588983589874, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.476940155029297, + "learning_rate": 1e-06, + "loss": 0.4521, + "mean_token_accuracy": 0.8615738153457642, + "num_tokens": 351044583.0, + "step": 9202 + }, + { + "epoch": 1.1707161938684645, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.771183013916016, + "learning_rate": 1e-06, + "loss": 0.4801, + "mean_token_accuracy": 0.8494242429733276, + "num_tokens": 351083613.0, + "step": 9203 + }, + { + "epoch": 1.170843404147055, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.527637481689453, + "learning_rate": 1e-06, + "loss": 0.4422, + "mean_token_accuracy": 0.8587506413459778, + "num_tokens": 351117412.0, + "step": 9204 + }, + { + "epoch": 1.1709706144256455, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.872554779052734, + "learning_rate": 1e-06, + "loss": 0.461, + "mean_token_accuracy": 0.8521676063537598, + "num_tokens": 351154779.0, + "step": 9205 + }, + { + "epoch": 1.171097824704236, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.533279418945312, + "learning_rate": 1e-06, + "loss": 0.3828, + "mean_token_accuracy": 0.8771679401397705, + "num_tokens": 351191412.0, + "step": 9206 + }, + { + "epoch": 1.1712250349828266, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.678789138793945, + "learning_rate": 1e-06, + "loss": 0.4147, + "mean_token_accuracy": 0.8686861395835876, + "num_tokens": 351235465.0, + "step": 9207 + }, + { + "epoch": 1.1713522452614171, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.64552879333496, + "learning_rate": 1e-06, + "loss": 0.408, + "mean_token_accuracy": 0.8718014359474182, + "num_tokens": 351279326.0, + "step": 9208 + }, + { + "epoch": 1.1714794555400077, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.786937713623047, + "learning_rate": 1e-06, + "loss": 0.3623, + "mean_token_accuracy": 0.8828492760658264, + "num_tokens": 351316678.0, + "step": 9209 + }, + { + "epoch": 1.1716066658185982, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.618322372436523, + "learning_rate": 1e-06, + "loss": 0.4331, + "mean_token_accuracy": 0.8626038432121277, + "num_tokens": 351352641.0, + "step": 9210 + }, + { + "epoch": 1.1717338760971887, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.703699111938477, + "learning_rate": 1e-06, + "loss": 0.4534, + "mean_token_accuracy": 0.8569480180740356, + "num_tokens": 351385845.0, + "step": 9211 + }, + { + "epoch": 1.1718610863757792, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.62031364440918, + "learning_rate": 1e-06, + "loss": 0.4028, + "mean_token_accuracy": 0.8727898001670837, + "num_tokens": 351425220.0, + "step": 9212 + }, + { + "epoch": 1.1719882966543698, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.658039093017578, + "learning_rate": 1e-06, + "loss": 0.4407, + "mean_token_accuracy": 0.8609251976013184, + "num_tokens": 351467515.0, + "step": 9213 + }, + { + "epoch": 1.1721155069329603, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.568981170654297, + "learning_rate": 1e-06, + "loss": 0.4255, + "mean_token_accuracy": 0.8634828925132751, + "num_tokens": 351507571.0, + "step": 9214 + }, + { + "epoch": 1.1722427172115506, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.34149169921875, + "learning_rate": 1e-06, + "loss": 0.4339, + "mean_token_accuracy": 0.8631405234336853, + "num_tokens": 351546150.0, + "step": 9215 + }, + { + "epoch": 1.1723699274901411, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.696624755859375, + "learning_rate": 1e-06, + "loss": 0.4472, + "mean_token_accuracy": 0.8597428798675537, + "num_tokens": 351588574.0, + "step": 9216 + }, + { + "epoch": 1.1724971377687317, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.600574493408203, + "learning_rate": 1e-06, + "loss": 0.4128, + "mean_token_accuracy": 0.8673229217529297, + "num_tokens": 351626914.0, + "step": 9217 + }, + { + "epoch": 1.1726243480473222, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.66499900817871, + "learning_rate": 1e-06, + "loss": 0.4511, + "mean_token_accuracy": 0.8568727374076843, + "num_tokens": 351663574.0, + "step": 9218 + }, + { + "epoch": 1.1727515583259127, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.508522033691406, + "learning_rate": 1e-06, + "loss": 0.4754, + "mean_token_accuracy": 0.8497041463851929, + "num_tokens": 351703275.0, + "step": 9219 + }, + { + "epoch": 1.1728787686045032, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.589109420776367, + "learning_rate": 1e-06, + "loss": 0.4543, + "mean_token_accuracy": 0.8570764064788818, + "num_tokens": 351745713.0, + "step": 9220 + }, + { + "epoch": 1.1730059788830938, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.61369514465332, + "learning_rate": 1e-06, + "loss": 0.4608, + "mean_token_accuracy": 0.8513516187667847, + "num_tokens": 351780789.0, + "step": 9221 + }, + { + "epoch": 1.1731331891616843, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.429075241088867, + "learning_rate": 1e-06, + "loss": 0.3859, + "mean_token_accuracy": 0.8755161762237549, + "num_tokens": 351820352.0, + "step": 9222 + }, + { + "epoch": 1.1732603994402748, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.535131454467773, + "learning_rate": 1e-06, + "loss": 0.3907, + "mean_token_accuracy": 0.8735207319259644, + "num_tokens": 351860011.0, + "step": 9223 + }, + { + "epoch": 1.1733876097188654, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.546693801879883, + "learning_rate": 1e-06, + "loss": 0.4387, + "mean_token_accuracy": 0.8603658080101013, + "num_tokens": 351898742.0, + "step": 9224 + }, + { + "epoch": 1.1735148199974559, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.56744384765625, + "learning_rate": 1e-06, + "loss": 0.4588, + "mean_token_accuracy": 0.8536450862884521, + "num_tokens": 351940201.0, + "step": 9225 + }, + { + "epoch": 1.1736420302760462, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.53529167175293, + "learning_rate": 1e-06, + "loss": 0.4261, + "mean_token_accuracy": 0.863781213760376, + "num_tokens": 351971041.0, + "step": 9226 + }, + { + "epoch": 1.1737692405546367, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.457752227783203, + "learning_rate": 1e-06, + "loss": 0.4765, + "mean_token_accuracy": 0.8486154079437256, + "num_tokens": 352013002.0, + "step": 9227 + }, + { + "epoch": 1.1738964508332272, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.489633560180664, + "learning_rate": 1e-06, + "loss": 0.4303, + "mean_token_accuracy": 0.8642042875289917, + "num_tokens": 352051435.0, + "step": 9228 + }, + { + "epoch": 1.1740236611118178, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.696441650390625, + "learning_rate": 1e-06, + "loss": 0.4023, + "mean_token_accuracy": 0.8718836307525635, + "num_tokens": 352093295.0, + "step": 9229 + }, + { + "epoch": 1.1741508713904083, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.599945068359375, + "learning_rate": 1e-06, + "loss": 0.4387, + "mean_token_accuracy": 0.8620957136154175, + "num_tokens": 352136401.0, + "step": 9230 + }, + { + "epoch": 1.1742780816689988, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.55933952331543, + "learning_rate": 1e-06, + "loss": 0.3849, + "mean_token_accuracy": 0.8769214153289795, + "num_tokens": 352178696.0, + "step": 9231 + }, + { + "epoch": 1.1744052919475894, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.424896240234375, + "learning_rate": 1e-06, + "loss": 0.4941, + "mean_token_accuracy": 0.8472676277160645, + "num_tokens": 352214759.0, + "step": 9232 + }, + { + "epoch": 1.1745325022261799, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.637022018432617, + "learning_rate": 1e-06, + "loss": 0.3834, + "mean_token_accuracy": 0.8786541223526001, + "num_tokens": 352250520.0, + "step": 9233 + }, + { + "epoch": 1.1746597125047704, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.58577537536621, + "learning_rate": 1e-06, + "loss": 0.4237, + "mean_token_accuracy": 0.8640499114990234, + "num_tokens": 352288773.0, + "step": 9234 + }, + { + "epoch": 1.174786922783361, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.519847869873047, + "learning_rate": 1e-06, + "loss": 0.3986, + "mean_token_accuracy": 0.8733523488044739, + "num_tokens": 352326548.0, + "step": 9235 + }, + { + "epoch": 1.1749141330619515, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.577980041503906, + "learning_rate": 1e-06, + "loss": 0.3962, + "mean_token_accuracy": 0.875796914100647, + "num_tokens": 352366537.0, + "step": 9236 + }, + { + "epoch": 1.175041343340542, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.368844985961914, + "learning_rate": 1e-06, + "loss": 0.4749, + "mean_token_accuracy": 0.8488448858261108, + "num_tokens": 352402390.0, + "step": 9237 + }, + { + "epoch": 1.1751685536191325, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.516279220581055, + "learning_rate": 1e-06, + "loss": 0.4407, + "mean_token_accuracy": 0.8630218505859375, + "num_tokens": 352442357.0, + "step": 9238 + }, + { + "epoch": 1.175295763897723, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.647701263427734, + "learning_rate": 1e-06, + "loss": 0.4235, + "mean_token_accuracy": 0.8638237714767456, + "num_tokens": 352476884.0, + "step": 9239 + }, + { + "epoch": 1.1754229741763134, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.487340927124023, + "learning_rate": 1e-06, + "loss": 0.4157, + "mean_token_accuracy": 0.8656364679336548, + "num_tokens": 352513448.0, + "step": 9240 + }, + { + "epoch": 1.175550184454904, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.605806350708008, + "learning_rate": 1e-06, + "loss": 0.3651, + "mean_token_accuracy": 0.8842489719390869, + "num_tokens": 352548420.0, + "step": 9241 + }, + { + "epoch": 1.1756773947334944, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.665136337280273, + "learning_rate": 1e-06, + "loss": 0.421, + "mean_token_accuracy": 0.8680962324142456, + "num_tokens": 352592157.0, + "step": 9242 + }, + { + "epoch": 1.175804605012085, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.573625564575195, + "learning_rate": 1e-06, + "loss": 0.4481, + "mean_token_accuracy": 0.8574221134185791, + "num_tokens": 352626427.0, + "step": 9243 + }, + { + "epoch": 1.1759318152906755, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.57920265197754, + "learning_rate": 1e-06, + "loss": 0.3911, + "mean_token_accuracy": 0.875462532043457, + "num_tokens": 352667075.0, + "step": 9244 + }, + { + "epoch": 1.176059025569266, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.417478561401367, + "learning_rate": 1e-06, + "loss": 0.4251, + "mean_token_accuracy": 0.8641435503959656, + "num_tokens": 352707114.0, + "step": 9245 + }, + { + "epoch": 1.1761862358478565, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.732545852661133, + "learning_rate": 1e-06, + "loss": 0.4003, + "mean_token_accuracy": 0.8711893558502197, + "num_tokens": 352738064.0, + "step": 9246 + }, + { + "epoch": 1.176313446126447, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.363426208496094, + "learning_rate": 1e-06, + "loss": 0.4739, + "mean_token_accuracy": 0.8504189848899841, + "num_tokens": 352779964.0, + "step": 9247 + }, + { + "epoch": 1.1764406564050376, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.472753524780273, + "learning_rate": 1e-06, + "loss": 0.3941, + "mean_token_accuracy": 0.874153733253479, + "num_tokens": 352819574.0, + "step": 9248 + }, + { + "epoch": 1.1765678666836281, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.709470748901367, + "learning_rate": 1e-06, + "loss": 0.3944, + "mean_token_accuracy": 0.8744928240776062, + "num_tokens": 352859151.0, + "step": 9249 + }, + { + "epoch": 1.1766950769622184, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.6064453125, + "learning_rate": 1e-06, + "loss": 0.5179, + "mean_token_accuracy": 0.8444969654083252, + "num_tokens": 352901119.0, + "step": 9250 + }, + { + "epoch": 1.176822287240809, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.494403839111328, + "learning_rate": 1e-06, + "loss": 0.4315, + "mean_token_accuracy": 0.8653029203414917, + "num_tokens": 352945878.0, + "step": 9251 + }, + { + "epoch": 1.1769494975193995, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.546850204467773, + "learning_rate": 1e-06, + "loss": 0.4371, + "mean_token_accuracy": 0.8603444695472717, + "num_tokens": 352979819.0, + "step": 9252 + }, + { + "epoch": 1.17707670779799, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.601179122924805, + "learning_rate": 1e-06, + "loss": 0.4598, + "mean_token_accuracy": 0.8542475700378418, + "num_tokens": 353023552.0, + "step": 9253 + }, + { + "epoch": 1.1772039180765805, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.675331115722656, + "learning_rate": 1e-06, + "loss": 0.422, + "mean_token_accuracy": 0.8666048049926758, + "num_tokens": 353058178.0, + "step": 9254 + }, + { + "epoch": 1.177331128355171, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.451091766357422, + "learning_rate": 1e-06, + "loss": 0.3977, + "mean_token_accuracy": 0.8748857975006104, + "num_tokens": 353091428.0, + "step": 9255 + }, + { + "epoch": 1.1774583386337616, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.70846176147461, + "learning_rate": 1e-06, + "loss": 0.4219, + "mean_token_accuracy": 0.8685246706008911, + "num_tokens": 353131943.0, + "step": 9256 + }, + { + "epoch": 1.1775855489123521, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.652908325195312, + "learning_rate": 1e-06, + "loss": 0.4416, + "mean_token_accuracy": 0.8588393330574036, + "num_tokens": 353171213.0, + "step": 9257 + }, + { + "epoch": 1.1777127591909426, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.562713623046875, + "learning_rate": 1e-06, + "loss": 0.4152, + "mean_token_accuracy": 0.8743665814399719, + "num_tokens": 353205057.0, + "step": 9258 + }, + { + "epoch": 1.1778399694695332, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.807741165161133, + "learning_rate": 1e-06, + "loss": 0.4867, + "mean_token_accuracy": 0.850960373878479, + "num_tokens": 353251164.0, + "step": 9259 + }, + { + "epoch": 1.1779671797481237, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.55238151550293, + "learning_rate": 1e-06, + "loss": 0.3983, + "mean_token_accuracy": 0.8703219294548035, + "num_tokens": 353288202.0, + "step": 9260 + }, + { + "epoch": 1.1780943900267142, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.563093185424805, + "learning_rate": 1e-06, + "loss": 0.4246, + "mean_token_accuracy": 0.8644523620605469, + "num_tokens": 353327396.0, + "step": 9261 + }, + { + "epoch": 1.1782216003053048, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.456979751586914, + "learning_rate": 1e-06, + "loss": 0.4581, + "mean_token_accuracy": 0.8540805578231812, + "num_tokens": 353366512.0, + "step": 9262 + }, + { + "epoch": 1.1783488105838953, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.686199188232422, + "learning_rate": 1e-06, + "loss": 0.4432, + "mean_token_accuracy": 0.8579484224319458, + "num_tokens": 353408546.0, + "step": 9263 + }, + { + "epoch": 1.1784760208624856, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.689428329467773, + "learning_rate": 1e-06, + "loss": 0.4083, + "mean_token_accuracy": 0.8734035491943359, + "num_tokens": 353444950.0, + "step": 9264 + }, + { + "epoch": 1.1786032311410761, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.638452529907227, + "learning_rate": 1e-06, + "loss": 0.3955, + "mean_token_accuracy": 0.8730630278587341, + "num_tokens": 353483306.0, + "step": 9265 + }, + { + "epoch": 1.1787304414196667, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.493846893310547, + "learning_rate": 1e-06, + "loss": 0.4294, + "mean_token_accuracy": 0.86465984582901, + "num_tokens": 353523038.0, + "step": 9266 + }, + { + "epoch": 1.1788576516982572, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.71043586730957, + "learning_rate": 1e-06, + "loss": 0.3903, + "mean_token_accuracy": 0.8757035732269287, + "num_tokens": 353564266.0, + "step": 9267 + }, + { + "epoch": 1.1789848619768477, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.520570755004883, + "learning_rate": 1e-06, + "loss": 0.4159, + "mean_token_accuracy": 0.8693976998329163, + "num_tokens": 353603558.0, + "step": 9268 + }, + { + "epoch": 1.1791120722554382, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.809734344482422, + "learning_rate": 1e-06, + "loss": 0.3999, + "mean_token_accuracy": 0.8734167218208313, + "num_tokens": 353648097.0, + "step": 9269 + }, + { + "epoch": 1.1792392825340288, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.373985290527344, + "learning_rate": 1e-06, + "loss": 0.4787, + "mean_token_accuracy": 0.850332498550415, + "num_tokens": 353688970.0, + "step": 9270 + }, + { + "epoch": 1.1793664928126193, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.58283805847168, + "learning_rate": 1e-06, + "loss": 0.4561, + "mean_token_accuracy": 0.8552126884460449, + "num_tokens": 353732505.0, + "step": 9271 + }, + { + "epoch": 1.1794937030912098, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.642335891723633, + "learning_rate": 1e-06, + "loss": 0.3878, + "mean_token_accuracy": 0.8742026090621948, + "num_tokens": 353767460.0, + "step": 9272 + }, + { + "epoch": 1.1796209133698004, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.683609008789062, + "learning_rate": 1e-06, + "loss": 0.3742, + "mean_token_accuracy": 0.8751094341278076, + "num_tokens": 353801951.0, + "step": 9273 + }, + { + "epoch": 1.1797481236483909, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.785675048828125, + "learning_rate": 1e-06, + "loss": 0.3911, + "mean_token_accuracy": 0.8739991188049316, + "num_tokens": 353836815.0, + "step": 9274 + }, + { + "epoch": 1.1798753339269812, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.565528869628906, + "learning_rate": 1e-06, + "loss": 0.439, + "mean_token_accuracy": 0.8583200573921204, + "num_tokens": 353876296.0, + "step": 9275 + }, + { + "epoch": 1.1800025442055717, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.8671817779541, + "learning_rate": 1e-06, + "loss": 0.4748, + "mean_token_accuracy": 0.8486400842666626, + "num_tokens": 353911792.0, + "step": 9276 + }, + { + "epoch": 1.1801297544841622, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.533994674682617, + "learning_rate": 1e-06, + "loss": 0.4559, + "mean_token_accuracy": 0.8549630641937256, + "num_tokens": 353949532.0, + "step": 9277 + }, + { + "epoch": 1.1802569647627528, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.687280654907227, + "learning_rate": 1e-06, + "loss": 0.4533, + "mean_token_accuracy": 0.8529895544052124, + "num_tokens": 353988622.0, + "step": 9278 + }, + { + "epoch": 1.1803841750413433, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.664323806762695, + "learning_rate": 1e-06, + "loss": 0.4104, + "mean_token_accuracy": 0.8691826462745667, + "num_tokens": 354028136.0, + "step": 9279 + }, + { + "epoch": 1.1805113853199338, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.568748474121094, + "learning_rate": 1e-06, + "loss": 0.4121, + "mean_token_accuracy": 0.8679942488670349, + "num_tokens": 354065029.0, + "step": 9280 + }, + { + "epoch": 1.1806385955985244, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.468786239624023, + "learning_rate": 1e-06, + "loss": 0.3923, + "mean_token_accuracy": 0.876976728439331, + "num_tokens": 354099279.0, + "step": 9281 + }, + { + "epoch": 1.1807658058771149, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.486127853393555, + "learning_rate": 1e-06, + "loss": 0.4421, + "mean_token_accuracy": 0.8616015911102295, + "num_tokens": 354138870.0, + "step": 9282 + }, + { + "epoch": 1.1808930161557054, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.620622634887695, + "learning_rate": 1e-06, + "loss": 0.4629, + "mean_token_accuracy": 0.8526495695114136, + "num_tokens": 354176498.0, + "step": 9283 + }, + { + "epoch": 1.181020226434296, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.45668601989746, + "learning_rate": 1e-06, + "loss": 0.4855, + "mean_token_accuracy": 0.8468671441078186, + "num_tokens": 354210063.0, + "step": 9284 + }, + { + "epoch": 1.1811474367128865, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.714643478393555, + "learning_rate": 1e-06, + "loss": 0.4466, + "mean_token_accuracy": 0.8585030436515808, + "num_tokens": 354248095.0, + "step": 9285 + }, + { + "epoch": 1.181274646991477, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.645051956176758, + "learning_rate": 1e-06, + "loss": 0.4333, + "mean_token_accuracy": 0.8620625138282776, + "num_tokens": 354286897.0, + "step": 9286 + }, + { + "epoch": 1.1814018572700675, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.563562393188477, + "learning_rate": 1e-06, + "loss": 0.3961, + "mean_token_accuracy": 0.873787522315979, + "num_tokens": 354330685.0, + "step": 9287 + }, + { + "epoch": 1.181529067548658, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.59316635131836, + "learning_rate": 1e-06, + "loss": 0.4557, + "mean_token_accuracy": 0.8569657206535339, + "num_tokens": 354371120.0, + "step": 9288 + }, + { + "epoch": 1.1816562778272484, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.67112159729004, + "learning_rate": 1e-06, + "loss": 0.4383, + "mean_token_accuracy": 0.8629682064056396, + "num_tokens": 354410825.0, + "step": 9289 + }, + { + "epoch": 1.1817834881058389, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.4290714263916, + "learning_rate": 1e-06, + "loss": 0.4334, + "mean_token_accuracy": 0.8619064092636108, + "num_tokens": 354453623.0, + "step": 9290 + }, + { + "epoch": 1.1819106983844294, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.819419860839844, + "learning_rate": 1e-06, + "loss": 0.4326, + "mean_token_accuracy": 0.8657639026641846, + "num_tokens": 354491854.0, + "step": 9291 + }, + { + "epoch": 1.18203790866302, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.510299682617188, + "learning_rate": 1e-06, + "loss": 0.4713, + "mean_token_accuracy": 0.850615918636322, + "num_tokens": 354530788.0, + "step": 9292 + }, + { + "epoch": 1.1821651189416105, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.576862335205078, + "learning_rate": 1e-06, + "loss": 0.3602, + "mean_token_accuracy": 0.8845323920249939, + "num_tokens": 354568576.0, + "step": 9293 + }, + { + "epoch": 1.182292329220201, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.7334041595459, + "learning_rate": 1e-06, + "loss": 0.4246, + "mean_token_accuracy": 0.8650737404823303, + "num_tokens": 354611307.0, + "step": 9294 + }, + { + "epoch": 1.1824195394987915, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.670995712280273, + "learning_rate": 1e-06, + "loss": 0.4053, + "mean_token_accuracy": 0.8722466230392456, + "num_tokens": 354648810.0, + "step": 9295 + }, + { + "epoch": 1.182546749777382, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.65679931640625, + "learning_rate": 1e-06, + "loss": 0.4135, + "mean_token_accuracy": 0.8672374486923218, + "num_tokens": 354688411.0, + "step": 9296 + }, + { + "epoch": 1.1826739600559726, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.507312774658203, + "learning_rate": 1e-06, + "loss": 0.4433, + "mean_token_accuracy": 0.8632307648658752, + "num_tokens": 354734700.0, + "step": 9297 + }, + { + "epoch": 1.1828011703345631, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.56794548034668, + "learning_rate": 1e-06, + "loss": 0.4303, + "mean_token_accuracy": 0.8626386523246765, + "num_tokens": 354765517.0, + "step": 9298 + }, + { + "epoch": 1.1829283806131534, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.808950424194336, + "learning_rate": 1e-06, + "loss": 0.4136, + "mean_token_accuracy": 0.8696111440658569, + "num_tokens": 354808563.0, + "step": 9299 + }, + { + "epoch": 1.183055590891744, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.794042587280273, + "learning_rate": 1e-06, + "loss": 0.4824, + "mean_token_accuracy": 0.8472145795822144, + "num_tokens": 354846618.0, + "step": 9300 + }, + { + "epoch": 1.1831828011703345, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.621620178222656, + "learning_rate": 1e-06, + "loss": 0.4153, + "mean_token_accuracy": 0.8684090375900269, + "num_tokens": 354881293.0, + "step": 9301 + }, + { + "epoch": 1.183310011448925, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.83026123046875, + "learning_rate": 1e-06, + "loss": 0.4145, + "mean_token_accuracy": 0.8625330924987793, + "num_tokens": 354925239.0, + "step": 9302 + }, + { + "epoch": 1.1834372217275155, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.625280380249023, + "learning_rate": 1e-06, + "loss": 0.452, + "mean_token_accuracy": 0.8528268337249756, + "num_tokens": 354967836.0, + "step": 9303 + }, + { + "epoch": 1.183564432006106, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.812671661376953, + "learning_rate": 1e-06, + "loss": 0.4818, + "mean_token_accuracy": 0.8502772450447083, + "num_tokens": 355003386.0, + "step": 9304 + }, + { + "epoch": 1.1836916422846966, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.85577964782715, + "learning_rate": 1e-06, + "loss": 0.4927, + "mean_token_accuracy": 0.8435314893722534, + "num_tokens": 355043391.0, + "step": 9305 + }, + { + "epoch": 1.1838188525632871, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.688282012939453, + "learning_rate": 1e-06, + "loss": 0.418, + "mean_token_accuracy": 0.8665502667427063, + "num_tokens": 355082820.0, + "step": 9306 + }, + { + "epoch": 1.1839460628418776, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.8079891204834, + "learning_rate": 1e-06, + "loss": 0.3609, + "mean_token_accuracy": 0.8864151239395142, + "num_tokens": 355117531.0, + "step": 9307 + }, + { + "epoch": 1.1840732731204682, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.6815242767334, + "learning_rate": 1e-06, + "loss": 0.465, + "mean_token_accuracy": 0.852890133857727, + "num_tokens": 355159485.0, + "step": 9308 + }, + { + "epoch": 1.1842004833990587, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.856599807739258, + "learning_rate": 1e-06, + "loss": 0.4425, + "mean_token_accuracy": 0.8604800701141357, + "num_tokens": 355206051.0, + "step": 9309 + }, + { + "epoch": 1.1843276936776492, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.75969696044922, + "learning_rate": 1e-06, + "loss": 0.4524, + "mean_token_accuracy": 0.8564391136169434, + "num_tokens": 355242933.0, + "step": 9310 + }, + { + "epoch": 1.1844549039562398, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.81833839416504, + "learning_rate": 1e-06, + "loss": 0.3942, + "mean_token_accuracy": 0.8770236372947693, + "num_tokens": 355286311.0, + "step": 9311 + }, + { + "epoch": 1.1845821142348303, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.613027572631836, + "learning_rate": 1e-06, + "loss": 0.4231, + "mean_token_accuracy": 0.8646928668022156, + "num_tokens": 355327844.0, + "step": 9312 + }, + { + "epoch": 1.1847093245134206, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.680076599121094, + "learning_rate": 1e-06, + "loss": 0.3944, + "mean_token_accuracy": 0.8735620975494385, + "num_tokens": 355368977.0, + "step": 9313 + }, + { + "epoch": 1.1848365347920111, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.747446060180664, + "learning_rate": 1e-06, + "loss": 0.4621, + "mean_token_accuracy": 0.8555965423583984, + "num_tokens": 355411479.0, + "step": 9314 + }, + { + "epoch": 1.1849637450706016, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.728681564331055, + "learning_rate": 1e-06, + "loss": 0.3734, + "mean_token_accuracy": 0.8828930854797363, + "num_tokens": 355447370.0, + "step": 9315 + }, + { + "epoch": 1.1850909553491922, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.725616455078125, + "learning_rate": 1e-06, + "loss": 0.4259, + "mean_token_accuracy": 0.862500011920929, + "num_tokens": 355490335.0, + "step": 9316 + }, + { + "epoch": 1.1852181656277827, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.590810775756836, + "learning_rate": 1e-06, + "loss": 0.4215, + "mean_token_accuracy": 0.866040050983429, + "num_tokens": 355527843.0, + "step": 9317 + }, + { + "epoch": 1.1853453759063732, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.72396469116211, + "learning_rate": 1e-06, + "loss": 0.4411, + "mean_token_accuracy": 0.8626576662063599, + "num_tokens": 355566094.0, + "step": 9318 + }, + { + "epoch": 1.1854725861849638, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.66239356994629, + "learning_rate": 1e-06, + "loss": 0.4101, + "mean_token_accuracy": 0.8732286691665649, + "num_tokens": 355609505.0, + "step": 9319 + }, + { + "epoch": 1.1855997964635543, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.75704002380371, + "learning_rate": 1e-06, + "loss": 0.5018, + "mean_token_accuracy": 0.840416431427002, + "num_tokens": 355650930.0, + "step": 9320 + }, + { + "epoch": 1.1857270067421448, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.56734848022461, + "learning_rate": 1e-06, + "loss": 0.3923, + "mean_token_accuracy": 0.8745708465576172, + "num_tokens": 355684075.0, + "step": 9321 + }, + { + "epoch": 1.1858542170207353, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.722381591796875, + "learning_rate": 1e-06, + "loss": 0.4073, + "mean_token_accuracy": 0.8709733486175537, + "num_tokens": 355723567.0, + "step": 9322 + }, + { + "epoch": 1.1859814272993259, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.732379913330078, + "learning_rate": 1e-06, + "loss": 0.4745, + "mean_token_accuracy": 0.8510755300521851, + "num_tokens": 355764915.0, + "step": 9323 + }, + { + "epoch": 1.1861086375779162, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.70259666442871, + "learning_rate": 1e-06, + "loss": 0.4193, + "mean_token_accuracy": 0.8660628795623779, + "num_tokens": 355804950.0, + "step": 9324 + }, + { + "epoch": 1.1862358478565067, + "ewc_loss": 0.030517578125, + "ewc_loss_parallel": 3.0517578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.842998504638672, + "learning_rate": 1e-06, + "loss": 0.4044, + "mean_token_accuracy": 0.873214602470398, + "num_tokens": 355844155.0, + "step": 9325 + }, + { + "epoch": 1.1863630581350972, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.818920135498047, + "learning_rate": 1e-06, + "loss": 0.3748, + "mean_token_accuracy": 0.8814595937728882, + "num_tokens": 355883971.0, + "step": 9326 + }, + { + "epoch": 1.1864902684136878, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.653141021728516, + "learning_rate": 1e-06, + "loss": 0.4404, + "mean_token_accuracy": 0.8606036901473999, + "num_tokens": 355922271.0, + "step": 9327 + }, + { + "epoch": 1.1866174786922783, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.489990234375, + "learning_rate": 1e-06, + "loss": 0.4182, + "mean_token_accuracy": 0.8641588687896729, + "num_tokens": 355961087.0, + "step": 9328 + }, + { + "epoch": 1.1867446889708688, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.728506088256836, + "learning_rate": 1e-06, + "loss": 0.4056, + "mean_token_accuracy": 0.8685874938964844, + "num_tokens": 355999553.0, + "step": 9329 + }, + { + "epoch": 1.1868718992494594, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.612722396850586, + "learning_rate": 1e-06, + "loss": 0.4106, + "mean_token_accuracy": 0.8705441951751709, + "num_tokens": 356038503.0, + "step": 9330 + }, + { + "epoch": 1.1869991095280499, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.652124404907227, + "learning_rate": 1e-06, + "loss": 0.3902, + "mean_token_accuracy": 0.8752604722976685, + "num_tokens": 356078538.0, + "step": 9331 + }, + { + "epoch": 1.1871263198066404, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.490116119384766, + "learning_rate": 1e-06, + "loss": 0.3846, + "mean_token_accuracy": 0.8777530193328857, + "num_tokens": 356115031.0, + "step": 9332 + }, + { + "epoch": 1.187253530085231, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.57536506652832, + "learning_rate": 1e-06, + "loss": 0.3751, + "mean_token_accuracy": 0.8794410228729248, + "num_tokens": 356154751.0, + "step": 9333 + }, + { + "epoch": 1.1873807403638215, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.596237182617188, + "learning_rate": 1e-06, + "loss": 0.4583, + "mean_token_accuracy": 0.8502237796783447, + "num_tokens": 356192105.0, + "step": 9334 + }, + { + "epoch": 1.187507950642412, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.40345573425293, + "learning_rate": 1e-06, + "loss": 0.4509, + "mean_token_accuracy": 0.8556244373321533, + "num_tokens": 356224964.0, + "step": 9335 + }, + { + "epoch": 1.1876351609210025, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.705045700073242, + "learning_rate": 1e-06, + "loss": 0.4037, + "mean_token_accuracy": 0.8697314858436584, + "num_tokens": 356261565.0, + "step": 9336 + }, + { + "epoch": 1.187762371199593, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.356233596801758, + "learning_rate": 1e-06, + "loss": 0.4177, + "mean_token_accuracy": 0.8672974705696106, + "num_tokens": 356301886.0, + "step": 9337 + }, + { + "epoch": 1.1878895814781834, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.889352798461914, + "learning_rate": 1e-06, + "loss": 0.4015, + "mean_token_accuracy": 0.8734591007232666, + "num_tokens": 356339481.0, + "step": 9338 + }, + { + "epoch": 1.1880167917567739, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.496129989624023, + "learning_rate": 1e-06, + "loss": 0.4972, + "mean_token_accuracy": 0.842867374420166, + "num_tokens": 356387038.0, + "step": 9339 + }, + { + "epoch": 1.1881440020353644, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.59755516052246, + "learning_rate": 1e-06, + "loss": 0.4444, + "mean_token_accuracy": 0.861625611782074, + "num_tokens": 356433256.0, + "step": 9340 + }, + { + "epoch": 1.188271212313955, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.675052642822266, + "learning_rate": 1e-06, + "loss": 0.4682, + "mean_token_accuracy": 0.854531466960907, + "num_tokens": 356471282.0, + "step": 9341 + }, + { + "epoch": 1.1883984225925455, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.598464965820312, + "learning_rate": 1e-06, + "loss": 0.4063, + "mean_token_accuracy": 0.8717110753059387, + "num_tokens": 356508951.0, + "step": 9342 + }, + { + "epoch": 1.188525632871136, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.741125106811523, + "learning_rate": 1e-06, + "loss": 0.4195, + "mean_token_accuracy": 0.8683776259422302, + "num_tokens": 356544487.0, + "step": 9343 + }, + { + "epoch": 1.1886528431497265, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.710376739501953, + "learning_rate": 1e-06, + "loss": 0.4013, + "mean_token_accuracy": 0.8719898462295532, + "num_tokens": 356580873.0, + "step": 9344 + }, + { + "epoch": 1.188780053428317, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.830078125, + "learning_rate": 1e-06, + "loss": 0.4255, + "mean_token_accuracy": 0.8627673387527466, + "num_tokens": 356613622.0, + "step": 9345 + }, + { + "epoch": 1.1889072637069076, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.819679260253906, + "learning_rate": 1e-06, + "loss": 0.4887, + "mean_token_accuracy": 0.8421412706375122, + "num_tokens": 356660058.0, + "step": 9346 + }, + { + "epoch": 1.189034473985498, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.665008544921875, + "learning_rate": 1e-06, + "loss": 0.419, + "mean_token_accuracy": 0.8643035292625427, + "num_tokens": 356701614.0, + "step": 9347 + }, + { + "epoch": 1.1891616842640884, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.69658851623535, + "learning_rate": 1e-06, + "loss": 0.3692, + "mean_token_accuracy": 0.8804064989089966, + "num_tokens": 356737105.0, + "step": 9348 + }, + { + "epoch": 1.189288894542679, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.501195907592773, + "learning_rate": 1e-06, + "loss": 0.4646, + "mean_token_accuracy": 0.8565495014190674, + "num_tokens": 356779382.0, + "step": 9349 + }, + { + "epoch": 1.1894161048212695, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.722007751464844, + "learning_rate": 1e-06, + "loss": 0.4443, + "mean_token_accuracy": 0.8592523336410522, + "num_tokens": 356813836.0, + "step": 9350 + }, + { + "epoch": 1.18954331509986, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.747190475463867, + "learning_rate": 1e-06, + "loss": 0.4516, + "mean_token_accuracy": 0.856299877166748, + "num_tokens": 356852591.0, + "step": 9351 + }, + { + "epoch": 1.1896705253784505, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.74382781982422, + "learning_rate": 1e-06, + "loss": 0.4249, + "mean_token_accuracy": 0.8627066612243652, + "num_tokens": 356889940.0, + "step": 9352 + }, + { + "epoch": 1.189797735657041, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.57402992248535, + "learning_rate": 1e-06, + "loss": 0.4122, + "mean_token_accuracy": 0.8689907789230347, + "num_tokens": 356927878.0, + "step": 9353 + }, + { + "epoch": 1.1899249459356316, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.515644073486328, + "learning_rate": 1e-06, + "loss": 0.3914, + "mean_token_accuracy": 0.8747241497039795, + "num_tokens": 356963339.0, + "step": 9354 + }, + { + "epoch": 1.1900521562142221, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.775026321411133, + "learning_rate": 1e-06, + "loss": 0.4079, + "mean_token_accuracy": 0.8691834807395935, + "num_tokens": 357002999.0, + "step": 9355 + }, + { + "epoch": 1.1901793664928126, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.667625427246094, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.868069052696228, + "num_tokens": 357044305.0, + "step": 9356 + }, + { + "epoch": 1.1903065767714032, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.652597427368164, + "learning_rate": 1e-06, + "loss": 0.417, + "mean_token_accuracy": 0.8673657774925232, + "num_tokens": 357082197.0, + "step": 9357 + }, + { + "epoch": 1.1904337870499937, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.851903915405273, + "learning_rate": 1e-06, + "loss": 0.4913, + "mean_token_accuracy": 0.8460756540298462, + "num_tokens": 357119541.0, + "step": 9358 + }, + { + "epoch": 1.1905609973285842, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.550689697265625, + "learning_rate": 1e-06, + "loss": 0.3587, + "mean_token_accuracy": 0.887184739112854, + "num_tokens": 357159176.0, + "step": 9359 + }, + { + "epoch": 1.1906882076071748, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.659454345703125, + "learning_rate": 1e-06, + "loss": 0.4279, + "mean_token_accuracy": 0.8651349544525146, + "num_tokens": 357199204.0, + "step": 9360 + }, + { + "epoch": 1.1908154178857653, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.550884246826172, + "learning_rate": 1e-06, + "loss": 0.3954, + "mean_token_accuracy": 0.8788105249404907, + "num_tokens": 357236900.0, + "step": 9361 + }, + { + "epoch": 1.1909426281643556, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.418392181396484, + "learning_rate": 1e-06, + "loss": 0.4208, + "mean_token_accuracy": 0.8657108545303345, + "num_tokens": 357271622.0, + "step": 9362 + }, + { + "epoch": 1.1910698384429461, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.727691650390625, + "learning_rate": 1e-06, + "loss": 0.3854, + "mean_token_accuracy": 0.876781165599823, + "num_tokens": 357311145.0, + "step": 9363 + }, + { + "epoch": 1.1911970487215366, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.70640754699707, + "learning_rate": 1e-06, + "loss": 0.3793, + "mean_token_accuracy": 0.8822839856147766, + "num_tokens": 357344609.0, + "step": 9364 + }, + { + "epoch": 1.1913242590001272, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.772912979125977, + "learning_rate": 1e-06, + "loss": 0.4267, + "mean_token_accuracy": 0.8630544543266296, + "num_tokens": 357378896.0, + "step": 9365 + }, + { + "epoch": 1.1914514692787177, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.70345687866211, + "learning_rate": 1e-06, + "loss": 0.4156, + "mean_token_accuracy": 0.8696087598800659, + "num_tokens": 357424730.0, + "step": 9366 + }, + { + "epoch": 1.1915786795573082, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.70985984802246, + "learning_rate": 1e-06, + "loss": 0.4928, + "mean_token_accuracy": 0.8466252088546753, + "num_tokens": 357461152.0, + "step": 9367 + }, + { + "epoch": 1.1917058898358988, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.720537185668945, + "learning_rate": 1e-06, + "loss": 0.4424, + "mean_token_accuracy": 0.8591184020042419, + "num_tokens": 357502305.0, + "step": 9368 + }, + { + "epoch": 1.1918331001144893, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.46255111694336, + "learning_rate": 1e-06, + "loss": 0.4432, + "mean_token_accuracy": 0.8602814078330994, + "num_tokens": 357539187.0, + "step": 9369 + }, + { + "epoch": 1.1919603103930798, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.745031356811523, + "learning_rate": 1e-06, + "loss": 0.4491, + "mean_token_accuracy": 0.8547337055206299, + "num_tokens": 357569520.0, + "step": 9370 + }, + { + "epoch": 1.1920875206716703, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.5440731048584, + "learning_rate": 1e-06, + "loss": 0.4199, + "mean_token_accuracy": 0.8654056191444397, + "num_tokens": 357605253.0, + "step": 9371 + }, + { + "epoch": 1.1922147309502609, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.775718688964844, + "learning_rate": 1e-06, + "loss": 0.3907, + "mean_token_accuracy": 0.8757026195526123, + "num_tokens": 357639701.0, + "step": 9372 + }, + { + "epoch": 1.1923419412288512, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.46461296081543, + "learning_rate": 1e-06, + "loss": 0.3987, + "mean_token_accuracy": 0.8735443353652954, + "num_tokens": 357675611.0, + "step": 9373 + }, + { + "epoch": 1.1924691515074417, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.772733688354492, + "learning_rate": 1e-06, + "loss": 0.3872, + "mean_token_accuracy": 0.8776687383651733, + "num_tokens": 357708521.0, + "step": 9374 + }, + { + "epoch": 1.1925963617860322, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.586376190185547, + "learning_rate": 1e-06, + "loss": 0.4496, + "mean_token_accuracy": 0.8560150861740112, + "num_tokens": 357749970.0, + "step": 9375 + }, + { + "epoch": 1.1927235720646228, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.719972610473633, + "learning_rate": 1e-06, + "loss": 0.3834, + "mean_token_accuracy": 0.8754217624664307, + "num_tokens": 357783287.0, + "step": 9376 + }, + { + "epoch": 1.1928507823432133, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.719736099243164, + "learning_rate": 1e-06, + "loss": 0.4112, + "mean_token_accuracy": 0.8668088912963867, + "num_tokens": 357818883.0, + "step": 9377 + }, + { + "epoch": 1.1929779926218038, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.584135055541992, + "learning_rate": 1e-06, + "loss": 0.381, + "mean_token_accuracy": 0.8783637881278992, + "num_tokens": 357862862.0, + "step": 9378 + }, + { + "epoch": 1.1931052029003943, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.705425262451172, + "learning_rate": 1e-06, + "loss": 0.4496, + "mean_token_accuracy": 0.8545534610748291, + "num_tokens": 357901574.0, + "step": 9379 + }, + { + "epoch": 1.1932324131789849, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.60462760925293, + "learning_rate": 1e-06, + "loss": 0.4215, + "mean_token_accuracy": 0.8638746738433838, + "num_tokens": 357941822.0, + "step": 9380 + }, + { + "epoch": 1.1933596234575754, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.70901870727539, + "learning_rate": 1e-06, + "loss": 0.4426, + "mean_token_accuracy": 0.8588317632675171, + "num_tokens": 357982820.0, + "step": 9381 + }, + { + "epoch": 1.193486833736166, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.492155075073242, + "learning_rate": 1e-06, + "loss": 0.426, + "mean_token_accuracy": 0.8632900714874268, + "num_tokens": 358020491.0, + "step": 9382 + }, + { + "epoch": 1.1936140440147565, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.751991271972656, + "learning_rate": 1e-06, + "loss": 0.4159, + "mean_token_accuracy": 0.8669359683990479, + "num_tokens": 358059775.0, + "step": 9383 + }, + { + "epoch": 1.193741254293347, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.776424407958984, + "learning_rate": 1e-06, + "loss": 0.4128, + "mean_token_accuracy": 0.8735383749008179, + "num_tokens": 358098672.0, + "step": 9384 + }, + { + "epoch": 1.1938684645719375, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.672853469848633, + "learning_rate": 1e-06, + "loss": 0.4426, + "mean_token_accuracy": 0.8576532602310181, + "num_tokens": 358140850.0, + "step": 9385 + }, + { + "epoch": 1.193995674850528, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.854494094848633, + "learning_rate": 1e-06, + "loss": 0.3967, + "mean_token_accuracy": 0.8722377419471741, + "num_tokens": 358175503.0, + "step": 9386 + }, + { + "epoch": 1.1941228851291183, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.69664764404297, + "learning_rate": 1e-06, + "loss": 0.4133, + "mean_token_accuracy": 0.8689652681350708, + "num_tokens": 358210615.0, + "step": 9387 + }, + { + "epoch": 1.1942500954077089, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.84481430053711, + "learning_rate": 1e-06, + "loss": 0.4205, + "mean_token_accuracy": 0.8680785298347473, + "num_tokens": 358246605.0, + "step": 9388 + }, + { + "epoch": 1.1943773056862994, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.670373916625977, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.8620907664299011, + "num_tokens": 358283136.0, + "step": 9389 + }, + { + "epoch": 1.19450451596489, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.5650634765625, + "learning_rate": 1e-06, + "loss": 0.394, + "mean_token_accuracy": 0.8737477660179138, + "num_tokens": 358325437.0, + "step": 9390 + }, + { + "epoch": 1.1946317262434805, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.828922271728516, + "learning_rate": 1e-06, + "loss": 0.4125, + "mean_token_accuracy": 0.8656696081161499, + "num_tokens": 358365360.0, + "step": 9391 + }, + { + "epoch": 1.194758936522071, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.640094757080078, + "learning_rate": 1e-06, + "loss": 0.3619, + "mean_token_accuracy": 0.8825557231903076, + "num_tokens": 358406502.0, + "step": 9392 + }, + { + "epoch": 1.1948861468006615, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.63544464111328, + "learning_rate": 1e-06, + "loss": 0.4308, + "mean_token_accuracy": 0.8611835837364197, + "num_tokens": 358437220.0, + "step": 9393 + }, + { + "epoch": 1.195013357079252, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.535234451293945, + "learning_rate": 1e-06, + "loss": 0.4622, + "mean_token_accuracy": 0.8541294932365417, + "num_tokens": 358475314.0, + "step": 9394 + }, + { + "epoch": 1.1951405673578426, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.60625648498535, + "learning_rate": 1e-06, + "loss": 0.3962, + "mean_token_accuracy": 0.8761274218559265, + "num_tokens": 358520777.0, + "step": 9395 + }, + { + "epoch": 1.195267777636433, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.696189880371094, + "learning_rate": 1e-06, + "loss": 0.3666, + "mean_token_accuracy": 0.8856987953186035, + "num_tokens": 358557973.0, + "step": 9396 + }, + { + "epoch": 1.1953949879150234, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.72822380065918, + "learning_rate": 1e-06, + "loss": 0.4024, + "mean_token_accuracy": 0.8730628490447998, + "num_tokens": 358596795.0, + "step": 9397 + }, + { + "epoch": 1.195522198193614, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.67210578918457, + "learning_rate": 1e-06, + "loss": 0.4413, + "mean_token_accuracy": 0.8666556477546692, + "num_tokens": 358632146.0, + "step": 9398 + }, + { + "epoch": 1.1956494084722045, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.584707260131836, + "learning_rate": 1e-06, + "loss": 0.3926, + "mean_token_accuracy": 0.8741642832756042, + "num_tokens": 358672194.0, + "step": 9399 + }, + { + "epoch": 1.195776618750795, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.707096099853516, + "learning_rate": 1e-06, + "loss": 0.3898, + "mean_token_accuracy": 0.8732135891914368, + "num_tokens": 358710192.0, + "step": 9400 + }, + { + "epoch": 1.1959038290293855, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.771631240844727, + "learning_rate": 1e-06, + "loss": 0.4621, + "mean_token_accuracy": 0.8522077798843384, + "num_tokens": 358749284.0, + "step": 9401 + }, + { + "epoch": 1.196031039307976, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.739286422729492, + "learning_rate": 1e-06, + "loss": 0.4361, + "mean_token_accuracy": 0.8621050715446472, + "num_tokens": 358782830.0, + "step": 9402 + }, + { + "epoch": 1.1961582495865666, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.655651092529297, + "learning_rate": 1e-06, + "loss": 0.4199, + "mean_token_accuracy": 0.8641313910484314, + "num_tokens": 358820696.0, + "step": 9403 + }, + { + "epoch": 1.196285459865157, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.938371658325195, + "learning_rate": 1e-06, + "loss": 0.3967, + "mean_token_accuracy": 0.8722918033599854, + "num_tokens": 358853616.0, + "step": 9404 + }, + { + "epoch": 1.1964126701437476, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.558881759643555, + "learning_rate": 1e-06, + "loss": 0.3959, + "mean_token_accuracy": 0.8749335408210754, + "num_tokens": 358893420.0, + "step": 9405 + }, + { + "epoch": 1.1965398804223382, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.711549758911133, + "learning_rate": 1e-06, + "loss": 0.4211, + "mean_token_accuracy": 0.8679536581039429, + "num_tokens": 358930704.0, + "step": 9406 + }, + { + "epoch": 1.1966670907009287, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.404460906982422, + "learning_rate": 1e-06, + "loss": 0.3786, + "mean_token_accuracy": 0.8794289827346802, + "num_tokens": 358967730.0, + "step": 9407 + }, + { + "epoch": 1.1967943009795192, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.742042541503906, + "learning_rate": 1e-06, + "loss": 0.3934, + "mean_token_accuracy": 0.8782442212104797, + "num_tokens": 359007241.0, + "step": 9408 + }, + { + "epoch": 1.1969215112581097, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.508310317993164, + "learning_rate": 1e-06, + "loss": 0.4667, + "mean_token_accuracy": 0.8536428809165955, + "num_tokens": 359045573.0, + "step": 9409 + }, + { + "epoch": 1.1970487215367003, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.67142105102539, + "learning_rate": 1e-06, + "loss": 0.4116, + "mean_token_accuracy": 0.8696920871734619, + "num_tokens": 359083420.0, + "step": 9410 + }, + { + "epoch": 1.1971759318152906, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.72987174987793, + "learning_rate": 1e-06, + "loss": 0.3741, + "mean_token_accuracy": 0.8792160749435425, + "num_tokens": 359117555.0, + "step": 9411 + }, + { + "epoch": 1.1973031420938811, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.58652687072754, + "learning_rate": 1e-06, + "loss": 0.4229, + "mean_token_accuracy": 0.863060712814331, + "num_tokens": 359154597.0, + "step": 9412 + }, + { + "epoch": 1.1974303523724716, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.78512954711914, + "learning_rate": 1e-06, + "loss": 0.4334, + "mean_token_accuracy": 0.8618526458740234, + "num_tokens": 359190074.0, + "step": 9413 + }, + { + "epoch": 1.1975575626510622, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.791473388671875, + "learning_rate": 1e-06, + "loss": 0.4722, + "mean_token_accuracy": 0.855248749256134, + "num_tokens": 359232711.0, + "step": 9414 + }, + { + "epoch": 1.1976847729296527, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.685564041137695, + "learning_rate": 1e-06, + "loss": 0.4216, + "mean_token_accuracy": 0.8660051226615906, + "num_tokens": 359265582.0, + "step": 9415 + }, + { + "epoch": 1.1978119832082432, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.768518447875977, + "learning_rate": 1e-06, + "loss": 0.3993, + "mean_token_accuracy": 0.8736100792884827, + "num_tokens": 359303177.0, + "step": 9416 + }, + { + "epoch": 1.1979391934868338, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.695755004882812, + "learning_rate": 1e-06, + "loss": 0.3827, + "mean_token_accuracy": 0.8763192892074585, + "num_tokens": 359338712.0, + "step": 9417 + }, + { + "epoch": 1.1980664037654243, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.52942657470703, + "learning_rate": 1e-06, + "loss": 0.3956, + "mean_token_accuracy": 0.8751259446144104, + "num_tokens": 359372849.0, + "step": 9418 + }, + { + "epoch": 1.1981936140440148, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.889514923095703, + "learning_rate": 1e-06, + "loss": 0.3974, + "mean_token_accuracy": 0.87152498960495, + "num_tokens": 359407981.0, + "step": 9419 + }, + { + "epoch": 1.1983208243226053, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.607892990112305, + "learning_rate": 1e-06, + "loss": 0.4798, + "mean_token_accuracy": 0.8454837203025818, + "num_tokens": 359450180.0, + "step": 9420 + }, + { + "epoch": 1.1984480346011959, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.80145263671875, + "learning_rate": 1e-06, + "loss": 0.4124, + "mean_token_accuracy": 0.8655262589454651, + "num_tokens": 359487909.0, + "step": 9421 + }, + { + "epoch": 1.1985752448797862, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.509780883789062, + "learning_rate": 1e-06, + "loss": 0.3911, + "mean_token_accuracy": 0.8788074851036072, + "num_tokens": 359522609.0, + "step": 9422 + }, + { + "epoch": 1.1987024551583767, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.90741539001465, + "learning_rate": 1e-06, + "loss": 0.4266, + "mean_token_accuracy": 0.8634480237960815, + "num_tokens": 359560735.0, + "step": 9423 + }, + { + "epoch": 1.1988296654369672, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.67190933227539, + "learning_rate": 1e-06, + "loss": 0.4379, + "mean_token_accuracy": 0.8606345653533936, + "num_tokens": 359602994.0, + "step": 9424 + }, + { + "epoch": 1.1989568757155578, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.75493621826172, + "learning_rate": 1e-06, + "loss": 0.4165, + "mean_token_accuracy": 0.8662037253379822, + "num_tokens": 359638864.0, + "step": 9425 + }, + { + "epoch": 1.1990840859941483, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.87054443359375, + "learning_rate": 1e-06, + "loss": 0.3882, + "mean_token_accuracy": 0.8743373155593872, + "num_tokens": 359674202.0, + "step": 9426 + }, + { + "epoch": 1.1992112962727388, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.950698852539062, + "learning_rate": 1e-06, + "loss": 0.4003, + "mean_token_accuracy": 0.8730673789978027, + "num_tokens": 359710408.0, + "step": 9427 + }, + { + "epoch": 1.1993385065513293, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.573396682739258, + "learning_rate": 1e-06, + "loss": 0.4011, + "mean_token_accuracy": 0.8743247985839844, + "num_tokens": 359749068.0, + "step": 9428 + }, + { + "epoch": 1.1994657168299199, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.74772834777832, + "learning_rate": 1e-06, + "loss": 0.4214, + "mean_token_accuracy": 0.8675713539123535, + "num_tokens": 359788449.0, + "step": 9429 + }, + { + "epoch": 1.1995929271085104, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.806489944458008, + "learning_rate": 1e-06, + "loss": 0.4392, + "mean_token_accuracy": 0.8605152368545532, + "num_tokens": 359824167.0, + "step": 9430 + }, + { + "epoch": 1.199720137387101, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.598148345947266, + "learning_rate": 1e-06, + "loss": 0.4412, + "mean_token_accuracy": 0.8609955906867981, + "num_tokens": 359861476.0, + "step": 9431 + }, + { + "epoch": 1.1998473476656915, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.702598571777344, + "learning_rate": 1e-06, + "loss": 0.4245, + "mean_token_accuracy": 0.8650485277175903, + "num_tokens": 359906463.0, + "step": 9432 + }, + { + "epoch": 1.199974557944282, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.779436111450195, + "learning_rate": 1e-06, + "loss": 0.4047, + "mean_token_accuracy": 0.8695552349090576, + "num_tokens": 359940576.0, + "step": 9433 + }, + { + "epoch": 1.2001017682228725, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.75534439086914, + "learning_rate": 1e-06, + "loss": 0.3892, + "mean_token_accuracy": 0.8768726587295532, + "num_tokens": 359973616.0, + "step": 9434 + }, + { + "epoch": 1.200228978501463, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.738283157348633, + "learning_rate": 1e-06, + "loss": 0.3798, + "mean_token_accuracy": 0.8785247206687927, + "num_tokens": 360011059.0, + "step": 9435 + }, + { + "epoch": 1.2003561887800533, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.654531478881836, + "learning_rate": 1e-06, + "loss": 0.3914, + "mean_token_accuracy": 0.8760108947753906, + "num_tokens": 360048307.0, + "step": 9436 + }, + { + "epoch": 1.2004833990586439, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.73695945739746, + "learning_rate": 1e-06, + "loss": 0.4227, + "mean_token_accuracy": 0.866487443447113, + "num_tokens": 360086946.0, + "step": 9437 + }, + { + "epoch": 1.2006106093372344, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.74604034423828, + "learning_rate": 1e-06, + "loss": 0.4042, + "mean_token_accuracy": 0.8704754114151001, + "num_tokens": 360121908.0, + "step": 9438 + }, + { + "epoch": 1.200737819615825, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.85769271850586, + "learning_rate": 1e-06, + "loss": 0.459, + "mean_token_accuracy": 0.8568917512893677, + "num_tokens": 360164349.0, + "step": 9439 + }, + { + "epoch": 1.2008650298944155, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.65620231628418, + "learning_rate": 1e-06, + "loss": 0.3611, + "mean_token_accuracy": 0.8820521831512451, + "num_tokens": 360197660.0, + "step": 9440 + }, + { + "epoch": 1.200992240173006, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.768125534057617, + "learning_rate": 1e-06, + "loss": 0.4952, + "mean_token_accuracy": 0.8449618816375732, + "num_tokens": 360240297.0, + "step": 9441 + }, + { + "epoch": 1.2011194504515965, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.764328002929688, + "learning_rate": 1e-06, + "loss": 0.457, + "mean_token_accuracy": 0.8569685220718384, + "num_tokens": 360281566.0, + "step": 9442 + }, + { + "epoch": 1.201246660730187, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.737987518310547, + "learning_rate": 1e-06, + "loss": 0.394, + "mean_token_accuracy": 0.8728954195976257, + "num_tokens": 360313927.0, + "step": 9443 + }, + { + "epoch": 1.2013738710087776, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.432737350463867, + "learning_rate": 1e-06, + "loss": 0.4067, + "mean_token_accuracy": 0.8705263733863831, + "num_tokens": 360349369.0, + "step": 9444 + }, + { + "epoch": 1.201501081287368, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.687728881835938, + "learning_rate": 1e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.8661343455314636, + "num_tokens": 360386996.0, + "step": 9445 + }, + { + "epoch": 1.2016282915659584, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.56900405883789, + "learning_rate": 1e-06, + "loss": 0.4094, + "mean_token_accuracy": 0.8708508014678955, + "num_tokens": 360424883.0, + "step": 9446 + }, + { + "epoch": 1.201755501844549, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.685914993286133, + "learning_rate": 1e-06, + "loss": 0.3991, + "mean_token_accuracy": 0.8716065883636475, + "num_tokens": 360457992.0, + "step": 9447 + }, + { + "epoch": 1.2018827121231395, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.844148635864258, + "learning_rate": 1e-06, + "loss": 0.4694, + "mean_token_accuracy": 0.8518350124359131, + "num_tokens": 360497934.0, + "step": 9448 + }, + { + "epoch": 1.20200992240173, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.42245101928711, + "learning_rate": 1e-06, + "loss": 0.4353, + "mean_token_accuracy": 0.8642547130584717, + "num_tokens": 360533928.0, + "step": 9449 + }, + { + "epoch": 1.2021371326803205, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.74291229248047, + "learning_rate": 1e-06, + "loss": 0.4093, + "mean_token_accuracy": 0.8689706325531006, + "num_tokens": 360571076.0, + "step": 9450 + }, + { + "epoch": 1.202264342958911, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.773210525512695, + "learning_rate": 1e-06, + "loss": 0.4401, + "mean_token_accuracy": 0.8588472008705139, + "num_tokens": 360611957.0, + "step": 9451 + }, + { + "epoch": 1.2023915532375016, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.78321075439453, + "learning_rate": 1e-06, + "loss": 0.4333, + "mean_token_accuracy": 0.8602232933044434, + "num_tokens": 360647823.0, + "step": 9452 + }, + { + "epoch": 1.202518763516092, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.76064109802246, + "learning_rate": 1e-06, + "loss": 0.4593, + "mean_token_accuracy": 0.8572075366973877, + "num_tokens": 360688341.0, + "step": 9453 + }, + { + "epoch": 1.2026459737946826, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.830059051513672, + "learning_rate": 1e-06, + "loss": 0.4455, + "mean_token_accuracy": 0.859227180480957, + "num_tokens": 360730417.0, + "step": 9454 + }, + { + "epoch": 1.2027731840732732, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.733009338378906, + "learning_rate": 1e-06, + "loss": 0.4823, + "mean_token_accuracy": 0.8474162220954895, + "num_tokens": 360771931.0, + "step": 9455 + }, + { + "epoch": 1.2029003943518637, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.882455825805664, + "learning_rate": 1e-06, + "loss": 0.4251, + "mean_token_accuracy": 0.8642873764038086, + "num_tokens": 360811460.0, + "step": 9456 + }, + { + "epoch": 1.2030276046304542, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.78750991821289, + "learning_rate": 1e-06, + "loss": 0.3829, + "mean_token_accuracy": 0.8782972097396851, + "num_tokens": 360849476.0, + "step": 9457 + }, + { + "epoch": 1.2031548149090447, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.63274574279785, + "learning_rate": 1e-06, + "loss": 0.4624, + "mean_token_accuracy": 0.8561097383499146, + "num_tokens": 360891178.0, + "step": 9458 + }, + { + "epoch": 1.2032820251876353, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.91058921813965, + "learning_rate": 1e-06, + "loss": 0.4268, + "mean_token_accuracy": 0.8623180985450745, + "num_tokens": 360930503.0, + "step": 9459 + }, + { + "epoch": 1.2034092354662256, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.71845245361328, + "learning_rate": 1e-06, + "loss": 0.4207, + "mean_token_accuracy": 0.8632372617721558, + "num_tokens": 360963730.0, + "step": 9460 + }, + { + "epoch": 1.203536445744816, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.83233070373535, + "learning_rate": 1e-06, + "loss": 0.4619, + "mean_token_accuracy": 0.85224449634552, + "num_tokens": 361003796.0, + "step": 9461 + }, + { + "epoch": 1.2036636560234066, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.682912826538086, + "learning_rate": 1e-06, + "loss": 0.435, + "mean_token_accuracy": 0.8666731119155884, + "num_tokens": 361044818.0, + "step": 9462 + }, + { + "epoch": 1.2037908663019972, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.630233764648438, + "learning_rate": 1e-06, + "loss": 0.3895, + "mean_token_accuracy": 0.8767077922821045, + "num_tokens": 361080692.0, + "step": 9463 + }, + { + "epoch": 1.2039180765805877, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.78076934814453, + "learning_rate": 1e-06, + "loss": 0.4619, + "mean_token_accuracy": 0.8568460941314697, + "num_tokens": 361116609.0, + "step": 9464 + }, + { + "epoch": 1.2040452868591782, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.751556396484375, + "learning_rate": 1e-06, + "loss": 0.4277, + "mean_token_accuracy": 0.859432578086853, + "num_tokens": 361154559.0, + "step": 9465 + }, + { + "epoch": 1.2041724971377687, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.561580657958984, + "learning_rate": 1e-06, + "loss": 0.3967, + "mean_token_accuracy": 0.8750923871994019, + "num_tokens": 361191005.0, + "step": 9466 + }, + { + "epoch": 1.2042997074163593, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.758071899414062, + "learning_rate": 1e-06, + "loss": 0.4308, + "mean_token_accuracy": 0.8640159368515015, + "num_tokens": 361230191.0, + "step": 9467 + }, + { + "epoch": 1.2044269176949498, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.77539825439453, + "learning_rate": 1e-06, + "loss": 0.4489, + "mean_token_accuracy": 0.8589137196540833, + "num_tokens": 361269132.0, + "step": 9468 + }, + { + "epoch": 1.2045541279735403, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.582962036132812, + "learning_rate": 1e-06, + "loss": 0.4537, + "mean_token_accuracy": 0.8589226007461548, + "num_tokens": 361306674.0, + "step": 9469 + }, + { + "epoch": 1.2046813382521309, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.809415817260742, + "learning_rate": 1e-06, + "loss": 0.4163, + "mean_token_accuracy": 0.868219792842865, + "num_tokens": 361348309.0, + "step": 9470 + }, + { + "epoch": 1.2048085485307212, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.63478660583496, + "learning_rate": 1e-06, + "loss": 0.4451, + "mean_token_accuracy": 0.8617006540298462, + "num_tokens": 361386824.0, + "step": 9471 + }, + { + "epoch": 1.2049357588093117, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.889341354370117, + "learning_rate": 1e-06, + "loss": 0.4762, + "mean_token_accuracy": 0.8468717336654663, + "num_tokens": 361426590.0, + "step": 9472 + }, + { + "epoch": 1.2050629690879022, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.86025047302246, + "learning_rate": 1e-06, + "loss": 0.4709, + "mean_token_accuracy": 0.8516642451286316, + "num_tokens": 361463259.0, + "step": 9473 + }, + { + "epoch": 1.2051901793664928, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.507089614868164, + "learning_rate": 1e-06, + "loss": 0.3896, + "mean_token_accuracy": 0.8753912448883057, + "num_tokens": 361498750.0, + "step": 9474 + }, + { + "epoch": 1.2053173896450833, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.853017807006836, + "learning_rate": 1e-06, + "loss": 0.453, + "mean_token_accuracy": 0.8559768795967102, + "num_tokens": 361537101.0, + "step": 9475 + }, + { + "epoch": 1.2054445999236738, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.884075164794922, + "learning_rate": 1e-06, + "loss": 0.414, + "mean_token_accuracy": 0.8696982860565186, + "num_tokens": 361572130.0, + "step": 9476 + }, + { + "epoch": 1.2055718102022643, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.811702728271484, + "learning_rate": 1e-06, + "loss": 0.3868, + "mean_token_accuracy": 0.8820126056671143, + "num_tokens": 361608484.0, + "step": 9477 + }, + { + "epoch": 1.2056990204808549, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.744016647338867, + "learning_rate": 1e-06, + "loss": 0.3634, + "mean_token_accuracy": 0.8831612467765808, + "num_tokens": 361650361.0, + "step": 9478 + }, + { + "epoch": 1.2058262307594454, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.80477523803711, + "learning_rate": 1e-06, + "loss": 0.3945, + "mean_token_accuracy": 0.873193621635437, + "num_tokens": 361680670.0, + "step": 9479 + }, + { + "epoch": 1.205953441038036, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.629838943481445, + "learning_rate": 1e-06, + "loss": 0.4101, + "mean_token_accuracy": 0.8720769882202148, + "num_tokens": 361723504.0, + "step": 9480 + }, + { + "epoch": 1.2060806513166265, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.68416976928711, + "learning_rate": 1e-06, + "loss": 0.3999, + "mean_token_accuracy": 0.8725535869598389, + "num_tokens": 361765850.0, + "step": 9481 + }, + { + "epoch": 1.206207861595217, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.842994689941406, + "learning_rate": 1e-06, + "loss": 0.4423, + "mean_token_accuracy": 0.8567876815795898, + "num_tokens": 361804375.0, + "step": 9482 + }, + { + "epoch": 1.2063350718738075, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.66921615600586, + "learning_rate": 1e-06, + "loss": 0.4245, + "mean_token_accuracy": 0.8626898527145386, + "num_tokens": 361834714.0, + "step": 9483 + }, + { + "epoch": 1.206462282152398, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.73354721069336, + "learning_rate": 1e-06, + "loss": 0.4415, + "mean_token_accuracy": 0.8657299876213074, + "num_tokens": 361877568.0, + "step": 9484 + }, + { + "epoch": 1.2065894924309883, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.813642501831055, + "learning_rate": 1e-06, + "loss": 0.4406, + "mean_token_accuracy": 0.8620805144309998, + "num_tokens": 361914785.0, + "step": 9485 + }, + { + "epoch": 1.2067167027095789, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.80126190185547, + "learning_rate": 1e-06, + "loss": 0.3943, + "mean_token_accuracy": 0.8732108473777771, + "num_tokens": 361956781.0, + "step": 9486 + }, + { + "epoch": 1.2068439129881694, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.790115356445312, + "learning_rate": 1e-06, + "loss": 0.4008, + "mean_token_accuracy": 0.8760344982147217, + "num_tokens": 361997408.0, + "step": 9487 + }, + { + "epoch": 1.20697112326676, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.680667877197266, + "learning_rate": 1e-06, + "loss": 0.4209, + "mean_token_accuracy": 0.8651977181434631, + "num_tokens": 362030226.0, + "step": 9488 + }, + { + "epoch": 1.2070983335453505, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.72277069091797, + "learning_rate": 1e-06, + "loss": 0.4023, + "mean_token_accuracy": 0.8768570423126221, + "num_tokens": 362068370.0, + "step": 9489 + }, + { + "epoch": 1.207225543823941, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.68474006652832, + "learning_rate": 1e-06, + "loss": 0.3936, + "mean_token_accuracy": 0.874297022819519, + "num_tokens": 362108569.0, + "step": 9490 + }, + { + "epoch": 1.2073527541025315, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.71211051940918, + "learning_rate": 1e-06, + "loss": 0.4412, + "mean_token_accuracy": 0.8584840893745422, + "num_tokens": 362140476.0, + "step": 9491 + }, + { + "epoch": 1.207479964381122, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.632259368896484, + "learning_rate": 1e-06, + "loss": 0.3907, + "mean_token_accuracy": 0.8746976852416992, + "num_tokens": 362171132.0, + "step": 9492 + }, + { + "epoch": 1.2076071746597126, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.92950439453125, + "learning_rate": 1e-06, + "loss": 0.426, + "mean_token_accuracy": 0.864554226398468, + "num_tokens": 362204791.0, + "step": 9493 + }, + { + "epoch": 1.207734384938303, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.696409225463867, + "learning_rate": 1e-06, + "loss": 0.4151, + "mean_token_accuracy": 0.8701686859130859, + "num_tokens": 362242360.0, + "step": 9494 + }, + { + "epoch": 1.2078615952168934, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.685386657714844, + "learning_rate": 1e-06, + "loss": 0.3959, + "mean_token_accuracy": 0.8686100840568542, + "num_tokens": 362276558.0, + "step": 9495 + }, + { + "epoch": 1.207988805495484, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.767498016357422, + "learning_rate": 1e-06, + "loss": 0.4662, + "mean_token_accuracy": 0.8516495227813721, + "num_tokens": 362307055.0, + "step": 9496 + }, + { + "epoch": 1.2081160157740745, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.871692657470703, + "learning_rate": 1e-06, + "loss": 0.3942, + "mean_token_accuracy": 0.877324104309082, + "num_tokens": 362342433.0, + "step": 9497 + }, + { + "epoch": 1.208243226052665, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.638734817504883, + "learning_rate": 1e-06, + "loss": 0.3463, + "mean_token_accuracy": 0.8877037763595581, + "num_tokens": 362377008.0, + "step": 9498 + }, + { + "epoch": 1.2083704363312555, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.7717227935791, + "learning_rate": 1e-06, + "loss": 0.3593, + "mean_token_accuracy": 0.8807833790779114, + "num_tokens": 362412089.0, + "step": 9499 + }, + { + "epoch": 1.208497646609846, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.66004753112793, + "learning_rate": 1e-06, + "loss": 0.4507, + "mean_token_accuracy": 0.8580727577209473, + "num_tokens": 362455817.0, + "step": 9500 + }, + { + "epoch": 1.2086248568884366, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.776731491088867, + "learning_rate": 1e-06, + "loss": 0.4476, + "mean_token_accuracy": 0.8661977052688599, + "num_tokens": 362498859.0, + "step": 9501 + }, + { + "epoch": 1.208752067167027, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.740442276000977, + "learning_rate": 1e-06, + "loss": 0.4219, + "mean_token_accuracy": 0.8651660680770874, + "num_tokens": 362542346.0, + "step": 9502 + }, + { + "epoch": 1.2088792774456176, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.707599639892578, + "learning_rate": 1e-06, + "loss": 0.3485, + "mean_token_accuracy": 0.8849365711212158, + "num_tokens": 362575360.0, + "step": 9503 + }, + { + "epoch": 1.2090064877242082, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.653884887695312, + "learning_rate": 1e-06, + "loss": 0.4116, + "mean_token_accuracy": 0.8668292164802551, + "num_tokens": 362614737.0, + "step": 9504 + }, + { + "epoch": 1.2091336980027987, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.804861068725586, + "learning_rate": 1e-06, + "loss": 0.4341, + "mean_token_accuracy": 0.861666202545166, + "num_tokens": 362653756.0, + "step": 9505 + }, + { + "epoch": 1.2092609082813892, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.863046646118164, + "learning_rate": 1e-06, + "loss": 0.4178, + "mean_token_accuracy": 0.8680374622344971, + "num_tokens": 362690745.0, + "step": 9506 + }, + { + "epoch": 1.2093881185599797, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.808609008789062, + "learning_rate": 1e-06, + "loss": 0.3951, + "mean_token_accuracy": 0.875736653804779, + "num_tokens": 362726328.0, + "step": 9507 + }, + { + "epoch": 1.2095153288385703, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.51384925842285, + "learning_rate": 1e-06, + "loss": 0.4119, + "mean_token_accuracy": 0.8694777488708496, + "num_tokens": 362764285.0, + "step": 9508 + }, + { + "epoch": 1.2096425391171606, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.766284942626953, + "learning_rate": 1e-06, + "loss": 0.4734, + "mean_token_accuracy": 0.8479218482971191, + "num_tokens": 362794697.0, + "step": 9509 + }, + { + "epoch": 1.209769749395751, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.839313507080078, + "learning_rate": 1e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.8653704524040222, + "num_tokens": 362831618.0, + "step": 9510 + }, + { + "epoch": 1.2098969596743416, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.514026641845703, + "learning_rate": 1e-06, + "loss": 0.389, + "mean_token_accuracy": 0.8749982118606567, + "num_tokens": 362866404.0, + "step": 9511 + }, + { + "epoch": 1.2100241699529322, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.734142303466797, + "learning_rate": 1e-06, + "loss": 0.3825, + "mean_token_accuracy": 0.88270503282547, + "num_tokens": 362902554.0, + "step": 9512 + }, + { + "epoch": 1.2101513802315227, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.758747100830078, + "learning_rate": 1e-06, + "loss": 0.435, + "mean_token_accuracy": 0.8613447546958923, + "num_tokens": 362944169.0, + "step": 9513 + }, + { + "epoch": 1.2102785905101132, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.742183685302734, + "learning_rate": 1e-06, + "loss": 0.4771, + "mean_token_accuracy": 0.846268892288208, + "num_tokens": 362983789.0, + "step": 9514 + }, + { + "epoch": 1.2104058007887037, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.633651733398438, + "learning_rate": 1e-06, + "loss": 0.4068, + "mean_token_accuracy": 0.872309684753418, + "num_tokens": 363021551.0, + "step": 9515 + }, + { + "epoch": 1.2105330110672943, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.769136428833008, + "learning_rate": 1e-06, + "loss": 0.4804, + "mean_token_accuracy": 0.8500850796699524, + "num_tokens": 363064951.0, + "step": 9516 + }, + { + "epoch": 1.2106602213458848, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.73676872253418, + "learning_rate": 1e-06, + "loss": 0.4247, + "mean_token_accuracy": 0.866502583026886, + "num_tokens": 363110876.0, + "step": 9517 + }, + { + "epoch": 1.2107874316244753, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.506183624267578, + "learning_rate": 1e-06, + "loss": 0.4205, + "mean_token_accuracy": 0.8637858033180237, + "num_tokens": 363155310.0, + "step": 9518 + }, + { + "epoch": 1.2109146419030659, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.71953773498535, + "learning_rate": 1e-06, + "loss": 0.3911, + "mean_token_accuracy": 0.87640380859375, + "num_tokens": 363193166.0, + "step": 9519 + }, + { + "epoch": 1.2110418521816562, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.594507217407227, + "learning_rate": 1e-06, + "loss": 0.4789, + "mean_token_accuracy": 0.848745584487915, + "num_tokens": 363225163.0, + "step": 9520 + }, + { + "epoch": 1.2111690624602467, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.743940353393555, + "learning_rate": 1e-06, + "loss": 0.4198, + "mean_token_accuracy": 0.865256130695343, + "num_tokens": 363264720.0, + "step": 9521 + }, + { + "epoch": 1.2112962727388372, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.708412170410156, + "learning_rate": 1e-06, + "loss": 0.4737, + "mean_token_accuracy": 0.8507556915283203, + "num_tokens": 363305029.0, + "step": 9522 + }, + { + "epoch": 1.2114234830174277, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.6570987701416, + "learning_rate": 1e-06, + "loss": 0.3941, + "mean_token_accuracy": 0.8736580014228821, + "num_tokens": 363344128.0, + "step": 9523 + }, + { + "epoch": 1.2115506932960183, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.63136100769043, + "learning_rate": 1e-06, + "loss": 0.4278, + "mean_token_accuracy": 0.8659152984619141, + "num_tokens": 363385903.0, + "step": 9524 + }, + { + "epoch": 1.2116779035746088, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.956212997436523, + "learning_rate": 1e-06, + "loss": 0.4074, + "mean_token_accuracy": 0.8677183985710144, + "num_tokens": 363425216.0, + "step": 9525 + }, + { + "epoch": 1.2118051138531993, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.797653198242188, + "learning_rate": 1e-06, + "loss": 0.3804, + "mean_token_accuracy": 0.8777249455451965, + "num_tokens": 363462691.0, + "step": 9526 + }, + { + "epoch": 1.2119323241317899, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.584720611572266, + "learning_rate": 1e-06, + "loss": 0.4706, + "mean_token_accuracy": 0.8513655066490173, + "num_tokens": 363495849.0, + "step": 9527 + }, + { + "epoch": 1.2120595344103804, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.750402450561523, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.8658496737480164, + "num_tokens": 363535268.0, + "step": 9528 + }, + { + "epoch": 1.212186744688971, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.65552520751953, + "learning_rate": 1e-06, + "loss": 0.4779, + "mean_token_accuracy": 0.8484601974487305, + "num_tokens": 363573523.0, + "step": 9529 + }, + { + "epoch": 1.2123139549675614, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.906818389892578, + "learning_rate": 1e-06, + "loss": 0.4059, + "mean_token_accuracy": 0.8656163215637207, + "num_tokens": 363607545.0, + "step": 9530 + }, + { + "epoch": 1.212441165246152, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.921306610107422, + "learning_rate": 1e-06, + "loss": 0.412, + "mean_token_accuracy": 0.8686836957931519, + "num_tokens": 363648456.0, + "step": 9531 + }, + { + "epoch": 1.2125683755247425, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.795175552368164, + "learning_rate": 1e-06, + "loss": 0.4058, + "mean_token_accuracy": 0.8754928112030029, + "num_tokens": 363688726.0, + "step": 9532 + }, + { + "epoch": 1.212695585803333, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.83642578125, + "learning_rate": 1e-06, + "loss": 0.4353, + "mean_token_accuracy": 0.8595694899559021, + "num_tokens": 363728557.0, + "step": 9533 + }, + { + "epoch": 1.2128227960819233, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.78038215637207, + "learning_rate": 1e-06, + "loss": 0.4269, + "mean_token_accuracy": 0.8657735586166382, + "num_tokens": 363764191.0, + "step": 9534 + }, + { + "epoch": 1.2129500063605139, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.84439468383789, + "learning_rate": 1e-06, + "loss": 0.4503, + "mean_token_accuracy": 0.8548355102539062, + "num_tokens": 363798126.0, + "step": 9535 + }, + { + "epoch": 1.2130772166391044, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.818952560424805, + "learning_rate": 1e-06, + "loss": 0.4564, + "mean_token_accuracy": 0.8505735397338867, + "num_tokens": 363836037.0, + "step": 9536 + }, + { + "epoch": 1.213204426917695, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.689205169677734, + "learning_rate": 1e-06, + "loss": 0.3593, + "mean_token_accuracy": 0.8866801857948303, + "num_tokens": 363872359.0, + "step": 9537 + }, + { + "epoch": 1.2133316371962855, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.769006729125977, + "learning_rate": 1e-06, + "loss": 0.4451, + "mean_token_accuracy": 0.8607989549636841, + "num_tokens": 363914206.0, + "step": 9538 + }, + { + "epoch": 1.213458847474876, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.773929595947266, + "learning_rate": 1e-06, + "loss": 0.4156, + "mean_token_accuracy": 0.8672776222229004, + "num_tokens": 363956147.0, + "step": 9539 + }, + { + "epoch": 1.2135860577534665, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.727781295776367, + "learning_rate": 1e-06, + "loss": 0.3656, + "mean_token_accuracy": 0.8836996555328369, + "num_tokens": 363986328.0, + "step": 9540 + }, + { + "epoch": 1.213713268032057, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.699451446533203, + "learning_rate": 1e-06, + "loss": 0.4293, + "mean_token_accuracy": 0.8633599281311035, + "num_tokens": 364024905.0, + "step": 9541 + }, + { + "epoch": 1.2138404783106476, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.94011116027832, + "learning_rate": 1e-06, + "loss": 0.4206, + "mean_token_accuracy": 0.8673392534255981, + "num_tokens": 364068628.0, + "step": 9542 + }, + { + "epoch": 1.213967688589238, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.880970001220703, + "learning_rate": 1e-06, + "loss": 0.4224, + "mean_token_accuracy": 0.8628575801849365, + "num_tokens": 364104261.0, + "step": 9543 + }, + { + "epoch": 1.2140948988678284, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.761037826538086, + "learning_rate": 1e-06, + "loss": 0.4261, + "mean_token_accuracy": 0.8634070158004761, + "num_tokens": 364147524.0, + "step": 9544 + }, + { + "epoch": 1.214222109146419, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.769386291503906, + "learning_rate": 1e-06, + "loss": 0.4194, + "mean_token_accuracy": 0.8679680824279785, + "num_tokens": 364187715.0, + "step": 9545 + }, + { + "epoch": 1.2143493194250095, + "ewc_loss": 0.03076171875, + "ewc_loss_parallel": 3.075599670410156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.765586853027344, + "learning_rate": 1e-06, + "loss": 0.3895, + "mean_token_accuracy": 0.8770312666893005, + "num_tokens": 364226081.0, + "step": 9546 + }, + { + "epoch": 1.2144765297036, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.846275329589844, + "learning_rate": 1e-06, + "loss": 0.4204, + "mean_token_accuracy": 0.8671308159828186, + "num_tokens": 364268390.0, + "step": 9547 + }, + { + "epoch": 1.2146037399821905, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.70577621459961, + "learning_rate": 1e-06, + "loss": 0.411, + "mean_token_accuracy": 0.8693865537643433, + "num_tokens": 364310176.0, + "step": 9548 + }, + { + "epoch": 1.214730950260781, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.638851165771484, + "learning_rate": 1e-06, + "loss": 0.3825, + "mean_token_accuracy": 0.8782633543014526, + "num_tokens": 364345652.0, + "step": 9549 + }, + { + "epoch": 1.2148581605393716, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.883975982666016, + "learning_rate": 1e-06, + "loss": 0.4394, + "mean_token_accuracy": 0.8623746633529663, + "num_tokens": 364383810.0, + "step": 9550 + }, + { + "epoch": 1.214985370817962, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.754467010498047, + "learning_rate": 1e-06, + "loss": 0.4086, + "mean_token_accuracy": 0.8704603910446167, + "num_tokens": 364421169.0, + "step": 9551 + }, + { + "epoch": 1.2151125810965526, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.95923614501953, + "learning_rate": 1e-06, + "loss": 0.4198, + "mean_token_accuracy": 0.8604063987731934, + "num_tokens": 364452552.0, + "step": 9552 + }, + { + "epoch": 1.2152397913751432, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.885337829589844, + "learning_rate": 1e-06, + "loss": 0.4261, + "mean_token_accuracy": 0.8646435737609863, + "num_tokens": 364486779.0, + "step": 9553 + }, + { + "epoch": 1.2153670016537337, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.9345645904541, + "learning_rate": 1e-06, + "loss": 0.4149, + "mean_token_accuracy": 0.8763692378997803, + "num_tokens": 364526237.0, + "step": 9554 + }, + { + "epoch": 1.2154942119323242, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.817821502685547, + "learning_rate": 1e-06, + "loss": 0.3933, + "mean_token_accuracy": 0.8722534775733948, + "num_tokens": 364568446.0, + "step": 9555 + }, + { + "epoch": 1.2156214222109147, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.993955612182617, + "learning_rate": 1e-06, + "loss": 0.395, + "mean_token_accuracy": 0.8740871548652649, + "num_tokens": 364604448.0, + "step": 9556 + }, + { + "epoch": 1.2157486324895053, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.725793838500977, + "learning_rate": 1e-06, + "loss": 0.4291, + "mean_token_accuracy": 0.8637478351593018, + "num_tokens": 364642707.0, + "step": 9557 + }, + { + "epoch": 1.2158758427680956, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.7563419342041, + "learning_rate": 1e-06, + "loss": 0.4456, + "mean_token_accuracy": 0.8614727258682251, + "num_tokens": 364678428.0, + "step": 9558 + }, + { + "epoch": 1.216003053046686, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.894027709960938, + "learning_rate": 1e-06, + "loss": 0.3756, + "mean_token_accuracy": 0.8800293207168579, + "num_tokens": 364718915.0, + "step": 9559 + }, + { + "epoch": 1.2161302633252766, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.825305938720703, + "learning_rate": 1e-06, + "loss": 0.3705, + "mean_token_accuracy": 0.8830794095993042, + "num_tokens": 364756030.0, + "step": 9560 + }, + { + "epoch": 1.2162574736038672, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.724666595458984, + "learning_rate": 1e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.863773763179779, + "num_tokens": 364796158.0, + "step": 9561 + }, + { + "epoch": 1.2163846838824577, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.947093963623047, + "learning_rate": 1e-06, + "loss": 0.3911, + "mean_token_accuracy": 0.8751342296600342, + "num_tokens": 364832722.0, + "step": 9562 + }, + { + "epoch": 1.2165118941610482, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.011676788330078, + "learning_rate": 1e-06, + "loss": 0.4238, + "mean_token_accuracy": 0.8644391298294067, + "num_tokens": 364874120.0, + "step": 9563 + }, + { + "epoch": 1.2166391044396387, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.82469940185547, + "learning_rate": 1e-06, + "loss": 0.4126, + "mean_token_accuracy": 0.868074893951416, + "num_tokens": 364913166.0, + "step": 9564 + }, + { + "epoch": 1.2167663147182293, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.944276809692383, + "learning_rate": 1e-06, + "loss": 0.4658, + "mean_token_accuracy": 0.8489251732826233, + "num_tokens": 364949586.0, + "step": 9565 + }, + { + "epoch": 1.2168935249968198, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.86808204650879, + "learning_rate": 1e-06, + "loss": 0.472, + "mean_token_accuracy": 0.8487450480461121, + "num_tokens": 364989070.0, + "step": 9566 + }, + { + "epoch": 1.2170207352754103, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.8581485748291, + "learning_rate": 1e-06, + "loss": 0.4211, + "mean_token_accuracy": 0.870442271232605, + "num_tokens": 365027050.0, + "step": 9567 + }, + { + "epoch": 1.2171479455540009, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.856159210205078, + "learning_rate": 1e-06, + "loss": 0.4187, + "mean_token_accuracy": 0.8666428327560425, + "num_tokens": 365060050.0, + "step": 9568 + }, + { + "epoch": 1.2172751558325912, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.797788619995117, + "learning_rate": 1e-06, + "loss": 0.5138, + "mean_token_accuracy": 0.8362602591514587, + "num_tokens": 365098030.0, + "step": 9569 + }, + { + "epoch": 1.2174023661111817, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.787633895874023, + "learning_rate": 1e-06, + "loss": 0.4031, + "mean_token_accuracy": 0.8684239983558655, + "num_tokens": 365141143.0, + "step": 9570 + }, + { + "epoch": 1.2175295763897722, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.935993194580078, + "learning_rate": 1e-06, + "loss": 0.5097, + "mean_token_accuracy": 0.8431860208511353, + "num_tokens": 365174204.0, + "step": 9571 + }, + { + "epoch": 1.2176567866683627, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.77485466003418, + "learning_rate": 1e-06, + "loss": 0.4166, + "mean_token_accuracy": 0.8667779564857483, + "num_tokens": 365207170.0, + "step": 9572 + }, + { + "epoch": 1.2177839969469533, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.909292221069336, + "learning_rate": 1e-06, + "loss": 0.4097, + "mean_token_accuracy": 0.870063066482544, + "num_tokens": 365248926.0, + "step": 9573 + }, + { + "epoch": 1.2179112072255438, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.81229591369629, + "learning_rate": 1e-06, + "loss": 0.4073, + "mean_token_accuracy": 0.869417667388916, + "num_tokens": 365283588.0, + "step": 9574 + }, + { + "epoch": 1.2180384175041343, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.05471420288086, + "learning_rate": 1e-06, + "loss": 0.4348, + "mean_token_accuracy": 0.8640398383140564, + "num_tokens": 365324869.0, + "step": 9575 + }, + { + "epoch": 1.2181656277827249, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.87741470336914, + "learning_rate": 1e-06, + "loss": 0.4235, + "mean_token_accuracy": 0.8631011247634888, + "num_tokens": 365366832.0, + "step": 9576 + }, + { + "epoch": 1.2182928380613154, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.871360778808594, + "learning_rate": 1e-06, + "loss": 0.4187, + "mean_token_accuracy": 0.8640522956848145, + "num_tokens": 365401148.0, + "step": 9577 + }, + { + "epoch": 1.218420048339906, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.899709701538086, + "learning_rate": 1e-06, + "loss": 0.421, + "mean_token_accuracy": 0.8684372901916504, + "num_tokens": 365442303.0, + "step": 9578 + }, + { + "epoch": 1.2185472586184964, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.690214157104492, + "learning_rate": 1e-06, + "loss": 0.3856, + "mean_token_accuracy": 0.8785255551338196, + "num_tokens": 365477877.0, + "step": 9579 + }, + { + "epoch": 1.218674468897087, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.80655288696289, + "learning_rate": 1e-06, + "loss": 0.4596, + "mean_token_accuracy": 0.8543643355369568, + "num_tokens": 365520648.0, + "step": 9580 + }, + { + "epoch": 1.2188016791756775, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.981760025024414, + "learning_rate": 1e-06, + "loss": 0.4875, + "mean_token_accuracy": 0.8485968112945557, + "num_tokens": 365555129.0, + "step": 9581 + }, + { + "epoch": 1.218928889454268, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.756834030151367, + "learning_rate": 1e-06, + "loss": 0.3969, + "mean_token_accuracy": 0.8711649775505066, + "num_tokens": 365594368.0, + "step": 9582 + }, + { + "epoch": 1.2190560997328583, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.82646942138672, + "learning_rate": 1e-06, + "loss": 0.4338, + "mean_token_accuracy": 0.865681529045105, + "num_tokens": 365634763.0, + "step": 9583 + }, + { + "epoch": 1.2191833100114489, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.696340560913086, + "learning_rate": 1e-06, + "loss": 0.3743, + "mean_token_accuracy": 0.8820668458938599, + "num_tokens": 365675951.0, + "step": 9584 + }, + { + "epoch": 1.2193105202900394, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.028139114379883, + "learning_rate": 1e-06, + "loss": 0.3893, + "mean_token_accuracy": 0.8741632699966431, + "num_tokens": 365710348.0, + "step": 9585 + }, + { + "epoch": 1.21943773056863, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.699739456176758, + "learning_rate": 1e-06, + "loss": 0.415, + "mean_token_accuracy": 0.8681931495666504, + "num_tokens": 365745166.0, + "step": 9586 + }, + { + "epoch": 1.2195649408472204, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.90877914428711, + "learning_rate": 1e-06, + "loss": 0.3785, + "mean_token_accuracy": 0.8793770670890808, + "num_tokens": 365780122.0, + "step": 9587 + }, + { + "epoch": 1.219692151125811, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.03639793395996, + "learning_rate": 1e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.8656420111656189, + "num_tokens": 365816326.0, + "step": 9588 + }, + { + "epoch": 1.2198193614044015, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.163806915283203, + "learning_rate": 1e-06, + "loss": 0.4996, + "mean_token_accuracy": 0.8400543332099915, + "num_tokens": 365858938.0, + "step": 9589 + }, + { + "epoch": 1.219946571682992, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.74925994873047, + "learning_rate": 1e-06, + "loss": 0.4481, + "mean_token_accuracy": 0.8560076355934143, + "num_tokens": 365893892.0, + "step": 9590 + }, + { + "epoch": 1.2200737819615826, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.923206329345703, + "learning_rate": 1e-06, + "loss": 0.4434, + "mean_token_accuracy": 0.8570793867111206, + "num_tokens": 365938787.0, + "step": 9591 + }, + { + "epoch": 1.220200992240173, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.03551483154297, + "learning_rate": 1e-06, + "loss": 0.4163, + "mean_token_accuracy": 0.865752100944519, + "num_tokens": 365972664.0, + "step": 9592 + }, + { + "epoch": 1.2203282025187634, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.717025756835938, + "learning_rate": 1e-06, + "loss": 0.3982, + "mean_token_accuracy": 0.8727088570594788, + "num_tokens": 366013367.0, + "step": 9593 + }, + { + "epoch": 1.220455412797354, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.638517379760742, + "learning_rate": 1e-06, + "loss": 0.3963, + "mean_token_accuracy": 0.872421145439148, + "num_tokens": 366051637.0, + "step": 9594 + }, + { + "epoch": 1.2205826230759445, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.667978286743164, + "learning_rate": 1e-06, + "loss": 0.4415, + "mean_token_accuracy": 0.8634703159332275, + "num_tokens": 366092713.0, + "step": 9595 + }, + { + "epoch": 1.220709833354535, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.700103759765625, + "learning_rate": 1e-06, + "loss": 0.4168, + "mean_token_accuracy": 0.867022693157196, + "num_tokens": 366131594.0, + "step": 9596 + }, + { + "epoch": 1.2208370436331255, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.900386810302734, + "learning_rate": 1e-06, + "loss": 0.4365, + "mean_token_accuracy": 0.8606820106506348, + "num_tokens": 366170871.0, + "step": 9597 + }, + { + "epoch": 1.220964253911716, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.78499412536621, + "learning_rate": 1e-06, + "loss": 0.4502, + "mean_token_accuracy": 0.8563713431358337, + "num_tokens": 366212974.0, + "step": 9598 + }, + { + "epoch": 1.2210914641903066, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.814830780029297, + "learning_rate": 1e-06, + "loss": 0.447, + "mean_token_accuracy": 0.8582555651664734, + "num_tokens": 366255904.0, + "step": 9599 + }, + { + "epoch": 1.221218674468897, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.80765724182129, + "learning_rate": 1e-06, + "loss": 0.4018, + "mean_token_accuracy": 0.8717083930969238, + "num_tokens": 366288887.0, + "step": 9600 + }, + { + "epoch": 1.2213458847474876, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.965991973876953, + "learning_rate": 1e-06, + "loss": 0.3754, + "mean_token_accuracy": 0.8798304200172424, + "num_tokens": 366328796.0, + "step": 9601 + }, + { + "epoch": 1.2214730950260781, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.865243911743164, + "learning_rate": 1e-06, + "loss": 0.4318, + "mean_token_accuracy": 0.863493800163269, + "num_tokens": 366373093.0, + "step": 9602 + }, + { + "epoch": 1.2216003053046687, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.825342178344727, + "learning_rate": 1e-06, + "loss": 0.3946, + "mean_token_accuracy": 0.8752117156982422, + "num_tokens": 366414353.0, + "step": 9603 + }, + { + "epoch": 1.2217275155832592, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.964628219604492, + "learning_rate": 1e-06, + "loss": 0.3883, + "mean_token_accuracy": 0.879427433013916, + "num_tokens": 366450683.0, + "step": 9604 + }, + { + "epoch": 1.2218547258618497, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.729595184326172, + "learning_rate": 1e-06, + "loss": 0.4081, + "mean_token_accuracy": 0.8705268502235413, + "num_tokens": 366487158.0, + "step": 9605 + }, + { + "epoch": 1.2219819361404403, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.872385025024414, + "learning_rate": 1e-06, + "loss": 0.4103, + "mean_token_accuracy": 0.8681859970092773, + "num_tokens": 366529984.0, + "step": 9606 + }, + { + "epoch": 1.2221091464190306, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.807058334350586, + "learning_rate": 1e-06, + "loss": 0.4198, + "mean_token_accuracy": 0.8703527450561523, + "num_tokens": 366564100.0, + "step": 9607 + }, + { + "epoch": 1.222236356697621, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.791837692260742, + "learning_rate": 1e-06, + "loss": 0.3915, + "mean_token_accuracy": 0.8731738328933716, + "num_tokens": 366603752.0, + "step": 9608 + }, + { + "epoch": 1.2223635669762116, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.90421485900879, + "learning_rate": 1e-06, + "loss": 0.4124, + "mean_token_accuracy": 0.8685214519500732, + "num_tokens": 366642170.0, + "step": 9609 + }, + { + "epoch": 1.2224907772548022, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.86543846130371, + "learning_rate": 1e-06, + "loss": 0.4752, + "mean_token_accuracy": 0.8526124954223633, + "num_tokens": 366684038.0, + "step": 9610 + }, + { + "epoch": 1.2226179875333927, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.822925567626953, + "learning_rate": 1e-06, + "loss": 0.4267, + "mean_token_accuracy": 0.8633748292922974, + "num_tokens": 366717832.0, + "step": 9611 + }, + { + "epoch": 1.2227451978119832, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.818580627441406, + "learning_rate": 1e-06, + "loss": 0.4351, + "mean_token_accuracy": 0.8606692552566528, + "num_tokens": 366757999.0, + "step": 9612 + }, + { + "epoch": 1.2228724080905737, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.971662521362305, + "learning_rate": 1e-06, + "loss": 0.3955, + "mean_token_accuracy": 0.8753259778022766, + "num_tokens": 366798533.0, + "step": 9613 + }, + { + "epoch": 1.2229996183691643, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.823646545410156, + "learning_rate": 1e-06, + "loss": 0.4281, + "mean_token_accuracy": 0.8633900880813599, + "num_tokens": 366843066.0, + "step": 9614 + }, + { + "epoch": 1.2231268286477548, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.635459899902344, + "learning_rate": 1e-06, + "loss": 0.3747, + "mean_token_accuracy": 0.8816080689430237, + "num_tokens": 366883190.0, + "step": 9615 + }, + { + "epoch": 1.2232540389263453, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.884521484375, + "learning_rate": 1e-06, + "loss": 0.4076, + "mean_token_accuracy": 0.8703169822692871, + "num_tokens": 366920642.0, + "step": 9616 + }, + { + "epoch": 1.2233812492049359, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.728065490722656, + "learning_rate": 1e-06, + "loss": 0.4175, + "mean_token_accuracy": 0.8665501475334167, + "num_tokens": 366954468.0, + "step": 9617 + }, + { + "epoch": 1.2235084594835262, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.851341247558594, + "learning_rate": 1e-06, + "loss": 0.4703, + "mean_token_accuracy": 0.8506125807762146, + "num_tokens": 366991170.0, + "step": 9618 + }, + { + "epoch": 1.2236356697621167, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.776138305664062, + "learning_rate": 1e-06, + "loss": 0.4115, + "mean_token_accuracy": 0.87092125415802, + "num_tokens": 367031833.0, + "step": 9619 + }, + { + "epoch": 1.2237628800407072, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.883649826049805, + "learning_rate": 1e-06, + "loss": 0.4553, + "mean_token_accuracy": 0.8613644242286682, + "num_tokens": 367074709.0, + "step": 9620 + }, + { + "epoch": 1.2238900903192977, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.90238380432129, + "learning_rate": 1e-06, + "loss": 0.4411, + "mean_token_accuracy": 0.8636603951454163, + "num_tokens": 367106820.0, + "step": 9621 + }, + { + "epoch": 1.2240173005978883, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.844484329223633, + "learning_rate": 1e-06, + "loss": 0.3761, + "mean_token_accuracy": 0.8803486227989197, + "num_tokens": 367142298.0, + "step": 9622 + }, + { + "epoch": 1.2241445108764788, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.88605499267578, + "learning_rate": 1e-06, + "loss": 0.4504, + "mean_token_accuracy": 0.856783390045166, + "num_tokens": 367179957.0, + "step": 9623 + }, + { + "epoch": 1.2242717211550693, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.87251091003418, + "learning_rate": 1e-06, + "loss": 0.4104, + "mean_token_accuracy": 0.8689159750938416, + "num_tokens": 367226800.0, + "step": 9624 + }, + { + "epoch": 1.2243989314336599, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.90260124206543, + "learning_rate": 1e-06, + "loss": 0.3856, + "mean_token_accuracy": 0.8707275390625, + "num_tokens": 367257168.0, + "step": 9625 + }, + { + "epoch": 1.2245261417122504, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.7482967376709, + "learning_rate": 1e-06, + "loss": 0.4097, + "mean_token_accuracy": 0.8683272004127502, + "num_tokens": 367295254.0, + "step": 9626 + }, + { + "epoch": 1.224653351990841, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.768856048583984, + "learning_rate": 1e-06, + "loss": 0.4635, + "mean_token_accuracy": 0.8547553420066833, + "num_tokens": 367335814.0, + "step": 9627 + }, + { + "epoch": 1.2247805622694314, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.86652946472168, + "learning_rate": 1e-06, + "loss": 0.4372, + "mean_token_accuracy": 0.8638384342193604, + "num_tokens": 367377592.0, + "step": 9628 + }, + { + "epoch": 1.224907772548022, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.789703369140625, + "learning_rate": 1e-06, + "loss": 0.444, + "mean_token_accuracy": 0.8589025139808655, + "num_tokens": 367408069.0, + "step": 9629 + }, + { + "epoch": 1.2250349828266125, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.850648880004883, + "learning_rate": 1e-06, + "loss": 0.4289, + "mean_token_accuracy": 0.867851734161377, + "num_tokens": 367444787.0, + "step": 9630 + }, + { + "epoch": 1.225162193105203, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.923419952392578, + "learning_rate": 1e-06, + "loss": 0.4588, + "mean_token_accuracy": 0.8527368307113647, + "num_tokens": 367486848.0, + "step": 9631 + }, + { + "epoch": 1.2252894033837933, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.845571517944336, + "learning_rate": 1e-06, + "loss": 0.3868, + "mean_token_accuracy": 0.8770277500152588, + "num_tokens": 367530456.0, + "step": 9632 + }, + { + "epoch": 1.2254166136623839, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.95531463623047, + "learning_rate": 1e-06, + "loss": 0.4088, + "mean_token_accuracy": 0.8695381879806519, + "num_tokens": 367566513.0, + "step": 9633 + }, + { + "epoch": 1.2255438239409744, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.886533737182617, + "learning_rate": 1e-06, + "loss": 0.4937, + "mean_token_accuracy": 0.8458069562911987, + "num_tokens": 367607757.0, + "step": 9634 + }, + { + "epoch": 1.225671034219565, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.747814178466797, + "learning_rate": 1e-06, + "loss": 0.4116, + "mean_token_accuracy": 0.8682140111923218, + "num_tokens": 367645525.0, + "step": 9635 + }, + { + "epoch": 1.2257982444981554, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.787921905517578, + "learning_rate": 1e-06, + "loss": 0.3923, + "mean_token_accuracy": 0.8752476572990417, + "num_tokens": 367680840.0, + "step": 9636 + }, + { + "epoch": 1.225925454776746, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.735347747802734, + "learning_rate": 1e-06, + "loss": 0.415, + "mean_token_accuracy": 0.874313235282898, + "num_tokens": 367717444.0, + "step": 9637 + }, + { + "epoch": 1.2260526650553365, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.85276985168457, + "learning_rate": 1e-06, + "loss": 0.4812, + "mean_token_accuracy": 0.8459066152572632, + "num_tokens": 367756548.0, + "step": 9638 + }, + { + "epoch": 1.226179875333927, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.61335563659668, + "learning_rate": 1e-06, + "loss": 0.4423, + "mean_token_accuracy": 0.860992968082428, + "num_tokens": 367798341.0, + "step": 9639 + }, + { + "epoch": 1.2263070856125176, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.815309524536133, + "learning_rate": 1e-06, + "loss": 0.3966, + "mean_token_accuracy": 0.873260498046875, + "num_tokens": 367835087.0, + "step": 9640 + }, + { + "epoch": 1.226434295891108, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.915542602539062, + "learning_rate": 1e-06, + "loss": 0.4134, + "mean_token_accuracy": 0.867640495300293, + "num_tokens": 367873816.0, + "step": 9641 + }, + { + "epoch": 1.2265615061696984, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.548158645629883, + "learning_rate": 1e-06, + "loss": 0.4375, + "mean_token_accuracy": 0.8626724481582642, + "num_tokens": 367913621.0, + "step": 9642 + }, + { + "epoch": 1.226688716448289, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.97014045715332, + "learning_rate": 1e-06, + "loss": 0.4598, + "mean_token_accuracy": 0.8574178814888, + "num_tokens": 367946881.0, + "step": 9643 + }, + { + "epoch": 1.2268159267268794, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.662654876708984, + "learning_rate": 1e-06, + "loss": 0.4026, + "mean_token_accuracy": 0.8705106973648071, + "num_tokens": 367980806.0, + "step": 9644 + }, + { + "epoch": 1.22694313700547, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.03285789489746, + "learning_rate": 1e-06, + "loss": 0.3928, + "mean_token_accuracy": 0.8732664585113525, + "num_tokens": 368016041.0, + "step": 9645 + }, + { + "epoch": 1.2270703472840605, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.62049102783203, + "learning_rate": 1e-06, + "loss": 0.428, + "mean_token_accuracy": 0.8645811676979065, + "num_tokens": 368052331.0, + "step": 9646 + }, + { + "epoch": 1.227197557562651, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.8167781829834, + "learning_rate": 1e-06, + "loss": 0.3951, + "mean_token_accuracy": 0.8752511143684387, + "num_tokens": 368091458.0, + "step": 9647 + }, + { + "epoch": 1.2273247678412416, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.733505249023438, + "learning_rate": 1e-06, + "loss": 0.4491, + "mean_token_accuracy": 0.8551845550537109, + "num_tokens": 368132433.0, + "step": 9648 + }, + { + "epoch": 1.227451978119832, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.742326736450195, + "learning_rate": 1e-06, + "loss": 0.4201, + "mean_token_accuracy": 0.8672927618026733, + "num_tokens": 368171030.0, + "step": 9649 + }, + { + "epoch": 1.2275791883984226, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.834531784057617, + "learning_rate": 1e-06, + "loss": 0.4525, + "mean_token_accuracy": 0.855162501335144, + "num_tokens": 368211555.0, + "step": 9650 + }, + { + "epoch": 1.2277063986770131, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.798763275146484, + "learning_rate": 1e-06, + "loss": 0.3992, + "mean_token_accuracy": 0.8720412254333496, + "num_tokens": 368252076.0, + "step": 9651 + }, + { + "epoch": 1.2278336089556037, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.80188751220703, + "learning_rate": 1e-06, + "loss": 0.4544, + "mean_token_accuracy": 0.8601871132850647, + "num_tokens": 368291265.0, + "step": 9652 + }, + { + "epoch": 1.2279608192341942, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.729127883911133, + "learning_rate": 1e-06, + "loss": 0.4403, + "mean_token_accuracy": 0.8597638010978699, + "num_tokens": 368332178.0, + "step": 9653 + }, + { + "epoch": 1.2280880295127847, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.789533615112305, + "learning_rate": 1e-06, + "loss": 0.4241, + "mean_token_accuracy": 0.8650479316711426, + "num_tokens": 368374103.0, + "step": 9654 + }, + { + "epoch": 1.2282152397913753, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.858301162719727, + "learning_rate": 1e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.8684729933738708, + "num_tokens": 368411928.0, + "step": 9655 + }, + { + "epoch": 1.2283424500699656, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.754409790039062, + "learning_rate": 1e-06, + "loss": 0.4769, + "mean_token_accuracy": 0.8423928618431091, + "num_tokens": 368449200.0, + "step": 9656 + }, + { + "epoch": 1.228469660348556, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.875869750976562, + "learning_rate": 1e-06, + "loss": 0.4736, + "mean_token_accuracy": 0.8488829135894775, + "num_tokens": 368489935.0, + "step": 9657 + }, + { + "epoch": 1.2285968706271466, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.796415328979492, + "learning_rate": 1e-06, + "loss": 0.3872, + "mean_token_accuracy": 0.8790328502655029, + "num_tokens": 368522227.0, + "step": 9658 + }, + { + "epoch": 1.2287240809057371, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.93019676208496, + "learning_rate": 1e-06, + "loss": 0.4315, + "mean_token_accuracy": 0.8660681247711182, + "num_tokens": 368555084.0, + "step": 9659 + }, + { + "epoch": 1.2288512911843277, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.927061080932617, + "learning_rate": 1e-06, + "loss": 0.44, + "mean_token_accuracy": 0.8619011640548706, + "num_tokens": 368591845.0, + "step": 9660 + }, + { + "epoch": 1.2289785014629182, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.849328994750977, + "learning_rate": 1e-06, + "loss": 0.4536, + "mean_token_accuracy": 0.8554328083992004, + "num_tokens": 368637841.0, + "step": 9661 + }, + { + "epoch": 1.2291057117415087, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.68755340576172, + "learning_rate": 1e-06, + "loss": 0.3665, + "mean_token_accuracy": 0.8810600638389587, + "num_tokens": 368677470.0, + "step": 9662 + }, + { + "epoch": 1.2292329220200993, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.661041259765625, + "learning_rate": 1e-06, + "loss": 0.3818, + "mean_token_accuracy": 0.8790230751037598, + "num_tokens": 368718412.0, + "step": 9663 + }, + { + "epoch": 1.2293601322986898, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.959484100341797, + "learning_rate": 1e-06, + "loss": 0.4565, + "mean_token_accuracy": 0.8543792963027954, + "num_tokens": 368759267.0, + "step": 9664 + }, + { + "epoch": 1.2294873425772803, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.72646141052246, + "learning_rate": 1e-06, + "loss": 0.465, + "mean_token_accuracy": 0.8573593497276306, + "num_tokens": 368797586.0, + "step": 9665 + }, + { + "epoch": 1.2296145528558708, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.732555389404297, + "learning_rate": 1e-06, + "loss": 0.4532, + "mean_token_accuracy": 0.858229398727417, + "num_tokens": 368833801.0, + "step": 9666 + }, + { + "epoch": 1.2297417631344612, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.93280792236328, + "learning_rate": 1e-06, + "loss": 0.4698, + "mean_token_accuracy": 0.8501728773117065, + "num_tokens": 368878513.0, + "step": 9667 + }, + { + "epoch": 1.2298689734130517, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.640127182006836, + "learning_rate": 1e-06, + "loss": 0.4452, + "mean_token_accuracy": 0.859101414680481, + "num_tokens": 368917815.0, + "step": 9668 + }, + { + "epoch": 1.2299961836916422, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.94087791442871, + "learning_rate": 1e-06, + "loss": 0.4585, + "mean_token_accuracy": 0.8544463515281677, + "num_tokens": 368954971.0, + "step": 9669 + }, + { + "epoch": 1.2301233939702327, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.508159637451172, + "learning_rate": 1e-06, + "loss": 0.4045, + "mean_token_accuracy": 0.8705635666847229, + "num_tokens": 368996212.0, + "step": 9670 + }, + { + "epoch": 1.2302506042488233, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.07894515991211, + "learning_rate": 1e-06, + "loss": 0.3773, + "mean_token_accuracy": 0.8795440196990967, + "num_tokens": 369032186.0, + "step": 9671 + }, + { + "epoch": 1.2303778145274138, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.820981979370117, + "learning_rate": 1e-06, + "loss": 0.4091, + "mean_token_accuracy": 0.8706516623497009, + "num_tokens": 369077565.0, + "step": 9672 + }, + { + "epoch": 1.2305050248060043, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.97395133972168, + "learning_rate": 1e-06, + "loss": 0.3975, + "mean_token_accuracy": 0.8729997873306274, + "num_tokens": 369111935.0, + "step": 9673 + }, + { + "epoch": 1.2306322350845948, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.796707153320312, + "learning_rate": 1e-06, + "loss": 0.4319, + "mean_token_accuracy": 0.8641226291656494, + "num_tokens": 369154374.0, + "step": 9674 + }, + { + "epoch": 1.2307594453631854, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.932415008544922, + "learning_rate": 1e-06, + "loss": 0.4487, + "mean_token_accuracy": 0.8576623201370239, + "num_tokens": 369198557.0, + "step": 9675 + }, + { + "epoch": 1.230886655641776, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.80035972595215, + "learning_rate": 1e-06, + "loss": 0.4766, + "mean_token_accuracy": 0.8493688106536865, + "num_tokens": 369237756.0, + "step": 9676 + }, + { + "epoch": 1.2310138659203664, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.948135375976562, + "learning_rate": 1e-06, + "loss": 0.4755, + "mean_token_accuracy": 0.8529917001724243, + "num_tokens": 369270155.0, + "step": 9677 + }, + { + "epoch": 1.231141076198957, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.044252395629883, + "learning_rate": 1e-06, + "loss": 0.4552, + "mean_token_accuracy": 0.8584988117218018, + "num_tokens": 369310870.0, + "step": 9678 + }, + { + "epoch": 1.2312682864775475, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.892650604248047, + "learning_rate": 1e-06, + "loss": 0.4724, + "mean_token_accuracy": 0.8475595116615295, + "num_tokens": 369344679.0, + "step": 9679 + }, + { + "epoch": 1.231395496756138, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.903291702270508, + "learning_rate": 1e-06, + "loss": 0.424, + "mean_token_accuracy": 0.86553955078125, + "num_tokens": 369380388.0, + "step": 9680 + }, + { + "epoch": 1.2315227070347283, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.641620635986328, + "learning_rate": 1e-06, + "loss": 0.3878, + "mean_token_accuracy": 0.8769643306732178, + "num_tokens": 369422327.0, + "step": 9681 + }, + { + "epoch": 1.2316499173133189, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.916109085083008, + "learning_rate": 1e-06, + "loss": 0.3944, + "mean_token_accuracy": 0.8727321624755859, + "num_tokens": 369457160.0, + "step": 9682 + }, + { + "epoch": 1.2317771275919094, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.6661434173584, + "learning_rate": 1e-06, + "loss": 0.4224, + "mean_token_accuracy": 0.8656671643257141, + "num_tokens": 369497883.0, + "step": 9683 + }, + { + "epoch": 1.2319043378705, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.77937126159668, + "learning_rate": 1e-06, + "loss": 0.3333, + "mean_token_accuracy": 0.8950826525688171, + "num_tokens": 369540014.0, + "step": 9684 + }, + { + "epoch": 1.2320315481490904, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.788515090942383, + "learning_rate": 1e-06, + "loss": 0.426, + "mean_token_accuracy": 0.8646238446235657, + "num_tokens": 369580424.0, + "step": 9685 + }, + { + "epoch": 1.232158758427681, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.74416732788086, + "learning_rate": 1e-06, + "loss": 0.4489, + "mean_token_accuracy": 0.8620398044586182, + "num_tokens": 369623615.0, + "step": 9686 + }, + { + "epoch": 1.2322859687062715, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.64459800720215, + "learning_rate": 1e-06, + "loss": 0.469, + "mean_token_accuracy": 0.8540389537811279, + "num_tokens": 369658408.0, + "step": 9687 + }, + { + "epoch": 1.232413178984862, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.003318786621094, + "learning_rate": 1e-06, + "loss": 0.4327, + "mean_token_accuracy": 0.8629414439201355, + "num_tokens": 369696311.0, + "step": 9688 + }, + { + "epoch": 1.2325403892634526, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.648677825927734, + "learning_rate": 1e-06, + "loss": 0.4696, + "mean_token_accuracy": 0.8527809381484985, + "num_tokens": 369737473.0, + "step": 9689 + }, + { + "epoch": 1.232667599542043, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.715238571166992, + "learning_rate": 1e-06, + "loss": 0.4083, + "mean_token_accuracy": 0.8636281490325928, + "num_tokens": 369770151.0, + "step": 9690 + }, + { + "epoch": 1.2327948098206334, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.802560806274414, + "learning_rate": 1e-06, + "loss": 0.3673, + "mean_token_accuracy": 0.8835065960884094, + "num_tokens": 369813503.0, + "step": 9691 + }, + { + "epoch": 1.232922020099224, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.69346809387207, + "learning_rate": 1e-06, + "loss": 0.4049, + "mean_token_accuracy": 0.8672117590904236, + "num_tokens": 369848768.0, + "step": 9692 + }, + { + "epoch": 1.2330492303778144, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.728506088256836, + "learning_rate": 1e-06, + "loss": 0.4186, + "mean_token_accuracy": 0.8683091402053833, + "num_tokens": 369892510.0, + "step": 9693 + }, + { + "epoch": 1.233176440656405, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.645708084106445, + "learning_rate": 1e-06, + "loss": 0.3957, + "mean_token_accuracy": 0.8734545707702637, + "num_tokens": 369929638.0, + "step": 9694 + }, + { + "epoch": 1.2333036509349955, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.925289154052734, + "learning_rate": 1e-06, + "loss": 0.3633, + "mean_token_accuracy": 0.8860431909561157, + "num_tokens": 369971201.0, + "step": 9695 + }, + { + "epoch": 1.233430861213586, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.773733139038086, + "learning_rate": 1e-06, + "loss": 0.4409, + "mean_token_accuracy": 0.8599675297737122, + "num_tokens": 370011713.0, + "step": 9696 + }, + { + "epoch": 1.2335580714921766, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.85251808166504, + "learning_rate": 1e-06, + "loss": 0.4716, + "mean_token_accuracy": 0.8463045954704285, + "num_tokens": 370042647.0, + "step": 9697 + }, + { + "epoch": 1.233685281770767, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.8542537689209, + "learning_rate": 1e-06, + "loss": 0.4359, + "mean_token_accuracy": 0.8638261556625366, + "num_tokens": 370083976.0, + "step": 9698 + }, + { + "epoch": 1.2338124920493576, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.837114334106445, + "learning_rate": 1e-06, + "loss": 0.4379, + "mean_token_accuracy": 0.861742377281189, + "num_tokens": 370126435.0, + "step": 9699 + }, + { + "epoch": 1.2339397023279481, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.887868881225586, + "learning_rate": 1e-06, + "loss": 0.3914, + "mean_token_accuracy": 0.8740943074226379, + "num_tokens": 370165038.0, + "step": 9700 + }, + { + "epoch": 1.2340669126065387, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.051952362060547, + "learning_rate": 1e-06, + "loss": 0.4546, + "mean_token_accuracy": 0.8576614260673523, + "num_tokens": 370203438.0, + "step": 9701 + }, + { + "epoch": 1.2341941228851292, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.70107650756836, + "learning_rate": 1e-06, + "loss": 0.3833, + "mean_token_accuracy": 0.8808426856994629, + "num_tokens": 370235144.0, + "step": 9702 + }, + { + "epoch": 1.2343213331637197, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.996183395385742, + "learning_rate": 1e-06, + "loss": 0.4362, + "mean_token_accuracy": 0.860285758972168, + "num_tokens": 370278936.0, + "step": 9703 + }, + { + "epoch": 1.2344485434423103, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.88475799560547, + "learning_rate": 1e-06, + "loss": 0.3989, + "mean_token_accuracy": 0.869495153427124, + "num_tokens": 370312595.0, + "step": 9704 + }, + { + "epoch": 1.2345757537209006, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.758769989013672, + "learning_rate": 1e-06, + "loss": 0.432, + "mean_token_accuracy": 0.8629088401794434, + "num_tokens": 370351697.0, + "step": 9705 + }, + { + "epoch": 1.234702963999491, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.996021270751953, + "learning_rate": 1e-06, + "loss": 0.393, + "mean_token_accuracy": 0.8789742588996887, + "num_tokens": 370391264.0, + "step": 9706 + }, + { + "epoch": 1.2348301742780816, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.71999168395996, + "learning_rate": 1e-06, + "loss": 0.3896, + "mean_token_accuracy": 0.8726730346679688, + "num_tokens": 370425248.0, + "step": 9707 + }, + { + "epoch": 1.2349573845566721, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.802709579467773, + "learning_rate": 1e-06, + "loss": 0.4865, + "mean_token_accuracy": 0.8440589308738708, + "num_tokens": 370462670.0, + "step": 9708 + }, + { + "epoch": 1.2350845948352627, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.840255737304688, + "learning_rate": 1e-06, + "loss": 0.4858, + "mean_token_accuracy": 0.8455436825752258, + "num_tokens": 370506126.0, + "step": 9709 + }, + { + "epoch": 1.2352118051138532, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.8848934173584, + "learning_rate": 1e-06, + "loss": 0.3891, + "mean_token_accuracy": 0.8751007914543152, + "num_tokens": 370546895.0, + "step": 9710 + }, + { + "epoch": 1.2353390153924437, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.696718215942383, + "learning_rate": 1e-06, + "loss": 0.4069, + "mean_token_accuracy": 0.8694725036621094, + "num_tokens": 370590645.0, + "step": 9711 + }, + { + "epoch": 1.2354662256710343, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.74180030822754, + "learning_rate": 1e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.8685343861579895, + "num_tokens": 370628614.0, + "step": 9712 + }, + { + "epoch": 1.2355934359496248, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.817955017089844, + "learning_rate": 1e-06, + "loss": 0.4242, + "mean_token_accuracy": 0.8668738007545471, + "num_tokens": 370666082.0, + "step": 9713 + }, + { + "epoch": 1.2357206462282153, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.770034790039062, + "learning_rate": 1e-06, + "loss": 0.4627, + "mean_token_accuracy": 0.8518671989440918, + "num_tokens": 370705780.0, + "step": 9714 + }, + { + "epoch": 1.2358478565068058, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.862361907958984, + "learning_rate": 1e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.8707807660102844, + "num_tokens": 370742550.0, + "step": 9715 + }, + { + "epoch": 1.2359750667853961, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.781755447387695, + "learning_rate": 1e-06, + "loss": 0.4033, + "mean_token_accuracy": 0.8701586127281189, + "num_tokens": 370784406.0, + "step": 9716 + }, + { + "epoch": 1.2361022770639867, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.786888122558594, + "learning_rate": 1e-06, + "loss": 0.4389, + "mean_token_accuracy": 0.861290454864502, + "num_tokens": 370820911.0, + "step": 9717 + }, + { + "epoch": 1.2362294873425772, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.791439056396484, + "learning_rate": 1e-06, + "loss": 0.3897, + "mean_token_accuracy": 0.8766652941703796, + "num_tokens": 370857427.0, + "step": 9718 + }, + { + "epoch": 1.2363566976211677, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.074840545654297, + "learning_rate": 1e-06, + "loss": 0.4558, + "mean_token_accuracy": 0.8555289506912231, + "num_tokens": 370897989.0, + "step": 9719 + }, + { + "epoch": 1.2364839078997583, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.672115325927734, + "learning_rate": 1e-06, + "loss": 0.4661, + "mean_token_accuracy": 0.8540298342704773, + "num_tokens": 370932672.0, + "step": 9720 + }, + { + "epoch": 1.2366111181783488, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.801071166992188, + "learning_rate": 1e-06, + "loss": 0.4548, + "mean_token_accuracy": 0.8575805425643921, + "num_tokens": 370971027.0, + "step": 9721 + }, + { + "epoch": 1.2367383284569393, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.708946228027344, + "learning_rate": 1e-06, + "loss": 0.3872, + "mean_token_accuracy": 0.8760042190551758, + "num_tokens": 371014742.0, + "step": 9722 + }, + { + "epoch": 1.2368655387355298, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.17462158203125, + "learning_rate": 1e-06, + "loss": 0.4688, + "mean_token_accuracy": 0.8503392934799194, + "num_tokens": 371052065.0, + "step": 9723 + }, + { + "epoch": 1.2369927490141204, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.74045181274414, + "learning_rate": 1e-06, + "loss": 0.4771, + "mean_token_accuracy": 0.844955563545227, + "num_tokens": 371093897.0, + "step": 9724 + }, + { + "epoch": 1.237119959292711, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.76881217956543, + "learning_rate": 1e-06, + "loss": 0.4348, + "mean_token_accuracy": 0.8662933111190796, + "num_tokens": 371132390.0, + "step": 9725 + }, + { + "epoch": 1.2372471695713014, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.883031845092773, + "learning_rate": 1e-06, + "loss": 0.3732, + "mean_token_accuracy": 0.877363383769989, + "num_tokens": 371170468.0, + "step": 9726 + }, + { + "epoch": 1.237374379849892, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.630823135375977, + "learning_rate": 1e-06, + "loss": 0.4369, + "mean_token_accuracy": 0.8623759150505066, + "num_tokens": 371209323.0, + "step": 9727 + }, + { + "epoch": 1.2375015901284825, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.888687133789062, + "learning_rate": 1e-06, + "loss": 0.4342, + "mean_token_accuracy": 0.865143895149231, + "num_tokens": 371246526.0, + "step": 9728 + }, + { + "epoch": 1.237628800407073, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.820005416870117, + "learning_rate": 1e-06, + "loss": 0.4097, + "mean_token_accuracy": 0.8688676357269287, + "num_tokens": 371282850.0, + "step": 9729 + }, + { + "epoch": 1.2377560106856633, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.970508575439453, + "learning_rate": 1e-06, + "loss": 0.3808, + "mean_token_accuracy": 0.8738539218902588, + "num_tokens": 371312614.0, + "step": 9730 + }, + { + "epoch": 1.2378832209642538, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.91447639465332, + "learning_rate": 1e-06, + "loss": 0.4122, + "mean_token_accuracy": 0.8674321174621582, + "num_tokens": 371350170.0, + "step": 9731 + }, + { + "epoch": 1.2380104312428444, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.682815551757812, + "learning_rate": 1e-06, + "loss": 0.3966, + "mean_token_accuracy": 0.8721327781677246, + "num_tokens": 371389295.0, + "step": 9732 + }, + { + "epoch": 1.238137641521435, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.879793167114258, + "learning_rate": 1e-06, + "loss": 0.4384, + "mean_token_accuracy": 0.8658542633056641, + "num_tokens": 371430579.0, + "step": 9733 + }, + { + "epoch": 1.2382648518000254, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.73204803466797, + "learning_rate": 1e-06, + "loss": 0.4593, + "mean_token_accuracy": 0.8570761680603027, + "num_tokens": 371465087.0, + "step": 9734 + }, + { + "epoch": 1.238392062078616, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.902328491210938, + "learning_rate": 1e-06, + "loss": 0.3886, + "mean_token_accuracy": 0.8734721541404724, + "num_tokens": 371503549.0, + "step": 9735 + }, + { + "epoch": 1.2385192723572065, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.10713005065918, + "learning_rate": 1e-06, + "loss": 0.363, + "mean_token_accuracy": 0.8844621777534485, + "num_tokens": 371542953.0, + "step": 9736 + }, + { + "epoch": 1.238646482635797, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.805875778198242, + "learning_rate": 1e-06, + "loss": 0.4095, + "mean_token_accuracy": 0.8692086935043335, + "num_tokens": 371582541.0, + "step": 9737 + }, + { + "epoch": 1.2387736929143875, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.81195068359375, + "learning_rate": 1e-06, + "loss": 0.3927, + "mean_token_accuracy": 0.876493513584137, + "num_tokens": 371617221.0, + "step": 9738 + }, + { + "epoch": 1.238900903192978, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.87822914123535, + "learning_rate": 1e-06, + "loss": 0.4128, + "mean_token_accuracy": 0.8692322969436646, + "num_tokens": 371660773.0, + "step": 9739 + }, + { + "epoch": 1.2390281134715684, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.701419830322266, + "learning_rate": 1e-06, + "loss": 0.4343, + "mean_token_accuracy": 0.860660195350647, + "num_tokens": 371694763.0, + "step": 9740 + }, + { + "epoch": 1.239155323750159, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.900014877319336, + "learning_rate": 1e-06, + "loss": 0.4201, + "mean_token_accuracy": 0.8649827241897583, + "num_tokens": 371734793.0, + "step": 9741 + }, + { + "epoch": 1.2392825340287494, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.8126277923584, + "learning_rate": 1e-06, + "loss": 0.4267, + "mean_token_accuracy": 0.8611298203468323, + "num_tokens": 371775691.0, + "step": 9742 + }, + { + "epoch": 1.23940974430734, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.924991607666016, + "learning_rate": 1e-06, + "loss": 0.441, + "mean_token_accuracy": 0.8582072257995605, + "num_tokens": 371817319.0, + "step": 9743 + }, + { + "epoch": 1.2395369545859305, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.913862228393555, + "learning_rate": 1e-06, + "loss": 0.4214, + "mean_token_accuracy": 0.8650631904602051, + "num_tokens": 371847431.0, + "step": 9744 + }, + { + "epoch": 1.239664164864521, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.850114822387695, + "learning_rate": 1e-06, + "loss": 0.3861, + "mean_token_accuracy": 0.8789100050926208, + "num_tokens": 371881788.0, + "step": 9745 + }, + { + "epoch": 1.2397913751431116, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.007932662963867, + "learning_rate": 1e-06, + "loss": 0.4364, + "mean_token_accuracy": 0.8610830307006836, + "num_tokens": 371917388.0, + "step": 9746 + }, + { + "epoch": 1.239918585421702, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.858224868774414, + "learning_rate": 1e-06, + "loss": 0.3977, + "mean_token_accuracy": 0.8752526640892029, + "num_tokens": 371957561.0, + "step": 9747 + }, + { + "epoch": 1.2400457957002926, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.854747772216797, + "learning_rate": 1e-06, + "loss": 0.4416, + "mean_token_accuracy": 0.8588255643844604, + "num_tokens": 371987322.0, + "step": 9748 + }, + { + "epoch": 1.2401730059788831, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.715335845947266, + "learning_rate": 1e-06, + "loss": 0.4266, + "mean_token_accuracy": 0.8687255382537842, + "num_tokens": 372027263.0, + "step": 9749 + }, + { + "epoch": 1.2403002162574737, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.733503341674805, + "learning_rate": 1e-06, + "loss": 0.4047, + "mean_token_accuracy": 0.8704078197479248, + "num_tokens": 372061441.0, + "step": 9750 + }, + { + "epoch": 1.2404274265360642, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.95902442932129, + "learning_rate": 1e-06, + "loss": 0.3998, + "mean_token_accuracy": 0.8712090849876404, + "num_tokens": 372099085.0, + "step": 9751 + }, + { + "epoch": 1.2405546368146547, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.807613372802734, + "learning_rate": 1e-06, + "loss": 0.4108, + "mean_token_accuracy": 0.8686025142669678, + "num_tokens": 372134106.0, + "step": 9752 + }, + { + "epoch": 1.2406818470932452, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.988574981689453, + "learning_rate": 1e-06, + "loss": 0.406, + "mean_token_accuracy": 0.8740379810333252, + "num_tokens": 372164635.0, + "step": 9753 + }, + { + "epoch": 1.2408090573718356, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.804954528808594, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.8716031908988953, + "num_tokens": 372204924.0, + "step": 9754 + }, + { + "epoch": 1.240936267650426, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.948867797851562, + "learning_rate": 1e-06, + "loss": 0.4085, + "mean_token_accuracy": 0.8659630417823792, + "num_tokens": 372240939.0, + "step": 9755 + }, + { + "epoch": 1.2410634779290166, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.071739196777344, + "learning_rate": 1e-06, + "loss": 0.4341, + "mean_token_accuracy": 0.861992359161377, + "num_tokens": 372272355.0, + "step": 9756 + }, + { + "epoch": 1.2411906882076071, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.66013526916504, + "learning_rate": 1e-06, + "loss": 0.4196, + "mean_token_accuracy": 0.8642251491546631, + "num_tokens": 372309566.0, + "step": 9757 + }, + { + "epoch": 1.2413178984861977, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.88422203063965, + "learning_rate": 1e-06, + "loss": 0.4408, + "mean_token_accuracy": 0.8608086705207825, + "num_tokens": 372347768.0, + "step": 9758 + }, + { + "epoch": 1.2414451087647882, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.78593635559082, + "learning_rate": 1e-06, + "loss": 0.3978, + "mean_token_accuracy": 0.8742462396621704, + "num_tokens": 372381020.0, + "step": 9759 + }, + { + "epoch": 1.2415723190433787, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.82341194152832, + "learning_rate": 1e-06, + "loss": 0.4504, + "mean_token_accuracy": 0.8558706045150757, + "num_tokens": 372422434.0, + "step": 9760 + }, + { + "epoch": 1.2416995293219693, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.709871292114258, + "learning_rate": 1e-06, + "loss": 0.4608, + "mean_token_accuracy": 0.8572168350219727, + "num_tokens": 372459342.0, + "step": 9761 + }, + { + "epoch": 1.2418267396005598, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.879018783569336, + "learning_rate": 1e-06, + "loss": 0.4005, + "mean_token_accuracy": 0.8689942955970764, + "num_tokens": 372496637.0, + "step": 9762 + }, + { + "epoch": 1.2419539498791503, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.826419830322266, + "learning_rate": 1e-06, + "loss": 0.3826, + "mean_token_accuracy": 0.878271222114563, + "num_tokens": 372532068.0, + "step": 9763 + }, + { + "epoch": 1.2420811601577408, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.823122024536133, + "learning_rate": 1e-06, + "loss": 0.3985, + "mean_token_accuracy": 0.8727760314941406, + "num_tokens": 372565460.0, + "step": 9764 + }, + { + "epoch": 1.2422083704363311, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.919002532958984, + "learning_rate": 1e-06, + "loss": 0.4061, + "mean_token_accuracy": 0.8701019287109375, + "num_tokens": 372605998.0, + "step": 9765 + }, + { + "epoch": 1.2423355807149217, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.93874740600586, + "learning_rate": 1e-06, + "loss": 0.4514, + "mean_token_accuracy": 0.8561221361160278, + "num_tokens": 372642038.0, + "step": 9766 + }, + { + "epoch": 1.2424627909935122, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.05894660949707, + "learning_rate": 1e-06, + "loss": 0.4271, + "mean_token_accuracy": 0.8620290756225586, + "num_tokens": 372678192.0, + "step": 9767 + }, + { + "epoch": 1.2425900012721027, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.765745162963867, + "learning_rate": 1e-06, + "loss": 0.4328, + "mean_token_accuracy": 0.8621543049812317, + "num_tokens": 372712867.0, + "step": 9768 + }, + { + "epoch": 1.2427172115506933, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.214412689208984, + "learning_rate": 1e-06, + "loss": 0.4742, + "mean_token_accuracy": 0.8483519554138184, + "num_tokens": 372745790.0, + "step": 9769 + }, + { + "epoch": 1.2428444218292838, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.610721588134766, + "learning_rate": 1e-06, + "loss": 0.3975, + "mean_token_accuracy": 0.8769735097885132, + "num_tokens": 372777634.0, + "step": 9770 + }, + { + "epoch": 1.2429716321078743, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.93686866760254, + "learning_rate": 1e-06, + "loss": 0.4728, + "mean_token_accuracy": 0.8509747385978699, + "num_tokens": 372820527.0, + "step": 9771 + }, + { + "epoch": 1.2430988423864648, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.877119064331055, + "learning_rate": 1e-06, + "loss": 0.3888, + "mean_token_accuracy": 0.875762939453125, + "num_tokens": 372858479.0, + "step": 9772 + }, + { + "epoch": 1.2432260526650554, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.708967208862305, + "learning_rate": 1e-06, + "loss": 0.3951, + "mean_token_accuracy": 0.872981071472168, + "num_tokens": 372895219.0, + "step": 9773 + }, + { + "epoch": 1.243353262943646, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.845197677612305, + "learning_rate": 1e-06, + "loss": 0.3804, + "mean_token_accuracy": 0.8775910139083862, + "num_tokens": 372934491.0, + "step": 9774 + }, + { + "epoch": 1.2434804732222364, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.788463592529297, + "learning_rate": 1e-06, + "loss": 0.4547, + "mean_token_accuracy": 0.8575270175933838, + "num_tokens": 372972892.0, + "step": 9775 + }, + { + "epoch": 1.243607683500827, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.7542781829834, + "learning_rate": 1e-06, + "loss": 0.4159, + "mean_token_accuracy": 0.8693234920501709, + "num_tokens": 373011564.0, + "step": 9776 + }, + { + "epoch": 1.2437348937794175, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.922069549560547, + "learning_rate": 1e-06, + "loss": 0.4161, + "mean_token_accuracy": 0.8682429194450378, + "num_tokens": 373049444.0, + "step": 9777 + }, + { + "epoch": 1.243862104058008, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.85051155090332, + "learning_rate": 1e-06, + "loss": 0.4319, + "mean_token_accuracy": 0.8658135533332825, + "num_tokens": 373085466.0, + "step": 9778 + }, + { + "epoch": 1.2439893143365983, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.00249671936035, + "learning_rate": 1e-06, + "loss": 0.4848, + "mean_token_accuracy": 0.844919741153717, + "num_tokens": 373124712.0, + "step": 9779 + }, + { + "epoch": 1.2441165246151888, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.7550048828125, + "learning_rate": 1e-06, + "loss": 0.3989, + "mean_token_accuracy": 0.871877133846283, + "num_tokens": 373160008.0, + "step": 9780 + }, + { + "epoch": 1.2442437348937794, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.924314498901367, + "learning_rate": 1e-06, + "loss": 0.4152, + "mean_token_accuracy": 0.8667259216308594, + "num_tokens": 373192110.0, + "step": 9781 + }, + { + "epoch": 1.24437094517237, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.733642578125, + "learning_rate": 1e-06, + "loss": 0.4238, + "mean_token_accuracy": 0.867333173751831, + "num_tokens": 373234432.0, + "step": 9782 + }, + { + "epoch": 1.2444981554509604, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.900053024291992, + "learning_rate": 1e-06, + "loss": 0.4462, + "mean_token_accuracy": 0.8561885952949524, + "num_tokens": 373270725.0, + "step": 9783 + }, + { + "epoch": 1.244625365729551, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.021488189697266, + "learning_rate": 1e-06, + "loss": 0.5053, + "mean_token_accuracy": 0.8437987565994263, + "num_tokens": 373307531.0, + "step": 9784 + }, + { + "epoch": 1.2447525760081415, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.800607681274414, + "learning_rate": 1e-06, + "loss": 0.4214, + "mean_token_accuracy": 0.8701407313346863, + "num_tokens": 373351657.0, + "step": 9785 + }, + { + "epoch": 1.244879786286732, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.961502075195312, + "learning_rate": 1e-06, + "loss": 0.4053, + "mean_token_accuracy": 0.8711293935775757, + "num_tokens": 373388811.0, + "step": 9786 + }, + { + "epoch": 1.2450069965653225, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.762203216552734, + "learning_rate": 1e-06, + "loss": 0.3925, + "mean_token_accuracy": 0.8754004240036011, + "num_tokens": 373423378.0, + "step": 9787 + }, + { + "epoch": 1.245134206843913, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.756351470947266, + "learning_rate": 1e-06, + "loss": 0.4148, + "mean_token_accuracy": 0.8678385019302368, + "num_tokens": 373460982.0, + "step": 9788 + }, + { + "epoch": 1.2452614171225034, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.80398941040039, + "learning_rate": 1e-06, + "loss": 0.3973, + "mean_token_accuracy": 0.8729040622711182, + "num_tokens": 373501474.0, + "step": 9789 + }, + { + "epoch": 1.245388627401094, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.86788558959961, + "learning_rate": 1e-06, + "loss": 0.4064, + "mean_token_accuracy": 0.8704078197479248, + "num_tokens": 373537830.0, + "step": 9790 + }, + { + "epoch": 1.2455158376796844, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.69395637512207, + "learning_rate": 1e-06, + "loss": 0.4408, + "mean_token_accuracy": 0.8598452806472778, + "num_tokens": 373573975.0, + "step": 9791 + }, + { + "epoch": 1.245643047958275, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.834274291992188, + "learning_rate": 1e-06, + "loss": 0.4825, + "mean_token_accuracy": 0.846055805683136, + "num_tokens": 373612157.0, + "step": 9792 + }, + { + "epoch": 1.2457702582368655, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.80246353149414, + "learning_rate": 1e-06, + "loss": 0.3567, + "mean_token_accuracy": 0.885168194770813, + "num_tokens": 373645065.0, + "step": 9793 + }, + { + "epoch": 1.245897468515456, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.85857582092285, + "learning_rate": 1e-06, + "loss": 0.4415, + "mean_token_accuracy": 0.8613513112068176, + "num_tokens": 373687658.0, + "step": 9794 + }, + { + "epoch": 1.2460246787940465, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.78108024597168, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.8675122261047363, + "num_tokens": 373732262.0, + "step": 9795 + }, + { + "epoch": 1.246151889072637, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.911474227905273, + "learning_rate": 1e-06, + "loss": 0.4797, + "mean_token_accuracy": 0.8467178344726562, + "num_tokens": 373772410.0, + "step": 9796 + }, + { + "epoch": 1.2462790993512276, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.752063751220703, + "learning_rate": 1e-06, + "loss": 0.4184, + "mean_token_accuracy": 0.8674145340919495, + "num_tokens": 373808773.0, + "step": 9797 + }, + { + "epoch": 1.2464063096298181, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.057323455810547, + "learning_rate": 1e-06, + "loss": 0.4317, + "mean_token_accuracy": 0.861534833908081, + "num_tokens": 373848945.0, + "step": 9798 + }, + { + "epoch": 1.2465335199084087, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.87220573425293, + "learning_rate": 1e-06, + "loss": 0.432, + "mean_token_accuracy": 0.8605378866195679, + "num_tokens": 373892716.0, + "step": 9799 + }, + { + "epoch": 1.2466607301869992, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.901592254638672, + "learning_rate": 1e-06, + "loss": 0.3658, + "mean_token_accuracy": 0.8812100887298584, + "num_tokens": 373929443.0, + "step": 9800 + }, + { + "epoch": 1.2467879404655897, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.96477699279785, + "learning_rate": 1e-06, + "loss": 0.4118, + "mean_token_accuracy": 0.8663734793663025, + "num_tokens": 373968882.0, + "step": 9801 + }, + { + "epoch": 1.2469151507441802, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.97039794921875, + "learning_rate": 1e-06, + "loss": 0.3777, + "mean_token_accuracy": 0.8787350654602051, + "num_tokens": 374009265.0, + "step": 9802 + }, + { + "epoch": 1.2470423610227706, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.748302459716797, + "learning_rate": 1e-06, + "loss": 0.4587, + "mean_token_accuracy": 0.850986123085022, + "num_tokens": 374047152.0, + "step": 9803 + }, + { + "epoch": 1.247169571301361, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.951217651367188, + "learning_rate": 1e-06, + "loss": 0.428, + "mean_token_accuracy": 0.861792802810669, + "num_tokens": 374083127.0, + "step": 9804 + }, + { + "epoch": 1.2472967815799516, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.879018783569336, + "learning_rate": 1e-06, + "loss": 0.3878, + "mean_token_accuracy": 0.8747913837432861, + "num_tokens": 374127086.0, + "step": 9805 + }, + { + "epoch": 1.2474239918585421, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.99907875061035, + "learning_rate": 1e-06, + "loss": 0.4438, + "mean_token_accuracy": 0.8608971834182739, + "num_tokens": 374163879.0, + "step": 9806 + }, + { + "epoch": 1.2475512021371327, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.88616180419922, + "learning_rate": 1e-06, + "loss": 0.4015, + "mean_token_accuracy": 0.8706537485122681, + "num_tokens": 374206639.0, + "step": 9807 + }, + { + "epoch": 1.2476784124157232, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.992921829223633, + "learning_rate": 1e-06, + "loss": 0.4351, + "mean_token_accuracy": 0.863503098487854, + "num_tokens": 374244106.0, + "step": 9808 + }, + { + "epoch": 1.2478056226943137, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.719987869262695, + "learning_rate": 1e-06, + "loss": 0.4255, + "mean_token_accuracy": 0.8623457551002502, + "num_tokens": 374285776.0, + "step": 9809 + }, + { + "epoch": 1.2479328329729042, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.94382095336914, + "learning_rate": 1e-06, + "loss": 0.3853, + "mean_token_accuracy": 0.8764322996139526, + "num_tokens": 374323036.0, + "step": 9810 + }, + { + "epoch": 1.2480600432514948, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.054811477661133, + "learning_rate": 1e-06, + "loss": 0.4032, + "mean_token_accuracy": 0.8726000785827637, + "num_tokens": 374360204.0, + "step": 9811 + }, + { + "epoch": 1.2481872535300853, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.788490295410156, + "learning_rate": 1e-06, + "loss": 0.3827, + "mean_token_accuracy": 0.8761575818061829, + "num_tokens": 374394961.0, + "step": 9812 + }, + { + "epoch": 1.2483144638086758, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.93768310546875, + "learning_rate": 1e-06, + "loss": 0.4208, + "mean_token_accuracy": 0.8678691387176514, + "num_tokens": 374436142.0, + "step": 9813 + }, + { + "epoch": 1.2484416740872661, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.05961036682129, + "learning_rate": 1e-06, + "loss": 0.4212, + "mean_token_accuracy": 0.8635863065719604, + "num_tokens": 374474263.0, + "step": 9814 + }, + { + "epoch": 1.2485688843658567, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.082319259643555, + "learning_rate": 1e-06, + "loss": 0.4437, + "mean_token_accuracy": 0.8603066205978394, + "num_tokens": 374510250.0, + "step": 9815 + }, + { + "epoch": 1.2486960946444472, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.84272575378418, + "learning_rate": 1e-06, + "loss": 0.3628, + "mean_token_accuracy": 0.8845155239105225, + "num_tokens": 374549579.0, + "step": 9816 + }, + { + "epoch": 1.2488233049230377, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.93358039855957, + "learning_rate": 1e-06, + "loss": 0.418, + "mean_token_accuracy": 0.8653368353843689, + "num_tokens": 374588631.0, + "step": 9817 + }, + { + "epoch": 1.2489505152016283, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.90760040283203, + "learning_rate": 1e-06, + "loss": 0.432, + "mean_token_accuracy": 0.8627240657806396, + "num_tokens": 374627092.0, + "step": 9818 + }, + { + "epoch": 1.2490777254802188, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.683366775512695, + "learning_rate": 1e-06, + "loss": 0.4095, + "mean_token_accuracy": 0.8716120719909668, + "num_tokens": 374657797.0, + "step": 9819 + }, + { + "epoch": 1.2492049357588093, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.086637496948242, + "learning_rate": 1e-06, + "loss": 0.4055, + "mean_token_accuracy": 0.8715873956680298, + "num_tokens": 374694011.0, + "step": 9820 + }, + { + "epoch": 1.2493321460373998, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.853008270263672, + "learning_rate": 1e-06, + "loss": 0.413, + "mean_token_accuracy": 0.8683120608329773, + "num_tokens": 374735127.0, + "step": 9821 + }, + { + "epoch": 1.2494593563159904, + "ewc_loss": 0.031005859375, + "ewc_loss_parallel": 3.0994415283203125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.9075927734375, + "learning_rate": 1e-06, + "loss": 0.4186, + "mean_token_accuracy": 0.8634904026985168, + "num_tokens": 374774170.0, + "step": 9822 + }, + { + "epoch": 1.249586566594581, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.888246536254883, + "learning_rate": 1e-06, + "loss": 0.4284, + "mean_token_accuracy": 0.8667504787445068, + "num_tokens": 374809162.0, + "step": 9823 + }, + { + "epoch": 1.2497137768731714, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.00571060180664, + "learning_rate": 1e-06, + "loss": 0.4424, + "mean_token_accuracy": 0.8587055206298828, + "num_tokens": 374847974.0, + "step": 9824 + }, + { + "epoch": 1.249840987151762, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.83802032470703, + "learning_rate": 1e-06, + "loss": 0.3674, + "mean_token_accuracy": 0.8823246359825134, + "num_tokens": 374894911.0, + "step": 9825 + }, + { + "epoch": 1.2499681974303525, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.028772354125977, + "learning_rate": 1e-06, + "loss": 0.443, + "mean_token_accuracy": 0.8603637218475342, + "num_tokens": 374933534.0, + "step": 9826 + }, + { + "epoch": 1.250095407708943, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.79195213317871, + "learning_rate": 1e-06, + "loss": 0.4601, + "mean_token_accuracy": 0.8544122576713562, + "num_tokens": 374968183.0, + "step": 9827 + }, + { + "epoch": 1.2502226179875333, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.991409301757812, + "learning_rate": 1e-06, + "loss": 0.4312, + "mean_token_accuracy": 0.858880877494812, + "num_tokens": 375005973.0, + "step": 9828 + }, + { + "epoch": 1.2503498282661238, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.004724502563477, + "learning_rate": 1e-06, + "loss": 0.4469, + "mean_token_accuracy": 0.8597623109817505, + "num_tokens": 375043446.0, + "step": 9829 + }, + { + "epoch": 1.2504770385447144, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.869552612304688, + "learning_rate": 1e-06, + "loss": 0.4112, + "mean_token_accuracy": 0.8691761493682861, + "num_tokens": 375081518.0, + "step": 9830 + }, + { + "epoch": 1.250604248823305, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.924842834472656, + "learning_rate": 1e-06, + "loss": 0.4366, + "mean_token_accuracy": 0.8588436841964722, + "num_tokens": 375117767.0, + "step": 9831 + }, + { + "epoch": 1.2507314591018954, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.917743682861328, + "learning_rate": 1e-06, + "loss": 0.4959, + "mean_token_accuracy": 0.8426918983459473, + "num_tokens": 375152061.0, + "step": 9832 + }, + { + "epoch": 1.250858669380486, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.955711364746094, + "learning_rate": 1e-06, + "loss": 0.4335, + "mean_token_accuracy": 0.859898567199707, + "num_tokens": 375195677.0, + "step": 9833 + }, + { + "epoch": 1.2509858796590765, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.897878646850586, + "learning_rate": 1e-06, + "loss": 0.4347, + "mean_token_accuracy": 0.8586735129356384, + "num_tokens": 375230319.0, + "step": 9834 + }, + { + "epoch": 1.251113089937667, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.981040954589844, + "learning_rate": 1e-06, + "loss": 0.4039, + "mean_token_accuracy": 0.8711670637130737, + "num_tokens": 375268970.0, + "step": 9835 + }, + { + "epoch": 1.2512403002162575, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.920289993286133, + "learning_rate": 1e-06, + "loss": 0.4058, + "mean_token_accuracy": 0.873015284538269, + "num_tokens": 375303661.0, + "step": 9836 + }, + { + "epoch": 1.2513675104948478, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.876848220825195, + "learning_rate": 1e-06, + "loss": 0.4011, + "mean_token_accuracy": 0.8734309673309326, + "num_tokens": 375343278.0, + "step": 9837 + }, + { + "epoch": 1.2514947207734384, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.828655242919922, + "learning_rate": 1e-06, + "loss": 0.3924, + "mean_token_accuracy": 0.8723655939102173, + "num_tokens": 375378574.0, + "step": 9838 + }, + { + "epoch": 1.251621931052029, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.712980270385742, + "learning_rate": 1e-06, + "loss": 0.436, + "mean_token_accuracy": 0.8612609505653381, + "num_tokens": 375412781.0, + "step": 9839 + }, + { + "epoch": 1.2517491413306194, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.86382484436035, + "learning_rate": 1e-06, + "loss": 0.3806, + "mean_token_accuracy": 0.8777739405632019, + "num_tokens": 375450431.0, + "step": 9840 + }, + { + "epoch": 1.25187635160921, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.958602905273438, + "learning_rate": 1e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.8600565791130066, + "num_tokens": 375490692.0, + "step": 9841 + }, + { + "epoch": 1.2520035618878005, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.781824111938477, + "learning_rate": 1e-06, + "loss": 0.4132, + "mean_token_accuracy": 0.87198406457901, + "num_tokens": 375535278.0, + "step": 9842 + }, + { + "epoch": 1.252130772166391, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.009618759155273, + "learning_rate": 1e-06, + "loss": 0.4418, + "mean_token_accuracy": 0.8595269918441772, + "num_tokens": 375575859.0, + "step": 9843 + }, + { + "epoch": 1.2522579824449815, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.778371810913086, + "learning_rate": 1e-06, + "loss": 0.4405, + "mean_token_accuracy": 0.8583948612213135, + "num_tokens": 375611084.0, + "step": 9844 + }, + { + "epoch": 1.252385192723572, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.880455017089844, + "learning_rate": 1e-06, + "loss": 0.3816, + "mean_token_accuracy": 0.8764873743057251, + "num_tokens": 375644926.0, + "step": 9845 + }, + { + "epoch": 1.2525124030021626, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.932071685791016, + "learning_rate": 1e-06, + "loss": 0.408, + "mean_token_accuracy": 0.8748713731765747, + "num_tokens": 375686970.0, + "step": 9846 + }, + { + "epoch": 1.2526396132807531, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.79621696472168, + "learning_rate": 1e-06, + "loss": 0.3913, + "mean_token_accuracy": 0.8780109882354736, + "num_tokens": 375727372.0, + "step": 9847 + }, + { + "epoch": 1.2527668235593437, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.999706268310547, + "learning_rate": 1e-06, + "loss": 0.4516, + "mean_token_accuracy": 0.8546070456504822, + "num_tokens": 375763568.0, + "step": 9848 + }, + { + "epoch": 1.2528940338379342, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.895427703857422, + "learning_rate": 1e-06, + "loss": 0.4761, + "mean_token_accuracy": 0.8494868278503418, + "num_tokens": 375797954.0, + "step": 9849 + }, + { + "epoch": 1.2530212441165247, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.043169021606445, + "learning_rate": 1e-06, + "loss": 0.4196, + "mean_token_accuracy": 0.8640894889831543, + "num_tokens": 375836362.0, + "step": 9850 + }, + { + "epoch": 1.2531484543951152, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.008180618286133, + "learning_rate": 1e-06, + "loss": 0.4369, + "mean_token_accuracy": 0.8615552186965942, + "num_tokens": 375879403.0, + "step": 9851 + }, + { + "epoch": 1.2532756646737058, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.69157600402832, + "learning_rate": 1e-06, + "loss": 0.4168, + "mean_token_accuracy": 0.8688533306121826, + "num_tokens": 375916728.0, + "step": 9852 + }, + { + "epoch": 1.253402874952296, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.003902435302734, + "learning_rate": 1e-06, + "loss": 0.4404, + "mean_token_accuracy": 0.86053466796875, + "num_tokens": 375951743.0, + "step": 9853 + }, + { + "epoch": 1.2535300852308866, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.862682342529297, + "learning_rate": 1e-06, + "loss": 0.5007, + "mean_token_accuracy": 0.8403022885322571, + "num_tokens": 375988872.0, + "step": 9854 + }, + { + "epoch": 1.2536572955094771, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.041301727294922, + "learning_rate": 1e-06, + "loss": 0.4626, + "mean_token_accuracy": 0.8554847240447998, + "num_tokens": 376030388.0, + "step": 9855 + }, + { + "epoch": 1.2537845057880677, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.8123722076416, + "learning_rate": 1e-06, + "loss": 0.3781, + "mean_token_accuracy": 0.875156581401825, + "num_tokens": 376062975.0, + "step": 9856 + }, + { + "epoch": 1.2539117160666582, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.91825294494629, + "learning_rate": 1e-06, + "loss": 0.4462, + "mean_token_accuracy": 0.8591843247413635, + "num_tokens": 376101873.0, + "step": 9857 + }, + { + "epoch": 1.2540389263452487, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.84649658203125, + "learning_rate": 1e-06, + "loss": 0.4894, + "mean_token_accuracy": 0.8450599908828735, + "num_tokens": 376138774.0, + "step": 9858 + }, + { + "epoch": 1.2541661366238392, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.850614547729492, + "learning_rate": 1e-06, + "loss": 0.4482, + "mean_token_accuracy": 0.857253909111023, + "num_tokens": 376172357.0, + "step": 9859 + }, + { + "epoch": 1.2542933469024298, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.012239456176758, + "learning_rate": 1e-06, + "loss": 0.451, + "mean_token_accuracy": 0.8551623821258545, + "num_tokens": 376212977.0, + "step": 9860 + }, + { + "epoch": 1.2544205571810203, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.76576042175293, + "learning_rate": 1e-06, + "loss": 0.4521, + "mean_token_accuracy": 0.8560183048248291, + "num_tokens": 376256989.0, + "step": 9861 + }, + { + "epoch": 1.2545477674596106, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.933809280395508, + "learning_rate": 1e-06, + "loss": 0.4031, + "mean_token_accuracy": 0.869909405708313, + "num_tokens": 376300130.0, + "step": 9862 + }, + { + "epoch": 1.2546749777382011, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.755422592163086, + "learning_rate": 1e-06, + "loss": 0.4189, + "mean_token_accuracy": 0.8692018985748291, + "num_tokens": 376337475.0, + "step": 9863 + }, + { + "epoch": 1.2548021880167917, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.180438995361328, + "learning_rate": 1e-06, + "loss": 0.4543, + "mean_token_accuracy": 0.8569543957710266, + "num_tokens": 376376485.0, + "step": 9864 + }, + { + "epoch": 1.2549293982953822, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.781457901000977, + "learning_rate": 1e-06, + "loss": 0.409, + "mean_token_accuracy": 0.8714688420295715, + "num_tokens": 376413393.0, + "step": 9865 + }, + { + "epoch": 1.2550566085739727, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.0468692779541, + "learning_rate": 1e-06, + "loss": 0.4507, + "mean_token_accuracy": 0.8560819625854492, + "num_tokens": 376448192.0, + "step": 9866 + }, + { + "epoch": 1.2551838188525632, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.858478546142578, + "learning_rate": 1e-06, + "loss": 0.4028, + "mean_token_accuracy": 0.8692970871925354, + "num_tokens": 376489382.0, + "step": 9867 + }, + { + "epoch": 1.2553110291311538, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.844768524169922, + "learning_rate": 1e-06, + "loss": 0.4073, + "mean_token_accuracy": 0.8698785305023193, + "num_tokens": 376527221.0, + "step": 9868 + }, + { + "epoch": 1.2554382394097443, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.9939022064209, + "learning_rate": 1e-06, + "loss": 0.4418, + "mean_token_accuracy": 0.8616927862167358, + "num_tokens": 376567045.0, + "step": 9869 + }, + { + "epoch": 1.2555654496883348, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.82356071472168, + "learning_rate": 1e-06, + "loss": 0.4105, + "mean_token_accuracy": 0.8725035786628723, + "num_tokens": 376607039.0, + "step": 9870 + }, + { + "epoch": 1.2556926599669254, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.9124813079834, + "learning_rate": 1e-06, + "loss": 0.4147, + "mean_token_accuracy": 0.867438554763794, + "num_tokens": 376643220.0, + "step": 9871 + }, + { + "epoch": 1.255819870245516, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.83058738708496, + "learning_rate": 1e-06, + "loss": 0.3633, + "mean_token_accuracy": 0.8845922946929932, + "num_tokens": 376681417.0, + "step": 9872 + }, + { + "epoch": 1.2559470805241064, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.791349411010742, + "learning_rate": 1e-06, + "loss": 0.3635, + "mean_token_accuracy": 0.8834667205810547, + "num_tokens": 376722836.0, + "step": 9873 + }, + { + "epoch": 1.256074290802697, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.84808921813965, + "learning_rate": 1e-06, + "loss": 0.4291, + "mean_token_accuracy": 0.8592514991760254, + "num_tokens": 376763194.0, + "step": 9874 + }, + { + "epoch": 1.2562015010812875, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.9040584564209, + "learning_rate": 1e-06, + "loss": 0.4486, + "mean_token_accuracy": 0.8610203266143799, + "num_tokens": 376803701.0, + "step": 9875 + }, + { + "epoch": 1.256328711359878, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.0312557220459, + "learning_rate": 1e-06, + "loss": 0.399, + "mean_token_accuracy": 0.8742692470550537, + "num_tokens": 376835296.0, + "step": 9876 + }, + { + "epoch": 1.2564559216384683, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.811100006103516, + "learning_rate": 1e-06, + "loss": 0.4143, + "mean_token_accuracy": 0.8695574402809143, + "num_tokens": 376876317.0, + "step": 9877 + }, + { + "epoch": 1.2565831319170588, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.960655212402344, + "learning_rate": 1e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.8636133074760437, + "num_tokens": 376916403.0, + "step": 9878 + }, + { + "epoch": 1.2567103421956494, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.761507034301758, + "learning_rate": 1e-06, + "loss": 0.3954, + "mean_token_accuracy": 0.8750501871109009, + "num_tokens": 376956079.0, + "step": 9879 + }, + { + "epoch": 1.25683755247424, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.8237247467041, + "learning_rate": 1e-06, + "loss": 0.4544, + "mean_token_accuracy": 0.8563017249107361, + "num_tokens": 376991618.0, + "step": 9880 + }, + { + "epoch": 1.2569647627528304, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.954662322998047, + "learning_rate": 1e-06, + "loss": 0.4066, + "mean_token_accuracy": 0.8731784820556641, + "num_tokens": 377033964.0, + "step": 9881 + }, + { + "epoch": 1.257091973031421, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.929920196533203, + "learning_rate": 1e-06, + "loss": 0.3922, + "mean_token_accuracy": 0.8738193511962891, + "num_tokens": 377070091.0, + "step": 9882 + }, + { + "epoch": 1.2572191833100115, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.878206253051758, + "learning_rate": 1e-06, + "loss": 0.4013, + "mean_token_accuracy": 0.8739235401153564, + "num_tokens": 377111185.0, + "step": 9883 + }, + { + "epoch": 1.257346393588602, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.97279167175293, + "learning_rate": 1e-06, + "loss": 0.4054, + "mean_token_accuracy": 0.8708356618881226, + "num_tokens": 377147637.0, + "step": 9884 + }, + { + "epoch": 1.2574736038671925, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.00492286682129, + "learning_rate": 1e-06, + "loss": 0.4287, + "mean_token_accuracy": 0.8660495281219482, + "num_tokens": 377181918.0, + "step": 9885 + }, + { + "epoch": 1.2576008141457828, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.791831970214844, + "learning_rate": 1e-06, + "loss": 0.4438, + "mean_token_accuracy": 0.8597436547279358, + "num_tokens": 377222900.0, + "step": 9886 + }, + { + "epoch": 1.2577280244243734, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.932090759277344, + "learning_rate": 1e-06, + "loss": 0.3731, + "mean_token_accuracy": 0.8801161050796509, + "num_tokens": 377263107.0, + "step": 9887 + }, + { + "epoch": 1.257855234702964, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.94091796875, + "learning_rate": 1e-06, + "loss": 0.3723, + "mean_token_accuracy": 0.881301999092102, + "num_tokens": 377301384.0, + "step": 9888 + }, + { + "epoch": 1.2579824449815544, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.88300895690918, + "learning_rate": 1e-06, + "loss": 0.4526, + "mean_token_accuracy": 0.8601555824279785, + "num_tokens": 377339865.0, + "step": 9889 + }, + { + "epoch": 1.258109655260145, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.730064392089844, + "learning_rate": 1e-06, + "loss": 0.3967, + "mean_token_accuracy": 0.8748863935470581, + "num_tokens": 377381582.0, + "step": 9890 + }, + { + "epoch": 1.2582368655387355, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.939495086669922, + "learning_rate": 1e-06, + "loss": 0.4956, + "mean_token_accuracy": 0.8426607251167297, + "num_tokens": 377425548.0, + "step": 9891 + }, + { + "epoch": 1.258364075817326, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.933330535888672, + "learning_rate": 1e-06, + "loss": 0.4236, + "mean_token_accuracy": 0.8656632900238037, + "num_tokens": 377462298.0, + "step": 9892 + }, + { + "epoch": 1.2584912860959165, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.80159568786621, + "learning_rate": 1e-06, + "loss": 0.4436, + "mean_token_accuracy": 0.8581169247627258, + "num_tokens": 377503873.0, + "step": 9893 + }, + { + "epoch": 1.258618496374507, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.815135955810547, + "learning_rate": 1e-06, + "loss": 0.3704, + "mean_token_accuracy": 0.8804843425750732, + "num_tokens": 377540497.0, + "step": 9894 + }, + { + "epoch": 1.2587457066530976, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.9587345123291, + "learning_rate": 1e-06, + "loss": 0.4194, + "mean_token_accuracy": 0.8673409819602966, + "num_tokens": 377582242.0, + "step": 9895 + }, + { + "epoch": 1.2588729169316881, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.996004104614258, + "learning_rate": 1e-06, + "loss": 0.3732, + "mean_token_accuracy": 0.8823398351669312, + "num_tokens": 377614972.0, + "step": 9896 + }, + { + "epoch": 1.2590001272102787, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.941017150878906, + "learning_rate": 1e-06, + "loss": 0.4211, + "mean_token_accuracy": 0.8634193539619446, + "num_tokens": 377645392.0, + "step": 9897 + }, + { + "epoch": 1.2591273374888692, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.98961639404297, + "learning_rate": 1e-06, + "loss": 0.4013, + "mean_token_accuracy": 0.8741564750671387, + "num_tokens": 377680317.0, + "step": 9898 + }, + { + "epoch": 1.2592545477674597, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.0047607421875, + "learning_rate": 1e-06, + "loss": 0.4192, + "mean_token_accuracy": 0.8661690354347229, + "num_tokens": 377714879.0, + "step": 9899 + }, + { + "epoch": 1.2593817580460502, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.86884117126465, + "learning_rate": 1e-06, + "loss": 0.4705, + "mean_token_accuracy": 0.8514376878738403, + "num_tokens": 377751404.0, + "step": 9900 + }, + { + "epoch": 1.2595089683246408, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.060625076293945, + "learning_rate": 1e-06, + "loss": 0.4064, + "mean_token_accuracy": 0.8701416254043579, + "num_tokens": 377788354.0, + "step": 9901 + }, + { + "epoch": 1.259636178603231, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.866092681884766, + "learning_rate": 1e-06, + "loss": 0.4536, + "mean_token_accuracy": 0.8556183576583862, + "num_tokens": 377827259.0, + "step": 9902 + }, + { + "epoch": 1.2597633888818216, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.884601593017578, + "learning_rate": 1e-06, + "loss": 0.4521, + "mean_token_accuracy": 0.8580988645553589, + "num_tokens": 377872477.0, + "step": 9903 + }, + { + "epoch": 1.2598905991604121, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.787643432617188, + "learning_rate": 1e-06, + "loss": 0.4588, + "mean_token_accuracy": 0.859218180179596, + "num_tokens": 377911159.0, + "step": 9904 + }, + { + "epoch": 1.2600178094390027, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.930082321166992, + "learning_rate": 1e-06, + "loss": 0.4095, + "mean_token_accuracy": 0.8701727390289307, + "num_tokens": 377951919.0, + "step": 9905 + }, + { + "epoch": 1.2601450197175932, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.82222557067871, + "learning_rate": 1e-06, + "loss": 0.4842, + "mean_token_accuracy": 0.8477947115898132, + "num_tokens": 377997567.0, + "step": 9906 + }, + { + "epoch": 1.2602722299961837, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.810808181762695, + "learning_rate": 1e-06, + "loss": 0.4352, + "mean_token_accuracy": 0.8653069734573364, + "num_tokens": 378033811.0, + "step": 9907 + }, + { + "epoch": 1.2603994402747742, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.917957305908203, + "learning_rate": 1e-06, + "loss": 0.4565, + "mean_token_accuracy": 0.8543962240219116, + "num_tokens": 378070136.0, + "step": 9908 + }, + { + "epoch": 1.2605266505533648, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.71861457824707, + "learning_rate": 1e-06, + "loss": 0.4376, + "mean_token_accuracy": 0.8634306192398071, + "num_tokens": 378106902.0, + "step": 9909 + }, + { + "epoch": 1.2606538608319553, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.848249435424805, + "learning_rate": 1e-06, + "loss": 0.4032, + "mean_token_accuracy": 0.869223952293396, + "num_tokens": 378144638.0, + "step": 9910 + }, + { + "epoch": 1.2607810711105456, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.886987686157227, + "learning_rate": 1e-06, + "loss": 0.3858, + "mean_token_accuracy": 0.8744674921035767, + "num_tokens": 378179059.0, + "step": 9911 + }, + { + "epoch": 1.2609082813891361, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.03428840637207, + "learning_rate": 1e-06, + "loss": 0.4637, + "mean_token_accuracy": 0.852678656578064, + "num_tokens": 378210346.0, + "step": 9912 + }, + { + "epoch": 1.2610354916677267, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.749818801879883, + "learning_rate": 1e-06, + "loss": 0.4018, + "mean_token_accuracy": 0.8718140125274658, + "num_tokens": 378242687.0, + "step": 9913 + }, + { + "epoch": 1.2611627019463172, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.89618492126465, + "learning_rate": 1e-06, + "loss": 0.4028, + "mean_token_accuracy": 0.8717221617698669, + "num_tokens": 378279421.0, + "step": 9914 + }, + { + "epoch": 1.2612899122249077, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.834369659423828, + "learning_rate": 1e-06, + "loss": 0.4304, + "mean_token_accuracy": 0.8609331250190735, + "num_tokens": 378318133.0, + "step": 9915 + }, + { + "epoch": 1.2614171225034982, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.790775299072266, + "learning_rate": 1e-06, + "loss": 0.4787, + "mean_token_accuracy": 0.8485493659973145, + "num_tokens": 378362987.0, + "step": 9916 + }, + { + "epoch": 1.2615443327820888, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.874393463134766, + "learning_rate": 1e-06, + "loss": 0.3661, + "mean_token_accuracy": 0.8828592896461487, + "num_tokens": 378396197.0, + "step": 9917 + }, + { + "epoch": 1.2616715430606793, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.855865478515625, + "learning_rate": 1e-06, + "loss": 0.3948, + "mean_token_accuracy": 0.8739528059959412, + "num_tokens": 378433691.0, + "step": 9918 + }, + { + "epoch": 1.2617987533392698, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.00556182861328, + "learning_rate": 1e-06, + "loss": 0.4109, + "mean_token_accuracy": 0.8695921897888184, + "num_tokens": 378470277.0, + "step": 9919 + }, + { + "epoch": 1.2619259636178604, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.753198623657227, + "learning_rate": 1e-06, + "loss": 0.4425, + "mean_token_accuracy": 0.8598196506500244, + "num_tokens": 378510500.0, + "step": 9920 + }, + { + "epoch": 1.2620531738964509, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.007150650024414, + "learning_rate": 1e-06, + "loss": 0.4262, + "mean_token_accuracy": 0.8641722202301025, + "num_tokens": 378552893.0, + "step": 9921 + }, + { + "epoch": 1.2621803841750414, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.819658279418945, + "learning_rate": 1e-06, + "loss": 0.3976, + "mean_token_accuracy": 0.8722848296165466, + "num_tokens": 378590917.0, + "step": 9922 + }, + { + "epoch": 1.262307594453632, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.99332046508789, + "learning_rate": 1e-06, + "loss": 0.3939, + "mean_token_accuracy": 0.8740809559822083, + "num_tokens": 378637764.0, + "step": 9923 + }, + { + "epoch": 1.2624348047322225, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.9183292388916, + "learning_rate": 1e-06, + "loss": 0.426, + "mean_token_accuracy": 0.8623708486557007, + "num_tokens": 378675497.0, + "step": 9924 + }, + { + "epoch": 1.262562015010813, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.812259674072266, + "learning_rate": 1e-06, + "loss": 0.462, + "mean_token_accuracy": 0.857114851474762, + "num_tokens": 378710470.0, + "step": 9925 + }, + { + "epoch": 1.2626892252894033, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.832195281982422, + "learning_rate": 1e-06, + "loss": 0.4389, + "mean_token_accuracy": 0.8624517917633057, + "num_tokens": 378758116.0, + "step": 9926 + }, + { + "epoch": 1.2628164355679938, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.192873001098633, + "learning_rate": 1e-06, + "loss": 0.438, + "mean_token_accuracy": 0.8623272180557251, + "num_tokens": 378798166.0, + "step": 9927 + }, + { + "epoch": 1.2629436458465844, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.72416877746582, + "learning_rate": 1e-06, + "loss": 0.3753, + "mean_token_accuracy": 0.8809769749641418, + "num_tokens": 378839899.0, + "step": 9928 + }, + { + "epoch": 1.263070856125175, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.074491500854492, + "learning_rate": 1e-06, + "loss": 0.4192, + "mean_token_accuracy": 0.8673444986343384, + "num_tokens": 378886022.0, + "step": 9929 + }, + { + "epoch": 1.2631980664037654, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.922740936279297, + "learning_rate": 1e-06, + "loss": 0.4514, + "mean_token_accuracy": 0.8561842441558838, + "num_tokens": 378925097.0, + "step": 9930 + }, + { + "epoch": 1.263325276682356, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.99553871154785, + "learning_rate": 1e-06, + "loss": 0.392, + "mean_token_accuracy": 0.8779215812683105, + "num_tokens": 378965242.0, + "step": 9931 + }, + { + "epoch": 1.2634524869609465, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.984844207763672, + "learning_rate": 1e-06, + "loss": 0.4972, + "mean_token_accuracy": 0.8455782532691956, + "num_tokens": 379005932.0, + "step": 9932 + }, + { + "epoch": 1.263579697239537, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.90668487548828, + "learning_rate": 1e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.8634005188941956, + "num_tokens": 379045817.0, + "step": 9933 + }, + { + "epoch": 1.2637069075181275, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.010953903198242, + "learning_rate": 1e-06, + "loss": 0.4467, + "mean_token_accuracy": 0.8589998483657837, + "num_tokens": 379086481.0, + "step": 9934 + }, + { + "epoch": 1.2638341177967178, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.89965057373047, + "learning_rate": 1e-06, + "loss": 0.4796, + "mean_token_accuracy": 0.8484037518501282, + "num_tokens": 379129782.0, + "step": 9935 + }, + { + "epoch": 1.2639613280753084, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.06633186340332, + "learning_rate": 1e-06, + "loss": 0.4279, + "mean_token_accuracy": 0.863463282585144, + "num_tokens": 379171525.0, + "step": 9936 + }, + { + "epoch": 1.264088538353899, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.00754737854004, + "learning_rate": 1e-06, + "loss": 0.4376, + "mean_token_accuracy": 0.8590812683105469, + "num_tokens": 379213887.0, + "step": 9937 + }, + { + "epoch": 1.2642157486324894, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.86279296875, + "learning_rate": 1e-06, + "loss": 0.4244, + "mean_token_accuracy": 0.866062343120575, + "num_tokens": 379252634.0, + "step": 9938 + }, + { + "epoch": 1.26434295891108, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.95500373840332, + "learning_rate": 1e-06, + "loss": 0.4238, + "mean_token_accuracy": 0.8657761812210083, + "num_tokens": 379287402.0, + "step": 9939 + }, + { + "epoch": 1.2644701691896705, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.181074142456055, + "learning_rate": 1e-06, + "loss": 0.4026, + "mean_token_accuracy": 0.8717226982116699, + "num_tokens": 379325592.0, + "step": 9940 + }, + { + "epoch": 1.264597379468261, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.153303146362305, + "learning_rate": 1e-06, + "loss": 0.4549, + "mean_token_accuracy": 0.8536136150360107, + "num_tokens": 379359010.0, + "step": 9941 + }, + { + "epoch": 1.2647245897468515, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.966609954833984, + "learning_rate": 1e-06, + "loss": 0.4477, + "mean_token_accuracy": 0.8582231998443604, + "num_tokens": 379399783.0, + "step": 9942 + }, + { + "epoch": 1.264851800025442, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.819374084472656, + "learning_rate": 1e-06, + "loss": 0.4075, + "mean_token_accuracy": 0.8690581321716309, + "num_tokens": 379442207.0, + "step": 9943 + }, + { + "epoch": 1.2649790103040326, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.136608123779297, + "learning_rate": 1e-06, + "loss": 0.4249, + "mean_token_accuracy": 0.8649746775627136, + "num_tokens": 379479561.0, + "step": 9944 + }, + { + "epoch": 1.2651062205826231, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.933652877807617, + "learning_rate": 1e-06, + "loss": 0.4025, + "mean_token_accuracy": 0.87099289894104, + "num_tokens": 379516413.0, + "step": 9945 + }, + { + "epoch": 1.2652334308612136, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.749649047851562, + "learning_rate": 1e-06, + "loss": 0.45, + "mean_token_accuracy": 0.8562687635421753, + "num_tokens": 379557993.0, + "step": 9946 + }, + { + "epoch": 1.2653606411398042, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.073270797729492, + "learning_rate": 1e-06, + "loss": 0.483, + "mean_token_accuracy": 0.8469678163528442, + "num_tokens": 379594028.0, + "step": 9947 + }, + { + "epoch": 1.2654878514183947, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.971698760986328, + "learning_rate": 1e-06, + "loss": 0.4513, + "mean_token_accuracy": 0.8592301607131958, + "num_tokens": 379631235.0, + "step": 9948 + }, + { + "epoch": 1.2656150616969852, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.95149803161621, + "learning_rate": 1e-06, + "loss": 0.4609, + "mean_token_accuracy": 0.8538026809692383, + "num_tokens": 379668714.0, + "step": 9949 + }, + { + "epoch": 1.2657422719755758, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.783159255981445, + "learning_rate": 1e-06, + "loss": 0.4034, + "mean_token_accuracy": 0.8706709742546082, + "num_tokens": 379710233.0, + "step": 9950 + }, + { + "epoch": 1.265869482254166, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.91838836669922, + "learning_rate": 1e-06, + "loss": 0.4357, + "mean_token_accuracy": 0.8623170256614685, + "num_tokens": 379746208.0, + "step": 9951 + }, + { + "epoch": 1.2659966925327566, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.949277877807617, + "learning_rate": 1e-06, + "loss": 0.3955, + "mean_token_accuracy": 0.8749912977218628, + "num_tokens": 379796420.0, + "step": 9952 + }, + { + "epoch": 1.2661239028113471, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.849620819091797, + "learning_rate": 1e-06, + "loss": 0.4177, + "mean_token_accuracy": 0.8664902448654175, + "num_tokens": 379830027.0, + "step": 9953 + }, + { + "epoch": 1.2662511130899377, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.11273193359375, + "learning_rate": 1e-06, + "loss": 0.4215, + "mean_token_accuracy": 0.8652807474136353, + "num_tokens": 379871751.0, + "step": 9954 + }, + { + "epoch": 1.2663783233685282, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.87190818786621, + "learning_rate": 1e-06, + "loss": 0.4413, + "mean_token_accuracy": 0.8589809536933899, + "num_tokens": 379908255.0, + "step": 9955 + }, + { + "epoch": 1.2665055336471187, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.983604431152344, + "learning_rate": 1e-06, + "loss": 0.4101, + "mean_token_accuracy": 0.8686003088951111, + "num_tokens": 379944502.0, + "step": 9956 + }, + { + "epoch": 1.2666327439257092, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.04526138305664, + "learning_rate": 1e-06, + "loss": 0.418, + "mean_token_accuracy": 0.8636918067932129, + "num_tokens": 379981929.0, + "step": 9957 + }, + { + "epoch": 1.2667599542042998, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.942440032958984, + "learning_rate": 1e-06, + "loss": 0.4312, + "mean_token_accuracy": 0.860588014125824, + "num_tokens": 380018876.0, + "step": 9958 + }, + { + "epoch": 1.2668871644828903, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.058826446533203, + "learning_rate": 1e-06, + "loss": 0.4306, + "mean_token_accuracy": 0.8605947494506836, + "num_tokens": 380054153.0, + "step": 9959 + }, + { + "epoch": 1.2670143747614806, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.942825317382812, + "learning_rate": 1e-06, + "loss": 0.456, + "mean_token_accuracy": 0.8559885621070862, + "num_tokens": 380086550.0, + "step": 9960 + }, + { + "epoch": 1.2671415850400711, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.30228042602539, + "learning_rate": 1e-06, + "loss": 0.4317, + "mean_token_accuracy": 0.8605953454971313, + "num_tokens": 380120308.0, + "step": 9961 + }, + { + "epoch": 1.2672687953186617, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.978679656982422, + "learning_rate": 1e-06, + "loss": 0.4667, + "mean_token_accuracy": 0.8487024903297424, + "num_tokens": 380160204.0, + "step": 9962 + }, + { + "epoch": 1.2673960055972522, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.890087127685547, + "learning_rate": 1e-06, + "loss": 0.4295, + "mean_token_accuracy": 0.8686425089836121, + "num_tokens": 380196687.0, + "step": 9963 + }, + { + "epoch": 1.2675232158758427, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.04047203063965, + "learning_rate": 1e-06, + "loss": 0.3777, + "mean_token_accuracy": 0.879427433013916, + "num_tokens": 380237451.0, + "step": 9964 + }, + { + "epoch": 1.2676504261544332, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.969348907470703, + "learning_rate": 1e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.8618693947792053, + "num_tokens": 380276631.0, + "step": 9965 + }, + { + "epoch": 1.2677776364330238, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.05556297302246, + "learning_rate": 1e-06, + "loss": 0.4084, + "mean_token_accuracy": 0.8695896863937378, + "num_tokens": 380319432.0, + "step": 9966 + }, + { + "epoch": 1.2679048467116143, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.83193016052246, + "learning_rate": 1e-06, + "loss": 0.4001, + "mean_token_accuracy": 0.8740969300270081, + "num_tokens": 380356666.0, + "step": 9967 + }, + { + "epoch": 1.2680320569902048, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.87533950805664, + "learning_rate": 1e-06, + "loss": 0.3951, + "mean_token_accuracy": 0.8742985725402832, + "num_tokens": 380390347.0, + "step": 9968 + }, + { + "epoch": 1.2681592672687954, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.76651954650879, + "learning_rate": 1e-06, + "loss": 0.415, + "mean_token_accuracy": 0.8683942556381226, + "num_tokens": 380424083.0, + "step": 9969 + }, + { + "epoch": 1.2682864775473859, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.919048309326172, + "learning_rate": 1e-06, + "loss": 0.4412, + "mean_token_accuracy": 0.8602890372276306, + "num_tokens": 380471179.0, + "step": 9970 + }, + { + "epoch": 1.2684136878259764, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.125600814819336, + "learning_rate": 1e-06, + "loss": 0.4109, + "mean_token_accuracy": 0.867181122303009, + "num_tokens": 380510291.0, + "step": 9971 + }, + { + "epoch": 1.268540898104567, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.641071319580078, + "learning_rate": 1e-06, + "loss": 0.4074, + "mean_token_accuracy": 0.8696433901786804, + "num_tokens": 380556297.0, + "step": 9972 + }, + { + "epoch": 1.2686681083831575, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.837926864624023, + "learning_rate": 1e-06, + "loss": 0.3875, + "mean_token_accuracy": 0.8755881786346436, + "num_tokens": 380592113.0, + "step": 9973 + }, + { + "epoch": 1.268795318661748, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.042943954467773, + "learning_rate": 1e-06, + "loss": 0.381, + "mean_token_accuracy": 0.8792587518692017, + "num_tokens": 380623942.0, + "step": 9974 + }, + { + "epoch": 1.2689225289403383, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.89314842224121, + "learning_rate": 1e-06, + "loss": 0.4416, + "mean_token_accuracy": 0.8587633371353149, + "num_tokens": 380662300.0, + "step": 9975 + }, + { + "epoch": 1.2690497392189288, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.860401153564453, + "learning_rate": 1e-06, + "loss": 0.4131, + "mean_token_accuracy": 0.8670941591262817, + "num_tokens": 380701452.0, + "step": 9976 + }, + { + "epoch": 1.2691769494975194, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.846471786499023, + "learning_rate": 1e-06, + "loss": 0.3919, + "mean_token_accuracy": 0.8759106397628784, + "num_tokens": 380741248.0, + "step": 9977 + }, + { + "epoch": 1.2693041597761099, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.795574188232422, + "learning_rate": 1e-06, + "loss": 0.445, + "mean_token_accuracy": 0.8586260080337524, + "num_tokens": 380781455.0, + "step": 9978 + }, + { + "epoch": 1.2694313700547004, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.057811737060547, + "learning_rate": 1e-06, + "loss": 0.452, + "mean_token_accuracy": 0.855794370174408, + "num_tokens": 380818139.0, + "step": 9979 + }, + { + "epoch": 1.269558580333291, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.766172409057617, + "learning_rate": 1e-06, + "loss": 0.4083, + "mean_token_accuracy": 0.8706562519073486, + "num_tokens": 380860530.0, + "step": 9980 + }, + { + "epoch": 1.2696857906118815, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.865421295166016, + "learning_rate": 1e-06, + "loss": 0.3827, + "mean_token_accuracy": 0.8773422241210938, + "num_tokens": 380900825.0, + "step": 9981 + }, + { + "epoch": 1.269813000890472, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.965484619140625, + "learning_rate": 1e-06, + "loss": 0.4284, + "mean_token_accuracy": 0.863577127456665, + "num_tokens": 380939270.0, + "step": 9982 + }, + { + "epoch": 1.2699402111690625, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.8632869720459, + "learning_rate": 1e-06, + "loss": 0.4899, + "mean_token_accuracy": 0.8430686593055725, + "num_tokens": 380981708.0, + "step": 9983 + }, + { + "epoch": 1.2700674214476528, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.934309005737305, + "learning_rate": 1e-06, + "loss": 0.409, + "mean_token_accuracy": 0.8708249926567078, + "num_tokens": 381015186.0, + "step": 9984 + }, + { + "epoch": 1.2701946317262434, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.831884384155273, + "learning_rate": 1e-06, + "loss": 0.3945, + "mean_token_accuracy": 0.8756137490272522, + "num_tokens": 381055503.0, + "step": 9985 + }, + { + "epoch": 1.270321842004834, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.93448829650879, + "learning_rate": 1e-06, + "loss": 0.3889, + "mean_token_accuracy": 0.8777751922607422, + "num_tokens": 381091935.0, + "step": 9986 + }, + { + "epoch": 1.2704490522834244, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.019227981567383, + "learning_rate": 1e-06, + "loss": 0.4083, + "mean_token_accuracy": 0.8702030181884766, + "num_tokens": 381127951.0, + "step": 9987 + }, + { + "epoch": 1.270576262562015, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.880842208862305, + "learning_rate": 1e-06, + "loss": 0.4291, + "mean_token_accuracy": 0.8594866991043091, + "num_tokens": 381161918.0, + "step": 9988 + }, + { + "epoch": 1.2707034728406055, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.99702262878418, + "learning_rate": 1e-06, + "loss": 0.4494, + "mean_token_accuracy": 0.853754460811615, + "num_tokens": 381198747.0, + "step": 9989 + }, + { + "epoch": 1.270830683119196, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.713603973388672, + "learning_rate": 1e-06, + "loss": 0.4209, + "mean_token_accuracy": 0.8660115003585815, + "num_tokens": 381242714.0, + "step": 9990 + }, + { + "epoch": 1.2709578933977865, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.97632598876953, + "learning_rate": 1e-06, + "loss": 0.3836, + "mean_token_accuracy": 0.877590537071228, + "num_tokens": 381274667.0, + "step": 9991 + }, + { + "epoch": 1.271085103676377, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.034048080444336, + "learning_rate": 1e-06, + "loss": 0.3936, + "mean_token_accuracy": 0.8751742243766785, + "num_tokens": 381318110.0, + "step": 9992 + }, + { + "epoch": 1.2712123139549676, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.730661392211914, + "learning_rate": 1e-06, + "loss": 0.4178, + "mean_token_accuracy": 0.8668341040611267, + "num_tokens": 381356694.0, + "step": 9993 + }, + { + "epoch": 1.2713395242335581, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.109981536865234, + "learning_rate": 1e-06, + "loss": 0.4655, + "mean_token_accuracy": 0.8549388647079468, + "num_tokens": 381395218.0, + "step": 9994 + }, + { + "epoch": 1.2714667345121486, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.956392288208008, + "learning_rate": 1e-06, + "loss": 0.3777, + "mean_token_accuracy": 0.8791980743408203, + "num_tokens": 381429195.0, + "step": 9995 + }, + { + "epoch": 1.2715939447907392, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.924081802368164, + "learning_rate": 1e-06, + "loss": 0.4349, + "mean_token_accuracy": 0.8595194220542908, + "num_tokens": 381468276.0, + "step": 9996 + }, + { + "epoch": 1.2717211550693297, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.857730865478516, + "learning_rate": 1e-06, + "loss": 0.3991, + "mean_token_accuracy": 0.8735619187355042, + "num_tokens": 381503400.0, + "step": 9997 + }, + { + "epoch": 1.2718483653479202, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.13473892211914, + "learning_rate": 1e-06, + "loss": 0.4158, + "mean_token_accuracy": 0.8711158037185669, + "num_tokens": 381542126.0, + "step": 9998 + }, + { + "epoch": 1.2719755756265108, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.78922462463379, + "learning_rate": 1e-06, + "loss": 0.3882, + "mean_token_accuracy": 0.8746805191040039, + "num_tokens": 381579169.0, + "step": 9999 + }, + { + "epoch": 1.272102785905101, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.845108032226562, + "learning_rate": 1e-06, + "loss": 0.4752, + "mean_token_accuracy": 0.8505120277404785, + "num_tokens": 381615501.0, + "step": 10000 + }, + { + "epoch": 1.2722299961836916, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.901500701904297, + "learning_rate": 1e-06, + "loss": 0.4374, + "mean_token_accuracy": 0.8603140115737915, + "num_tokens": 381649737.0, + "step": 10001 + }, + { + "epoch": 1.2723572064622821, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.967864990234375, + "learning_rate": 1e-06, + "loss": 0.4246, + "mean_token_accuracy": 0.8688300848007202, + "num_tokens": 381693776.0, + "step": 10002 + }, + { + "epoch": 1.2724844167408726, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.852825164794922, + "learning_rate": 1e-06, + "loss": 0.3914, + "mean_token_accuracy": 0.8763976097106934, + "num_tokens": 381733324.0, + "step": 10003 + }, + { + "epoch": 1.2726116270194632, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.866697311401367, + "learning_rate": 1e-06, + "loss": 0.427, + "mean_token_accuracy": 0.8610801100730896, + "num_tokens": 381765893.0, + "step": 10004 + }, + { + "epoch": 1.2727388372980537, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.968820571899414, + "learning_rate": 1e-06, + "loss": 0.4205, + "mean_token_accuracy": 0.8637559413909912, + "num_tokens": 381802782.0, + "step": 10005 + }, + { + "epoch": 1.2728660475766442, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.823158264160156, + "learning_rate": 1e-06, + "loss": 0.4104, + "mean_token_accuracy": 0.8686919212341309, + "num_tokens": 381843448.0, + "step": 10006 + }, + { + "epoch": 1.2729932578552348, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.033483505249023, + "learning_rate": 1e-06, + "loss": 0.4584, + "mean_token_accuracy": 0.8543577790260315, + "num_tokens": 381878850.0, + "step": 10007 + }, + { + "epoch": 1.2731204681338253, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.05394744873047, + "learning_rate": 1e-06, + "loss": 0.4939, + "mean_token_accuracy": 0.8450571298599243, + "num_tokens": 381915508.0, + "step": 10008 + }, + { + "epoch": 1.2732476784124156, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.891557693481445, + "learning_rate": 1e-06, + "loss": 0.4543, + "mean_token_accuracy": 0.8559886813163757, + "num_tokens": 381952699.0, + "step": 10009 + }, + { + "epoch": 1.2733748886910061, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.97499656677246, + "learning_rate": 1e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.8592344522476196, + "num_tokens": 381989947.0, + "step": 10010 + }, + { + "epoch": 1.2735020989695967, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.001605987548828, + "learning_rate": 1e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.8652371168136597, + "num_tokens": 382027701.0, + "step": 10011 + }, + { + "epoch": 1.2736293092481872, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.040735244750977, + "learning_rate": 1e-06, + "loss": 0.4334, + "mean_token_accuracy": 0.8641229867935181, + "num_tokens": 382066415.0, + "step": 10012 + }, + { + "epoch": 1.2737565195267777, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.81680679321289, + "learning_rate": 1e-06, + "loss": 0.4443, + "mean_token_accuracy": 0.859981894493103, + "num_tokens": 382108402.0, + "step": 10013 + }, + { + "epoch": 1.2738837298053682, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.880327224731445, + "learning_rate": 1e-06, + "loss": 0.4236, + "mean_token_accuracy": 0.8662778735160828, + "num_tokens": 382146865.0, + "step": 10014 + }, + { + "epoch": 1.2740109400839588, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.846988677978516, + "learning_rate": 1e-06, + "loss": 0.4508, + "mean_token_accuracy": 0.8623318672180176, + "num_tokens": 382186746.0, + "step": 10015 + }, + { + "epoch": 1.2741381503625493, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.93829917907715, + "learning_rate": 1e-06, + "loss": 0.3854, + "mean_token_accuracy": 0.8792344331741333, + "num_tokens": 382220693.0, + "step": 10016 + }, + { + "epoch": 1.2742653606411398, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.866390228271484, + "learning_rate": 1e-06, + "loss": 0.4394, + "mean_token_accuracy": 0.860175609588623, + "num_tokens": 382259549.0, + "step": 10017 + }, + { + "epoch": 1.2743925709197303, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.096012115478516, + "learning_rate": 1e-06, + "loss": 0.3881, + "mean_token_accuracy": 0.8753584027290344, + "num_tokens": 382299164.0, + "step": 10018 + }, + { + "epoch": 1.2745197811983209, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.021486282348633, + "learning_rate": 1e-06, + "loss": 0.3793, + "mean_token_accuracy": 0.8789733648300171, + "num_tokens": 382333596.0, + "step": 10019 + }, + { + "epoch": 1.2746469914769114, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.909860610961914, + "learning_rate": 1e-06, + "loss": 0.4171, + "mean_token_accuracy": 0.8675063848495483, + "num_tokens": 382372321.0, + "step": 10020 + }, + { + "epoch": 1.274774201755502, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.02135467529297, + "learning_rate": 1e-06, + "loss": 0.4227, + "mean_token_accuracy": 0.8682237267494202, + "num_tokens": 382411125.0, + "step": 10021 + }, + { + "epoch": 1.2749014120340925, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.957128524780273, + "learning_rate": 1e-06, + "loss": 0.4536, + "mean_token_accuracy": 0.8551837205886841, + "num_tokens": 382450935.0, + "step": 10022 + }, + { + "epoch": 1.275028622312683, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.830060958862305, + "learning_rate": 1e-06, + "loss": 0.4362, + "mean_token_accuracy": 0.8635818362236023, + "num_tokens": 382488232.0, + "step": 10023 + }, + { + "epoch": 1.2751558325912733, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.944181442260742, + "learning_rate": 1e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.8609194755554199, + "num_tokens": 382522280.0, + "step": 10024 + }, + { + "epoch": 1.2752830428698638, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.782304763793945, + "learning_rate": 1e-06, + "loss": 0.3959, + "mean_token_accuracy": 0.8759636282920837, + "num_tokens": 382562788.0, + "step": 10025 + }, + { + "epoch": 1.2754102531484544, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.926353454589844, + "learning_rate": 1e-06, + "loss": 0.4432, + "mean_token_accuracy": 0.8592541217803955, + "num_tokens": 382598534.0, + "step": 10026 + }, + { + "epoch": 1.2755374634270449, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.90672492980957, + "learning_rate": 1e-06, + "loss": 0.4374, + "mean_token_accuracy": 0.8591501712799072, + "num_tokens": 382641803.0, + "step": 10027 + }, + { + "epoch": 1.2756646737056354, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.929325103759766, + "learning_rate": 1e-06, + "loss": 0.3894, + "mean_token_accuracy": 0.876036524772644, + "num_tokens": 382677871.0, + "step": 10028 + }, + { + "epoch": 1.275791883984226, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.080026626586914, + "learning_rate": 1e-06, + "loss": 0.3961, + "mean_token_accuracy": 0.8745317459106445, + "num_tokens": 382714989.0, + "step": 10029 + }, + { + "epoch": 1.2759190942628165, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.864229202270508, + "learning_rate": 1e-06, + "loss": 0.4183, + "mean_token_accuracy": 0.8686971664428711, + "num_tokens": 382751919.0, + "step": 10030 + }, + { + "epoch": 1.276046304541407, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.115802764892578, + "learning_rate": 1e-06, + "loss": 0.3896, + "mean_token_accuracy": 0.8775615692138672, + "num_tokens": 382793550.0, + "step": 10031 + }, + { + "epoch": 1.2761735148199975, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.000011444091797, + "learning_rate": 1e-06, + "loss": 0.3963, + "mean_token_accuracy": 0.8730332255363464, + "num_tokens": 382833142.0, + "step": 10032 + }, + { + "epoch": 1.2763007250985878, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.999439239501953, + "learning_rate": 1e-06, + "loss": 0.3916, + "mean_token_accuracy": 0.8736664056777954, + "num_tokens": 382869934.0, + "step": 10033 + }, + { + "epoch": 1.2764279353771784, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.98760986328125, + "learning_rate": 1e-06, + "loss": 0.4601, + "mean_token_accuracy": 0.8527160882949829, + "num_tokens": 382909533.0, + "step": 10034 + }, + { + "epoch": 1.2765551456557689, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.900468826293945, + "learning_rate": 1e-06, + "loss": 0.4084, + "mean_token_accuracy": 0.871943473815918, + "num_tokens": 382947462.0, + "step": 10035 + }, + { + "epoch": 1.2766823559343594, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.074527740478516, + "learning_rate": 1e-06, + "loss": 0.4147, + "mean_token_accuracy": 0.8663119673728943, + "num_tokens": 382989973.0, + "step": 10036 + }, + { + "epoch": 1.27680956621295, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.867088317871094, + "learning_rate": 1e-06, + "loss": 0.4338, + "mean_token_accuracy": 0.8641448020935059, + "num_tokens": 383033052.0, + "step": 10037 + }, + { + "epoch": 1.2769367764915405, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.074115753173828, + "learning_rate": 1e-06, + "loss": 0.4189, + "mean_token_accuracy": 0.8645603656768799, + "num_tokens": 383075981.0, + "step": 10038 + }, + { + "epoch": 1.277063986770131, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.83372688293457, + "learning_rate": 1e-06, + "loss": 0.371, + "mean_token_accuracy": 0.8815007209777832, + "num_tokens": 383111984.0, + "step": 10039 + }, + { + "epoch": 1.2771911970487215, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.059284210205078, + "learning_rate": 1e-06, + "loss": 0.4164, + "mean_token_accuracy": 0.8669688105583191, + "num_tokens": 383155323.0, + "step": 10040 + }, + { + "epoch": 1.277318407327312, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.032201766967773, + "learning_rate": 1e-06, + "loss": 0.4276, + "mean_token_accuracy": 0.865221381187439, + "num_tokens": 383190069.0, + "step": 10041 + }, + { + "epoch": 1.2774456176059026, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.959840774536133, + "learning_rate": 1e-06, + "loss": 0.4446, + "mean_token_accuracy": 0.8578329682350159, + "num_tokens": 383224999.0, + "step": 10042 + }, + { + "epoch": 1.2775728278844931, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.760082244873047, + "learning_rate": 1e-06, + "loss": 0.4488, + "mean_token_accuracy": 0.8577996492385864, + "num_tokens": 383267764.0, + "step": 10043 + }, + { + "epoch": 1.2777000381630836, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.857196807861328, + "learning_rate": 1e-06, + "loss": 0.4168, + "mean_token_accuracy": 0.8755072951316833, + "num_tokens": 383306148.0, + "step": 10044 + }, + { + "epoch": 1.2778272484416742, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.874189376831055, + "learning_rate": 1e-06, + "loss": 0.4003, + "mean_token_accuracy": 0.8702820539474487, + "num_tokens": 383343408.0, + "step": 10045 + }, + { + "epoch": 1.2779544587202647, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.037382125854492, + "learning_rate": 1e-06, + "loss": 0.3442, + "mean_token_accuracy": 0.8893554210662842, + "num_tokens": 383383626.0, + "step": 10046 + }, + { + "epoch": 1.2780816689988552, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.933732986450195, + "learning_rate": 1e-06, + "loss": 0.485, + "mean_token_accuracy": 0.8497313261032104, + "num_tokens": 383423218.0, + "step": 10047 + }, + { + "epoch": 1.2782088792774458, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.90773582458496, + "learning_rate": 1e-06, + "loss": 0.4033, + "mean_token_accuracy": 0.8744895458221436, + "num_tokens": 383461126.0, + "step": 10048 + }, + { + "epoch": 1.278336089556036, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.864389419555664, + "learning_rate": 1e-06, + "loss": 0.4598, + "mean_token_accuracy": 0.8575332164764404, + "num_tokens": 383501808.0, + "step": 10049 + }, + { + "epoch": 1.2784632998346266, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.00398826599121, + "learning_rate": 1e-06, + "loss": 0.3627, + "mean_token_accuracy": 0.8867133855819702, + "num_tokens": 383542395.0, + "step": 10050 + }, + { + "epoch": 1.2785905101132171, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.9088077545166, + "learning_rate": 1e-06, + "loss": 0.4385, + "mean_token_accuracy": 0.8634546995162964, + "num_tokens": 383579968.0, + "step": 10051 + }, + { + "epoch": 1.2787177203918076, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.875850677490234, + "learning_rate": 1e-06, + "loss": 0.4284, + "mean_token_accuracy": 0.8658164143562317, + "num_tokens": 383617861.0, + "step": 10052 + }, + { + "epoch": 1.2788449306703982, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.883365631103516, + "learning_rate": 1e-06, + "loss": 0.4313, + "mean_token_accuracy": 0.8615378737449646, + "num_tokens": 383652534.0, + "step": 10053 + }, + { + "epoch": 1.2789721409489887, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.91409683227539, + "learning_rate": 1e-06, + "loss": 0.432, + "mean_token_accuracy": 0.8619017004966736, + "num_tokens": 383693120.0, + "step": 10054 + }, + { + "epoch": 1.2790993512275792, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.951303482055664, + "learning_rate": 1e-06, + "loss": 0.4516, + "mean_token_accuracy": 0.8570057153701782, + "num_tokens": 383734958.0, + "step": 10055 + }, + { + "epoch": 1.2792265615061698, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.919832229614258, + "learning_rate": 1e-06, + "loss": 0.4407, + "mean_token_accuracy": 0.8589433431625366, + "num_tokens": 383768873.0, + "step": 10056 + }, + { + "epoch": 1.2793537717847603, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.019166946411133, + "learning_rate": 1e-06, + "loss": 0.4117, + "mean_token_accuracy": 0.867119312286377, + "num_tokens": 383804063.0, + "step": 10057 + }, + { + "epoch": 1.2794809820633506, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.88640022277832, + "learning_rate": 1e-06, + "loss": 0.3705, + "mean_token_accuracy": 0.8790660500526428, + "num_tokens": 383843098.0, + "step": 10058 + }, + { + "epoch": 1.2796081923419411, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.97015380859375, + "learning_rate": 1e-06, + "loss": 0.3963, + "mean_token_accuracy": 0.8739265203475952, + "num_tokens": 383879443.0, + "step": 10059 + }, + { + "epoch": 1.2797354026205316, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.07989501953125, + "learning_rate": 1e-06, + "loss": 0.4254, + "mean_token_accuracy": 0.865649938583374, + "num_tokens": 383918071.0, + "step": 10060 + }, + { + "epoch": 1.2798626128991222, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.962812423706055, + "learning_rate": 1e-06, + "loss": 0.4511, + "mean_token_accuracy": 0.8588442802429199, + "num_tokens": 383953052.0, + "step": 10061 + }, + { + "epoch": 1.2799898231777127, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.132219314575195, + "learning_rate": 1e-06, + "loss": 0.4286, + "mean_token_accuracy": 0.8638615608215332, + "num_tokens": 383992835.0, + "step": 10062 + }, + { + "epoch": 1.2801170334563032, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.79248809814453, + "learning_rate": 1e-06, + "loss": 0.4847, + "mean_token_accuracy": 0.8540195226669312, + "num_tokens": 384031234.0, + "step": 10063 + }, + { + "epoch": 1.2802442437348938, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.015600204467773, + "learning_rate": 1e-06, + "loss": 0.4587, + "mean_token_accuracy": 0.8576083183288574, + "num_tokens": 384063611.0, + "step": 10064 + }, + { + "epoch": 1.2803714540134843, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.86356544494629, + "learning_rate": 1e-06, + "loss": 0.3895, + "mean_token_accuracy": 0.874705970287323, + "num_tokens": 384106757.0, + "step": 10065 + }, + { + "epoch": 1.2804986642920748, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.061464309692383, + "learning_rate": 1e-06, + "loss": 0.4049, + "mean_token_accuracy": 0.8698030710220337, + "num_tokens": 384143094.0, + "step": 10066 + }, + { + "epoch": 1.2806258745706653, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.034833908081055, + "learning_rate": 1e-06, + "loss": 0.4207, + "mean_token_accuracy": 0.8658305406570435, + "num_tokens": 384181498.0, + "step": 10067 + }, + { + "epoch": 1.2807530848492559, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.928600311279297, + "learning_rate": 1e-06, + "loss": 0.3997, + "mean_token_accuracy": 0.8699689507484436, + "num_tokens": 384209784.0, + "step": 10068 + }, + { + "epoch": 1.2808802951278464, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.177291870117188, + "learning_rate": 1e-06, + "loss": 0.4093, + "mean_token_accuracy": 0.8712336421012878, + "num_tokens": 384248000.0, + "step": 10069 + }, + { + "epoch": 1.281007505406437, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.8891544342041, + "learning_rate": 1e-06, + "loss": 0.4492, + "mean_token_accuracy": 0.8570073843002319, + "num_tokens": 384290897.0, + "step": 10070 + }, + { + "epoch": 1.2811347156850275, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.18122100830078, + "learning_rate": 1e-06, + "loss": 0.4472, + "mean_token_accuracy": 0.8596429824829102, + "num_tokens": 384329464.0, + "step": 10071 + }, + { + "epoch": 1.281261925963618, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.91671371459961, + "learning_rate": 1e-06, + "loss": 0.4157, + "mean_token_accuracy": 0.8689696788787842, + "num_tokens": 384368915.0, + "step": 10072 + }, + { + "epoch": 1.2813891362422083, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.928247451782227, + "learning_rate": 1e-06, + "loss": 0.3907, + "mean_token_accuracy": 0.8751234412193298, + "num_tokens": 384407445.0, + "step": 10073 + }, + { + "epoch": 1.2815163465207988, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.920259475708008, + "learning_rate": 1e-06, + "loss": 0.432, + "mean_token_accuracy": 0.8616291880607605, + "num_tokens": 384444384.0, + "step": 10074 + }, + { + "epoch": 1.2816435567993893, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.997220993041992, + "learning_rate": 1e-06, + "loss": 0.426, + "mean_token_accuracy": 0.864131510257721, + "num_tokens": 384479670.0, + "step": 10075 + }, + { + "epoch": 1.2817707670779799, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.825551986694336, + "learning_rate": 1e-06, + "loss": 0.4543, + "mean_token_accuracy": 0.8536064028739929, + "num_tokens": 384522436.0, + "step": 10076 + }, + { + "epoch": 1.2818979773565704, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.945823669433594, + "learning_rate": 1e-06, + "loss": 0.451, + "mean_token_accuracy": 0.8585255146026611, + "num_tokens": 384559754.0, + "step": 10077 + }, + { + "epoch": 1.282025187635161, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.75604820251465, + "learning_rate": 1e-06, + "loss": 0.4235, + "mean_token_accuracy": 0.863684892654419, + "num_tokens": 384595153.0, + "step": 10078 + }, + { + "epoch": 1.2821523979137515, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.058490753173828, + "learning_rate": 1e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.8649656176567078, + "num_tokens": 384628345.0, + "step": 10079 + }, + { + "epoch": 1.282279608192342, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.800973892211914, + "learning_rate": 1e-06, + "loss": 0.4395, + "mean_token_accuracy": 0.8610638380050659, + "num_tokens": 384666725.0, + "step": 10080 + }, + { + "epoch": 1.2824068184709325, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.83890724182129, + "learning_rate": 1e-06, + "loss": 0.4439, + "mean_token_accuracy": 0.8570210933685303, + "num_tokens": 384706217.0, + "step": 10081 + }, + { + "epoch": 1.2825340287495228, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.909832000732422, + "learning_rate": 1e-06, + "loss": 0.4267, + "mean_token_accuracy": 0.8619340062141418, + "num_tokens": 384741835.0, + "step": 10082 + }, + { + "epoch": 1.2826612390281134, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.959548950195312, + "learning_rate": 1e-06, + "loss": 0.4157, + "mean_token_accuracy": 0.8672652840614319, + "num_tokens": 384777273.0, + "step": 10083 + }, + { + "epoch": 1.2827884493067039, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.975431442260742, + "learning_rate": 1e-06, + "loss": 0.3827, + "mean_token_accuracy": 0.8800102472305298, + "num_tokens": 384814950.0, + "step": 10084 + }, + { + "epoch": 1.2829156595852944, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.81534194946289, + "learning_rate": 1e-06, + "loss": 0.4469, + "mean_token_accuracy": 0.857715904712677, + "num_tokens": 384848088.0, + "step": 10085 + }, + { + "epoch": 1.283042869863885, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.885120391845703, + "learning_rate": 1e-06, + "loss": 0.4549, + "mean_token_accuracy": 0.8571262955665588, + "num_tokens": 384887361.0, + "step": 10086 + }, + { + "epoch": 1.2831700801424755, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.236305236816406, + "learning_rate": 1e-06, + "loss": 0.4046, + "mean_token_accuracy": 0.872145414352417, + "num_tokens": 384930459.0, + "step": 10087 + }, + { + "epoch": 1.283297290421066, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.9294490814209, + "learning_rate": 1e-06, + "loss": 0.4514, + "mean_token_accuracy": 0.8605785369873047, + "num_tokens": 384969506.0, + "step": 10088 + }, + { + "epoch": 1.2834245006996565, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.986337661743164, + "learning_rate": 1e-06, + "loss": 0.3995, + "mean_token_accuracy": 0.872430682182312, + "num_tokens": 385004368.0, + "step": 10089 + }, + { + "epoch": 1.283551710978247, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.8586483001709, + "learning_rate": 1e-06, + "loss": 0.4049, + "mean_token_accuracy": 0.8671197295188904, + "num_tokens": 385044553.0, + "step": 10090 + }, + { + "epoch": 1.2836789212568376, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.05453872680664, + "learning_rate": 1e-06, + "loss": 0.3709, + "mean_token_accuracy": 0.8780398964881897, + "num_tokens": 385076873.0, + "step": 10091 + }, + { + "epoch": 1.283806131535428, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.13959312438965, + "learning_rate": 1e-06, + "loss": 0.4042, + "mean_token_accuracy": 0.867400050163269, + "num_tokens": 385113969.0, + "step": 10092 + }, + { + "epoch": 1.2839333418140186, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.817096710205078, + "learning_rate": 1e-06, + "loss": 0.3891, + "mean_token_accuracy": 0.8758916854858398, + "num_tokens": 385149460.0, + "step": 10093 + }, + { + "epoch": 1.2840605520926092, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.05299186706543, + "learning_rate": 1e-06, + "loss": 0.473, + "mean_token_accuracy": 0.8489301204681396, + "num_tokens": 385187240.0, + "step": 10094 + }, + { + "epoch": 1.2841877623711997, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.02628517150879, + "learning_rate": 1e-06, + "loss": 0.4571, + "mean_token_accuracy": 0.858318567276001, + "num_tokens": 385225532.0, + "step": 10095 + }, + { + "epoch": 1.2843149726497902, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.95065689086914, + "learning_rate": 1e-06, + "loss": 0.4209, + "mean_token_accuracy": 0.868198037147522, + "num_tokens": 385267404.0, + "step": 10096 + }, + { + "epoch": 1.2844421829283807, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.90690040588379, + "learning_rate": 1e-06, + "loss": 0.4584, + "mean_token_accuracy": 0.8542594909667969, + "num_tokens": 385314008.0, + "step": 10097 + }, + { + "epoch": 1.284569393206971, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.899410247802734, + "learning_rate": 1e-06, + "loss": 0.4615, + "mean_token_accuracy": 0.8529999256134033, + "num_tokens": 385352204.0, + "step": 10098 + }, + { + "epoch": 1.2846966034855616, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.155527114868164, + "learning_rate": 1e-06, + "loss": 0.4419, + "mean_token_accuracy": 0.8574739694595337, + "num_tokens": 385393307.0, + "step": 10099 + }, + { + "epoch": 1.2848238137641521, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.986610412597656, + "learning_rate": 1e-06, + "loss": 0.4163, + "mean_token_accuracy": 0.8666216731071472, + "num_tokens": 385426124.0, + "step": 10100 + }, + { + "epoch": 1.2849510240427426, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.0319766998291, + "learning_rate": 1e-06, + "loss": 0.4059, + "mean_token_accuracy": 0.8706622123718262, + "num_tokens": 385467471.0, + "step": 10101 + }, + { + "epoch": 1.2850782343213332, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.88460922241211, + "learning_rate": 1e-06, + "loss": 0.4487, + "mean_token_accuracy": 0.856839656829834, + "num_tokens": 385508780.0, + "step": 10102 + }, + { + "epoch": 1.2852054445999237, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.86720085144043, + "learning_rate": 1e-06, + "loss": 0.4066, + "mean_token_accuracy": 0.871507465839386, + "num_tokens": 385552646.0, + "step": 10103 + }, + { + "epoch": 1.2853326548785142, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.164796829223633, + "learning_rate": 1e-06, + "loss": 0.4629, + "mean_token_accuracy": 0.8541481494903564, + "num_tokens": 385586676.0, + "step": 10104 + }, + { + "epoch": 1.2854598651571048, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.919301986694336, + "learning_rate": 1e-06, + "loss": 0.4591, + "mean_token_accuracy": 0.8542358875274658, + "num_tokens": 385623201.0, + "step": 10105 + }, + { + "epoch": 1.2855870754356953, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.949478149414062, + "learning_rate": 1e-06, + "loss": 0.3944, + "mean_token_accuracy": 0.876160740852356, + "num_tokens": 385662716.0, + "step": 10106 + }, + { + "epoch": 1.2857142857142856, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.885435104370117, + "learning_rate": 1e-06, + "loss": 0.4477, + "mean_token_accuracy": 0.8549283146858215, + "num_tokens": 385706986.0, + "step": 10107 + }, + { + "epoch": 1.2858414959928761, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.90604019165039, + "learning_rate": 1e-06, + "loss": 0.4192, + "mean_token_accuracy": 0.8651375770568848, + "num_tokens": 385744240.0, + "step": 10108 + }, + { + "epoch": 1.2859687062714666, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.725915908813477, + "learning_rate": 1e-06, + "loss": 0.4821, + "mean_token_accuracy": 0.8493653535842896, + "num_tokens": 385778583.0, + "step": 10109 + }, + { + "epoch": 1.2860959165500572, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.967069625854492, + "learning_rate": 1e-06, + "loss": 0.4172, + "mean_token_accuracy": 0.8665356636047363, + "num_tokens": 385819910.0, + "step": 10110 + }, + { + "epoch": 1.2862231268286477, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.95337677001953, + "learning_rate": 1e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.8629027605056763, + "num_tokens": 385860068.0, + "step": 10111 + }, + { + "epoch": 1.2863503371072382, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.948951721191406, + "learning_rate": 1e-06, + "loss": 0.4553, + "mean_token_accuracy": 0.8581677079200745, + "num_tokens": 385890460.0, + "step": 10112 + }, + { + "epoch": 1.2864775473858288, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.93923568725586, + "learning_rate": 1e-06, + "loss": 0.4332, + "mean_token_accuracy": 0.8623610138893127, + "num_tokens": 385926174.0, + "step": 10113 + }, + { + "epoch": 1.2866047576644193, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.943843841552734, + "learning_rate": 1e-06, + "loss": 0.3916, + "mean_token_accuracy": 0.8747929334640503, + "num_tokens": 385962928.0, + "step": 10114 + }, + { + "epoch": 1.2867319679430098, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.95085334777832, + "learning_rate": 1e-06, + "loss": 0.4033, + "mean_token_accuracy": 0.873732328414917, + "num_tokens": 386002509.0, + "step": 10115 + }, + { + "epoch": 1.2868591782216003, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.910430908203125, + "learning_rate": 1e-06, + "loss": 0.4177, + "mean_token_accuracy": 0.8647783398628235, + "num_tokens": 386043304.0, + "step": 10116 + }, + { + "epoch": 1.2869863885001909, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.953811645507812, + "learning_rate": 1e-06, + "loss": 0.3988, + "mean_token_accuracy": 0.8734457492828369, + "num_tokens": 386081607.0, + "step": 10117 + }, + { + "epoch": 1.2871135987787814, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.937271118164062, + "learning_rate": 1e-06, + "loss": 0.3671, + "mean_token_accuracy": 0.8838725090026855, + "num_tokens": 386117762.0, + "step": 10118 + }, + { + "epoch": 1.287240809057372, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.91828727722168, + "learning_rate": 1e-06, + "loss": 0.4507, + "mean_token_accuracy": 0.8574016094207764, + "num_tokens": 386155908.0, + "step": 10119 + }, + { + "epoch": 1.2873680193359625, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.963401794433594, + "learning_rate": 1e-06, + "loss": 0.4226, + "mean_token_accuracy": 0.8625819087028503, + "num_tokens": 386199385.0, + "step": 10120 + }, + { + "epoch": 1.287495229614553, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.94566535949707, + "learning_rate": 1e-06, + "loss": 0.4307, + "mean_token_accuracy": 0.8608046174049377, + "num_tokens": 386231892.0, + "step": 10121 + }, + { + "epoch": 1.2876224398931433, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.779207229614258, + "learning_rate": 1e-06, + "loss": 0.4779, + "mean_token_accuracy": 0.8492096662521362, + "num_tokens": 386269657.0, + "step": 10122 + }, + { + "epoch": 1.2877496501717338, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.873720169067383, + "learning_rate": 1e-06, + "loss": 0.4137, + "mean_token_accuracy": 0.8698426485061646, + "num_tokens": 386306936.0, + "step": 10123 + }, + { + "epoch": 1.2878768604503243, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.07275390625, + "learning_rate": 1e-06, + "loss": 0.4168, + "mean_token_accuracy": 0.8651816844940186, + "num_tokens": 386342260.0, + "step": 10124 + }, + { + "epoch": 1.2880040707289149, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.897661209106445, + "learning_rate": 1e-06, + "loss": 0.4342, + "mean_token_accuracy": 0.8613000512123108, + "num_tokens": 386381784.0, + "step": 10125 + }, + { + "epoch": 1.2881312810075054, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.875165939331055, + "learning_rate": 1e-06, + "loss": 0.421, + "mean_token_accuracy": 0.8640440702438354, + "num_tokens": 386422350.0, + "step": 10126 + }, + { + "epoch": 1.288258491286096, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.83440399169922, + "learning_rate": 1e-06, + "loss": 0.441, + "mean_token_accuracy": 0.8610059022903442, + "num_tokens": 386467951.0, + "step": 10127 + }, + { + "epoch": 1.2883857015646865, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.045373916625977, + "learning_rate": 1e-06, + "loss": 0.3988, + "mean_token_accuracy": 0.8745805621147156, + "num_tokens": 386511896.0, + "step": 10128 + }, + { + "epoch": 1.288512911843277, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.130977630615234, + "learning_rate": 1e-06, + "loss": 0.4683, + "mean_token_accuracy": 0.8524360656738281, + "num_tokens": 386550704.0, + "step": 10129 + }, + { + "epoch": 1.2886401221218675, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.855438232421875, + "learning_rate": 1e-06, + "loss": 0.48, + "mean_token_accuracy": 0.8481385707855225, + "num_tokens": 386589295.0, + "step": 10130 + }, + { + "epoch": 1.2887673324004578, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.01138687133789, + "learning_rate": 1e-06, + "loss": 0.425, + "mean_token_accuracy": 0.8651634454727173, + "num_tokens": 386623462.0, + "step": 10131 + }, + { + "epoch": 1.2888945426790483, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.850994110107422, + "learning_rate": 1e-06, + "loss": 0.4199, + "mean_token_accuracy": 0.8685504198074341, + "num_tokens": 386663036.0, + "step": 10132 + }, + { + "epoch": 1.2890217529576389, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.813949584960938, + "learning_rate": 1e-06, + "loss": 0.4258, + "mean_token_accuracy": 0.8649156093597412, + "num_tokens": 386702530.0, + "step": 10133 + }, + { + "epoch": 1.2891489632362294, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.828882217407227, + "learning_rate": 1e-06, + "loss": 0.3805, + "mean_token_accuracy": 0.8797088861465454, + "num_tokens": 386739758.0, + "step": 10134 + }, + { + "epoch": 1.28927617351482, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.010387420654297, + "learning_rate": 1e-06, + "loss": 0.3918, + "mean_token_accuracy": 0.8750030994415283, + "num_tokens": 386778287.0, + "step": 10135 + }, + { + "epoch": 1.2894033837934105, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.797752380371094, + "learning_rate": 1e-06, + "loss": 0.389, + "mean_token_accuracy": 0.8755186796188354, + "num_tokens": 386812348.0, + "step": 10136 + }, + { + "epoch": 1.289530594072001, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.9027042388916, + "learning_rate": 1e-06, + "loss": 0.4067, + "mean_token_accuracy": 0.8729345202445984, + "num_tokens": 386855540.0, + "step": 10137 + }, + { + "epoch": 1.2896578043505915, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.860157012939453, + "learning_rate": 1e-06, + "loss": 0.43, + "mean_token_accuracy": 0.8610437512397766, + "num_tokens": 386893775.0, + "step": 10138 + }, + { + "epoch": 1.289785014629182, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.893789291381836, + "learning_rate": 1e-06, + "loss": 0.4192, + "mean_token_accuracy": 0.8654699325561523, + "num_tokens": 386929553.0, + "step": 10139 + }, + { + "epoch": 1.2899122249077726, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.795028686523438, + "learning_rate": 1e-06, + "loss": 0.4283, + "mean_token_accuracy": 0.8595452904701233, + "num_tokens": 386968033.0, + "step": 10140 + }, + { + "epoch": 1.290039435186363, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.88558006286621, + "learning_rate": 1e-06, + "loss": 0.431, + "mean_token_accuracy": 0.8645716905593872, + "num_tokens": 387004967.0, + "step": 10141 + }, + { + "epoch": 1.2901666454649536, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.922470092773438, + "learning_rate": 1e-06, + "loss": 0.4479, + "mean_token_accuracy": 0.8530144691467285, + "num_tokens": 387038086.0, + "step": 10142 + }, + { + "epoch": 1.2902938557435442, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.751815795898438, + "learning_rate": 1e-06, + "loss": 0.509, + "mean_token_accuracy": 0.8403088450431824, + "num_tokens": 387077112.0, + "step": 10143 + }, + { + "epoch": 1.2904210660221347, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.015607833862305, + "learning_rate": 1e-06, + "loss": 0.481, + "mean_token_accuracy": 0.8490691184997559, + "num_tokens": 387118800.0, + "step": 10144 + }, + { + "epoch": 1.2905482763007252, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.044815063476562, + "learning_rate": 1e-06, + "loss": 0.4402, + "mean_token_accuracy": 0.8604350090026855, + "num_tokens": 387160321.0, + "step": 10145 + }, + { + "epoch": 1.2906754865793157, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.949710845947266, + "learning_rate": 1e-06, + "loss": 0.4304, + "mean_token_accuracy": 0.8628051280975342, + "num_tokens": 387199387.0, + "step": 10146 + }, + { + "epoch": 1.290802696857906, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.887313842773438, + "learning_rate": 1e-06, + "loss": 0.4043, + "mean_token_accuracy": 0.8712968826293945, + "num_tokens": 387238859.0, + "step": 10147 + }, + { + "epoch": 1.2909299071364966, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.889400482177734, + "learning_rate": 1e-06, + "loss": 0.4073, + "mean_token_accuracy": 0.8720003366470337, + "num_tokens": 387272149.0, + "step": 10148 + }, + { + "epoch": 1.291057117415087, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.038766860961914, + "learning_rate": 1e-06, + "loss": 0.423, + "mean_token_accuracy": 0.8644585609436035, + "num_tokens": 387311617.0, + "step": 10149 + }, + { + "epoch": 1.2911843276936776, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.001113891601562, + "learning_rate": 1e-06, + "loss": 0.4642, + "mean_token_accuracy": 0.8522526621818542, + "num_tokens": 387344281.0, + "step": 10150 + }, + { + "epoch": 1.2913115379722682, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.949382781982422, + "learning_rate": 1e-06, + "loss": 0.4509, + "mean_token_accuracy": 0.8547031879425049, + "num_tokens": 387385959.0, + "step": 10151 + }, + { + "epoch": 1.2914387482508587, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.98223114013672, + "learning_rate": 1e-06, + "loss": 0.4344, + "mean_token_accuracy": 0.8607690334320068, + "num_tokens": 387426197.0, + "step": 10152 + }, + { + "epoch": 1.2915659585294492, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.896896362304688, + "learning_rate": 1e-06, + "loss": 0.4012, + "mean_token_accuracy": 0.8704015016555786, + "num_tokens": 387462250.0, + "step": 10153 + }, + { + "epoch": 1.2916931688080397, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.03302574157715, + "learning_rate": 1e-06, + "loss": 0.4339, + "mean_token_accuracy": 0.8652259111404419, + "num_tokens": 387499903.0, + "step": 10154 + }, + { + "epoch": 1.2918203790866303, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.813730239868164, + "learning_rate": 1e-06, + "loss": 0.3997, + "mean_token_accuracy": 0.8718845844268799, + "num_tokens": 387535361.0, + "step": 10155 + }, + { + "epoch": 1.2919475893652206, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.973264694213867, + "learning_rate": 1e-06, + "loss": 0.437, + "mean_token_accuracy": 0.8614985942840576, + "num_tokens": 387574300.0, + "step": 10156 + }, + { + "epoch": 1.2920747996438111, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.86841583251953, + "learning_rate": 1e-06, + "loss": 0.4247, + "mean_token_accuracy": 0.8668714761734009, + "num_tokens": 387606860.0, + "step": 10157 + }, + { + "epoch": 1.2922020099224016, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.957576751708984, + "learning_rate": 1e-06, + "loss": 0.4472, + "mean_token_accuracy": 0.8550201654434204, + "num_tokens": 387646519.0, + "step": 10158 + }, + { + "epoch": 1.2923292202009922, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.150440216064453, + "learning_rate": 1e-06, + "loss": 0.3999, + "mean_token_accuracy": 0.8770906329154968, + "num_tokens": 387681100.0, + "step": 10159 + }, + { + "epoch": 1.2924564304795827, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.801973342895508, + "learning_rate": 1e-06, + "loss": 0.3631, + "mean_token_accuracy": 0.8870806097984314, + "num_tokens": 387715962.0, + "step": 10160 + }, + { + "epoch": 1.2925836407581732, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.94782829284668, + "learning_rate": 1e-06, + "loss": 0.4355, + "mean_token_accuracy": 0.8636958599090576, + "num_tokens": 387755856.0, + "step": 10161 + }, + { + "epoch": 1.2927108510367638, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.91900062561035, + "learning_rate": 1e-06, + "loss": 0.3991, + "mean_token_accuracy": 0.8731904029846191, + "num_tokens": 387793256.0, + "step": 10162 + }, + { + "epoch": 1.2928380613153543, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.988527297973633, + "learning_rate": 1e-06, + "loss": 0.4322, + "mean_token_accuracy": 0.8610731363296509, + "num_tokens": 387834696.0, + "step": 10163 + }, + { + "epoch": 1.2929652715939448, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.93653106689453, + "learning_rate": 1e-06, + "loss": 0.4664, + "mean_token_accuracy": 0.8533079624176025, + "num_tokens": 387874973.0, + "step": 10164 + }, + { + "epoch": 1.2930924818725353, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.91136360168457, + "learning_rate": 1e-06, + "loss": 0.4688, + "mean_token_accuracy": 0.8552244901657104, + "num_tokens": 387904887.0, + "step": 10165 + }, + { + "epoch": 1.2932196921511259, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.885589599609375, + "learning_rate": 1e-06, + "loss": 0.4132, + "mean_token_accuracy": 0.8719035387039185, + "num_tokens": 387944676.0, + "step": 10166 + }, + { + "epoch": 1.2933469024297164, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.18515396118164, + "learning_rate": 1e-06, + "loss": 0.4185, + "mean_token_accuracy": 0.8659337162971497, + "num_tokens": 387982234.0, + "step": 10167 + }, + { + "epoch": 1.293474112708307, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.938453674316406, + "learning_rate": 1e-06, + "loss": 0.4045, + "mean_token_accuracy": 0.8672664165496826, + "num_tokens": 388018001.0, + "step": 10168 + }, + { + "epoch": 1.2936013229868975, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.02667236328125, + "learning_rate": 1e-06, + "loss": 0.4318, + "mean_token_accuracy": 0.8604928255081177, + "num_tokens": 388057476.0, + "step": 10169 + }, + { + "epoch": 1.293728533265488, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.086654663085938, + "learning_rate": 1e-06, + "loss": 0.3685, + "mean_token_accuracy": 0.8832195997238159, + "num_tokens": 388095351.0, + "step": 10170 + }, + { + "epoch": 1.2938557435440783, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.924528121948242, + "learning_rate": 1e-06, + "loss": 0.4259, + "mean_token_accuracy": 0.8655388355255127, + "num_tokens": 388135366.0, + "step": 10171 + }, + { + "epoch": 1.2939829538226688, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.03012466430664, + "learning_rate": 1e-06, + "loss": 0.4576, + "mean_token_accuracy": 0.8539080619812012, + "num_tokens": 388175003.0, + "step": 10172 + }, + { + "epoch": 1.2941101641012593, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.015586853027344, + "learning_rate": 1e-06, + "loss": 0.4722, + "mean_token_accuracy": 0.8457714319229126, + "num_tokens": 388207083.0, + "step": 10173 + }, + { + "epoch": 1.2942373743798499, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.989444732666016, + "learning_rate": 1e-06, + "loss": 0.3723, + "mean_token_accuracy": 0.8815861344337463, + "num_tokens": 388245038.0, + "step": 10174 + }, + { + "epoch": 1.2943645846584404, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.918411254882812, + "learning_rate": 1e-06, + "loss": 0.436, + "mean_token_accuracy": 0.8611471056938171, + "num_tokens": 388285035.0, + "step": 10175 + }, + { + "epoch": 1.294491794937031, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.025794982910156, + "learning_rate": 1e-06, + "loss": 0.3995, + "mean_token_accuracy": 0.8702867031097412, + "num_tokens": 388323971.0, + "step": 10176 + }, + { + "epoch": 1.2946190052156215, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.90519142150879, + "learning_rate": 1e-06, + "loss": 0.4357, + "mean_token_accuracy": 0.8607064485549927, + "num_tokens": 388362256.0, + "step": 10177 + }, + { + "epoch": 1.294746215494212, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.000778198242188, + "learning_rate": 1e-06, + "loss": 0.3615, + "mean_token_accuracy": 0.885300874710083, + "num_tokens": 388401120.0, + "step": 10178 + }, + { + "epoch": 1.2948734257728025, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.068134307861328, + "learning_rate": 1e-06, + "loss": 0.4156, + "mean_token_accuracy": 0.8674697875976562, + "num_tokens": 388446833.0, + "step": 10179 + }, + { + "epoch": 1.2950006360513928, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.089656829833984, + "learning_rate": 1e-06, + "loss": 0.3807, + "mean_token_accuracy": 0.8770636916160583, + "num_tokens": 388482266.0, + "step": 10180 + }, + { + "epoch": 1.2951278463299833, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.90232276916504, + "learning_rate": 1e-06, + "loss": 0.4202, + "mean_token_accuracy": 0.8663163185119629, + "num_tokens": 388518024.0, + "step": 10181 + }, + { + "epoch": 1.2952550566085739, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.097375869750977, + "learning_rate": 1e-06, + "loss": 0.417, + "mean_token_accuracy": 0.8680708408355713, + "num_tokens": 388564246.0, + "step": 10182 + }, + { + "epoch": 1.2953822668871644, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.843669891357422, + "learning_rate": 1e-06, + "loss": 0.4073, + "mean_token_accuracy": 0.8695739507675171, + "num_tokens": 388603556.0, + "step": 10183 + }, + { + "epoch": 1.295509477165755, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.039207458496094, + "learning_rate": 1e-06, + "loss": 0.3933, + "mean_token_accuracy": 0.8743935227394104, + "num_tokens": 388643615.0, + "step": 10184 + }, + { + "epoch": 1.2956366874443455, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.10517120361328, + "learning_rate": 1e-06, + "loss": 0.4067, + "mean_token_accuracy": 0.8686292171478271, + "num_tokens": 388682917.0, + "step": 10185 + }, + { + "epoch": 1.295763897722936, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.096887588500977, + "learning_rate": 1e-06, + "loss": 0.4115, + "mean_token_accuracy": 0.8692359924316406, + "num_tokens": 388720269.0, + "step": 10186 + }, + { + "epoch": 1.2958911080015265, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.0175724029541, + "learning_rate": 1e-06, + "loss": 0.4511, + "mean_token_accuracy": 0.8581850528717041, + "num_tokens": 388759245.0, + "step": 10187 + }, + { + "epoch": 1.296018318280117, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.86217498779297, + "learning_rate": 1e-06, + "loss": 0.4275, + "mean_token_accuracy": 0.8614667654037476, + "num_tokens": 388795673.0, + "step": 10188 + }, + { + "epoch": 1.2961455285587076, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.12934112548828, + "learning_rate": 1e-06, + "loss": 0.4043, + "mean_token_accuracy": 0.8695245981216431, + "num_tokens": 388832926.0, + "step": 10189 + }, + { + "epoch": 1.296272738837298, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.12151336669922, + "learning_rate": 1e-06, + "loss": 0.4118, + "mean_token_accuracy": 0.869469404220581, + "num_tokens": 388869767.0, + "step": 10190 + }, + { + "epoch": 1.2963999491158886, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.989486694335938, + "learning_rate": 1e-06, + "loss": 0.4134, + "mean_token_accuracy": 0.8687117099761963, + "num_tokens": 388913347.0, + "step": 10191 + }, + { + "epoch": 1.2965271593944792, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.0508975982666, + "learning_rate": 1e-06, + "loss": 0.3879, + "mean_token_accuracy": 0.8777363300323486, + "num_tokens": 388948211.0, + "step": 10192 + }, + { + "epoch": 1.2966543696730697, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.91111946105957, + "learning_rate": 1e-06, + "loss": 0.4099, + "mean_token_accuracy": 0.8712433576583862, + "num_tokens": 388989198.0, + "step": 10193 + }, + { + "epoch": 1.2967815799516602, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.894866943359375, + "learning_rate": 1e-06, + "loss": 0.4489, + "mean_token_accuracy": 0.8574418425559998, + "num_tokens": 389029076.0, + "step": 10194 + }, + { + "epoch": 1.2969087902302507, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.038066864013672, + "learning_rate": 1e-06, + "loss": 0.4258, + "mean_token_accuracy": 0.8655685186386108, + "num_tokens": 389066751.0, + "step": 10195 + }, + { + "epoch": 1.297036000508841, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.928791046142578, + "learning_rate": 1e-06, + "loss": 0.418, + "mean_token_accuracy": 0.8641819953918457, + "num_tokens": 389104207.0, + "step": 10196 + }, + { + "epoch": 1.2971632107874316, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.912532806396484, + "learning_rate": 1e-06, + "loss": 0.4198, + "mean_token_accuracy": 0.8689970374107361, + "num_tokens": 389141941.0, + "step": 10197 + }, + { + "epoch": 1.297290421066022, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.092636108398438, + "learning_rate": 1e-06, + "loss": 0.3943, + "mean_token_accuracy": 0.8764795660972595, + "num_tokens": 389185326.0, + "step": 10198 + }, + { + "epoch": 1.2974176313446126, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.96990203857422, + "learning_rate": 1e-06, + "loss": 0.3819, + "mean_token_accuracy": 0.8751838207244873, + "num_tokens": 389221512.0, + "step": 10199 + }, + { + "epoch": 1.2975448416232032, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.96732521057129, + "learning_rate": 1e-06, + "loss": 0.3914, + "mean_token_accuracy": 0.8732080459594727, + "num_tokens": 389259388.0, + "step": 10200 + }, + { + "epoch": 1.2976720519017937, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.114469528198242, + "learning_rate": 1e-06, + "loss": 0.4519, + "mean_token_accuracy": 0.855767548084259, + "num_tokens": 389298773.0, + "step": 10201 + }, + { + "epoch": 1.2977992621803842, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.054149627685547, + "learning_rate": 1e-06, + "loss": 0.4156, + "mean_token_accuracy": 0.8674196004867554, + "num_tokens": 389339680.0, + "step": 10202 + }, + { + "epoch": 1.2979264724589747, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.063129425048828, + "learning_rate": 1e-06, + "loss": 0.4153, + "mean_token_accuracy": 0.8686509728431702, + "num_tokens": 389380755.0, + "step": 10203 + }, + { + "epoch": 1.2980536827375653, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.089677810668945, + "learning_rate": 1e-06, + "loss": 0.3993, + "mean_token_accuracy": 0.8710779547691345, + "num_tokens": 389412031.0, + "step": 10204 + }, + { + "epoch": 1.2981808930161556, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.137609481811523, + "learning_rate": 1e-06, + "loss": 0.3917, + "mean_token_accuracy": 0.876421332359314, + "num_tokens": 389448299.0, + "step": 10205 + }, + { + "epoch": 1.298308103294746, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.92708969116211, + "learning_rate": 1e-06, + "loss": 0.4339, + "mean_token_accuracy": 0.8607763648033142, + "num_tokens": 389483962.0, + "step": 10206 + }, + { + "epoch": 1.2984353135733366, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.25873565673828, + "learning_rate": 1e-06, + "loss": 0.4425, + "mean_token_accuracy": 0.8587881922721863, + "num_tokens": 389521989.0, + "step": 10207 + }, + { + "epoch": 1.2985625238519272, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.99397850036621, + "learning_rate": 1e-06, + "loss": 0.444, + "mean_token_accuracy": 0.860968828201294, + "num_tokens": 389558228.0, + "step": 10208 + }, + { + "epoch": 1.2986897341305177, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.047161102294922, + "learning_rate": 1e-06, + "loss": 0.4185, + "mean_token_accuracy": 0.865288257598877, + "num_tokens": 389597164.0, + "step": 10209 + }, + { + "epoch": 1.2988169444091082, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.047672271728516, + "learning_rate": 1e-06, + "loss": 0.4137, + "mean_token_accuracy": 0.8671499490737915, + "num_tokens": 389636435.0, + "step": 10210 + }, + { + "epoch": 1.2989441546876987, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.962276458740234, + "learning_rate": 1e-06, + "loss": 0.4457, + "mean_token_accuracy": 0.8592134714126587, + "num_tokens": 389670498.0, + "step": 10211 + }, + { + "epoch": 1.2990713649662893, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.006162643432617, + "learning_rate": 1e-06, + "loss": 0.4108, + "mean_token_accuracy": 0.8679187297821045, + "num_tokens": 389707433.0, + "step": 10212 + }, + { + "epoch": 1.2991985752448798, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.014060974121094, + "learning_rate": 1e-06, + "loss": 0.4219, + "mean_token_accuracy": 0.8673571348190308, + "num_tokens": 389747300.0, + "step": 10213 + }, + { + "epoch": 1.2993257855234703, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.04743766784668, + "learning_rate": 1e-06, + "loss": 0.4413, + "mean_token_accuracy": 0.8601019978523254, + "num_tokens": 389788381.0, + "step": 10214 + }, + { + "epoch": 1.2994529958020609, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.993507385253906, + "learning_rate": 1e-06, + "loss": 0.404, + "mean_token_accuracy": 0.8698432445526123, + "num_tokens": 389821792.0, + "step": 10215 + }, + { + "epoch": 1.2995802060806514, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.038970947265625, + "learning_rate": 1e-06, + "loss": 0.3913, + "mean_token_accuracy": 0.8776997923851013, + "num_tokens": 389859370.0, + "step": 10216 + }, + { + "epoch": 1.299707416359242, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.145421981811523, + "learning_rate": 1e-06, + "loss": 0.4527, + "mean_token_accuracy": 0.8576596975326538, + "num_tokens": 389897020.0, + "step": 10217 + }, + { + "epoch": 1.2998346266378324, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.114547729492188, + "learning_rate": 1e-06, + "loss": 0.483, + "mean_token_accuracy": 0.8397117853164673, + "num_tokens": 389927107.0, + "step": 10218 + }, + { + "epoch": 1.299961836916423, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.168977737426758, + "learning_rate": 1e-06, + "loss": 0.4502, + "mean_token_accuracy": 0.8636810779571533, + "num_tokens": 389966323.0, + "step": 10219 + }, + { + "epoch": 1.3000890471950133, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.01602554321289, + "learning_rate": 1e-06, + "loss": 0.4281, + "mean_token_accuracy": 0.8649908304214478, + "num_tokens": 390001416.0, + "step": 10220 + }, + { + "epoch": 1.3002162574736038, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.001161575317383, + "learning_rate": 1e-06, + "loss": 0.4213, + "mean_token_accuracy": 0.868524968624115, + "num_tokens": 390042326.0, + "step": 10221 + }, + { + "epoch": 1.3003434677521943, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.96146583557129, + "learning_rate": 1e-06, + "loss": 0.4432, + "mean_token_accuracy": 0.8560380935668945, + "num_tokens": 390084443.0, + "step": 10222 + }, + { + "epoch": 1.3004706780307849, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.03444480895996, + "learning_rate": 1e-06, + "loss": 0.4372, + "mean_token_accuracy": 0.8593226671218872, + "num_tokens": 390126309.0, + "step": 10223 + }, + { + "epoch": 1.3005978883093754, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.991748809814453, + "learning_rate": 1e-06, + "loss": 0.4013, + "mean_token_accuracy": 0.8711293935775757, + "num_tokens": 390161463.0, + "step": 10224 + }, + { + "epoch": 1.300725098587966, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.09885597229004, + "learning_rate": 1e-06, + "loss": 0.3883, + "mean_token_accuracy": 0.8761864304542542, + "num_tokens": 390203221.0, + "step": 10225 + }, + { + "epoch": 1.3008523088665565, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.917396545410156, + "learning_rate": 1e-06, + "loss": 0.3684, + "mean_token_accuracy": 0.8815864324569702, + "num_tokens": 390243269.0, + "step": 10226 + }, + { + "epoch": 1.300979519145147, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.150779724121094, + "learning_rate": 1e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.867243766784668, + "num_tokens": 390282488.0, + "step": 10227 + }, + { + "epoch": 1.3011067294237375, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.06266975402832, + "learning_rate": 1e-06, + "loss": 0.429, + "mean_token_accuracy": 0.8619963526725769, + "num_tokens": 390320268.0, + "step": 10228 + }, + { + "epoch": 1.3012339397023278, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.010087966918945, + "learning_rate": 1e-06, + "loss": 0.461, + "mean_token_accuracy": 0.8563230037689209, + "num_tokens": 390354733.0, + "step": 10229 + }, + { + "epoch": 1.3013611499809183, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.286529541015625, + "learning_rate": 1e-06, + "loss": 0.4247, + "mean_token_accuracy": 0.8658027648925781, + "num_tokens": 390388161.0, + "step": 10230 + }, + { + "epoch": 1.3014883602595089, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.982074737548828, + "learning_rate": 1e-06, + "loss": 0.3942, + "mean_token_accuracy": 0.8761374354362488, + "num_tokens": 390424609.0, + "step": 10231 + }, + { + "epoch": 1.3016155705380994, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.18181037902832, + "learning_rate": 1e-06, + "loss": 0.3641, + "mean_token_accuracy": 0.8828058838844299, + "num_tokens": 390460809.0, + "step": 10232 + }, + { + "epoch": 1.30174278081669, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.22838020324707, + "learning_rate": 1e-06, + "loss": 0.4445, + "mean_token_accuracy": 0.856115460395813, + "num_tokens": 390500198.0, + "step": 10233 + }, + { + "epoch": 1.3018699910952805, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.079896926879883, + "learning_rate": 1e-06, + "loss": 0.4415, + "mean_token_accuracy": 0.8591326475143433, + "num_tokens": 390545588.0, + "step": 10234 + }, + { + "epoch": 1.301997201373871, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.02450942993164, + "learning_rate": 1e-06, + "loss": 0.4505, + "mean_token_accuracy": 0.856275200843811, + "num_tokens": 390581681.0, + "step": 10235 + }, + { + "epoch": 1.3021244116524615, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.121925354003906, + "learning_rate": 1e-06, + "loss": 0.3712, + "mean_token_accuracy": 0.8837194442749023, + "num_tokens": 390619122.0, + "step": 10236 + }, + { + "epoch": 1.302251621931052, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.188535690307617, + "learning_rate": 1e-06, + "loss": 0.4673, + "mean_token_accuracy": 0.8539903163909912, + "num_tokens": 390667974.0, + "step": 10237 + }, + { + "epoch": 1.3023788322096426, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.92450523376465, + "learning_rate": 1e-06, + "loss": 0.412, + "mean_token_accuracy": 0.8684545159339905, + "num_tokens": 390705188.0, + "step": 10238 + }, + { + "epoch": 1.302506042488233, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.25792694091797, + "learning_rate": 1e-06, + "loss": 0.3937, + "mean_token_accuracy": 0.8757315278053284, + "num_tokens": 390746892.0, + "step": 10239 + }, + { + "epoch": 1.3026332527668236, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.92387580871582, + "learning_rate": 1e-06, + "loss": 0.3738, + "mean_token_accuracy": 0.8813864588737488, + "num_tokens": 390787727.0, + "step": 10240 + }, + { + "epoch": 1.3027604630454142, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.009431838989258, + "learning_rate": 1e-06, + "loss": 0.4442, + "mean_token_accuracy": 0.8591275811195374, + "num_tokens": 390823702.0, + "step": 10241 + }, + { + "epoch": 1.3028876733240047, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.992107391357422, + "learning_rate": 1e-06, + "loss": 0.4287, + "mean_token_accuracy": 0.8650516271591187, + "num_tokens": 390860256.0, + "step": 10242 + }, + { + "epoch": 1.3030148836025952, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.02952003479004, + "learning_rate": 1e-06, + "loss": 0.3813, + "mean_token_accuracy": 0.8792061805725098, + "num_tokens": 390901676.0, + "step": 10243 + }, + { + "epoch": 1.3031420938811857, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.0601806640625, + "learning_rate": 1e-06, + "loss": 0.4657, + "mean_token_accuracy": 0.8491756916046143, + "num_tokens": 390936911.0, + "step": 10244 + }, + { + "epoch": 1.303269304159776, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.857669830322266, + "learning_rate": 1e-06, + "loss": 0.4734, + "mean_token_accuracy": 0.8540027141571045, + "num_tokens": 390976185.0, + "step": 10245 + }, + { + "epoch": 1.3033965144383666, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.98483657836914, + "learning_rate": 1e-06, + "loss": 0.4348, + "mean_token_accuracy": 0.8664912581443787, + "num_tokens": 391014661.0, + "step": 10246 + }, + { + "epoch": 1.303523724716957, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.839723587036133, + "learning_rate": 1e-06, + "loss": 0.4453, + "mean_token_accuracy": 0.8586164712905884, + "num_tokens": 391056205.0, + "step": 10247 + }, + { + "epoch": 1.3036509349955476, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.061664581298828, + "learning_rate": 1e-06, + "loss": 0.4114, + "mean_token_accuracy": 0.8673911094665527, + "num_tokens": 391092187.0, + "step": 10248 + }, + { + "epoch": 1.3037781452741382, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.003097534179688, + "learning_rate": 1e-06, + "loss": 0.3824, + "mean_token_accuracy": 0.8810434341430664, + "num_tokens": 391130890.0, + "step": 10249 + }, + { + "epoch": 1.3039053555527287, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.6935977935791, + "learning_rate": 1e-06, + "loss": 0.3879, + "mean_token_accuracy": 0.8759907484054565, + "num_tokens": 391172783.0, + "step": 10250 + }, + { + "epoch": 1.3040325658313192, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.258331298828125, + "learning_rate": 1e-06, + "loss": 0.4693, + "mean_token_accuracy": 0.8493397235870361, + "num_tokens": 391215031.0, + "step": 10251 + }, + { + "epoch": 1.3041597761099097, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.0166015625, + "learning_rate": 1e-06, + "loss": 0.4123, + "mean_token_accuracy": 0.8698444366455078, + "num_tokens": 391251400.0, + "step": 10252 + }, + { + "epoch": 1.3042869863885003, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.983667373657227, + "learning_rate": 1e-06, + "loss": 0.4479, + "mean_token_accuracy": 0.8577926158905029, + "num_tokens": 391285895.0, + "step": 10253 + }, + { + "epoch": 1.3044141966670906, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.96523094177246, + "learning_rate": 1e-06, + "loss": 0.4146, + "mean_token_accuracy": 0.8683834671974182, + "num_tokens": 391328361.0, + "step": 10254 + }, + { + "epoch": 1.304541406945681, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.969268798828125, + "learning_rate": 1e-06, + "loss": 0.4207, + "mean_token_accuracy": 0.8651113510131836, + "num_tokens": 391365019.0, + "step": 10255 + }, + { + "epoch": 1.3046686172242716, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.83987045288086, + "learning_rate": 1e-06, + "loss": 0.4561, + "mean_token_accuracy": 0.849940836429596, + "num_tokens": 391400407.0, + "step": 10256 + }, + { + "epoch": 1.3047958275028622, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.08147621154785, + "learning_rate": 1e-06, + "loss": 0.4719, + "mean_token_accuracy": 0.8486430644989014, + "num_tokens": 391440660.0, + "step": 10257 + }, + { + "epoch": 1.3049230377814527, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.983604431152344, + "learning_rate": 1e-06, + "loss": 0.4047, + "mean_token_accuracy": 0.8690841794013977, + "num_tokens": 391481096.0, + "step": 10258 + }, + { + "epoch": 1.3050502480600432, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.93402099609375, + "learning_rate": 1e-06, + "loss": 0.422, + "mean_token_accuracy": 0.8657318353652954, + "num_tokens": 391516356.0, + "step": 10259 + }, + { + "epoch": 1.3051774583386337, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.021072387695312, + "learning_rate": 1e-06, + "loss": 0.4521, + "mean_token_accuracy": 0.8545616865158081, + "num_tokens": 391554691.0, + "step": 10260 + }, + { + "epoch": 1.3053046686172243, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.93672752380371, + "learning_rate": 1e-06, + "loss": 0.4646, + "mean_token_accuracy": 0.8520273566246033, + "num_tokens": 391592473.0, + "step": 10261 + }, + { + "epoch": 1.3054318788958148, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.0076847076416, + "learning_rate": 1e-06, + "loss": 0.4087, + "mean_token_accuracy": 0.8710421323776245, + "num_tokens": 391632091.0, + "step": 10262 + }, + { + "epoch": 1.3055590891744053, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.027299880981445, + "learning_rate": 1e-06, + "loss": 0.4445, + "mean_token_accuracy": 0.8593637943267822, + "num_tokens": 391676121.0, + "step": 10263 + }, + { + "epoch": 1.3056862994529959, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.0004825592041, + "learning_rate": 1e-06, + "loss": 0.4268, + "mean_token_accuracy": 0.8631504774093628, + "num_tokens": 391709664.0, + "step": 10264 + }, + { + "epoch": 1.3058135097315864, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.914226531982422, + "learning_rate": 1e-06, + "loss": 0.4575, + "mean_token_accuracy": 0.8547161221504211, + "num_tokens": 391755828.0, + "step": 10265 + }, + { + "epoch": 1.305940720010177, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.093210220336914, + "learning_rate": 1e-06, + "loss": 0.3728, + "mean_token_accuracy": 0.8824391961097717, + "num_tokens": 391800232.0, + "step": 10266 + }, + { + "epoch": 1.3060679302887674, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.145456314086914, + "learning_rate": 1e-06, + "loss": 0.4087, + "mean_token_accuracy": 0.8675188422203064, + "num_tokens": 391837612.0, + "step": 10267 + }, + { + "epoch": 1.306195140567358, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.829822540283203, + "learning_rate": 1e-06, + "loss": 0.3842, + "mean_token_accuracy": 0.876248836517334, + "num_tokens": 391871193.0, + "step": 10268 + }, + { + "epoch": 1.3063223508459483, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.12729835510254, + "learning_rate": 1e-06, + "loss": 0.4535, + "mean_token_accuracy": 0.8574445843696594, + "num_tokens": 391908203.0, + "step": 10269 + }, + { + "epoch": 1.3064495611245388, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.93979835510254, + "learning_rate": 1e-06, + "loss": 0.4035, + "mean_token_accuracy": 0.871984601020813, + "num_tokens": 391938744.0, + "step": 10270 + }, + { + "epoch": 1.3065767714031293, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.070205688476562, + "learning_rate": 1e-06, + "loss": 0.4178, + "mean_token_accuracy": 0.8680330514907837, + "num_tokens": 391976020.0, + "step": 10271 + }, + { + "epoch": 1.3067039816817199, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.891632080078125, + "learning_rate": 1e-06, + "loss": 0.447, + "mean_token_accuracy": 0.8575852513313293, + "num_tokens": 392018006.0, + "step": 10272 + }, + { + "epoch": 1.3068311919603104, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.03538703918457, + "learning_rate": 1e-06, + "loss": 0.4252, + "mean_token_accuracy": 0.8637100458145142, + "num_tokens": 392055358.0, + "step": 10273 + }, + { + "epoch": 1.306958402238901, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.086397171020508, + "learning_rate": 1e-06, + "loss": 0.4263, + "mean_token_accuracy": 0.864269495010376, + "num_tokens": 392097727.0, + "step": 10274 + }, + { + "epoch": 1.3070856125174914, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.06606674194336, + "learning_rate": 1e-06, + "loss": 0.4094, + "mean_token_accuracy": 0.8710687160491943, + "num_tokens": 392144641.0, + "step": 10275 + }, + { + "epoch": 1.307212822796082, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.194927215576172, + "learning_rate": 1e-06, + "loss": 0.4087, + "mean_token_accuracy": 0.8740710020065308, + "num_tokens": 392178810.0, + "step": 10276 + }, + { + "epoch": 1.3073400330746725, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.0018253326416, + "learning_rate": 1e-06, + "loss": 0.441, + "mean_token_accuracy": 0.862067461013794, + "num_tokens": 392213571.0, + "step": 10277 + }, + { + "epoch": 1.3074672433532628, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.174142837524414, + "learning_rate": 1e-06, + "loss": 0.446, + "mean_token_accuracy": 0.8616161346435547, + "num_tokens": 392254628.0, + "step": 10278 + }, + { + "epoch": 1.3075944536318533, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.001794815063477, + "learning_rate": 1e-06, + "loss": 0.418, + "mean_token_accuracy": 0.8649158477783203, + "num_tokens": 392297243.0, + "step": 10279 + }, + { + "epoch": 1.3077216639104439, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.16172981262207, + "learning_rate": 1e-06, + "loss": 0.4159, + "mean_token_accuracy": 0.8678190112113953, + "num_tokens": 392329116.0, + "step": 10280 + }, + { + "epoch": 1.3078488741890344, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.093719482421875, + "learning_rate": 1e-06, + "loss": 0.4307, + "mean_token_accuracy": 0.8629975318908691, + "num_tokens": 392370542.0, + "step": 10281 + }, + { + "epoch": 1.307976084467625, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.201597213745117, + "learning_rate": 1e-06, + "loss": 0.4102, + "mean_token_accuracy": 0.8712180256843567, + "num_tokens": 392411155.0, + "step": 10282 + }, + { + "epoch": 1.3081032947462155, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.153255462646484, + "learning_rate": 1e-06, + "loss": 0.4276, + "mean_token_accuracy": 0.866313099861145, + "num_tokens": 392450440.0, + "step": 10283 + }, + { + "epoch": 1.308230505024806, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.092578887939453, + "learning_rate": 1e-06, + "loss": 0.3961, + "mean_token_accuracy": 0.8765254020690918, + "num_tokens": 392488545.0, + "step": 10284 + }, + { + "epoch": 1.3083577153033965, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.92449188232422, + "learning_rate": 1e-06, + "loss": 0.4309, + "mean_token_accuracy": 0.8646023273468018, + "num_tokens": 392527138.0, + "step": 10285 + }, + { + "epoch": 1.308484925581987, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.84668731689453, + "learning_rate": 1e-06, + "loss": 0.4259, + "mean_token_accuracy": 0.8664706945419312, + "num_tokens": 392568181.0, + "step": 10286 + }, + { + "epoch": 1.3086121358605776, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.137062072753906, + "learning_rate": 1e-06, + "loss": 0.3755, + "mean_token_accuracy": 0.8803291916847229, + "num_tokens": 392610900.0, + "step": 10287 + }, + { + "epoch": 1.308739346139168, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.953798294067383, + "learning_rate": 1e-06, + "loss": 0.4395, + "mean_token_accuracy": 0.8583595752716064, + "num_tokens": 392646624.0, + "step": 10288 + }, + { + "epoch": 1.3088665564177586, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.928955078125, + "learning_rate": 1e-06, + "loss": 0.4223, + "mean_token_accuracy": 0.8659532070159912, + "num_tokens": 392683103.0, + "step": 10289 + }, + { + "epoch": 1.3089937666963491, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.0511474609375, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.8675127029418945, + "num_tokens": 392723169.0, + "step": 10290 + }, + { + "epoch": 1.3091209769749397, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.049402236938477, + "learning_rate": 1e-06, + "loss": 0.3732, + "mean_token_accuracy": 0.8804230093955994, + "num_tokens": 392762323.0, + "step": 10291 + }, + { + "epoch": 1.3092481872535302, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.96760368347168, + "learning_rate": 1e-06, + "loss": 0.3591, + "mean_token_accuracy": 0.8822726011276245, + "num_tokens": 392797555.0, + "step": 10292 + }, + { + "epoch": 1.3093753975321207, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.971609115600586, + "learning_rate": 1e-06, + "loss": 0.3897, + "mean_token_accuracy": 0.8764487504959106, + "num_tokens": 392840001.0, + "step": 10293 + }, + { + "epoch": 1.309502607810711, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.01659393310547, + "learning_rate": 1e-06, + "loss": 0.4297, + "mean_token_accuracy": 0.8654167652130127, + "num_tokens": 392880871.0, + "step": 10294 + }, + { + "epoch": 1.3096298180893016, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.05430030822754, + "learning_rate": 1e-06, + "loss": 0.3873, + "mean_token_accuracy": 0.876457691192627, + "num_tokens": 392917554.0, + "step": 10295 + }, + { + "epoch": 1.309757028367892, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.95953941345215, + "learning_rate": 1e-06, + "loss": 0.3634, + "mean_token_accuracy": 0.8843627572059631, + "num_tokens": 392958601.0, + "step": 10296 + }, + { + "epoch": 1.3098842386464826, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.99981117248535, + "learning_rate": 1e-06, + "loss": 0.4053, + "mean_token_accuracy": 0.8686306476593018, + "num_tokens": 393000108.0, + "step": 10297 + }, + { + "epoch": 1.3100114489250732, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.00632095336914, + "learning_rate": 1e-06, + "loss": 0.3584, + "mean_token_accuracy": 0.8869925141334534, + "num_tokens": 393032321.0, + "step": 10298 + }, + { + "epoch": 1.3101386592036637, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.193769454956055, + "learning_rate": 1e-06, + "loss": 0.4209, + "mean_token_accuracy": 0.8650839328765869, + "num_tokens": 393068347.0, + "step": 10299 + }, + { + "epoch": 1.3102658694822542, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.965290069580078, + "learning_rate": 1e-06, + "loss": 0.3766, + "mean_token_accuracy": 0.8830186724662781, + "num_tokens": 393102509.0, + "step": 10300 + }, + { + "epoch": 1.3103930797608447, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.166147232055664, + "learning_rate": 1e-06, + "loss": 0.4001, + "mean_token_accuracy": 0.8753930926322937, + "num_tokens": 393140974.0, + "step": 10301 + }, + { + "epoch": 1.3105202900394353, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.08130645751953, + "learning_rate": 1e-06, + "loss": 0.3925, + "mean_token_accuracy": 0.8764786720275879, + "num_tokens": 393177214.0, + "step": 10302 + }, + { + "epoch": 1.3106475003180256, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.02735137939453, + "learning_rate": 1e-06, + "loss": 0.4159, + "mean_token_accuracy": 0.8694853782653809, + "num_tokens": 393215801.0, + "step": 10303 + }, + { + "epoch": 1.310774710596616, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.040971755981445, + "learning_rate": 1e-06, + "loss": 0.407, + "mean_token_accuracy": 0.8781275749206543, + "num_tokens": 393256444.0, + "step": 10304 + }, + { + "epoch": 1.3109019208752066, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.073949813842773, + "learning_rate": 1e-06, + "loss": 0.4289, + "mean_token_accuracy": 0.8632313013076782, + "num_tokens": 393300238.0, + "step": 10305 + }, + { + "epoch": 1.3110291311537972, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.95353889465332, + "learning_rate": 1e-06, + "loss": 0.4596, + "mean_token_accuracy": 0.8528180718421936, + "num_tokens": 393344087.0, + "step": 10306 + }, + { + "epoch": 1.3111563414323877, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.972126007080078, + "learning_rate": 1e-06, + "loss": 0.3989, + "mean_token_accuracy": 0.871436357498169, + "num_tokens": 393381678.0, + "step": 10307 + }, + { + "epoch": 1.3112835517109782, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.129304885864258, + "learning_rate": 1e-06, + "loss": 0.4275, + "mean_token_accuracy": 0.8651110529899597, + "num_tokens": 393415740.0, + "step": 10308 + }, + { + "epoch": 1.3114107619895687, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.18030548095703, + "learning_rate": 1e-06, + "loss": 0.4976, + "mean_token_accuracy": 0.8423784971237183, + "num_tokens": 393457692.0, + "step": 10309 + }, + { + "epoch": 1.3115379722681593, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.93732261657715, + "learning_rate": 1e-06, + "loss": 0.4345, + "mean_token_accuracy": 0.862130880355835, + "num_tokens": 393499003.0, + "step": 10310 + }, + { + "epoch": 1.3116651825467498, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.8957576751709, + "learning_rate": 1e-06, + "loss": 0.4552, + "mean_token_accuracy": 0.856103777885437, + "num_tokens": 393533842.0, + "step": 10311 + }, + { + "epoch": 1.3117923928253403, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.15323257446289, + "learning_rate": 1e-06, + "loss": 0.3946, + "mean_token_accuracy": 0.8747717142105103, + "num_tokens": 393573906.0, + "step": 10312 + }, + { + "epoch": 1.3119196031039309, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.099098205566406, + "learning_rate": 1e-06, + "loss": 0.4191, + "mean_token_accuracy": 0.8653003573417664, + "num_tokens": 393615298.0, + "step": 10313 + }, + { + "epoch": 1.3120468133825214, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.075302124023438, + "learning_rate": 1e-06, + "loss": 0.4411, + "mean_token_accuracy": 0.8560301065444946, + "num_tokens": 393654690.0, + "step": 10314 + }, + { + "epoch": 1.312174023661112, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.006189346313477, + "learning_rate": 1e-06, + "loss": 0.4118, + "mean_token_accuracy": 0.8688600659370422, + "num_tokens": 393696222.0, + "step": 10315 + }, + { + "epoch": 1.3123012339397024, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.125978469848633, + "learning_rate": 1e-06, + "loss": 0.4437, + "mean_token_accuracy": 0.8626009225845337, + "num_tokens": 393736748.0, + "step": 10316 + }, + { + "epoch": 1.312428444218293, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.884084701538086, + "learning_rate": 1e-06, + "loss": 0.4459, + "mean_token_accuracy": 0.8578808307647705, + "num_tokens": 393773077.0, + "step": 10317 + }, + { + "epoch": 1.3125556544968833, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.160789489746094, + "learning_rate": 1e-06, + "loss": 0.3796, + "mean_token_accuracy": 0.8781166076660156, + "num_tokens": 393806466.0, + "step": 10318 + }, + { + "epoch": 1.3126828647754738, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.022533416748047, + "learning_rate": 1e-06, + "loss": 0.4285, + "mean_token_accuracy": 0.8635622262954712, + "num_tokens": 393840922.0, + "step": 10319 + }, + { + "epoch": 1.3128100750540643, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.90799903869629, + "learning_rate": 1e-06, + "loss": 0.406, + "mean_token_accuracy": 0.8708962202072144, + "num_tokens": 393874544.0, + "step": 10320 + }, + { + "epoch": 1.3129372853326549, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.116147994995117, + "learning_rate": 1e-06, + "loss": 0.4273, + "mean_token_accuracy": 0.8628519177436829, + "num_tokens": 393917773.0, + "step": 10321 + }, + { + "epoch": 1.3130644956112454, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.12742805480957, + "learning_rate": 1e-06, + "loss": 0.4324, + "mean_token_accuracy": 0.8634279370307922, + "num_tokens": 393951525.0, + "step": 10322 + }, + { + "epoch": 1.313191705889836, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.162492752075195, + "learning_rate": 1e-06, + "loss": 0.401, + "mean_token_accuracy": 0.8716208934783936, + "num_tokens": 393989796.0, + "step": 10323 + }, + { + "epoch": 1.3133189161684264, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.008153915405273, + "learning_rate": 1e-06, + "loss": 0.4244, + "mean_token_accuracy": 0.8640961050987244, + "num_tokens": 394028746.0, + "step": 10324 + }, + { + "epoch": 1.313446126447017, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.232938766479492, + "learning_rate": 1e-06, + "loss": 0.3967, + "mean_token_accuracy": 0.8756217360496521, + "num_tokens": 394065241.0, + "step": 10325 + }, + { + "epoch": 1.3135733367256075, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.09050178527832, + "learning_rate": 1e-06, + "loss": 0.4394, + "mean_token_accuracy": 0.8617702722549438, + "num_tokens": 394110113.0, + "step": 10326 + }, + { + "epoch": 1.3137005470041978, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.00052261352539, + "learning_rate": 1e-06, + "loss": 0.4738, + "mean_token_accuracy": 0.8502570390701294, + "num_tokens": 394146577.0, + "step": 10327 + }, + { + "epoch": 1.3138277572827883, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.276451110839844, + "learning_rate": 1e-06, + "loss": 0.4213, + "mean_token_accuracy": 0.8662210702896118, + "num_tokens": 394185392.0, + "step": 10328 + }, + { + "epoch": 1.3139549675613789, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.984874725341797, + "learning_rate": 1e-06, + "loss": 0.4386, + "mean_token_accuracy": 0.8579300045967102, + "num_tokens": 394226301.0, + "step": 10329 + }, + { + "epoch": 1.3140821778399694, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.942190170288086, + "learning_rate": 1e-06, + "loss": 0.4442, + "mean_token_accuracy": 0.8582623600959778, + "num_tokens": 394262654.0, + "step": 10330 + }, + { + "epoch": 1.31420938811856, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.047266006469727, + "learning_rate": 1e-06, + "loss": 0.3906, + "mean_token_accuracy": 0.8776016235351562, + "num_tokens": 394299185.0, + "step": 10331 + }, + { + "epoch": 1.3143365983971504, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.016462326049805, + "learning_rate": 1e-06, + "loss": 0.4197, + "mean_token_accuracy": 0.8680324554443359, + "num_tokens": 394331364.0, + "step": 10332 + }, + { + "epoch": 1.314463808675741, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.230249404907227, + "learning_rate": 1e-06, + "loss": 0.4582, + "mean_token_accuracy": 0.8549169301986694, + "num_tokens": 394375479.0, + "step": 10333 + }, + { + "epoch": 1.3145910189543315, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.863447189331055, + "learning_rate": 1e-06, + "loss": 0.4025, + "mean_token_accuracy": 0.8725014328956604, + "num_tokens": 394414913.0, + "step": 10334 + }, + { + "epoch": 1.314718229232922, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.191598892211914, + "learning_rate": 1e-06, + "loss": 0.4279, + "mean_token_accuracy": 0.866843581199646, + "num_tokens": 394450889.0, + "step": 10335 + }, + { + "epoch": 1.3148454395115126, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.013925552368164, + "learning_rate": 1e-06, + "loss": 0.4676, + "mean_token_accuracy": 0.8501983880996704, + "num_tokens": 394495280.0, + "step": 10336 + }, + { + "epoch": 1.314972649790103, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.060293197631836, + "learning_rate": 1e-06, + "loss": 0.392, + "mean_token_accuracy": 0.8749328851699829, + "num_tokens": 394536811.0, + "step": 10337 + }, + { + "epoch": 1.3150998600686936, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.230104446411133, + "learning_rate": 1e-06, + "loss": 0.4279, + "mean_token_accuracy": 0.8643797636032104, + "num_tokens": 394573353.0, + "step": 10338 + }, + { + "epoch": 1.3152270703472841, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.15944480895996, + "learning_rate": 1e-06, + "loss": 0.4284, + "mean_token_accuracy": 0.8585801124572754, + "num_tokens": 394603517.0, + "step": 10339 + }, + { + "epoch": 1.3153542806258747, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.174009323120117, + "learning_rate": 1e-06, + "loss": 0.3995, + "mean_token_accuracy": 0.8725186586380005, + "num_tokens": 394650547.0, + "step": 10340 + }, + { + "epoch": 1.3154814909044652, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.025714874267578, + "learning_rate": 1e-06, + "loss": 0.4009, + "mean_token_accuracy": 0.8695839643478394, + "num_tokens": 394684460.0, + "step": 10341 + }, + { + "epoch": 1.3156087011830557, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.255733489990234, + "learning_rate": 1e-06, + "loss": 0.3679, + "mean_token_accuracy": 0.8820706009864807, + "num_tokens": 394733520.0, + "step": 10342 + }, + { + "epoch": 1.315735911461646, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.2296199798584, + "learning_rate": 1e-06, + "loss": 0.4417, + "mean_token_accuracy": 0.8607688546180725, + "num_tokens": 394775498.0, + "step": 10343 + }, + { + "epoch": 1.3158631217402366, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.19192886352539, + "learning_rate": 1e-06, + "loss": 0.4569, + "mean_token_accuracy": 0.8538272380828857, + "num_tokens": 394815138.0, + "step": 10344 + }, + { + "epoch": 1.315990332018827, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.086753845214844, + "learning_rate": 1e-06, + "loss": 0.4457, + "mean_token_accuracy": 0.8579543828964233, + "num_tokens": 394850447.0, + "step": 10345 + }, + { + "epoch": 1.3161175422974176, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.04155731201172, + "learning_rate": 1e-06, + "loss": 0.4393, + "mean_token_accuracy": 0.8599941730499268, + "num_tokens": 394891800.0, + "step": 10346 + }, + { + "epoch": 1.3162447525760081, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.86321449279785, + "learning_rate": 1e-06, + "loss": 0.4059, + "mean_token_accuracy": 0.8730250597000122, + "num_tokens": 394930087.0, + "step": 10347 + }, + { + "epoch": 1.3163719628545987, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.942296981811523, + "learning_rate": 1e-06, + "loss": 0.3901, + "mean_token_accuracy": 0.8764703273773193, + "num_tokens": 394964497.0, + "step": 10348 + }, + { + "epoch": 1.3164991731331892, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.21259307861328, + "learning_rate": 1e-06, + "loss": 0.4666, + "mean_token_accuracy": 0.8491621017456055, + "num_tokens": 395003127.0, + "step": 10349 + }, + { + "epoch": 1.3166263834117797, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.90047836303711, + "learning_rate": 1e-06, + "loss": 0.4569, + "mean_token_accuracy": 0.8557009100914001, + "num_tokens": 395040570.0, + "step": 10350 + }, + { + "epoch": 1.3167535936903703, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.009307861328125, + "learning_rate": 1e-06, + "loss": 0.4226, + "mean_token_accuracy": 0.866706132888794, + "num_tokens": 395075205.0, + "step": 10351 + }, + { + "epoch": 1.3168808039689606, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.951534271240234, + "learning_rate": 1e-06, + "loss": 0.4215, + "mean_token_accuracy": 0.865694522857666, + "num_tokens": 395117492.0, + "step": 10352 + }, + { + "epoch": 1.317008014247551, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.90319061279297, + "learning_rate": 1e-06, + "loss": 0.4311, + "mean_token_accuracy": 0.8636441826820374, + "num_tokens": 395162449.0, + "step": 10353 + }, + { + "epoch": 1.3171352245261416, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.18878173828125, + "learning_rate": 1e-06, + "loss": 0.4027, + "mean_token_accuracy": 0.8749674558639526, + "num_tokens": 395197310.0, + "step": 10354 + }, + { + "epoch": 1.3172624348047322, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.040498733520508, + "learning_rate": 1e-06, + "loss": 0.3787, + "mean_token_accuracy": 0.8771616220474243, + "num_tokens": 395232420.0, + "step": 10355 + }, + { + "epoch": 1.3173896450833227, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.12660789489746, + "learning_rate": 1e-06, + "loss": 0.4816, + "mean_token_accuracy": 0.8450717926025391, + "num_tokens": 395269392.0, + "step": 10356 + }, + { + "epoch": 1.3175168553619132, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.940357208251953, + "learning_rate": 1e-06, + "loss": 0.4529, + "mean_token_accuracy": 0.85463547706604, + "num_tokens": 395305496.0, + "step": 10357 + }, + { + "epoch": 1.3176440656405037, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.255416870117188, + "learning_rate": 1e-06, + "loss": 0.3994, + "mean_token_accuracy": 0.8745422959327698, + "num_tokens": 395347089.0, + "step": 10358 + }, + { + "epoch": 1.3177712759190943, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.885683059692383, + "learning_rate": 1e-06, + "loss": 0.3898, + "mean_token_accuracy": 0.8763874769210815, + "num_tokens": 395382885.0, + "step": 10359 + }, + { + "epoch": 1.3178984861976848, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.899700164794922, + "learning_rate": 1e-06, + "loss": 0.3831, + "mean_token_accuracy": 0.8799139261245728, + "num_tokens": 395416260.0, + "step": 10360 + }, + { + "epoch": 1.3180256964762753, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.24064826965332, + "learning_rate": 1e-06, + "loss": 0.4492, + "mean_token_accuracy": 0.8570395708084106, + "num_tokens": 395454179.0, + "step": 10361 + }, + { + "epoch": 1.3181529067548658, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.137300491333008, + "learning_rate": 1e-06, + "loss": 0.453, + "mean_token_accuracy": 0.8566204905509949, + "num_tokens": 395489173.0, + "step": 10362 + }, + { + "epoch": 1.3182801170334564, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.169038772583008, + "learning_rate": 1e-06, + "loss": 0.4317, + "mean_token_accuracy": 0.8596974611282349, + "num_tokens": 395532057.0, + "step": 10363 + }, + { + "epoch": 1.318407327312047, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.14043617248535, + "learning_rate": 1e-06, + "loss": 0.3967, + "mean_token_accuracy": 0.8725125193595886, + "num_tokens": 395570321.0, + "step": 10364 + }, + { + "epoch": 1.3185345375906374, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.054420471191406, + "learning_rate": 1e-06, + "loss": 0.3677, + "mean_token_accuracy": 0.8862718343734741, + "num_tokens": 395609041.0, + "step": 10365 + }, + { + "epoch": 1.318661747869228, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.026329040527344, + "learning_rate": 1e-06, + "loss": 0.4899, + "mean_token_accuracy": 0.842941164970398, + "num_tokens": 395649961.0, + "step": 10366 + }, + { + "epoch": 1.3187889581478183, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.03306770324707, + "learning_rate": 1e-06, + "loss": 0.4186, + "mean_token_accuracy": 0.8674572706222534, + "num_tokens": 395691882.0, + "step": 10367 + }, + { + "epoch": 1.3189161684264088, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.151708602905273, + "learning_rate": 1e-06, + "loss": 0.4627, + "mean_token_accuracy": 0.8539906740188599, + "num_tokens": 395731496.0, + "step": 10368 + }, + { + "epoch": 1.3190433787049993, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.064926147460938, + "learning_rate": 1e-06, + "loss": 0.4042, + "mean_token_accuracy": 0.8700558543205261, + "num_tokens": 395772618.0, + "step": 10369 + }, + { + "epoch": 1.3191705889835899, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.100854873657227, + "learning_rate": 1e-06, + "loss": 0.4729, + "mean_token_accuracy": 0.8497713804244995, + "num_tokens": 395811907.0, + "step": 10370 + }, + { + "epoch": 1.3192977992621804, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.19289207458496, + "learning_rate": 1e-06, + "loss": 0.4468, + "mean_token_accuracy": 0.8605749607086182, + "num_tokens": 395851186.0, + "step": 10371 + }, + { + "epoch": 1.319425009540771, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.152563095092773, + "learning_rate": 1e-06, + "loss": 0.4244, + "mean_token_accuracy": 0.8659510612487793, + "num_tokens": 395893868.0, + "step": 10372 + }, + { + "epoch": 1.3195522198193614, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.951974868774414, + "learning_rate": 1e-06, + "loss": 0.3944, + "mean_token_accuracy": 0.8757723569869995, + "num_tokens": 395930519.0, + "step": 10373 + }, + { + "epoch": 1.319679430097952, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.123064041137695, + "learning_rate": 1e-06, + "loss": 0.3687, + "mean_token_accuracy": 0.8813073635101318, + "num_tokens": 395965692.0, + "step": 10374 + }, + { + "epoch": 1.3198066403765425, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.01534652709961, + "learning_rate": 1e-06, + "loss": 0.4207, + "mean_token_accuracy": 0.8675282597541809, + "num_tokens": 396002316.0, + "step": 10375 + }, + { + "epoch": 1.3199338506551328, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.944467544555664, + "learning_rate": 1e-06, + "loss": 0.4024, + "mean_token_accuracy": 0.8726850748062134, + "num_tokens": 396043392.0, + "step": 10376 + }, + { + "epoch": 1.3200610609337233, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.959136962890625, + "learning_rate": 1e-06, + "loss": 0.422, + "mean_token_accuracy": 0.8633066415786743, + "num_tokens": 396078603.0, + "step": 10377 + }, + { + "epoch": 1.3201882712123139, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.858386993408203, + "learning_rate": 1e-06, + "loss": 0.4648, + "mean_token_accuracy": 0.8572856187820435, + "num_tokens": 396121132.0, + "step": 10378 + }, + { + "epoch": 1.3203154814909044, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.2237606048584, + "learning_rate": 1e-06, + "loss": 0.4042, + "mean_token_accuracy": 0.8751012086868286, + "num_tokens": 396155730.0, + "step": 10379 + }, + { + "epoch": 1.320442691769495, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.099937438964844, + "learning_rate": 1e-06, + "loss": 0.4561, + "mean_token_accuracy": 0.8522396087646484, + "num_tokens": 396190976.0, + "step": 10380 + }, + { + "epoch": 1.3205699020480854, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.13750648498535, + "learning_rate": 1e-06, + "loss": 0.4217, + "mean_token_accuracy": 0.8653382062911987, + "num_tokens": 396229571.0, + "step": 10381 + }, + { + "epoch": 1.320697112326676, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.14092254638672, + "learning_rate": 1e-06, + "loss": 0.421, + "mean_token_accuracy": 0.8654049634933472, + "num_tokens": 396265586.0, + "step": 10382 + }, + { + "epoch": 1.3208243226052665, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.064634323120117, + "learning_rate": 1e-06, + "loss": 0.4113, + "mean_token_accuracy": 0.8708496689796448, + "num_tokens": 396303810.0, + "step": 10383 + }, + { + "epoch": 1.320951532883857, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.317028045654297, + "learning_rate": 1e-06, + "loss": 0.3557, + "mean_token_accuracy": 0.8859114646911621, + "num_tokens": 396345671.0, + "step": 10384 + }, + { + "epoch": 1.3210787431624476, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.984249114990234, + "learning_rate": 1e-06, + "loss": 0.3649, + "mean_token_accuracy": 0.883762538433075, + "num_tokens": 396388062.0, + "step": 10385 + }, + { + "epoch": 1.321205953441038, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.874284744262695, + "learning_rate": 1e-06, + "loss": 0.3802, + "mean_token_accuracy": 0.8793079853057861, + "num_tokens": 396425189.0, + "step": 10386 + }, + { + "epoch": 1.3213331637196286, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.2642879486084, + "learning_rate": 1e-06, + "loss": 0.4474, + "mean_token_accuracy": 0.8572453260421753, + "num_tokens": 396461861.0, + "step": 10387 + }, + { + "epoch": 1.3214603739982191, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.15554428100586, + "learning_rate": 1e-06, + "loss": 0.3876, + "mean_token_accuracy": 0.8777225017547607, + "num_tokens": 396506190.0, + "step": 10388 + }, + { + "epoch": 1.3215875842768097, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.983781814575195, + "learning_rate": 1e-06, + "loss": 0.4241, + "mean_token_accuracy": 0.8682032823562622, + "num_tokens": 396538900.0, + "step": 10389 + }, + { + "epoch": 1.3217147945554002, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.04184913635254, + "learning_rate": 1e-06, + "loss": 0.3913, + "mean_token_accuracy": 0.8756354451179504, + "num_tokens": 396577029.0, + "step": 10390 + }, + { + "epoch": 1.3218420048339907, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.991907119750977, + "learning_rate": 1e-06, + "loss": 0.4668, + "mean_token_accuracy": 0.8509442210197449, + "num_tokens": 396616917.0, + "step": 10391 + }, + { + "epoch": 1.321969215112581, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.04400062561035, + "learning_rate": 1e-06, + "loss": 0.3797, + "mean_token_accuracy": 0.875746488571167, + "num_tokens": 396651726.0, + "step": 10392 + }, + { + "epoch": 1.3220964253911716, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.18790054321289, + "learning_rate": 1e-06, + "loss": 0.4627, + "mean_token_accuracy": 0.8549440503120422, + "num_tokens": 396693796.0, + "step": 10393 + }, + { + "epoch": 1.322223635669762, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.24987030029297, + "learning_rate": 1e-06, + "loss": 0.4534, + "mean_token_accuracy": 0.8563854098320007, + "num_tokens": 396731947.0, + "step": 10394 + }, + { + "epoch": 1.3223508459483526, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.925636291503906, + "learning_rate": 1e-06, + "loss": 0.4043, + "mean_token_accuracy": 0.8701892495155334, + "num_tokens": 396767062.0, + "step": 10395 + }, + { + "epoch": 1.3224780562269431, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.257932662963867, + "learning_rate": 1e-06, + "loss": 0.4094, + "mean_token_accuracy": 0.8694902658462524, + "num_tokens": 396803918.0, + "step": 10396 + }, + { + "epoch": 1.3226052665055337, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.067598342895508, + "learning_rate": 1e-06, + "loss": 0.5109, + "mean_token_accuracy": 0.8398500084877014, + "num_tokens": 396850314.0, + "step": 10397 + }, + { + "epoch": 1.3227324767841242, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.94316864013672, + "learning_rate": 1e-06, + "loss": 0.4432, + "mean_token_accuracy": 0.8598154783248901, + "num_tokens": 396887920.0, + "step": 10398 + }, + { + "epoch": 1.3228596870627147, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.44632911682129, + "learning_rate": 1e-06, + "loss": 0.4546, + "mean_token_accuracy": 0.8549805879592896, + "num_tokens": 396923369.0, + "step": 10399 + }, + { + "epoch": 1.3229868973413053, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.189210891723633, + "learning_rate": 1e-06, + "loss": 0.4283, + "mean_token_accuracy": 0.8646077513694763, + "num_tokens": 396962338.0, + "step": 10400 + }, + { + "epoch": 1.3231141076198956, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.061744689941406, + "learning_rate": 1e-06, + "loss": 0.38, + "mean_token_accuracy": 0.8762907981872559, + "num_tokens": 396998275.0, + "step": 10401 + }, + { + "epoch": 1.323241317898486, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.080608367919922, + "learning_rate": 1e-06, + "loss": 0.4278, + "mean_token_accuracy": 0.8643077611923218, + "num_tokens": 397040034.0, + "step": 10402 + }, + { + "epoch": 1.3233685281770766, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.187204360961914, + "learning_rate": 1e-06, + "loss": 0.3667, + "mean_token_accuracy": 0.8847159147262573, + "num_tokens": 397080549.0, + "step": 10403 + }, + { + "epoch": 1.3234957384556671, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.105546951293945, + "learning_rate": 1e-06, + "loss": 0.3809, + "mean_token_accuracy": 0.8790186643600464, + "num_tokens": 397120803.0, + "step": 10404 + }, + { + "epoch": 1.3236229487342577, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.849098205566406, + "learning_rate": 1e-06, + "loss": 0.4129, + "mean_token_accuracy": 0.8713479042053223, + "num_tokens": 397160463.0, + "step": 10405 + }, + { + "epoch": 1.3237501590128482, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.04915428161621, + "learning_rate": 1e-06, + "loss": 0.4376, + "mean_token_accuracy": 0.859958827495575, + "num_tokens": 397205425.0, + "step": 10406 + }, + { + "epoch": 1.3238773692914387, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.159381866455078, + "learning_rate": 1e-06, + "loss": 0.4434, + "mean_token_accuracy": 0.861615002155304, + "num_tokens": 397238907.0, + "step": 10407 + }, + { + "epoch": 1.3240045795700293, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.030742645263672, + "learning_rate": 1e-06, + "loss": 0.4314, + "mean_token_accuracy": 0.867431640625, + "num_tokens": 397281639.0, + "step": 10408 + }, + { + "epoch": 1.3241317898486198, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.024023056030273, + "learning_rate": 1e-06, + "loss": 0.4651, + "mean_token_accuracy": 0.8557421565055847, + "num_tokens": 397323350.0, + "step": 10409 + }, + { + "epoch": 1.3242590001272103, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.127290725708008, + "learning_rate": 1e-06, + "loss": 0.4455, + "mean_token_accuracy": 0.8600309491157532, + "num_tokens": 397364438.0, + "step": 10410 + }, + { + "epoch": 1.3243862104058008, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.002321243286133, + "learning_rate": 1e-06, + "loss": 0.4639, + "mean_token_accuracy": 0.8547617793083191, + "num_tokens": 397406881.0, + "step": 10411 + }, + { + "epoch": 1.3245134206843914, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.369895935058594, + "learning_rate": 1e-06, + "loss": 0.426, + "mean_token_accuracy": 0.867521345615387, + "num_tokens": 397451190.0, + "step": 10412 + }, + { + "epoch": 1.324640630962982, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.0058536529541, + "learning_rate": 1e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.8676371574401855, + "num_tokens": 397491063.0, + "step": 10413 + }, + { + "epoch": 1.3247678412415724, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.127193450927734, + "learning_rate": 1e-06, + "loss": 0.4136, + "mean_token_accuracy": 0.8672775626182556, + "num_tokens": 397529961.0, + "step": 10414 + }, + { + "epoch": 1.324895051520163, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.218292236328125, + "learning_rate": 1e-06, + "loss": 0.3789, + "mean_token_accuracy": 0.8768432140350342, + "num_tokens": 397565616.0, + "step": 10415 + }, + { + "epoch": 1.3250222617987533, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.149459838867188, + "learning_rate": 1e-06, + "loss": 0.4041, + "mean_token_accuracy": 0.8732096552848816, + "num_tokens": 397602223.0, + "step": 10416 + }, + { + "epoch": 1.3251494720773438, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.16023063659668, + "learning_rate": 1e-06, + "loss": 0.4906, + "mean_token_accuracy": 0.8444777131080627, + "num_tokens": 397638910.0, + "step": 10417 + }, + { + "epoch": 1.3252766823559343, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.03606605529785, + "learning_rate": 1e-06, + "loss": 0.4353, + "mean_token_accuracy": 0.8595300912857056, + "num_tokens": 397681200.0, + "step": 10418 + }, + { + "epoch": 1.3254038926345248, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.12900161743164, + "learning_rate": 1e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.8686965703964233, + "num_tokens": 397724832.0, + "step": 10419 + }, + { + "epoch": 1.3255311029131154, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.014312744140625, + "learning_rate": 1e-06, + "loss": 0.4464, + "mean_token_accuracy": 0.8590766191482544, + "num_tokens": 397765537.0, + "step": 10420 + }, + { + "epoch": 1.325658313191706, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.284439086914062, + "learning_rate": 1e-06, + "loss": 0.4414, + "mean_token_accuracy": 0.8572371006011963, + "num_tokens": 397800945.0, + "step": 10421 + }, + { + "epoch": 1.3257855234702964, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.999120712280273, + "learning_rate": 1e-06, + "loss": 0.4111, + "mean_token_accuracy": 0.8726038932800293, + "num_tokens": 397840786.0, + "step": 10422 + }, + { + "epoch": 1.325912733748887, + "ewc_loss": 0.03125, + "ewc_loss_parallel": 3.123283386230469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.954240798950195, + "learning_rate": 1e-06, + "loss": 0.4085, + "mean_token_accuracy": 0.870126485824585, + "num_tokens": 397875607.0, + "step": 10423 + }, + { + "epoch": 1.3260399440274775, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.056734085083008, + "learning_rate": 1e-06, + "loss": 0.3684, + "mean_token_accuracy": 0.8823987245559692, + "num_tokens": 397909524.0, + "step": 10424 + }, + { + "epoch": 1.3261671543060678, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.21923828125, + "learning_rate": 1e-06, + "loss": 0.4689, + "mean_token_accuracy": 0.8503602147102356, + "num_tokens": 397947862.0, + "step": 10425 + }, + { + "epoch": 1.3262943645846583, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.040395736694336, + "learning_rate": 1e-06, + "loss": 0.3793, + "mean_token_accuracy": 0.8808730244636536, + "num_tokens": 397980958.0, + "step": 10426 + }, + { + "epoch": 1.3264215748632489, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.337474822998047, + "learning_rate": 1e-06, + "loss": 0.4314, + "mean_token_accuracy": 0.8636409044265747, + "num_tokens": 398021231.0, + "step": 10427 + }, + { + "epoch": 1.3265487851418394, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.968130111694336, + "learning_rate": 1e-06, + "loss": 0.4237, + "mean_token_accuracy": 0.8639456629753113, + "num_tokens": 398059202.0, + "step": 10428 + }, + { + "epoch": 1.32667599542043, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.166738510131836, + "learning_rate": 1e-06, + "loss": 0.3494, + "mean_token_accuracy": 0.8876678943634033, + "num_tokens": 398097121.0, + "step": 10429 + }, + { + "epoch": 1.3268032056990204, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.14752197265625, + "learning_rate": 1e-06, + "loss": 0.4189, + "mean_token_accuracy": 0.8660391569137573, + "num_tokens": 398130413.0, + "step": 10430 + }, + { + "epoch": 1.326930415977611, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.186912536621094, + "learning_rate": 1e-06, + "loss": 0.4267, + "mean_token_accuracy": 0.865323543548584, + "num_tokens": 398172535.0, + "step": 10431 + }, + { + "epoch": 1.3270576262562015, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.963916778564453, + "learning_rate": 1e-06, + "loss": 0.4134, + "mean_token_accuracy": 0.8672242164611816, + "num_tokens": 398206681.0, + "step": 10432 + }, + { + "epoch": 1.327184836534792, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.01801109313965, + "learning_rate": 1e-06, + "loss": 0.4037, + "mean_token_accuracy": 0.8710482120513916, + "num_tokens": 398245206.0, + "step": 10433 + }, + { + "epoch": 1.3273120468133826, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.135330200195312, + "learning_rate": 1e-06, + "loss": 0.4068, + "mean_token_accuracy": 0.8630114793777466, + "num_tokens": 398278902.0, + "step": 10434 + }, + { + "epoch": 1.327439257091973, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.129255294799805, + "learning_rate": 1e-06, + "loss": 0.4077, + "mean_token_accuracy": 0.8684437274932861, + "num_tokens": 398315833.0, + "step": 10435 + }, + { + "epoch": 1.3275664673705636, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.043062210083008, + "learning_rate": 1e-06, + "loss": 0.4728, + "mean_token_accuracy": 0.8486299514770508, + "num_tokens": 398356496.0, + "step": 10436 + }, + { + "epoch": 1.3276936776491541, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.031742095947266, + "learning_rate": 1e-06, + "loss": 0.3951, + "mean_token_accuracy": 0.8730273246765137, + "num_tokens": 398393661.0, + "step": 10437 + }, + { + "epoch": 1.3278208879277447, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.158367156982422, + "learning_rate": 1e-06, + "loss": 0.3581, + "mean_token_accuracy": 0.8839334845542908, + "num_tokens": 398430463.0, + "step": 10438 + }, + { + "epoch": 1.3279480982063352, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.230121612548828, + "learning_rate": 1e-06, + "loss": 0.4236, + "mean_token_accuracy": 0.8653072714805603, + "num_tokens": 398470382.0, + "step": 10439 + }, + { + "epoch": 1.3280753084849257, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.294153213500977, + "learning_rate": 1e-06, + "loss": 0.4704, + "mean_token_accuracy": 0.8494512438774109, + "num_tokens": 398504793.0, + "step": 10440 + }, + { + "epoch": 1.328202518763516, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.87278175354004, + "learning_rate": 1e-06, + "loss": 0.4401, + "mean_token_accuracy": 0.8571747541427612, + "num_tokens": 398541038.0, + "step": 10441 + }, + { + "epoch": 1.3283297290421066, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.21156883239746, + "learning_rate": 1e-06, + "loss": 0.4691, + "mean_token_accuracy": 0.8498361110687256, + "num_tokens": 398578047.0, + "step": 10442 + }, + { + "epoch": 1.328456939320697, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.09518051147461, + "learning_rate": 1e-06, + "loss": 0.4533, + "mean_token_accuracy": 0.8549445867538452, + "num_tokens": 398618946.0, + "step": 10443 + }, + { + "epoch": 1.3285841495992876, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.20234489440918, + "learning_rate": 1e-06, + "loss": 0.4109, + "mean_token_accuracy": 0.8716983199119568, + "num_tokens": 398656895.0, + "step": 10444 + }, + { + "epoch": 1.3287113598778781, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.049291610717773, + "learning_rate": 1e-06, + "loss": 0.4165, + "mean_token_accuracy": 0.8663425445556641, + "num_tokens": 398696521.0, + "step": 10445 + }, + { + "epoch": 1.3288385701564687, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.04400062561035, + "learning_rate": 1e-06, + "loss": 0.4157, + "mean_token_accuracy": 0.8671145439147949, + "num_tokens": 398735275.0, + "step": 10446 + }, + { + "epoch": 1.3289657804350592, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.17165184020996, + "learning_rate": 1e-06, + "loss": 0.4736, + "mean_token_accuracy": 0.8521866202354431, + "num_tokens": 398774753.0, + "step": 10447 + }, + { + "epoch": 1.3290929907136497, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.002763748168945, + "learning_rate": 1e-06, + "loss": 0.4401, + "mean_token_accuracy": 0.8603073358535767, + "num_tokens": 398810893.0, + "step": 10448 + }, + { + "epoch": 1.3292202009922403, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.004980087280273, + "learning_rate": 1e-06, + "loss": 0.4317, + "mean_token_accuracy": 0.8605533838272095, + "num_tokens": 398849741.0, + "step": 10449 + }, + { + "epoch": 1.3293474112708306, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.11284637451172, + "learning_rate": 1e-06, + "loss": 0.421, + "mean_token_accuracy": 0.8602962493896484, + "num_tokens": 398881186.0, + "step": 10450 + }, + { + "epoch": 1.329474621549421, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.248920440673828, + "learning_rate": 1e-06, + "loss": 0.3692, + "mean_token_accuracy": 0.8834351301193237, + "num_tokens": 398918677.0, + "step": 10451 + }, + { + "epoch": 1.3296018318280116, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.139968872070312, + "learning_rate": 1e-06, + "loss": 0.4261, + "mean_token_accuracy": 0.8637427091598511, + "num_tokens": 398953435.0, + "step": 10452 + }, + { + "epoch": 1.3297290421066021, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.248552322387695, + "learning_rate": 1e-06, + "loss": 0.3994, + "mean_token_accuracy": 0.8732448220252991, + "num_tokens": 398991799.0, + "step": 10453 + }, + { + "epoch": 1.3298562523851927, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.992847442626953, + "learning_rate": 1e-06, + "loss": 0.4083, + "mean_token_accuracy": 0.8710840940475464, + "num_tokens": 399023211.0, + "step": 10454 + }, + { + "epoch": 1.3299834626637832, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.27286148071289, + "learning_rate": 1e-06, + "loss": 0.433, + "mean_token_accuracy": 0.8630476593971252, + "num_tokens": 399061431.0, + "step": 10455 + }, + { + "epoch": 1.3301106729423737, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.137760162353516, + "learning_rate": 1e-06, + "loss": 0.4145, + "mean_token_accuracy": 0.8682447671890259, + "num_tokens": 399097384.0, + "step": 10456 + }, + { + "epoch": 1.3302378832209643, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.17349624633789, + "learning_rate": 1e-06, + "loss": 0.418, + "mean_token_accuracy": 0.8686567544937134, + "num_tokens": 399133935.0, + "step": 10457 + }, + { + "epoch": 1.3303650934995548, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.117889404296875, + "learning_rate": 1e-06, + "loss": 0.4125, + "mean_token_accuracy": 0.8723105192184448, + "num_tokens": 399165709.0, + "step": 10458 + }, + { + "epoch": 1.3304923037781453, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.991451263427734, + "learning_rate": 1e-06, + "loss": 0.4258, + "mean_token_accuracy": 0.8645867705345154, + "num_tokens": 399203548.0, + "step": 10459 + }, + { + "epoch": 1.3306195140567358, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.22603416442871, + "learning_rate": 1e-06, + "loss": 0.4184, + "mean_token_accuracy": 0.8650634288787842, + "num_tokens": 399244341.0, + "step": 10460 + }, + { + "epoch": 1.3307467243353264, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.043407440185547, + "learning_rate": 1e-06, + "loss": 0.3988, + "mean_token_accuracy": 0.8706610202789307, + "num_tokens": 399282284.0, + "step": 10461 + }, + { + "epoch": 1.330873934613917, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.290191650390625, + "learning_rate": 1e-06, + "loss": 0.3764, + "mean_token_accuracy": 0.8789876699447632, + "num_tokens": 399320523.0, + "step": 10462 + }, + { + "epoch": 1.3310011448925074, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.28813934326172, + "learning_rate": 1e-06, + "loss": 0.408, + "mean_token_accuracy": 0.866168737411499, + "num_tokens": 399357489.0, + "step": 10463 + }, + { + "epoch": 1.331128355171098, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.23525619506836, + "learning_rate": 1e-06, + "loss": 0.4429, + "mean_token_accuracy": 0.854614794254303, + "num_tokens": 399393561.0, + "step": 10464 + }, + { + "epoch": 1.3312555654496883, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.20183563232422, + "learning_rate": 1e-06, + "loss": 0.3993, + "mean_token_accuracy": 0.8713428974151611, + "num_tokens": 399424514.0, + "step": 10465 + }, + { + "epoch": 1.3313827757282788, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.14006805419922, + "learning_rate": 1e-06, + "loss": 0.3974, + "mean_token_accuracy": 0.8724241256713867, + "num_tokens": 399461372.0, + "step": 10466 + }, + { + "epoch": 1.3315099860068693, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.234891891479492, + "learning_rate": 1e-06, + "loss": 0.3829, + "mean_token_accuracy": 0.8805087804794312, + "num_tokens": 399502503.0, + "step": 10467 + }, + { + "epoch": 1.3316371962854598, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.08347511291504, + "learning_rate": 1e-06, + "loss": 0.4341, + "mean_token_accuracy": 0.8604835271835327, + "num_tokens": 399537836.0, + "step": 10468 + }, + { + "epoch": 1.3317644065640504, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.179847717285156, + "learning_rate": 1e-06, + "loss": 0.431, + "mean_token_accuracy": 0.8625739216804504, + "num_tokens": 399575189.0, + "step": 10469 + }, + { + "epoch": 1.331891616842641, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.05921745300293, + "learning_rate": 1e-06, + "loss": 0.4866, + "mean_token_accuracy": 0.8479862809181213, + "num_tokens": 399614406.0, + "step": 10470 + }, + { + "epoch": 1.3320188271212314, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.209407806396484, + "learning_rate": 1e-06, + "loss": 0.403, + "mean_token_accuracy": 0.8717161417007446, + "num_tokens": 399652847.0, + "step": 10471 + }, + { + "epoch": 1.332146037399822, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.11744499206543, + "learning_rate": 1e-06, + "loss": 0.4468, + "mean_token_accuracy": 0.8566029667854309, + "num_tokens": 399691628.0, + "step": 10472 + }, + { + "epoch": 1.3322732476784125, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.974159240722656, + "learning_rate": 1e-06, + "loss": 0.3866, + "mean_token_accuracy": 0.8769741654396057, + "num_tokens": 399732875.0, + "step": 10473 + }, + { + "epoch": 1.3324004579570028, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.222366333007812, + "learning_rate": 1e-06, + "loss": 0.3825, + "mean_token_accuracy": 0.877741813659668, + "num_tokens": 399769537.0, + "step": 10474 + }, + { + "epoch": 1.3325276682355933, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.15203285217285, + "learning_rate": 1e-06, + "loss": 0.466, + "mean_token_accuracy": 0.851842999458313, + "num_tokens": 399807757.0, + "step": 10475 + }, + { + "epoch": 1.3326548785141838, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.12040138244629, + "learning_rate": 1e-06, + "loss": 0.4047, + "mean_token_accuracy": 0.8729113340377808, + "num_tokens": 399837351.0, + "step": 10476 + }, + { + "epoch": 1.3327820887927744, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.245403289794922, + "learning_rate": 1e-06, + "loss": 0.3665, + "mean_token_accuracy": 0.8848254084587097, + "num_tokens": 399873678.0, + "step": 10477 + }, + { + "epoch": 1.332909299071365, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.1688232421875, + "learning_rate": 1e-06, + "loss": 0.4139, + "mean_token_accuracy": 0.868544340133667, + "num_tokens": 399916206.0, + "step": 10478 + }, + { + "epoch": 1.3330365093499554, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.268707275390625, + "learning_rate": 1e-06, + "loss": 0.3871, + "mean_token_accuracy": 0.8786677122116089, + "num_tokens": 399954144.0, + "step": 10479 + }, + { + "epoch": 1.333163719628546, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.296892166137695, + "learning_rate": 1e-06, + "loss": 0.3937, + "mean_token_accuracy": 0.8749945759773254, + "num_tokens": 399995865.0, + "step": 10480 + }, + { + "epoch": 1.3332909299071365, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.892372131347656, + "learning_rate": 1e-06, + "loss": 0.4137, + "mean_token_accuracy": 0.868056058883667, + "num_tokens": 400033234.0, + "step": 10481 + }, + { + "epoch": 1.333418140185727, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.317222595214844, + "learning_rate": 1e-06, + "loss": 0.3938, + "mean_token_accuracy": 0.8757823705673218, + "num_tokens": 400066842.0, + "step": 10482 + }, + { + "epoch": 1.3335453504643175, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.334976196289062, + "learning_rate": 1e-06, + "loss": 0.3907, + "mean_token_accuracy": 0.8782089948654175, + "num_tokens": 400099380.0, + "step": 10483 + }, + { + "epoch": 1.333672560742908, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.154346466064453, + "learning_rate": 1e-06, + "loss": 0.3954, + "mean_token_accuracy": 0.8785747289657593, + "num_tokens": 400137419.0, + "step": 10484 + }, + { + "epoch": 1.3337997710214986, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.077590942382812, + "learning_rate": 1e-06, + "loss": 0.4235, + "mean_token_accuracy": 0.868228554725647, + "num_tokens": 400171086.0, + "step": 10485 + }, + { + "epoch": 1.3339269813000891, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.187498092651367, + "learning_rate": 1e-06, + "loss": 0.4215, + "mean_token_accuracy": 0.8645068407058716, + "num_tokens": 400206763.0, + "step": 10486 + }, + { + "epoch": 1.3340541915786797, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.10833168029785, + "learning_rate": 1e-06, + "loss": 0.4725, + "mean_token_accuracy": 0.8513337969779968, + "num_tokens": 400242708.0, + "step": 10487 + }, + { + "epoch": 1.3341814018572702, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.131269454956055, + "learning_rate": 1e-06, + "loss": 0.4809, + "mean_token_accuracy": 0.8480011224746704, + "num_tokens": 400277417.0, + "step": 10488 + }, + { + "epoch": 1.3343086121358605, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.055692672729492, + "learning_rate": 1e-06, + "loss": 0.4205, + "mean_token_accuracy": 0.8662487864494324, + "num_tokens": 400315811.0, + "step": 10489 + }, + { + "epoch": 1.334435822414451, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.076717376708984, + "learning_rate": 1e-06, + "loss": 0.4734, + "mean_token_accuracy": 0.8468577861785889, + "num_tokens": 400356321.0, + "step": 10490 + }, + { + "epoch": 1.3345630326930416, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.29250717163086, + "learning_rate": 1e-06, + "loss": 0.4182, + "mean_token_accuracy": 0.8668042421340942, + "num_tokens": 400392382.0, + "step": 10491 + }, + { + "epoch": 1.334690242971632, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.089929580688477, + "learning_rate": 1e-06, + "loss": 0.4075, + "mean_token_accuracy": 0.8736189603805542, + "num_tokens": 400434947.0, + "step": 10492 + }, + { + "epoch": 1.3348174532502226, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.223003387451172, + "learning_rate": 1e-06, + "loss": 0.4757, + "mean_token_accuracy": 0.84822016954422, + "num_tokens": 400472742.0, + "step": 10493 + }, + { + "epoch": 1.3349446635288131, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.16053009033203, + "learning_rate": 1e-06, + "loss": 0.3712, + "mean_token_accuracy": 0.8810587525367737, + "num_tokens": 400510029.0, + "step": 10494 + }, + { + "epoch": 1.3350718738074037, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.014644622802734, + "learning_rate": 1e-06, + "loss": 0.4345, + "mean_token_accuracy": 0.863332986831665, + "num_tokens": 400547383.0, + "step": 10495 + }, + { + "epoch": 1.3351990840859942, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.19898223876953, + "learning_rate": 1e-06, + "loss": 0.3809, + "mean_token_accuracy": 0.8783295750617981, + "num_tokens": 400585242.0, + "step": 10496 + }, + { + "epoch": 1.3353262943645847, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.068161010742188, + "learning_rate": 1e-06, + "loss": 0.3828, + "mean_token_accuracy": 0.877792239189148, + "num_tokens": 400623117.0, + "step": 10497 + }, + { + "epoch": 1.3354535046431752, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.440895080566406, + "learning_rate": 1e-06, + "loss": 0.4376, + "mean_token_accuracy": 0.8558381795883179, + "num_tokens": 400660170.0, + "step": 10498 + }, + { + "epoch": 1.3355807149217656, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.039011001586914, + "learning_rate": 1e-06, + "loss": 0.3976, + "mean_token_accuracy": 0.87305748462677, + "num_tokens": 400700467.0, + "step": 10499 + }, + { + "epoch": 1.335707925200356, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.84308624267578, + "learning_rate": 1e-06, + "loss": 0.444, + "mean_token_accuracy": 0.8612426519393921, + "num_tokens": 400740269.0, + "step": 10500 + }, + { + "epoch": 1.3358351354789466, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.211746215820312, + "learning_rate": 1e-06, + "loss": 0.3922, + "mean_token_accuracy": 0.8775724768638611, + "num_tokens": 400776465.0, + "step": 10501 + }, + { + "epoch": 1.3359623457575371, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.95150375366211, + "learning_rate": 1e-06, + "loss": 0.4335, + "mean_token_accuracy": 0.8639971017837524, + "num_tokens": 400811210.0, + "step": 10502 + }, + { + "epoch": 1.3360895560361277, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.066810607910156, + "learning_rate": 1e-06, + "loss": 0.4461, + "mean_token_accuracy": 0.8595361113548279, + "num_tokens": 400848820.0, + "step": 10503 + }, + { + "epoch": 1.3362167663147182, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.932668685913086, + "learning_rate": 1e-06, + "loss": 0.4112, + "mean_token_accuracy": 0.8680866956710815, + "num_tokens": 400888774.0, + "step": 10504 + }, + { + "epoch": 1.3363439765933087, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.274141311645508, + "learning_rate": 1e-06, + "loss": 0.4083, + "mean_token_accuracy": 0.8687194585800171, + "num_tokens": 400935067.0, + "step": 10505 + }, + { + "epoch": 1.3364711868718993, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.343177795410156, + "learning_rate": 1e-06, + "loss": 0.3882, + "mean_token_accuracy": 0.8762718439102173, + "num_tokens": 400974122.0, + "step": 10506 + }, + { + "epoch": 1.3365983971504898, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.057985305786133, + "learning_rate": 1e-06, + "loss": 0.3907, + "mean_token_accuracy": 0.8768002986907959, + "num_tokens": 401008911.0, + "step": 10507 + }, + { + "epoch": 1.3367256074290803, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.546615600585938, + "learning_rate": 1e-06, + "loss": 0.4055, + "mean_token_accuracy": 0.8725313544273376, + "num_tokens": 401039453.0, + "step": 10508 + }, + { + "epoch": 1.3368528177076708, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.125619888305664, + "learning_rate": 1e-06, + "loss": 0.3861, + "mean_token_accuracy": 0.8764064311981201, + "num_tokens": 401075854.0, + "step": 10509 + }, + { + "epoch": 1.3369800279862614, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.4801082611084, + "learning_rate": 1e-06, + "loss": 0.4187, + "mean_token_accuracy": 0.8654255270957947, + "num_tokens": 401114922.0, + "step": 10510 + }, + { + "epoch": 1.337107238264852, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.34359359741211, + "learning_rate": 1e-06, + "loss": 0.3915, + "mean_token_accuracy": 0.8757960796356201, + "num_tokens": 401155833.0, + "step": 10511 + }, + { + "epoch": 1.3372344485434424, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.11504554748535, + "learning_rate": 1e-06, + "loss": 0.401, + "mean_token_accuracy": 0.8738787174224854, + "num_tokens": 401196971.0, + "step": 10512 + }, + { + "epoch": 1.337361658822033, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.214557647705078, + "learning_rate": 1e-06, + "loss": 0.5191, + "mean_token_accuracy": 0.8363193869590759, + "num_tokens": 401237312.0, + "step": 10513 + }, + { + "epoch": 1.3374888691006233, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.969844818115234, + "learning_rate": 1e-06, + "loss": 0.4403, + "mean_token_accuracy": 0.8643230199813843, + "num_tokens": 401274630.0, + "step": 10514 + }, + { + "epoch": 1.3376160793792138, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.098840713500977, + "learning_rate": 1e-06, + "loss": 0.4074, + "mean_token_accuracy": 0.8727015256881714, + "num_tokens": 401311780.0, + "step": 10515 + }, + { + "epoch": 1.3377432896578043, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.054080963134766, + "learning_rate": 1e-06, + "loss": 0.3918, + "mean_token_accuracy": 0.8747075200080872, + "num_tokens": 401352614.0, + "step": 10516 + }, + { + "epoch": 1.3378704999363948, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.093311309814453, + "learning_rate": 1e-06, + "loss": 0.4108, + "mean_token_accuracy": 0.8703163862228394, + "num_tokens": 401394495.0, + "step": 10517 + }, + { + "epoch": 1.3379977102149854, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.057886123657227, + "learning_rate": 1e-06, + "loss": 0.4665, + "mean_token_accuracy": 0.8551692962646484, + "num_tokens": 401427758.0, + "step": 10518 + }, + { + "epoch": 1.338124920493576, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.18726348876953, + "learning_rate": 1e-06, + "loss": 0.4637, + "mean_token_accuracy": 0.8528220653533936, + "num_tokens": 401473868.0, + "step": 10519 + }, + { + "epoch": 1.3382521307721664, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.138731002807617, + "learning_rate": 1e-06, + "loss": 0.4821, + "mean_token_accuracy": 0.8497974872589111, + "num_tokens": 401510459.0, + "step": 10520 + }, + { + "epoch": 1.338379341050757, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.964284896850586, + "learning_rate": 1e-06, + "loss": 0.5175, + "mean_token_accuracy": 0.8378261923789978, + "num_tokens": 401545283.0, + "step": 10521 + }, + { + "epoch": 1.3385065513293475, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.149282455444336, + "learning_rate": 1e-06, + "loss": 0.4192, + "mean_token_accuracy": 0.8684680461883545, + "num_tokens": 401577166.0, + "step": 10522 + }, + { + "epoch": 1.3386337616079378, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.21887969970703, + "learning_rate": 1e-06, + "loss": 0.3736, + "mean_token_accuracy": 0.8799424171447754, + "num_tokens": 401619075.0, + "step": 10523 + }, + { + "epoch": 1.3387609718865283, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.100505828857422, + "learning_rate": 1e-06, + "loss": 0.4359, + "mean_token_accuracy": 0.8609682321548462, + "num_tokens": 401661286.0, + "step": 10524 + }, + { + "epoch": 1.3388881821651188, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.08742904663086, + "learning_rate": 1e-06, + "loss": 0.4451, + "mean_token_accuracy": 0.8575366139411926, + "num_tokens": 401699672.0, + "step": 10525 + }, + { + "epoch": 1.3390153924437094, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.232563018798828, + "learning_rate": 1e-06, + "loss": 0.4341, + "mean_token_accuracy": 0.8606557250022888, + "num_tokens": 401732676.0, + "step": 10526 + }, + { + "epoch": 1.3391426027223, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.02970314025879, + "learning_rate": 1e-06, + "loss": 0.4404, + "mean_token_accuracy": 0.8644506931304932, + "num_tokens": 401771593.0, + "step": 10527 + }, + { + "epoch": 1.3392698130008904, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.1046085357666, + "learning_rate": 1e-06, + "loss": 0.4641, + "mean_token_accuracy": 0.8535679578781128, + "num_tokens": 401809605.0, + "step": 10528 + }, + { + "epoch": 1.339397023279481, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.316997528076172, + "learning_rate": 1e-06, + "loss": 0.4531, + "mean_token_accuracy": 0.8572702407836914, + "num_tokens": 401845349.0, + "step": 10529 + }, + { + "epoch": 1.3395242335580715, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.03325843811035, + "learning_rate": 1e-06, + "loss": 0.3395, + "mean_token_accuracy": 0.8891595602035522, + "num_tokens": 401883186.0, + "step": 10530 + }, + { + "epoch": 1.339651443836662, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.16437530517578, + "learning_rate": 1e-06, + "loss": 0.4652, + "mean_token_accuracy": 0.8511689901351929, + "num_tokens": 401921149.0, + "step": 10531 + }, + { + "epoch": 1.3397786541152525, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.984214782714844, + "learning_rate": 1e-06, + "loss": 0.4499, + "mean_token_accuracy": 0.860905647277832, + "num_tokens": 401961813.0, + "step": 10532 + }, + { + "epoch": 1.339905864393843, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.02667236328125, + "learning_rate": 1e-06, + "loss": 0.4618, + "mean_token_accuracy": 0.8493053317070007, + "num_tokens": 402001212.0, + "step": 10533 + }, + { + "epoch": 1.3400330746724336, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.921096801757812, + "learning_rate": 1e-06, + "loss": 0.4461, + "mean_token_accuracy": 0.8603490591049194, + "num_tokens": 402044648.0, + "step": 10534 + }, + { + "epoch": 1.3401602849510241, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.101581573486328, + "learning_rate": 1e-06, + "loss": 0.4209, + "mean_token_accuracy": 0.8695918917655945, + "num_tokens": 402087391.0, + "step": 10535 + }, + { + "epoch": 1.3402874952296147, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.944414138793945, + "learning_rate": 1e-06, + "loss": 0.3592, + "mean_token_accuracy": 0.8861681222915649, + "num_tokens": 402119598.0, + "step": 10536 + }, + { + "epoch": 1.3404147055082052, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.20651626586914, + "learning_rate": 1e-06, + "loss": 0.5049, + "mean_token_accuracy": 0.8423073291778564, + "num_tokens": 402153628.0, + "step": 10537 + }, + { + "epoch": 1.3405419157867955, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.158729553222656, + "learning_rate": 1e-06, + "loss": 0.3657, + "mean_token_accuracy": 0.8831138610839844, + "num_tokens": 402190876.0, + "step": 10538 + }, + { + "epoch": 1.340669126065386, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.09337043762207, + "learning_rate": 1e-06, + "loss": 0.3818, + "mean_token_accuracy": 0.8752155303955078, + "num_tokens": 402227168.0, + "step": 10539 + }, + { + "epoch": 1.3407963363439765, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.197723388671875, + "learning_rate": 1e-06, + "loss": 0.3925, + "mean_token_accuracy": 0.8756153583526611, + "num_tokens": 402262731.0, + "step": 10540 + }, + { + "epoch": 1.340923546622567, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.03489112854004, + "learning_rate": 1e-06, + "loss": 0.4163, + "mean_token_accuracy": 0.8674650192260742, + "num_tokens": 402299992.0, + "step": 10541 + }, + { + "epoch": 1.3410507569011576, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.026636123657227, + "learning_rate": 1e-06, + "loss": 0.4437, + "mean_token_accuracy": 0.8585225939750671, + "num_tokens": 402339129.0, + "step": 10542 + }, + { + "epoch": 1.3411779671797481, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.134624481201172, + "learning_rate": 1e-06, + "loss": 0.4773, + "mean_token_accuracy": 0.8502705097198486, + "num_tokens": 402375380.0, + "step": 10543 + }, + { + "epoch": 1.3413051774583387, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.173643112182617, + "learning_rate": 1e-06, + "loss": 0.4589, + "mean_token_accuracy": 0.8548915386199951, + "num_tokens": 402412159.0, + "step": 10544 + }, + { + "epoch": 1.3414323877369292, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.16845703125, + "learning_rate": 1e-06, + "loss": 0.4277, + "mean_token_accuracy": 0.8624098896980286, + "num_tokens": 402446676.0, + "step": 10545 + }, + { + "epoch": 1.3415595980155197, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.974491119384766, + "learning_rate": 1e-06, + "loss": 0.4141, + "mean_token_accuracy": 0.8674490451812744, + "num_tokens": 402490741.0, + "step": 10546 + }, + { + "epoch": 1.3416868082941102, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.106103897094727, + "learning_rate": 1e-06, + "loss": 0.3914, + "mean_token_accuracy": 0.8752411603927612, + "num_tokens": 402527398.0, + "step": 10547 + }, + { + "epoch": 1.3418140185727006, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.13091468811035, + "learning_rate": 1e-06, + "loss": 0.4223, + "mean_token_accuracy": 0.8662306666374207, + "num_tokens": 402565914.0, + "step": 10548 + }, + { + "epoch": 1.341941228851291, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.023883819580078, + "learning_rate": 1e-06, + "loss": 0.4037, + "mean_token_accuracy": 0.8701150417327881, + "num_tokens": 402600671.0, + "step": 10549 + }, + { + "epoch": 1.3420684391298816, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.13161849975586, + "learning_rate": 1e-06, + "loss": 0.3816, + "mean_token_accuracy": 0.879427433013916, + "num_tokens": 402637381.0, + "step": 10550 + }, + { + "epoch": 1.3421956494084721, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.29356575012207, + "learning_rate": 1e-06, + "loss": 0.3994, + "mean_token_accuracy": 0.8703688979148865, + "num_tokens": 402671190.0, + "step": 10551 + }, + { + "epoch": 1.3423228596870627, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.927108764648438, + "learning_rate": 1e-06, + "loss": 0.4246, + "mean_token_accuracy": 0.8669226169586182, + "num_tokens": 402721518.0, + "step": 10552 + }, + { + "epoch": 1.3424500699656532, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.108257293701172, + "learning_rate": 1e-06, + "loss": 0.4216, + "mean_token_accuracy": 0.8671533465385437, + "num_tokens": 402758949.0, + "step": 10553 + }, + { + "epoch": 1.3425772802442437, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.23346519470215, + "learning_rate": 1e-06, + "loss": 0.4191, + "mean_token_accuracy": 0.8612707853317261, + "num_tokens": 402794342.0, + "step": 10554 + }, + { + "epoch": 1.3427044905228342, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.114532470703125, + "learning_rate": 1e-06, + "loss": 0.3777, + "mean_token_accuracy": 0.8785730004310608, + "num_tokens": 402831031.0, + "step": 10555 + }, + { + "epoch": 1.3428317008014248, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.05704689025879, + "learning_rate": 1e-06, + "loss": 0.3812, + "mean_token_accuracy": 0.8771235942840576, + "num_tokens": 402871486.0, + "step": 10556 + }, + { + "epoch": 1.3429589110800153, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.163145065307617, + "learning_rate": 1e-06, + "loss": 0.3745, + "mean_token_accuracy": 0.8790197372436523, + "num_tokens": 402916024.0, + "step": 10557 + }, + { + "epoch": 1.3430861213586058, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.09024429321289, + "learning_rate": 1e-06, + "loss": 0.4768, + "mean_token_accuracy": 0.8488984107971191, + "num_tokens": 402952116.0, + "step": 10558 + }, + { + "epoch": 1.3432133316371964, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.02597427368164, + "learning_rate": 1e-06, + "loss": 0.4346, + "mean_token_accuracy": 0.8655548691749573, + "num_tokens": 402998064.0, + "step": 10559 + }, + { + "epoch": 1.343340541915787, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.05550193786621, + "learning_rate": 1e-06, + "loss": 0.4073, + "mean_token_accuracy": 0.8710672855377197, + "num_tokens": 403041763.0, + "step": 10560 + }, + { + "epoch": 1.3434677521943774, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.07784080505371, + "learning_rate": 1e-06, + "loss": 0.3938, + "mean_token_accuracy": 0.8783336877822876, + "num_tokens": 403080956.0, + "step": 10561 + }, + { + "epoch": 1.343594962472968, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.123254776000977, + "learning_rate": 1e-06, + "loss": 0.3929, + "mean_token_accuracy": 0.8765193223953247, + "num_tokens": 403117200.0, + "step": 10562 + }, + { + "epoch": 1.3437221727515583, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.15258026123047, + "learning_rate": 1e-06, + "loss": 0.412, + "mean_token_accuracy": 0.8662590384483337, + "num_tokens": 403154112.0, + "step": 10563 + }, + { + "epoch": 1.3438493830301488, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.93972396850586, + "learning_rate": 1e-06, + "loss": 0.4105, + "mean_token_accuracy": 0.8642948865890503, + "num_tokens": 403188265.0, + "step": 10564 + }, + { + "epoch": 1.3439765933087393, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.05879020690918, + "learning_rate": 1e-06, + "loss": 0.4834, + "mean_token_accuracy": 0.8452157378196716, + "num_tokens": 403231236.0, + "step": 10565 + }, + { + "epoch": 1.3441038035873298, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.056926727294922, + "learning_rate": 1e-06, + "loss": 0.4083, + "mean_token_accuracy": 0.8689440488815308, + "num_tokens": 403268642.0, + "step": 10566 + }, + { + "epoch": 1.3442310138659204, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.120738983154297, + "learning_rate": 1e-06, + "loss": 0.3942, + "mean_token_accuracy": 0.8749706745147705, + "num_tokens": 403307679.0, + "step": 10567 + }, + { + "epoch": 1.344358224144511, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.121631622314453, + "learning_rate": 1e-06, + "loss": 0.3957, + "mean_token_accuracy": 0.8736876249313354, + "num_tokens": 403346395.0, + "step": 10568 + }, + { + "epoch": 1.3444854344231014, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.1641845703125, + "learning_rate": 1e-06, + "loss": 0.392, + "mean_token_accuracy": 0.8757602572441101, + "num_tokens": 403383001.0, + "step": 10569 + }, + { + "epoch": 1.344612644701692, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.158899307250977, + "learning_rate": 1e-06, + "loss": 0.4497, + "mean_token_accuracy": 0.8592164516448975, + "num_tokens": 403426090.0, + "step": 10570 + }, + { + "epoch": 1.3447398549802825, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.07008171081543, + "learning_rate": 1e-06, + "loss": 0.4351, + "mean_token_accuracy": 0.8598505258560181, + "num_tokens": 403470231.0, + "step": 10571 + }, + { + "epoch": 1.3448670652588728, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.170942306518555, + "learning_rate": 1e-06, + "loss": 0.3972, + "mean_token_accuracy": 0.872600793838501, + "num_tokens": 403508677.0, + "step": 10572 + }, + { + "epoch": 1.3449942755374633, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.32054901123047, + "learning_rate": 1e-06, + "loss": 0.4738, + "mean_token_accuracy": 0.8490099310874939, + "num_tokens": 403545175.0, + "step": 10573 + }, + { + "epoch": 1.3451214858160538, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.087547302246094, + "learning_rate": 1e-06, + "loss": 0.3724, + "mean_token_accuracy": 0.8804944753646851, + "num_tokens": 403583321.0, + "step": 10574 + }, + { + "epoch": 1.3452486960946444, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.2208194732666, + "learning_rate": 1e-06, + "loss": 0.4356, + "mean_token_accuracy": 0.8609342575073242, + "num_tokens": 403618547.0, + "step": 10575 + }, + { + "epoch": 1.345375906373235, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.017202377319336, + "learning_rate": 1e-06, + "loss": 0.4165, + "mean_token_accuracy": 0.8702104687690735, + "num_tokens": 403657552.0, + "step": 10576 + }, + { + "epoch": 1.3455031166518254, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.25667953491211, + "learning_rate": 1e-06, + "loss": 0.4166, + "mean_token_accuracy": 0.8698459267616272, + "num_tokens": 403697021.0, + "step": 10577 + }, + { + "epoch": 1.345630326930416, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.110435485839844, + "learning_rate": 1e-06, + "loss": 0.4451, + "mean_token_accuracy": 0.8567942976951599, + "num_tokens": 403736480.0, + "step": 10578 + }, + { + "epoch": 1.3457575372090065, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.028196334838867, + "learning_rate": 1e-06, + "loss": 0.3887, + "mean_token_accuracy": 0.8738253712654114, + "num_tokens": 403772056.0, + "step": 10579 + }, + { + "epoch": 1.345884747487597, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.200565338134766, + "learning_rate": 1e-06, + "loss": 0.4451, + "mean_token_accuracy": 0.8610410690307617, + "num_tokens": 403813646.0, + "step": 10580 + }, + { + "epoch": 1.3460119577661875, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.15986442565918, + "learning_rate": 1e-06, + "loss": 0.417, + "mean_token_accuracy": 0.8702625632286072, + "num_tokens": 403853904.0, + "step": 10581 + }, + { + "epoch": 1.346139168044778, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.010562896728516, + "learning_rate": 1e-06, + "loss": 0.4656, + "mean_token_accuracy": 0.8518386483192444, + "num_tokens": 403898528.0, + "step": 10582 + }, + { + "epoch": 1.3462663783233686, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.16999053955078, + "learning_rate": 1e-06, + "loss": 0.4657, + "mean_token_accuracy": 0.8525034189224243, + "num_tokens": 403929518.0, + "step": 10583 + }, + { + "epoch": 1.3463935886019591, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.183168411254883, + "learning_rate": 1e-06, + "loss": 0.4096, + "mean_token_accuracy": 0.8683415651321411, + "num_tokens": 403970323.0, + "step": 10584 + }, + { + "epoch": 1.3465207988805497, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.994998931884766, + "learning_rate": 1e-06, + "loss": 0.4094, + "mean_token_accuracy": 0.8702214956283569, + "num_tokens": 404012499.0, + "step": 10585 + }, + { + "epoch": 1.3466480091591402, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.174379348754883, + "learning_rate": 1e-06, + "loss": 0.4497, + "mean_token_accuracy": 0.8623905777931213, + "num_tokens": 404055370.0, + "step": 10586 + }, + { + "epoch": 1.3467752194377305, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.058351516723633, + "learning_rate": 1e-06, + "loss": 0.4122, + "mean_token_accuracy": 0.8670903444290161, + "num_tokens": 404092498.0, + "step": 10587 + }, + { + "epoch": 1.346902429716321, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.09514808654785, + "learning_rate": 1e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.8734655380249023, + "num_tokens": 404132466.0, + "step": 10588 + }, + { + "epoch": 1.3470296399949115, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.15831756591797, + "learning_rate": 1e-06, + "loss": 0.4195, + "mean_token_accuracy": 0.866665244102478, + "num_tokens": 404171913.0, + "step": 10589 + }, + { + "epoch": 1.347156850273502, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.2443790435791, + "learning_rate": 1e-06, + "loss": 0.3771, + "mean_token_accuracy": 0.8804563283920288, + "num_tokens": 404205817.0, + "step": 10590 + }, + { + "epoch": 1.3472840605520926, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.100162506103516, + "learning_rate": 1e-06, + "loss": 0.4008, + "mean_token_accuracy": 0.8705704212188721, + "num_tokens": 404243922.0, + "step": 10591 + }, + { + "epoch": 1.3474112708306831, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.149036407470703, + "learning_rate": 1e-06, + "loss": 0.407, + "mean_token_accuracy": 0.8703241348266602, + "num_tokens": 404283392.0, + "step": 10592 + }, + { + "epoch": 1.3475384811092737, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.267004013061523, + "learning_rate": 1e-06, + "loss": 0.4987, + "mean_token_accuracy": 0.8414605855941772, + "num_tokens": 404323044.0, + "step": 10593 + }, + { + "epoch": 1.3476656913878642, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.96583366394043, + "learning_rate": 1e-06, + "loss": 0.4013, + "mean_token_accuracy": 0.8733072876930237, + "num_tokens": 404359761.0, + "step": 10594 + }, + { + "epoch": 1.3477929016664547, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.292646408081055, + "learning_rate": 1e-06, + "loss": 0.4559, + "mean_token_accuracy": 0.8545269966125488, + "num_tokens": 404395354.0, + "step": 10595 + }, + { + "epoch": 1.3479201119450452, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.093217849731445, + "learning_rate": 1e-06, + "loss": 0.4909, + "mean_token_accuracy": 0.8500397801399231, + "num_tokens": 404432052.0, + "step": 10596 + }, + { + "epoch": 1.3480473222236355, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.268774032592773, + "learning_rate": 1e-06, + "loss": 0.4583, + "mean_token_accuracy": 0.8594180941581726, + "num_tokens": 404472689.0, + "step": 10597 + }, + { + "epoch": 1.348174532502226, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.9296817779541, + "learning_rate": 1e-06, + "loss": 0.4732, + "mean_token_accuracy": 0.8527047634124756, + "num_tokens": 404515916.0, + "step": 10598 + }, + { + "epoch": 1.3483017427808166, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.29547119140625, + "learning_rate": 1e-06, + "loss": 0.4192, + "mean_token_accuracy": 0.8663395047187805, + "num_tokens": 404557399.0, + "step": 10599 + }, + { + "epoch": 1.3484289530594071, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.095399856567383, + "learning_rate": 1e-06, + "loss": 0.4412, + "mean_token_accuracy": 0.8631483316421509, + "num_tokens": 404592573.0, + "step": 10600 + }, + { + "epoch": 1.3485561633379977, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.10936737060547, + "learning_rate": 1e-06, + "loss": 0.4279, + "mean_token_accuracy": 0.8638826012611389, + "num_tokens": 404635266.0, + "step": 10601 + }, + { + "epoch": 1.3486833736165882, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.274333953857422, + "learning_rate": 1e-06, + "loss": 0.3902, + "mean_token_accuracy": 0.8764328956604004, + "num_tokens": 404673004.0, + "step": 10602 + }, + { + "epoch": 1.3488105838951787, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.15781593322754, + "learning_rate": 1e-06, + "loss": 0.4698, + "mean_token_accuracy": 0.8515785336494446, + "num_tokens": 404705153.0, + "step": 10603 + }, + { + "epoch": 1.3489377941737692, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.266828536987305, + "learning_rate": 1e-06, + "loss": 0.4424, + "mean_token_accuracy": 0.8578298091888428, + "num_tokens": 404737068.0, + "step": 10604 + }, + { + "epoch": 1.3490650044523598, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.1673641204834, + "learning_rate": 1e-06, + "loss": 0.4966, + "mean_token_accuracy": 0.8455162048339844, + "num_tokens": 404773680.0, + "step": 10605 + }, + { + "epoch": 1.3491922147309503, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.16642189025879, + "learning_rate": 1e-06, + "loss": 0.3874, + "mean_token_accuracy": 0.8727985620498657, + "num_tokens": 404809937.0, + "step": 10606 + }, + { + "epoch": 1.3493194250095408, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.168209075927734, + "learning_rate": 1e-06, + "loss": 0.4495, + "mean_token_accuracy": 0.8574230670928955, + "num_tokens": 404847146.0, + "step": 10607 + }, + { + "epoch": 1.3494466352881314, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.380847930908203, + "learning_rate": 1e-06, + "loss": 0.4158, + "mean_token_accuracy": 0.8676368594169617, + "num_tokens": 404882952.0, + "step": 10608 + }, + { + "epoch": 1.3495738455667219, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.08873176574707, + "learning_rate": 1e-06, + "loss": 0.3909, + "mean_token_accuracy": 0.8746058940887451, + "num_tokens": 404925347.0, + "step": 10609 + }, + { + "epoch": 1.3497010558453124, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.121082305908203, + "learning_rate": 1e-06, + "loss": 0.4761, + "mean_token_accuracy": 0.8519003987312317, + "num_tokens": 404963411.0, + "step": 10610 + }, + { + "epoch": 1.349828266123903, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.22720718383789, + "learning_rate": 1e-06, + "loss": 0.413, + "mean_token_accuracy": 0.8643041849136353, + "num_tokens": 404997522.0, + "step": 10611 + }, + { + "epoch": 1.3499554764024932, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.115283966064453, + "learning_rate": 1e-06, + "loss": 0.4259, + "mean_token_accuracy": 0.864625096321106, + "num_tokens": 405034049.0, + "step": 10612 + }, + { + "epoch": 1.3500826866810838, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.239734649658203, + "learning_rate": 1e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.8632779717445374, + "num_tokens": 405074985.0, + "step": 10613 + }, + { + "epoch": 1.3502098969596743, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.316356658935547, + "learning_rate": 1e-06, + "loss": 0.3775, + "mean_token_accuracy": 0.8815052509307861, + "num_tokens": 405116133.0, + "step": 10614 + }, + { + "epoch": 1.3503371072382648, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.048168182373047, + "learning_rate": 1e-06, + "loss": 0.4255, + "mean_token_accuracy": 0.8648714423179626, + "num_tokens": 405155572.0, + "step": 10615 + }, + { + "epoch": 1.3504643175168554, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.143598556518555, + "learning_rate": 1e-06, + "loss": 0.4149, + "mean_token_accuracy": 0.8686040639877319, + "num_tokens": 405194700.0, + "step": 10616 + }, + { + "epoch": 1.350591527795446, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.36455535888672, + "learning_rate": 1e-06, + "loss": 0.4289, + "mean_token_accuracy": 0.8622328639030457, + "num_tokens": 405233582.0, + "step": 10617 + }, + { + "epoch": 1.3507187380740364, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.27714729309082, + "learning_rate": 1e-06, + "loss": 0.3807, + "mean_token_accuracy": 0.8781693577766418, + "num_tokens": 405274987.0, + "step": 10618 + }, + { + "epoch": 1.350845948352627, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.291189193725586, + "learning_rate": 1e-06, + "loss": 0.4357, + "mean_token_accuracy": 0.8647130131721497, + "num_tokens": 405308151.0, + "step": 10619 + }, + { + "epoch": 1.3509731586312175, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.23111915588379, + "learning_rate": 1e-06, + "loss": 0.4208, + "mean_token_accuracy": 0.8667452335357666, + "num_tokens": 405343939.0, + "step": 10620 + }, + { + "epoch": 1.3511003689098078, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.09491729736328, + "learning_rate": 1e-06, + "loss": 0.4165, + "mean_token_accuracy": 0.8656295537948608, + "num_tokens": 405384211.0, + "step": 10621 + }, + { + "epoch": 1.3512275791883983, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.360471725463867, + "learning_rate": 1e-06, + "loss": 0.4093, + "mean_token_accuracy": 0.8732462525367737, + "num_tokens": 405419971.0, + "step": 10622 + }, + { + "epoch": 1.3513547894669888, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.18874168395996, + "learning_rate": 1e-06, + "loss": 0.4409, + "mean_token_accuracy": 0.8602396249771118, + "num_tokens": 405451322.0, + "step": 10623 + }, + { + "epoch": 1.3514819997455794, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.145732879638672, + "learning_rate": 1e-06, + "loss": 0.4452, + "mean_token_accuracy": 0.8577665090560913, + "num_tokens": 405496101.0, + "step": 10624 + }, + { + "epoch": 1.35160921002417, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.159603118896484, + "learning_rate": 1e-06, + "loss": 0.4863, + "mean_token_accuracy": 0.8512394428253174, + "num_tokens": 405530034.0, + "step": 10625 + }, + { + "epoch": 1.3517364203027604, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.23520851135254, + "learning_rate": 1e-06, + "loss": 0.3832, + "mean_token_accuracy": 0.878626823425293, + "num_tokens": 405570893.0, + "step": 10626 + }, + { + "epoch": 1.351863630581351, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.18805503845215, + "learning_rate": 1e-06, + "loss": 0.4632, + "mean_token_accuracy": 0.8534849882125854, + "num_tokens": 405610120.0, + "step": 10627 + }, + { + "epoch": 1.3519908408599415, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.15309715270996, + "learning_rate": 1e-06, + "loss": 0.4172, + "mean_token_accuracy": 0.8653080463409424, + "num_tokens": 405651878.0, + "step": 10628 + }, + { + "epoch": 1.352118051138532, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.35274314880371, + "learning_rate": 1e-06, + "loss": 0.4079, + "mean_token_accuracy": 0.8713653683662415, + "num_tokens": 405694578.0, + "step": 10629 + }, + { + "epoch": 1.3522452614171225, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.19363784790039, + "learning_rate": 1e-06, + "loss": 0.3982, + "mean_token_accuracy": 0.8724685907363892, + "num_tokens": 405734128.0, + "step": 10630 + }, + { + "epoch": 1.352372471695713, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.458738327026367, + "learning_rate": 1e-06, + "loss": 0.4044, + "mean_token_accuracy": 0.8721021413803101, + "num_tokens": 405776588.0, + "step": 10631 + }, + { + "epoch": 1.3524996819743036, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.211454391479492, + "learning_rate": 1e-06, + "loss": 0.4252, + "mean_token_accuracy": 0.8626678586006165, + "num_tokens": 405810867.0, + "step": 10632 + }, + { + "epoch": 1.3526268922528941, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.39911460876465, + "learning_rate": 1e-06, + "loss": 0.3617, + "mean_token_accuracy": 0.8832546472549438, + "num_tokens": 405848884.0, + "step": 10633 + }, + { + "epoch": 1.3527541025314846, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.249095916748047, + "learning_rate": 1e-06, + "loss": 0.3582, + "mean_token_accuracy": 0.8856615424156189, + "num_tokens": 405888759.0, + "step": 10634 + }, + { + "epoch": 1.3528813128100752, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.409666061401367, + "learning_rate": 1e-06, + "loss": 0.3781, + "mean_token_accuracy": 0.8797041773796082, + "num_tokens": 405925686.0, + "step": 10635 + }, + { + "epoch": 1.3530085230886655, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.273256301879883, + "learning_rate": 1e-06, + "loss": 0.3907, + "mean_token_accuracy": 0.8787610530853271, + "num_tokens": 405960836.0, + "step": 10636 + }, + { + "epoch": 1.353135733367256, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.206336975097656, + "learning_rate": 1e-06, + "loss": 0.3814, + "mean_token_accuracy": 0.8775498867034912, + "num_tokens": 406000314.0, + "step": 10637 + }, + { + "epoch": 1.3532629436458465, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.244203567504883, + "learning_rate": 1e-06, + "loss": 0.3878, + "mean_token_accuracy": 0.8767794966697693, + "num_tokens": 406039468.0, + "step": 10638 + }, + { + "epoch": 1.353390153924437, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.142282485961914, + "learning_rate": 1e-06, + "loss": 0.3495, + "mean_token_accuracy": 0.8901444673538208, + "num_tokens": 406076828.0, + "step": 10639 + }, + { + "epoch": 1.3535173642030276, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.3402099609375, + "learning_rate": 1e-06, + "loss": 0.4151, + "mean_token_accuracy": 0.865364670753479, + "num_tokens": 406116504.0, + "step": 10640 + }, + { + "epoch": 1.3536445744816181, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.141502380371094, + "learning_rate": 1e-06, + "loss": 0.4245, + "mean_token_accuracy": 0.8664770722389221, + "num_tokens": 406155184.0, + "step": 10641 + }, + { + "epoch": 1.3537717847602087, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.082569122314453, + "learning_rate": 1e-06, + "loss": 0.5007, + "mean_token_accuracy": 0.8481720685958862, + "num_tokens": 406195411.0, + "step": 10642 + }, + { + "epoch": 1.3538989950387992, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.32651138305664, + "learning_rate": 1e-06, + "loss": 0.45, + "mean_token_accuracy": 0.8566107153892517, + "num_tokens": 406230159.0, + "step": 10643 + }, + { + "epoch": 1.3540262053173897, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.226985931396484, + "learning_rate": 1e-06, + "loss": 0.4477, + "mean_token_accuracy": 0.856083869934082, + "num_tokens": 406264541.0, + "step": 10644 + }, + { + "epoch": 1.3541534155959802, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.18477439880371, + "learning_rate": 1e-06, + "loss": 0.3966, + "mean_token_accuracy": 0.8722118735313416, + "num_tokens": 406301869.0, + "step": 10645 + }, + { + "epoch": 1.3542806258745705, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.285633087158203, + "learning_rate": 1e-06, + "loss": 0.4219, + "mean_token_accuracy": 0.8689390420913696, + "num_tokens": 406338819.0, + "step": 10646 + }, + { + "epoch": 1.354407836153161, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.358762741088867, + "learning_rate": 1e-06, + "loss": 0.4153, + "mean_token_accuracy": 0.8684078454971313, + "num_tokens": 406375914.0, + "step": 10647 + }, + { + "epoch": 1.3545350464317516, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.250398635864258, + "learning_rate": 1e-06, + "loss": 0.4072, + "mean_token_accuracy": 0.8720477819442749, + "num_tokens": 406418543.0, + "step": 10648 + }, + { + "epoch": 1.3546622567103421, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.26838493347168, + "learning_rate": 1e-06, + "loss": 0.4197, + "mean_token_accuracy": 0.864931046962738, + "num_tokens": 406455731.0, + "step": 10649 + }, + { + "epoch": 1.3547894669889327, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.238994598388672, + "learning_rate": 1e-06, + "loss": 0.416, + "mean_token_accuracy": 0.8659913539886475, + "num_tokens": 406496266.0, + "step": 10650 + }, + { + "epoch": 1.3549166772675232, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.21636199951172, + "learning_rate": 1e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.8623894453048706, + "num_tokens": 406541676.0, + "step": 10651 + }, + { + "epoch": 1.3550438875461137, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.066438674926758, + "learning_rate": 1e-06, + "loss": 0.4245, + "mean_token_accuracy": 0.8630330562591553, + "num_tokens": 406578997.0, + "step": 10652 + }, + { + "epoch": 1.3551710978247042, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.086563110351562, + "learning_rate": 1e-06, + "loss": 0.4, + "mean_token_accuracy": 0.8714320063591003, + "num_tokens": 406619404.0, + "step": 10653 + }, + { + "epoch": 1.3552983081032948, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.112398147583008, + "learning_rate": 1e-06, + "loss": 0.3581, + "mean_token_accuracy": 0.8831486701965332, + "num_tokens": 406656092.0, + "step": 10654 + }, + { + "epoch": 1.3554255183818853, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.20693016052246, + "learning_rate": 1e-06, + "loss": 0.4688, + "mean_token_accuracy": 0.8501967191696167, + "num_tokens": 406701600.0, + "step": 10655 + }, + { + "epoch": 1.3555527286604758, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.180931091308594, + "learning_rate": 1e-06, + "loss": 0.4238, + "mean_token_accuracy": 0.8659730553627014, + "num_tokens": 406739649.0, + "step": 10656 + }, + { + "epoch": 1.3556799389390664, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.125205993652344, + "learning_rate": 1e-06, + "loss": 0.3561, + "mean_token_accuracy": 0.8851327896118164, + "num_tokens": 406774962.0, + "step": 10657 + }, + { + "epoch": 1.3558071492176569, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.19342041015625, + "learning_rate": 1e-06, + "loss": 0.4294, + "mean_token_accuracy": 0.8638408184051514, + "num_tokens": 406818047.0, + "step": 10658 + }, + { + "epoch": 1.3559343594962474, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.06540870666504, + "learning_rate": 1e-06, + "loss": 0.4369, + "mean_token_accuracy": 0.8598712086677551, + "num_tokens": 406859464.0, + "step": 10659 + }, + { + "epoch": 1.356061569774838, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.188032150268555, + "learning_rate": 1e-06, + "loss": 0.427, + "mean_token_accuracy": 0.8654901385307312, + "num_tokens": 406896444.0, + "step": 10660 + }, + { + "epoch": 1.3561887800534282, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.99039077758789, + "learning_rate": 1e-06, + "loss": 0.4377, + "mean_token_accuracy": 0.8616198301315308, + "num_tokens": 406934623.0, + "step": 10661 + }, + { + "epoch": 1.3563159903320188, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.33724021911621, + "learning_rate": 1e-06, + "loss": 0.4373, + "mean_token_accuracy": 0.8599218130111694, + "num_tokens": 406974290.0, + "step": 10662 + }, + { + "epoch": 1.3564432006106093, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.194520950317383, + "learning_rate": 1e-06, + "loss": 0.3641, + "mean_token_accuracy": 0.8856419324874878, + "num_tokens": 407011279.0, + "step": 10663 + }, + { + "epoch": 1.3565704108891998, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.068328857421875, + "learning_rate": 1e-06, + "loss": 0.4545, + "mean_token_accuracy": 0.8561844825744629, + "num_tokens": 407054733.0, + "step": 10664 + }, + { + "epoch": 1.3566976211677904, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.971271514892578, + "learning_rate": 1e-06, + "loss": 0.419, + "mean_token_accuracy": 0.8680896162986755, + "num_tokens": 407098410.0, + "step": 10665 + }, + { + "epoch": 1.3568248314463809, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.224950790405273, + "learning_rate": 1e-06, + "loss": 0.4442, + "mean_token_accuracy": 0.8600267767906189, + "num_tokens": 407134231.0, + "step": 10666 + }, + { + "epoch": 1.3569520417249714, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.147705078125, + "learning_rate": 1e-06, + "loss": 0.4657, + "mean_token_accuracy": 0.8528677225112915, + "num_tokens": 407167231.0, + "step": 10667 + }, + { + "epoch": 1.357079252003562, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.029329299926758, + "learning_rate": 1e-06, + "loss": 0.4116, + "mean_token_accuracy": 0.8692330121994019, + "num_tokens": 407206604.0, + "step": 10668 + }, + { + "epoch": 1.3572064622821525, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.368501663208008, + "learning_rate": 1e-06, + "loss": 0.4093, + "mean_token_accuracy": 0.8689571619033813, + "num_tokens": 407248247.0, + "step": 10669 + }, + { + "epoch": 1.3573336725607428, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.17997169494629, + "learning_rate": 1e-06, + "loss": 0.3873, + "mean_token_accuracy": 0.8765910863876343, + "num_tokens": 407282870.0, + "step": 10670 + }, + { + "epoch": 1.3574608828393333, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.208087921142578, + "learning_rate": 1e-06, + "loss": 0.4005, + "mean_token_accuracy": 0.8729647397994995, + "num_tokens": 407318704.0, + "step": 10671 + }, + { + "epoch": 1.3575880931179238, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.283370971679688, + "learning_rate": 1e-06, + "loss": 0.4051, + "mean_token_accuracy": 0.8689528703689575, + "num_tokens": 407360003.0, + "step": 10672 + }, + { + "epoch": 1.3577153033965144, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.214130401611328, + "learning_rate": 1e-06, + "loss": 0.4322, + "mean_token_accuracy": 0.8613941669464111, + "num_tokens": 407399473.0, + "step": 10673 + }, + { + "epoch": 1.357842513675105, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.288734436035156, + "learning_rate": 1e-06, + "loss": 0.5135, + "mean_token_accuracy": 0.8369737267494202, + "num_tokens": 407440576.0, + "step": 10674 + }, + { + "epoch": 1.3579697239536954, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.28433609008789, + "learning_rate": 1e-06, + "loss": 0.4328, + "mean_token_accuracy": 0.8632472157478333, + "num_tokens": 407481057.0, + "step": 10675 + }, + { + "epoch": 1.358096934232286, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.177980422973633, + "learning_rate": 1e-06, + "loss": 0.3814, + "mean_token_accuracy": 0.8785605430603027, + "num_tokens": 407517104.0, + "step": 10676 + }, + { + "epoch": 1.3582241445108765, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.095869064331055, + "learning_rate": 1e-06, + "loss": 0.3931, + "mean_token_accuracy": 0.8739802837371826, + "num_tokens": 407552604.0, + "step": 10677 + }, + { + "epoch": 1.358351354789467, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.22435760498047, + "learning_rate": 1e-06, + "loss": 0.3745, + "mean_token_accuracy": 0.8804317712783813, + "num_tokens": 407588282.0, + "step": 10678 + }, + { + "epoch": 1.3584785650680575, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.23472023010254, + "learning_rate": 1e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.8616684079170227, + "num_tokens": 407619837.0, + "step": 10679 + }, + { + "epoch": 1.358605775346648, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.239328384399414, + "learning_rate": 1e-06, + "loss": 0.4383, + "mean_token_accuracy": 0.8635648488998413, + "num_tokens": 407656939.0, + "step": 10680 + }, + { + "epoch": 1.3587329856252386, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.160085678100586, + "learning_rate": 1e-06, + "loss": 0.4481, + "mean_token_accuracy": 0.8564761877059937, + "num_tokens": 407702756.0, + "step": 10681 + }, + { + "epoch": 1.3588601959038291, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.330533981323242, + "learning_rate": 1e-06, + "loss": 0.3878, + "mean_token_accuracy": 0.8756781816482544, + "num_tokens": 407736142.0, + "step": 10682 + }, + { + "epoch": 1.3589874061824196, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.146270751953125, + "learning_rate": 1e-06, + "loss": 0.431, + "mean_token_accuracy": 0.8635889887809753, + "num_tokens": 407773184.0, + "step": 10683 + }, + { + "epoch": 1.3591146164610102, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.303783416748047, + "learning_rate": 1e-06, + "loss": 0.4701, + "mean_token_accuracy": 0.8516870737075806, + "num_tokens": 407806543.0, + "step": 10684 + }, + { + "epoch": 1.3592418267396005, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.197750091552734, + "learning_rate": 1e-06, + "loss": 0.429, + "mean_token_accuracy": 0.8660693168640137, + "num_tokens": 407841371.0, + "step": 10685 + }, + { + "epoch": 1.359369037018191, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.05614471435547, + "learning_rate": 1e-06, + "loss": 0.4057, + "mean_token_accuracy": 0.8710046410560608, + "num_tokens": 407879862.0, + "step": 10686 + }, + { + "epoch": 1.3594962472967815, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.45687484741211, + "learning_rate": 1e-06, + "loss": 0.4338, + "mean_token_accuracy": 0.8614481091499329, + "num_tokens": 407917947.0, + "step": 10687 + }, + { + "epoch": 1.359623457575372, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.22385025024414, + "learning_rate": 1e-06, + "loss": 0.4004, + "mean_token_accuracy": 0.8742697238922119, + "num_tokens": 407953987.0, + "step": 10688 + }, + { + "epoch": 1.3597506678539626, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.000446319580078, + "learning_rate": 1e-06, + "loss": 0.4715, + "mean_token_accuracy": 0.8510578274726868, + "num_tokens": 407993166.0, + "step": 10689 + }, + { + "epoch": 1.3598778781325531, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.223352432250977, + "learning_rate": 1e-06, + "loss": 0.3893, + "mean_token_accuracy": 0.8749208450317383, + "num_tokens": 408037063.0, + "step": 10690 + }, + { + "epoch": 1.3600050884111436, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.262712478637695, + "learning_rate": 1e-06, + "loss": 0.426, + "mean_token_accuracy": 0.8696690797805786, + "num_tokens": 408078526.0, + "step": 10691 + }, + { + "epoch": 1.3601322986897342, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.24968719482422, + "learning_rate": 1e-06, + "loss": 0.3813, + "mean_token_accuracy": 0.8744320869445801, + "num_tokens": 408116002.0, + "step": 10692 + }, + { + "epoch": 1.3602595089683247, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.226808547973633, + "learning_rate": 1e-06, + "loss": 0.43, + "mean_token_accuracy": 0.8633655905723572, + "num_tokens": 408155885.0, + "step": 10693 + }, + { + "epoch": 1.3603867192469152, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.193296432495117, + "learning_rate": 1e-06, + "loss": 0.3886, + "mean_token_accuracy": 0.872051477432251, + "num_tokens": 408190085.0, + "step": 10694 + }, + { + "epoch": 1.3605139295255055, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.262134552001953, + "learning_rate": 1e-06, + "loss": 0.4252, + "mean_token_accuracy": 0.8689316511154175, + "num_tokens": 408227289.0, + "step": 10695 + }, + { + "epoch": 1.360641139804096, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.15574073791504, + "learning_rate": 1e-06, + "loss": 0.3973, + "mean_token_accuracy": 0.8744528293609619, + "num_tokens": 408271853.0, + "step": 10696 + }, + { + "epoch": 1.3607683500826866, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.342273712158203, + "learning_rate": 1e-06, + "loss": 0.3784, + "mean_token_accuracy": 0.8799324035644531, + "num_tokens": 408311133.0, + "step": 10697 + }, + { + "epoch": 1.3608955603612771, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.143980026245117, + "learning_rate": 1e-06, + "loss": 0.4218, + "mean_token_accuracy": 0.8693965077400208, + "num_tokens": 408342775.0, + "step": 10698 + }, + { + "epoch": 1.3610227706398677, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.254758834838867, + "learning_rate": 1e-06, + "loss": 0.3875, + "mean_token_accuracy": 0.8790189027786255, + "num_tokens": 408379063.0, + "step": 10699 + }, + { + "epoch": 1.3611499809184582, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.15810203552246, + "learning_rate": 1e-06, + "loss": 0.4547, + "mean_token_accuracy": 0.8546059131622314, + "num_tokens": 408415856.0, + "step": 10700 + }, + { + "epoch": 1.3612771911970487, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.185991287231445, + "learning_rate": 1e-06, + "loss": 0.4074, + "mean_token_accuracy": 0.872230589389801, + "num_tokens": 408451522.0, + "step": 10701 + }, + { + "epoch": 1.3614044014756392, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.223838806152344, + "learning_rate": 1e-06, + "loss": 0.4051, + "mean_token_accuracy": 0.8725771903991699, + "num_tokens": 408494605.0, + "step": 10702 + }, + { + "epoch": 1.3615316117542298, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.08482551574707, + "learning_rate": 1e-06, + "loss": 0.4251, + "mean_token_accuracy": 0.8661783337593079, + "num_tokens": 408536373.0, + "step": 10703 + }, + { + "epoch": 1.3616588220328203, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.221426010131836, + "learning_rate": 1e-06, + "loss": 0.4572, + "mean_token_accuracy": 0.8541097640991211, + "num_tokens": 408574724.0, + "step": 10704 + }, + { + "epoch": 1.3617860323114108, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.14969825744629, + "learning_rate": 1e-06, + "loss": 0.4335, + "mean_token_accuracy": 0.860014796257019, + "num_tokens": 408615057.0, + "step": 10705 + }, + { + "epoch": 1.3619132425900013, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.274982452392578, + "learning_rate": 1e-06, + "loss": 0.4561, + "mean_token_accuracy": 0.8551198840141296, + "num_tokens": 408651324.0, + "step": 10706 + }, + { + "epoch": 1.3620404528685919, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.13847541809082, + "learning_rate": 1e-06, + "loss": 0.4023, + "mean_token_accuracy": 0.8727433085441589, + "num_tokens": 408690533.0, + "step": 10707 + }, + { + "epoch": 1.3621676631471824, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.34676170349121, + "learning_rate": 1e-06, + "loss": 0.435, + "mean_token_accuracy": 0.8598092794418335, + "num_tokens": 408725970.0, + "step": 10708 + }, + { + "epoch": 1.362294873425773, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.232606887817383, + "learning_rate": 1e-06, + "loss": 0.3906, + "mean_token_accuracy": 0.8752807974815369, + "num_tokens": 408766550.0, + "step": 10709 + }, + { + "epoch": 1.3624220837043632, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.220304489135742, + "learning_rate": 1e-06, + "loss": 0.4021, + "mean_token_accuracy": 0.8710010051727295, + "num_tokens": 408804320.0, + "step": 10710 + }, + { + "epoch": 1.3625492939829538, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.160816192626953, + "learning_rate": 1e-06, + "loss": 0.4194, + "mean_token_accuracy": 0.8701760768890381, + "num_tokens": 408840902.0, + "step": 10711 + }, + { + "epoch": 1.3626765042615443, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.34632682800293, + "learning_rate": 1e-06, + "loss": 0.4152, + "mean_token_accuracy": 0.8651323318481445, + "num_tokens": 408879346.0, + "step": 10712 + }, + { + "epoch": 1.3628037145401348, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.17003059387207, + "learning_rate": 1e-06, + "loss": 0.38, + "mean_token_accuracy": 0.8794611692428589, + "num_tokens": 408919336.0, + "step": 10713 + }, + { + "epoch": 1.3629309248187254, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.203815460205078, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8622266054153442, + "num_tokens": 408961547.0, + "step": 10714 + }, + { + "epoch": 1.3630581350973159, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.31421661376953, + "learning_rate": 1e-06, + "loss": 0.3689, + "mean_token_accuracy": 0.8826619982719421, + "num_tokens": 409003573.0, + "step": 10715 + }, + { + "epoch": 1.3631853453759064, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.224082946777344, + "learning_rate": 1e-06, + "loss": 0.44, + "mean_token_accuracy": 0.8601844906806946, + "num_tokens": 409045560.0, + "step": 10716 + }, + { + "epoch": 1.363312555654497, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.240209579467773, + "learning_rate": 1e-06, + "loss": 0.3946, + "mean_token_accuracy": 0.8707700967788696, + "num_tokens": 409087245.0, + "step": 10717 + }, + { + "epoch": 1.3634397659330875, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.304981231689453, + "learning_rate": 1e-06, + "loss": 0.3977, + "mean_token_accuracy": 0.8761509656906128, + "num_tokens": 409134177.0, + "step": 10718 + }, + { + "epoch": 1.3635669762116778, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.256141662597656, + "learning_rate": 1e-06, + "loss": 0.4563, + "mean_token_accuracy": 0.8533327579498291, + "num_tokens": 409169495.0, + "step": 10719 + }, + { + "epoch": 1.3636941864902683, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.15996551513672, + "learning_rate": 1e-06, + "loss": 0.4908, + "mean_token_accuracy": 0.8432631492614746, + "num_tokens": 409207781.0, + "step": 10720 + }, + { + "epoch": 1.3638213967688588, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.47433853149414, + "learning_rate": 1e-06, + "loss": 0.4267, + "mean_token_accuracy": 0.8664141297340393, + "num_tokens": 409245411.0, + "step": 10721 + }, + { + "epoch": 1.3639486070474494, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.079843521118164, + "learning_rate": 1e-06, + "loss": 0.3815, + "mean_token_accuracy": 0.8745397925376892, + "num_tokens": 409281757.0, + "step": 10722 + }, + { + "epoch": 1.3640758173260399, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.469148635864258, + "learning_rate": 1e-06, + "loss": 0.4346, + "mean_token_accuracy": 0.8553526401519775, + "num_tokens": 409321317.0, + "step": 10723 + }, + { + "epoch": 1.3642030276046304, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.2403564453125, + "learning_rate": 1e-06, + "loss": 0.3881, + "mean_token_accuracy": 0.874251127243042, + "num_tokens": 409358919.0, + "step": 10724 + }, + { + "epoch": 1.364330237883221, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.101337432861328, + "learning_rate": 1e-06, + "loss": 0.434, + "mean_token_accuracy": 0.8583229780197144, + "num_tokens": 409394355.0, + "step": 10725 + }, + { + "epoch": 1.3644574481618115, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.361412048339844, + "learning_rate": 1e-06, + "loss": 0.3939, + "mean_token_accuracy": 0.8767511248588562, + "num_tokens": 409433781.0, + "step": 10726 + }, + { + "epoch": 1.364584658440402, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.23657989501953, + "learning_rate": 1e-06, + "loss": 0.4454, + "mean_token_accuracy": 0.8577492833137512, + "num_tokens": 409471728.0, + "step": 10727 + }, + { + "epoch": 1.3647118687189925, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.257343292236328, + "learning_rate": 1e-06, + "loss": 0.3566, + "mean_token_accuracy": 0.8854706287384033, + "num_tokens": 409502294.0, + "step": 10728 + }, + { + "epoch": 1.364839078997583, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.393442153930664, + "learning_rate": 1e-06, + "loss": 0.4138, + "mean_token_accuracy": 0.869156539440155, + "num_tokens": 409545878.0, + "step": 10729 + }, + { + "epoch": 1.3649662892761736, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.205398559570312, + "learning_rate": 1e-06, + "loss": 0.4079, + "mean_token_accuracy": 0.8699720501899719, + "num_tokens": 409584753.0, + "step": 10730 + }, + { + "epoch": 1.3650934995547641, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.232969284057617, + "learning_rate": 1e-06, + "loss": 0.4035, + "mean_token_accuracy": 0.8710018992424011, + "num_tokens": 409613725.0, + "step": 10731 + }, + { + "epoch": 1.3652207098333546, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.250926971435547, + "learning_rate": 1e-06, + "loss": 0.4373, + "mean_token_accuracy": 0.8604487180709839, + "num_tokens": 409652443.0, + "step": 10732 + }, + { + "epoch": 1.3653479201119452, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.188753128051758, + "learning_rate": 1e-06, + "loss": 0.3882, + "mean_token_accuracy": 0.8729221224784851, + "num_tokens": 409690683.0, + "step": 10733 + }, + { + "epoch": 1.3654751303905355, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.356414794921875, + "learning_rate": 1e-06, + "loss": 0.409, + "mean_token_accuracy": 0.8680034279823303, + "num_tokens": 409727502.0, + "step": 10734 + }, + { + "epoch": 1.365602340669126, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.15690040588379, + "learning_rate": 1e-06, + "loss": 0.4259, + "mean_token_accuracy": 0.8636288642883301, + "num_tokens": 409764592.0, + "step": 10735 + }, + { + "epoch": 1.3657295509477165, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.124467849731445, + "learning_rate": 1e-06, + "loss": 0.4923, + "mean_token_accuracy": 0.8458839654922485, + "num_tokens": 409809645.0, + "step": 10736 + }, + { + "epoch": 1.365856761226307, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.197879791259766, + "learning_rate": 1e-06, + "loss": 0.421, + "mean_token_accuracy": 0.8669655323028564, + "num_tokens": 409848048.0, + "step": 10737 + }, + { + "epoch": 1.3659839715048976, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.220943450927734, + "learning_rate": 1e-06, + "loss": 0.4295, + "mean_token_accuracy": 0.862308919429779, + "num_tokens": 409884176.0, + "step": 10738 + }, + { + "epoch": 1.3661111817834881, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.22258186340332, + "learning_rate": 1e-06, + "loss": 0.4257, + "mean_token_accuracy": 0.8698659539222717, + "num_tokens": 409922630.0, + "step": 10739 + }, + { + "epoch": 1.3662383920620786, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.191551208496094, + "learning_rate": 1e-06, + "loss": 0.4001, + "mean_token_accuracy": 0.8730860948562622, + "num_tokens": 409961876.0, + "step": 10740 + }, + { + "epoch": 1.3663656023406692, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.193119049072266, + "learning_rate": 1e-06, + "loss": 0.3899, + "mean_token_accuracy": 0.8761679530143738, + "num_tokens": 409995432.0, + "step": 10741 + }, + { + "epoch": 1.3664928126192597, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.220243453979492, + "learning_rate": 1e-06, + "loss": 0.416, + "mean_token_accuracy": 0.865118145942688, + "num_tokens": 410034876.0, + "step": 10742 + }, + { + "epoch": 1.3666200228978502, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.3042049407959, + "learning_rate": 1e-06, + "loss": 0.4093, + "mean_token_accuracy": 0.8702569007873535, + "num_tokens": 410079190.0, + "step": 10743 + }, + { + "epoch": 1.3667472331764405, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.234760284423828, + "learning_rate": 1e-06, + "loss": 0.414, + "mean_token_accuracy": 0.8705536723136902, + "num_tokens": 410112475.0, + "step": 10744 + }, + { + "epoch": 1.366874443455031, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.30858612060547, + "learning_rate": 1e-06, + "loss": 0.3657, + "mean_token_accuracy": 0.8816688060760498, + "num_tokens": 410154583.0, + "step": 10745 + }, + { + "epoch": 1.3670016537336216, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.207788467407227, + "learning_rate": 1e-06, + "loss": 0.4246, + "mean_token_accuracy": 0.8668686151504517, + "num_tokens": 410191631.0, + "step": 10746 + }, + { + "epoch": 1.3671288640122121, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 22.96243667602539, + "learning_rate": 1e-06, + "loss": 0.4775, + "mean_token_accuracy": 0.844925045967102, + "num_tokens": 410232435.0, + "step": 10747 + }, + { + "epoch": 1.3672560742908026, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.33245849609375, + "learning_rate": 1e-06, + "loss": 0.4007, + "mean_token_accuracy": 0.8715999722480774, + "num_tokens": 410272940.0, + "step": 10748 + }, + { + "epoch": 1.3673832845693932, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.06466293334961, + "learning_rate": 1e-06, + "loss": 0.4249, + "mean_token_accuracy": 0.8631418943405151, + "num_tokens": 410312807.0, + "step": 10749 + }, + { + "epoch": 1.3675104948479837, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.18251609802246, + "learning_rate": 1e-06, + "loss": 0.403, + "mean_token_accuracy": 0.8726686835289001, + "num_tokens": 410352433.0, + "step": 10750 + }, + { + "epoch": 1.3676377051265742, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.161781311035156, + "learning_rate": 1e-06, + "loss": 0.4259, + "mean_token_accuracy": 0.8675424456596375, + "num_tokens": 410394843.0, + "step": 10751 + }, + { + "epoch": 1.3677649154051648, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.28133201599121, + "learning_rate": 1e-06, + "loss": 0.4145, + "mean_token_accuracy": 0.8687039613723755, + "num_tokens": 410439353.0, + "step": 10752 + }, + { + "epoch": 1.3678921256837553, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.16228675842285, + "learning_rate": 1e-06, + "loss": 0.4918, + "mean_token_accuracy": 0.8458960056304932, + "num_tokens": 410485789.0, + "step": 10753 + }, + { + "epoch": 1.3680193359623458, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.28127098083496, + "learning_rate": 1e-06, + "loss": 0.4677, + "mean_token_accuracy": 0.8523540496826172, + "num_tokens": 410523385.0, + "step": 10754 + }, + { + "epoch": 1.3681465462409363, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.184059143066406, + "learning_rate": 1e-06, + "loss": 0.4448, + "mean_token_accuracy": 0.8611147403717041, + "num_tokens": 410553340.0, + "step": 10755 + }, + { + "epoch": 1.3682737565195269, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.29340171813965, + "learning_rate": 1e-06, + "loss": 0.3862, + "mean_token_accuracy": 0.8760669827461243, + "num_tokens": 410590041.0, + "step": 10756 + }, + { + "epoch": 1.3684009667981174, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.20621109008789, + "learning_rate": 1e-06, + "loss": 0.4309, + "mean_token_accuracy": 0.8689923286437988, + "num_tokens": 410627703.0, + "step": 10757 + }, + { + "epoch": 1.368528177076708, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.20545768737793, + "learning_rate": 1e-06, + "loss": 0.4482, + "mean_token_accuracy": 0.8591969013214111, + "num_tokens": 410662555.0, + "step": 10758 + }, + { + "epoch": 1.3686553873552982, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.249135971069336, + "learning_rate": 1e-06, + "loss": 0.4813, + "mean_token_accuracy": 0.8492367267608643, + "num_tokens": 410706026.0, + "step": 10759 + }, + { + "epoch": 1.3687825976338888, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.325632095336914, + "learning_rate": 1e-06, + "loss": 0.4259, + "mean_token_accuracy": 0.8649776577949524, + "num_tokens": 410743275.0, + "step": 10760 + }, + { + "epoch": 1.3689098079124793, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.224529266357422, + "learning_rate": 1e-06, + "loss": 0.3846, + "mean_token_accuracy": 0.8764098882675171, + "num_tokens": 410781220.0, + "step": 10761 + }, + { + "epoch": 1.3690370181910698, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.183279037475586, + "learning_rate": 1e-06, + "loss": 0.3844, + "mean_token_accuracy": 0.8754295110702515, + "num_tokens": 410820014.0, + "step": 10762 + }, + { + "epoch": 1.3691642284696603, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.277366638183594, + "learning_rate": 1e-06, + "loss": 0.4417, + "mean_token_accuracy": 0.8619032502174377, + "num_tokens": 410858721.0, + "step": 10763 + }, + { + "epoch": 1.3692914387482509, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.20827293395996, + "learning_rate": 1e-06, + "loss": 0.3883, + "mean_token_accuracy": 0.8726907968521118, + "num_tokens": 410898527.0, + "step": 10764 + }, + { + "epoch": 1.3694186490268414, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.255556106567383, + "learning_rate": 1e-06, + "loss": 0.423, + "mean_token_accuracy": 0.8673678636550903, + "num_tokens": 410938426.0, + "step": 10765 + }, + { + "epoch": 1.369545859305432, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.027029037475586, + "learning_rate": 1e-06, + "loss": 0.4007, + "mean_token_accuracy": 0.8718846440315247, + "num_tokens": 410969940.0, + "step": 10766 + }, + { + "epoch": 1.3696730695840225, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.294160842895508, + "learning_rate": 1e-06, + "loss": 0.4672, + "mean_token_accuracy": 0.8523910641670227, + "num_tokens": 411008859.0, + "step": 10767 + }, + { + "epoch": 1.3698002798626128, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.216463088989258, + "learning_rate": 1e-06, + "loss": 0.3832, + "mean_token_accuracy": 0.8812699317932129, + "num_tokens": 411043516.0, + "step": 10768 + }, + { + "epoch": 1.3699274901412033, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.051254272460938, + "learning_rate": 1e-06, + "loss": 0.4347, + "mean_token_accuracy": 0.8598735332489014, + "num_tokens": 411079092.0, + "step": 10769 + }, + { + "epoch": 1.3700547004197938, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.278217315673828, + "learning_rate": 1e-06, + "loss": 0.4052, + "mean_token_accuracy": 0.8710688352584839, + "num_tokens": 411117360.0, + "step": 10770 + }, + { + "epoch": 1.3701819106983844, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.35472297668457, + "learning_rate": 1e-06, + "loss": 0.4423, + "mean_token_accuracy": 0.8574819564819336, + "num_tokens": 411153615.0, + "step": 10771 + }, + { + "epoch": 1.3703091209769749, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.309396743774414, + "learning_rate": 1e-06, + "loss": 0.4145, + "mean_token_accuracy": 0.8690874576568604, + "num_tokens": 411193571.0, + "step": 10772 + }, + { + "epoch": 1.3704363312555654, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.3592529296875, + "learning_rate": 1e-06, + "loss": 0.4716, + "mean_token_accuracy": 0.8470841646194458, + "num_tokens": 411230427.0, + "step": 10773 + }, + { + "epoch": 1.370563541534156, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.2047176361084, + "learning_rate": 1e-06, + "loss": 0.4243, + "mean_token_accuracy": 0.8645257949829102, + "num_tokens": 411271251.0, + "step": 10774 + }, + { + "epoch": 1.3706907518127465, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.468469619750977, + "learning_rate": 1e-06, + "loss": 0.4065, + "mean_token_accuracy": 0.869408369064331, + "num_tokens": 411304906.0, + "step": 10775 + }, + { + "epoch": 1.370817962091337, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.361848831176758, + "learning_rate": 1e-06, + "loss": 0.3694, + "mean_token_accuracy": 0.884078860282898, + "num_tokens": 411341417.0, + "step": 10776 + }, + { + "epoch": 1.3709451723699275, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.29279136657715, + "learning_rate": 1e-06, + "loss": 0.416, + "mean_token_accuracy": 0.8677825927734375, + "num_tokens": 411382118.0, + "step": 10777 + }, + { + "epoch": 1.371072382648518, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.21159553527832, + "learning_rate": 1e-06, + "loss": 0.4622, + "mean_token_accuracy": 0.8562046885490417, + "num_tokens": 411426702.0, + "step": 10778 + }, + { + "epoch": 1.3711995929271086, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.140165328979492, + "learning_rate": 1e-06, + "loss": 0.4257, + "mean_token_accuracy": 0.8657288551330566, + "num_tokens": 411461689.0, + "step": 10779 + }, + { + "epoch": 1.371326803205699, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.35480499267578, + "learning_rate": 1e-06, + "loss": 0.4135, + "mean_token_accuracy": 0.8683605194091797, + "num_tokens": 411499613.0, + "step": 10780 + }, + { + "epoch": 1.3714540134842896, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.134218215942383, + "learning_rate": 1e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.8653185963630676, + "num_tokens": 411538422.0, + "step": 10781 + }, + { + "epoch": 1.3715812237628802, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.391857147216797, + "learning_rate": 1e-06, + "loss": 0.3808, + "mean_token_accuracy": 0.8789317011833191, + "num_tokens": 411572401.0, + "step": 10782 + }, + { + "epoch": 1.3717084340414705, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.33759880065918, + "learning_rate": 1e-06, + "loss": 0.3973, + "mean_token_accuracy": 0.8738728165626526, + "num_tokens": 411611268.0, + "step": 10783 + }, + { + "epoch": 1.371835644320061, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.413846969604492, + "learning_rate": 1e-06, + "loss": 0.4286, + "mean_token_accuracy": 0.8663876056671143, + "num_tokens": 411648837.0, + "step": 10784 + }, + { + "epoch": 1.3719628545986515, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.122087478637695, + "learning_rate": 1e-06, + "loss": 0.4011, + "mean_token_accuracy": 0.8702593445777893, + "num_tokens": 411690327.0, + "step": 10785 + }, + { + "epoch": 1.372090064877242, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.212421417236328, + "learning_rate": 1e-06, + "loss": 0.4226, + "mean_token_accuracy": 0.8663496971130371, + "num_tokens": 411729665.0, + "step": 10786 + }, + { + "epoch": 1.3722172751558326, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.163532257080078, + "learning_rate": 1e-06, + "loss": 0.3765, + "mean_token_accuracy": 0.881094217300415, + "num_tokens": 411764716.0, + "step": 10787 + }, + { + "epoch": 1.3723444854344231, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.2938289642334, + "learning_rate": 1e-06, + "loss": 0.4112, + "mean_token_accuracy": 0.8686554431915283, + "num_tokens": 411805560.0, + "step": 10788 + }, + { + "epoch": 1.3724716957130136, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.08484649658203, + "learning_rate": 1e-06, + "loss": 0.4016, + "mean_token_accuracy": 0.8710026741027832, + "num_tokens": 411841151.0, + "step": 10789 + }, + { + "epoch": 1.3725989059916042, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.23438835144043, + "learning_rate": 1e-06, + "loss": 0.3903, + "mean_token_accuracy": 0.8739047050476074, + "num_tokens": 411875088.0, + "step": 10790 + }, + { + "epoch": 1.3727261162701947, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.357128143310547, + "learning_rate": 1e-06, + "loss": 0.415, + "mean_token_accuracy": 0.8679234981536865, + "num_tokens": 411914554.0, + "step": 10791 + }, + { + "epoch": 1.3728533265487852, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.178823471069336, + "learning_rate": 1e-06, + "loss": 0.4191, + "mean_token_accuracy": 0.8670492172241211, + "num_tokens": 411952237.0, + "step": 10792 + }, + { + "epoch": 1.3729805368273755, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.40069580078125, + "learning_rate": 1e-06, + "loss": 0.4313, + "mean_token_accuracy": 0.8615942001342773, + "num_tokens": 411989676.0, + "step": 10793 + }, + { + "epoch": 1.373107747105966, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.133689880371094, + "learning_rate": 1e-06, + "loss": 0.3716, + "mean_token_accuracy": 0.8794177770614624, + "num_tokens": 412025804.0, + "step": 10794 + }, + { + "epoch": 1.3732349573845566, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.327518463134766, + "learning_rate": 1e-06, + "loss": 0.3659, + "mean_token_accuracy": 0.8831084966659546, + "num_tokens": 412060085.0, + "step": 10795 + }, + { + "epoch": 1.3733621676631471, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.061716079711914, + "learning_rate": 1e-06, + "loss": 0.3886, + "mean_token_accuracy": 0.8744721412658691, + "num_tokens": 412100750.0, + "step": 10796 + }, + { + "epoch": 1.3734893779417376, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.218608856201172, + "learning_rate": 1e-06, + "loss": 0.4031, + "mean_token_accuracy": 0.8696443438529968, + "num_tokens": 412145829.0, + "step": 10797 + }, + { + "epoch": 1.3736165882203282, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.21375846862793, + "learning_rate": 1e-06, + "loss": 0.4049, + "mean_token_accuracy": 0.8724154233932495, + "num_tokens": 412187607.0, + "step": 10798 + }, + { + "epoch": 1.3737437984989187, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.27882957458496, + "learning_rate": 1e-06, + "loss": 0.4624, + "mean_token_accuracy": 0.8548635840415955, + "num_tokens": 412227214.0, + "step": 10799 + }, + { + "epoch": 1.3738710087775092, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.380956649780273, + "learning_rate": 1e-06, + "loss": 0.3959, + "mean_token_accuracy": 0.8703272938728333, + "num_tokens": 412261929.0, + "step": 10800 + }, + { + "epoch": 1.3739982190560998, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.387868881225586, + "learning_rate": 1e-06, + "loss": 0.4303, + "mean_token_accuracy": 0.8631293773651123, + "num_tokens": 412299441.0, + "step": 10801 + }, + { + "epoch": 1.3741254293346903, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.23509407043457, + "learning_rate": 1e-06, + "loss": 0.4248, + "mean_token_accuracy": 0.8653243184089661, + "num_tokens": 412339248.0, + "step": 10802 + }, + { + "epoch": 1.3742526396132808, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.533796310424805, + "learning_rate": 1e-06, + "loss": 0.427, + "mean_token_accuracy": 0.8659210205078125, + "num_tokens": 412377224.0, + "step": 10803 + }, + { + "epoch": 1.3743798498918713, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.247926712036133, + "learning_rate": 1e-06, + "loss": 0.4878, + "mean_token_accuracy": 0.8446776866912842, + "num_tokens": 412415331.0, + "step": 10804 + }, + { + "epoch": 1.3745070601704619, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.18128776550293, + "learning_rate": 1e-06, + "loss": 0.4228, + "mean_token_accuracy": 0.8666564226150513, + "num_tokens": 412455604.0, + "step": 10805 + }, + { + "epoch": 1.3746342704490524, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.384172439575195, + "learning_rate": 1e-06, + "loss": 0.3931, + "mean_token_accuracy": 0.8744202852249146, + "num_tokens": 412489295.0, + "step": 10806 + }, + { + "epoch": 1.374761480727643, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.082612991333008, + "learning_rate": 1e-06, + "loss": 0.4334, + "mean_token_accuracy": 0.8636326789855957, + "num_tokens": 412525750.0, + "step": 10807 + }, + { + "epoch": 1.3748886910062332, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.257915496826172, + "learning_rate": 1e-06, + "loss": 0.3893, + "mean_token_accuracy": 0.876742959022522, + "num_tokens": 412567163.0, + "step": 10808 + }, + { + "epoch": 1.3750159012848238, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.149137496948242, + "learning_rate": 1e-06, + "loss": 0.3964, + "mean_token_accuracy": 0.874756932258606, + "num_tokens": 412598789.0, + "step": 10809 + }, + { + "epoch": 1.3751431115634143, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.250625610351562, + "learning_rate": 1e-06, + "loss": 0.4017, + "mean_token_accuracy": 0.870753288269043, + "num_tokens": 412632588.0, + "step": 10810 + }, + { + "epoch": 1.3752703218420048, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.36984634399414, + "learning_rate": 1e-06, + "loss": 0.4237, + "mean_token_accuracy": 0.8667900562286377, + "num_tokens": 412676369.0, + "step": 10811 + }, + { + "epoch": 1.3753975321205953, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.174970626831055, + "learning_rate": 1e-06, + "loss": 0.3694, + "mean_token_accuracy": 0.8815531730651855, + "num_tokens": 412712674.0, + "step": 10812 + }, + { + "epoch": 1.3755247423991859, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.281553268432617, + "learning_rate": 1e-06, + "loss": 0.438, + "mean_token_accuracy": 0.8619693517684937, + "num_tokens": 412753531.0, + "step": 10813 + }, + { + "epoch": 1.3756519526777764, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.271631240844727, + "learning_rate": 1e-06, + "loss": 0.4182, + "mean_token_accuracy": 0.8655794858932495, + "num_tokens": 412799450.0, + "step": 10814 + }, + { + "epoch": 1.375779162956367, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.03356170654297, + "learning_rate": 1e-06, + "loss": 0.3485, + "mean_token_accuracy": 0.8891505002975464, + "num_tokens": 412836858.0, + "step": 10815 + }, + { + "epoch": 1.3759063732349575, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.351945877075195, + "learning_rate": 1e-06, + "loss": 0.4102, + "mean_token_accuracy": 0.8712581992149353, + "num_tokens": 412871169.0, + "step": 10816 + }, + { + "epoch": 1.3760335835135478, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.016033172607422, + "learning_rate": 1e-06, + "loss": 0.41, + "mean_token_accuracy": 0.8720778822898865, + "num_tokens": 412909281.0, + "step": 10817 + }, + { + "epoch": 1.3761607937921383, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.20838737487793, + "learning_rate": 1e-06, + "loss": 0.4129, + "mean_token_accuracy": 0.8693021535873413, + "num_tokens": 412950126.0, + "step": 10818 + }, + { + "epoch": 1.3762880040707288, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.21282196044922, + "learning_rate": 1e-06, + "loss": 0.4565, + "mean_token_accuracy": 0.8549306988716125, + "num_tokens": 412988644.0, + "step": 10819 + }, + { + "epoch": 1.3764152143493193, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.167644500732422, + "learning_rate": 1e-06, + "loss": 0.4682, + "mean_token_accuracy": 0.8539283275604248, + "num_tokens": 413028403.0, + "step": 10820 + }, + { + "epoch": 1.3765424246279099, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.15688133239746, + "learning_rate": 1e-06, + "loss": 0.451, + "mean_token_accuracy": 0.8588820695877075, + "num_tokens": 413067429.0, + "step": 10821 + }, + { + "epoch": 1.3766696349065004, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.35971450805664, + "learning_rate": 1e-06, + "loss": 0.4294, + "mean_token_accuracy": 0.8641940951347351, + "num_tokens": 413101630.0, + "step": 10822 + }, + { + "epoch": 1.376796845185091, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.288888931274414, + "learning_rate": 1e-06, + "loss": 0.436, + "mean_token_accuracy": 0.8615880608558655, + "num_tokens": 413147757.0, + "step": 10823 + }, + { + "epoch": 1.3769240554636815, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.355920791625977, + "learning_rate": 1e-06, + "loss": 0.4049, + "mean_token_accuracy": 0.8710404634475708, + "num_tokens": 413186803.0, + "step": 10824 + }, + { + "epoch": 1.377051265742272, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.25975227355957, + "learning_rate": 1e-06, + "loss": 0.433, + "mean_token_accuracy": 0.8616409301757812, + "num_tokens": 413227201.0, + "step": 10825 + }, + { + "epoch": 1.3771784760208625, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.174156188964844, + "learning_rate": 1e-06, + "loss": 0.4132, + "mean_token_accuracy": 0.8657592535018921, + "num_tokens": 413270594.0, + "step": 10826 + }, + { + "epoch": 1.377305686299453, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.332027435302734, + "learning_rate": 1e-06, + "loss": 0.4634, + "mean_token_accuracy": 0.8497704267501831, + "num_tokens": 413307345.0, + "step": 10827 + }, + { + "epoch": 1.3774328965780436, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.31169319152832, + "learning_rate": 1e-06, + "loss": 0.4006, + "mean_token_accuracy": 0.8714761734008789, + "num_tokens": 413344310.0, + "step": 10828 + }, + { + "epoch": 1.377560106856634, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.313692092895508, + "learning_rate": 1e-06, + "loss": 0.3637, + "mean_token_accuracy": 0.8865413069725037, + "num_tokens": 413382047.0, + "step": 10829 + }, + { + "epoch": 1.3776873171352246, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.340579986572266, + "learning_rate": 1e-06, + "loss": 0.4184, + "mean_token_accuracy": 0.8650494813919067, + "num_tokens": 413424658.0, + "step": 10830 + }, + { + "epoch": 1.3778145274138152, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.179630279541016, + "learning_rate": 1e-06, + "loss": 0.3871, + "mean_token_accuracy": 0.8751806020736694, + "num_tokens": 413458386.0, + "step": 10831 + }, + { + "epoch": 1.3779417376924055, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.243946075439453, + "learning_rate": 1e-06, + "loss": 0.4521, + "mean_token_accuracy": 0.8547798991203308, + "num_tokens": 413492530.0, + "step": 10832 + }, + { + "epoch": 1.378068947970996, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.190750122070312, + "learning_rate": 1e-06, + "loss": 0.3947, + "mean_token_accuracy": 0.8787493705749512, + "num_tokens": 413528894.0, + "step": 10833 + }, + { + "epoch": 1.3781961582495865, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.356443405151367, + "learning_rate": 1e-06, + "loss": 0.4243, + "mean_token_accuracy": 0.8639132976531982, + "num_tokens": 413564765.0, + "step": 10834 + }, + { + "epoch": 1.378323368528177, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.28103256225586, + "learning_rate": 1e-06, + "loss": 0.493, + "mean_token_accuracy": 0.8401849269866943, + "num_tokens": 413598122.0, + "step": 10835 + }, + { + "epoch": 1.3784505788067676, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.282020568847656, + "learning_rate": 1e-06, + "loss": 0.4656, + "mean_token_accuracy": 0.8501852750778198, + "num_tokens": 413634519.0, + "step": 10836 + }, + { + "epoch": 1.378577789085358, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.168134689331055, + "learning_rate": 1e-06, + "loss": 0.436, + "mean_token_accuracy": 0.859524130821228, + "num_tokens": 413674021.0, + "step": 10837 + }, + { + "epoch": 1.3787049993639486, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.516305923461914, + "learning_rate": 1e-06, + "loss": 0.4002, + "mean_token_accuracy": 0.8730438351631165, + "num_tokens": 413713804.0, + "step": 10838 + }, + { + "epoch": 1.3788322096425392, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.361854553222656, + "learning_rate": 1e-06, + "loss": 0.3977, + "mean_token_accuracy": 0.8743808269500732, + "num_tokens": 413755665.0, + "step": 10839 + }, + { + "epoch": 1.3789594199211297, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.315629959106445, + "learning_rate": 1e-06, + "loss": 0.4109, + "mean_token_accuracy": 0.8663316369056702, + "num_tokens": 413790872.0, + "step": 10840 + }, + { + "epoch": 1.3790866301997202, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.434326171875, + "learning_rate": 1e-06, + "loss": 0.3546, + "mean_token_accuracy": 0.886734664440155, + "num_tokens": 413830798.0, + "step": 10841 + }, + { + "epoch": 1.3792138404783105, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.127534866333008, + "learning_rate": 1e-06, + "loss": 0.4071, + "mean_token_accuracy": 0.8714605569839478, + "num_tokens": 413865960.0, + "step": 10842 + }, + { + "epoch": 1.379341050756901, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.531171798706055, + "learning_rate": 1e-06, + "loss": 0.3858, + "mean_token_accuracy": 0.8772151470184326, + "num_tokens": 413898222.0, + "step": 10843 + }, + { + "epoch": 1.3794682610354916, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.327106475830078, + "learning_rate": 1e-06, + "loss": 0.4227, + "mean_token_accuracy": 0.8616235256195068, + "num_tokens": 413931839.0, + "step": 10844 + }, + { + "epoch": 1.3795954713140821, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.227148056030273, + "learning_rate": 1e-06, + "loss": 0.4362, + "mean_token_accuracy": 0.8586792945861816, + "num_tokens": 413963815.0, + "step": 10845 + }, + { + "epoch": 1.3797226815926726, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.2708683013916, + "learning_rate": 1e-06, + "loss": 0.3903, + "mean_token_accuracy": 0.8759536743164062, + "num_tokens": 414002476.0, + "step": 10846 + }, + { + "epoch": 1.3798498918712632, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.14139747619629, + "learning_rate": 1e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.8625649213790894, + "num_tokens": 414041121.0, + "step": 10847 + }, + { + "epoch": 1.3799771021498537, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.591690063476562, + "learning_rate": 1e-06, + "loss": 0.4731, + "mean_token_accuracy": 0.8552632331848145, + "num_tokens": 414080447.0, + "step": 10848 + }, + { + "epoch": 1.3801043124284442, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.280414581298828, + "learning_rate": 1e-06, + "loss": 0.4524, + "mean_token_accuracy": 0.8566488027572632, + "num_tokens": 414108925.0, + "step": 10849 + }, + { + "epoch": 1.3802315227070348, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.3303279876709, + "learning_rate": 1e-06, + "loss": 0.4074, + "mean_token_accuracy": 0.8719393610954285, + "num_tokens": 414154296.0, + "step": 10850 + }, + { + "epoch": 1.3803587329856253, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.427371978759766, + "learning_rate": 1e-06, + "loss": 0.462, + "mean_token_accuracy": 0.8558867573738098, + "num_tokens": 414194612.0, + "step": 10851 + }, + { + "epoch": 1.3804859432642158, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.368783950805664, + "learning_rate": 1e-06, + "loss": 0.3982, + "mean_token_accuracy": 0.8703222274780273, + "num_tokens": 414229939.0, + "step": 10852 + }, + { + "epoch": 1.3806131535428063, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.314647674560547, + "learning_rate": 1e-06, + "loss": 0.4356, + "mean_token_accuracy": 0.8646234273910522, + "num_tokens": 414265193.0, + "step": 10853 + }, + { + "epoch": 1.3807403638213969, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.45707893371582, + "learning_rate": 1e-06, + "loss": 0.4196, + "mean_token_accuracy": 0.8645234107971191, + "num_tokens": 414301471.0, + "step": 10854 + }, + { + "epoch": 1.3808675740999874, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.335838317871094, + "learning_rate": 1e-06, + "loss": 0.5035, + "mean_token_accuracy": 0.8385810256004333, + "num_tokens": 414338755.0, + "step": 10855 + }, + { + "epoch": 1.380994784378578, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.25978660583496, + "learning_rate": 1e-06, + "loss": 0.416, + "mean_token_accuracy": 0.8684260845184326, + "num_tokens": 414379996.0, + "step": 10856 + }, + { + "epoch": 1.3811219946571682, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.49945068359375, + "learning_rate": 1e-06, + "loss": 0.4142, + "mean_token_accuracy": 0.8694425821304321, + "num_tokens": 414421905.0, + "step": 10857 + }, + { + "epoch": 1.3812492049357588, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.28056526184082, + "learning_rate": 1e-06, + "loss": 0.3997, + "mean_token_accuracy": 0.872979998588562, + "num_tokens": 414465149.0, + "step": 10858 + }, + { + "epoch": 1.3813764152143493, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.35467529296875, + "learning_rate": 1e-06, + "loss": 0.3819, + "mean_token_accuracy": 0.8776498436927795, + "num_tokens": 414504086.0, + "step": 10859 + }, + { + "epoch": 1.3815036254929398, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.168527603149414, + "learning_rate": 1e-06, + "loss": 0.4825, + "mean_token_accuracy": 0.8506910800933838, + "num_tokens": 414545392.0, + "step": 10860 + }, + { + "epoch": 1.3816308357715303, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.392719268798828, + "learning_rate": 1e-06, + "loss": 0.524, + "mean_token_accuracy": 0.831088125705719, + "num_tokens": 414586445.0, + "step": 10861 + }, + { + "epoch": 1.3817580460501209, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.18417739868164, + "learning_rate": 1e-06, + "loss": 0.392, + "mean_token_accuracy": 0.8766297698020935, + "num_tokens": 414618467.0, + "step": 10862 + }, + { + "epoch": 1.3818852563287114, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.384849548339844, + "learning_rate": 1e-06, + "loss": 0.3757, + "mean_token_accuracy": 0.8775264024734497, + "num_tokens": 414651253.0, + "step": 10863 + }, + { + "epoch": 1.382012466607302, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.368288040161133, + "learning_rate": 1e-06, + "loss": 0.424, + "mean_token_accuracy": 0.8664418458938599, + "num_tokens": 414692016.0, + "step": 10864 + }, + { + "epoch": 1.3821396768858925, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.226953506469727, + "learning_rate": 1e-06, + "loss": 0.4015, + "mean_token_accuracy": 0.8731549978256226, + "num_tokens": 414731840.0, + "step": 10865 + }, + { + "epoch": 1.3822668871644828, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.41866683959961, + "learning_rate": 1e-06, + "loss": 0.4075, + "mean_token_accuracy": 0.8686857223510742, + "num_tokens": 414767110.0, + "step": 10866 + }, + { + "epoch": 1.3823940974430733, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.194381713867188, + "learning_rate": 1e-06, + "loss": 0.401, + "mean_token_accuracy": 0.8727985620498657, + "num_tokens": 414799340.0, + "step": 10867 + }, + { + "epoch": 1.3825213077216638, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.52171516418457, + "learning_rate": 1e-06, + "loss": 0.4027, + "mean_token_accuracy": 0.8699455261230469, + "num_tokens": 414840915.0, + "step": 10868 + }, + { + "epoch": 1.3826485180002543, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.302858352661133, + "learning_rate": 1e-06, + "loss": 0.4436, + "mean_token_accuracy": 0.8551217913627625, + "num_tokens": 414877298.0, + "step": 10869 + }, + { + "epoch": 1.3827757282788449, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.254497528076172, + "learning_rate": 1e-06, + "loss": 0.3905, + "mean_token_accuracy": 0.8744281530380249, + "num_tokens": 414916431.0, + "step": 10870 + }, + { + "epoch": 1.3829029385574354, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.47183609008789, + "learning_rate": 1e-06, + "loss": 0.4237, + "mean_token_accuracy": 0.8656977415084839, + "num_tokens": 414958148.0, + "step": 10871 + }, + { + "epoch": 1.383030148836026, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.177650451660156, + "learning_rate": 1e-06, + "loss": 0.3437, + "mean_token_accuracy": 0.8917453289031982, + "num_tokens": 414997008.0, + "step": 10872 + }, + { + "epoch": 1.3831573591146165, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.35677719116211, + "learning_rate": 1e-06, + "loss": 0.4619, + "mean_token_accuracy": 0.8537771701812744, + "num_tokens": 415042893.0, + "step": 10873 + }, + { + "epoch": 1.383284569393207, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.46640396118164, + "learning_rate": 1e-06, + "loss": 0.4222, + "mean_token_accuracy": 0.8653770685195923, + "num_tokens": 415077476.0, + "step": 10874 + }, + { + "epoch": 1.3834117796717975, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.225767135620117, + "learning_rate": 1e-06, + "loss": 0.439, + "mean_token_accuracy": 0.8622537851333618, + "num_tokens": 415124329.0, + "step": 10875 + }, + { + "epoch": 1.383538989950388, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.42120361328125, + "learning_rate": 1e-06, + "loss": 0.3899, + "mean_token_accuracy": 0.8785588145256042, + "num_tokens": 415164101.0, + "step": 10876 + }, + { + "epoch": 1.3836662002289786, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.432050704956055, + "learning_rate": 1e-06, + "loss": 0.4132, + "mean_token_accuracy": 0.8678581714630127, + "num_tokens": 415203792.0, + "step": 10877 + }, + { + "epoch": 1.383793410507569, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.376976013183594, + "learning_rate": 1e-06, + "loss": 0.4332, + "mean_token_accuracy": 0.8646893501281738, + "num_tokens": 415233754.0, + "step": 10878 + }, + { + "epoch": 1.3839206207861596, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.458208084106445, + "learning_rate": 1e-06, + "loss": 0.3949, + "mean_token_accuracy": 0.8748027086257935, + "num_tokens": 415269567.0, + "step": 10879 + }, + { + "epoch": 1.3840478310647502, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.377994537353516, + "learning_rate": 1e-06, + "loss": 0.4015, + "mean_token_accuracy": 0.8704712390899658, + "num_tokens": 415312660.0, + "step": 10880 + }, + { + "epoch": 1.3841750413433405, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.197338104248047, + "learning_rate": 1e-06, + "loss": 0.3914, + "mean_token_accuracy": 0.878045380115509, + "num_tokens": 415355377.0, + "step": 10881 + }, + { + "epoch": 1.384302251621931, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.353445053100586, + "learning_rate": 1e-06, + "loss": 0.3767, + "mean_token_accuracy": 0.879131019115448, + "num_tokens": 415393109.0, + "step": 10882 + }, + { + "epoch": 1.3844294619005215, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.407228469848633, + "learning_rate": 1e-06, + "loss": 0.3845, + "mean_token_accuracy": 0.879326343536377, + "num_tokens": 415428650.0, + "step": 10883 + }, + { + "epoch": 1.384556672179112, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.082334518432617, + "learning_rate": 1e-06, + "loss": 0.4242, + "mean_token_accuracy": 0.8654597401618958, + "num_tokens": 415467166.0, + "step": 10884 + }, + { + "epoch": 1.3846838824577026, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.44729232788086, + "learning_rate": 1e-06, + "loss": 0.3813, + "mean_token_accuracy": 0.8805158734321594, + "num_tokens": 415505633.0, + "step": 10885 + }, + { + "epoch": 1.384811092736293, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.355873107910156, + "learning_rate": 1e-06, + "loss": 0.4146, + "mean_token_accuracy": 0.8671869039535522, + "num_tokens": 415546254.0, + "step": 10886 + }, + { + "epoch": 1.3849383030148836, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.554176330566406, + "learning_rate": 1e-06, + "loss": 0.4142, + "mean_token_accuracy": 0.8660480976104736, + "num_tokens": 415584999.0, + "step": 10887 + }, + { + "epoch": 1.3850655132934742, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.493818283081055, + "learning_rate": 1e-06, + "loss": 0.4168, + "mean_token_accuracy": 0.8700765371322632, + "num_tokens": 415624608.0, + "step": 10888 + }, + { + "epoch": 1.3851927235720647, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.63460922241211, + "learning_rate": 1e-06, + "loss": 0.4674, + "mean_token_accuracy": 0.8547794818878174, + "num_tokens": 415663405.0, + "step": 10889 + }, + { + "epoch": 1.385319933850655, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.4453182220459, + "learning_rate": 1e-06, + "loss": 0.3699, + "mean_token_accuracy": 0.8819634914398193, + "num_tokens": 415698934.0, + "step": 10890 + }, + { + "epoch": 1.3854471441292455, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.282014846801758, + "learning_rate": 1e-06, + "loss": 0.5063, + "mean_token_accuracy": 0.8398836851119995, + "num_tokens": 415736973.0, + "step": 10891 + }, + { + "epoch": 1.385574354407836, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.577857971191406, + "learning_rate": 1e-06, + "loss": 0.4077, + "mean_token_accuracy": 0.870330274105072, + "num_tokens": 415778364.0, + "step": 10892 + }, + { + "epoch": 1.3857015646864266, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.29396629333496, + "learning_rate": 1e-06, + "loss": 0.3639, + "mean_token_accuracy": 0.88315349817276, + "num_tokens": 415817502.0, + "step": 10893 + }, + { + "epoch": 1.385828774965017, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.325885772705078, + "learning_rate": 1e-06, + "loss": 0.4377, + "mean_token_accuracy": 0.8625364303588867, + "num_tokens": 415860364.0, + "step": 10894 + }, + { + "epoch": 1.3859559852436076, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.362651824951172, + "learning_rate": 1e-06, + "loss": 0.3942, + "mean_token_accuracy": 0.8710476756095886, + "num_tokens": 415896816.0, + "step": 10895 + }, + { + "epoch": 1.3860831955221982, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.26299476623535, + "learning_rate": 1e-06, + "loss": 0.4709, + "mean_token_accuracy": 0.8493116497993469, + "num_tokens": 415935173.0, + "step": 10896 + }, + { + "epoch": 1.3862104058007887, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.277751922607422, + "learning_rate": 1e-06, + "loss": 0.3887, + "mean_token_accuracy": 0.8772946000099182, + "num_tokens": 415977043.0, + "step": 10897 + }, + { + "epoch": 1.3863376160793792, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.274677276611328, + "learning_rate": 1e-06, + "loss": 0.3996, + "mean_token_accuracy": 0.8772237300872803, + "num_tokens": 416014376.0, + "step": 10898 + }, + { + "epoch": 1.3864648263579697, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.350963592529297, + "learning_rate": 1e-06, + "loss": 0.4656, + "mean_token_accuracy": 0.8549364805221558, + "num_tokens": 416055648.0, + "step": 10899 + }, + { + "epoch": 1.3865920366365603, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.30385971069336, + "learning_rate": 1e-06, + "loss": 0.4386, + "mean_token_accuracy": 0.8597017526626587, + "num_tokens": 416087062.0, + "step": 10900 + }, + { + "epoch": 1.3867192469151508, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.401941299438477, + "learning_rate": 1e-06, + "loss": 0.3984, + "mean_token_accuracy": 0.8740621209144592, + "num_tokens": 416122559.0, + "step": 10901 + }, + { + "epoch": 1.3868464571937413, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.312885284423828, + "learning_rate": 1e-06, + "loss": 0.4382, + "mean_token_accuracy": 0.8597900867462158, + "num_tokens": 416155228.0, + "step": 10902 + }, + { + "epoch": 1.3869736674723319, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.312744140625, + "learning_rate": 1e-06, + "loss": 0.4545, + "mean_token_accuracy": 0.8537389636039734, + "num_tokens": 416191190.0, + "step": 10903 + }, + { + "epoch": 1.3871008777509224, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.26300621032715, + "learning_rate": 1e-06, + "loss": 0.4191, + "mean_token_accuracy": 0.871429443359375, + "num_tokens": 416228009.0, + "step": 10904 + }, + { + "epoch": 1.387228088029513, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.16712760925293, + "learning_rate": 1e-06, + "loss": 0.4237, + "mean_token_accuracy": 0.8629165291786194, + "num_tokens": 416263657.0, + "step": 10905 + }, + { + "epoch": 1.3873552983081032, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.288888931274414, + "learning_rate": 1e-06, + "loss": 0.4335, + "mean_token_accuracy": 0.8611239194869995, + "num_tokens": 416305187.0, + "step": 10906 + }, + { + "epoch": 1.3874825085866938, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.138689041137695, + "learning_rate": 1e-06, + "loss": 0.4316, + "mean_token_accuracy": 0.8651040196418762, + "num_tokens": 416342578.0, + "step": 10907 + }, + { + "epoch": 1.3876097188652843, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.423446655273438, + "learning_rate": 1e-06, + "loss": 0.4183, + "mean_token_accuracy": 0.8652467727661133, + "num_tokens": 416378969.0, + "step": 10908 + }, + { + "epoch": 1.3877369291438748, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.23297119140625, + "learning_rate": 1e-06, + "loss": 0.4268, + "mean_token_accuracy": 0.8654282093048096, + "num_tokens": 416412958.0, + "step": 10909 + }, + { + "epoch": 1.3878641394224653, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.179931640625, + "learning_rate": 1e-06, + "loss": 0.4585, + "mean_token_accuracy": 0.8489101529121399, + "num_tokens": 416445643.0, + "step": 10910 + }, + { + "epoch": 1.3879913497010559, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.246074676513672, + "learning_rate": 1e-06, + "loss": 0.4586, + "mean_token_accuracy": 0.8539232611656189, + "num_tokens": 416484891.0, + "step": 10911 + }, + { + "epoch": 1.3881185599796464, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.386058807373047, + "learning_rate": 1e-06, + "loss": 0.4761, + "mean_token_accuracy": 0.8535315990447998, + "num_tokens": 416520504.0, + "step": 10912 + }, + { + "epoch": 1.388245770258237, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.278648376464844, + "learning_rate": 1e-06, + "loss": 0.4154, + "mean_token_accuracy": 0.8674418926239014, + "num_tokens": 416561877.0, + "step": 10913 + }, + { + "epoch": 1.3883729805368275, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.2584285736084, + "learning_rate": 1e-06, + "loss": 0.4198, + "mean_token_accuracy": 0.8684017658233643, + "num_tokens": 416597676.0, + "step": 10914 + }, + { + "epoch": 1.3885001908154178, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.268630981445312, + "learning_rate": 1e-06, + "loss": 0.4079, + "mean_token_accuracy": 0.8739483952522278, + "num_tokens": 416630557.0, + "step": 10915 + }, + { + "epoch": 1.3886274010940083, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.19423484802246, + "learning_rate": 1e-06, + "loss": 0.3996, + "mean_token_accuracy": 0.8702494502067566, + "num_tokens": 416668604.0, + "step": 10916 + }, + { + "epoch": 1.3887546113725988, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.09490203857422, + "learning_rate": 1e-06, + "loss": 0.4841, + "mean_token_accuracy": 0.8471426963806152, + "num_tokens": 416710246.0, + "step": 10917 + }, + { + "epoch": 1.3888818216511893, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.155588150024414, + "learning_rate": 1e-06, + "loss": 0.4091, + "mean_token_accuracy": 0.867215633392334, + "num_tokens": 416744962.0, + "step": 10918 + }, + { + "epoch": 1.3890090319297799, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.241165161132812, + "learning_rate": 1e-06, + "loss": 0.4085, + "mean_token_accuracy": 0.8683092594146729, + "num_tokens": 416779483.0, + "step": 10919 + }, + { + "epoch": 1.3891362422083704, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.415138244628906, + "learning_rate": 1e-06, + "loss": 0.3763, + "mean_token_accuracy": 0.8800468444824219, + "num_tokens": 416822135.0, + "step": 10920 + }, + { + "epoch": 1.389263452486961, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.464204788208008, + "learning_rate": 1e-06, + "loss": 0.4256, + "mean_token_accuracy": 0.8650952577590942, + "num_tokens": 416860383.0, + "step": 10921 + }, + { + "epoch": 1.3893906627655515, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.268033981323242, + "learning_rate": 1e-06, + "loss": 0.437, + "mean_token_accuracy": 0.8631216883659363, + "num_tokens": 416896960.0, + "step": 10922 + }, + { + "epoch": 1.389517873044142, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.217914581298828, + "learning_rate": 1e-06, + "loss": 0.3798, + "mean_token_accuracy": 0.87775719165802, + "num_tokens": 416931981.0, + "step": 10923 + }, + { + "epoch": 1.3896450833227325, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.27145004272461, + "learning_rate": 1e-06, + "loss": 0.387, + "mean_token_accuracy": 0.8754954934120178, + "num_tokens": 416968179.0, + "step": 10924 + }, + { + "epoch": 1.389772293601323, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.5327091217041, + "learning_rate": 1e-06, + "loss": 0.4, + "mean_token_accuracy": 0.8717395067214966, + "num_tokens": 417008879.0, + "step": 10925 + }, + { + "epoch": 1.3898995038799136, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.4963436126709, + "learning_rate": 1e-06, + "loss": 0.3837, + "mean_token_accuracy": 0.8785744905471802, + "num_tokens": 417042497.0, + "step": 10926 + }, + { + "epoch": 1.390026714158504, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.237703323364258, + "learning_rate": 1e-06, + "loss": 0.4203, + "mean_token_accuracy": 0.8678431510925293, + "num_tokens": 417084396.0, + "step": 10927 + }, + { + "epoch": 1.3901539244370946, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.288240432739258, + "learning_rate": 1e-06, + "loss": 0.3802, + "mean_token_accuracy": 0.8809471726417542, + "num_tokens": 417121014.0, + "step": 10928 + }, + { + "epoch": 1.3902811347156852, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.35238265991211, + "learning_rate": 1e-06, + "loss": 0.4152, + "mean_token_accuracy": 0.8667563199996948, + "num_tokens": 417160169.0, + "step": 10929 + }, + { + "epoch": 1.3904083449942755, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.161720275878906, + "learning_rate": 1e-06, + "loss": 0.4571, + "mean_token_accuracy": 0.8511295318603516, + "num_tokens": 417195024.0, + "step": 10930 + }, + { + "epoch": 1.390535555272866, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.411714553833008, + "learning_rate": 1e-06, + "loss": 0.408, + "mean_token_accuracy": 0.8731461763381958, + "num_tokens": 417231344.0, + "step": 10931 + }, + { + "epoch": 1.3906627655514565, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.522802352905273, + "learning_rate": 1e-06, + "loss": 0.4309, + "mean_token_accuracy": 0.8585047125816345, + "num_tokens": 417270266.0, + "step": 10932 + }, + { + "epoch": 1.390789975830047, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.261117935180664, + "learning_rate": 1e-06, + "loss": 0.4317, + "mean_token_accuracy": 0.8658273816108704, + "num_tokens": 417308292.0, + "step": 10933 + }, + { + "epoch": 1.3909171861086376, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.133928298950195, + "learning_rate": 1e-06, + "loss": 0.4208, + "mean_token_accuracy": 0.8646310567855835, + "num_tokens": 417346358.0, + "step": 10934 + }, + { + "epoch": 1.391044396387228, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.467117309570312, + "learning_rate": 1e-06, + "loss": 0.4288, + "mean_token_accuracy": 0.8682436943054199, + "num_tokens": 417381924.0, + "step": 10935 + }, + { + "epoch": 1.3911716066658186, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.162492752075195, + "learning_rate": 1e-06, + "loss": 0.4951, + "mean_token_accuracy": 0.8411360383033752, + "num_tokens": 417417779.0, + "step": 10936 + }, + { + "epoch": 1.3912988169444092, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.197704315185547, + "learning_rate": 1e-06, + "loss": 0.4474, + "mean_token_accuracy": 0.8593927025794983, + "num_tokens": 417459076.0, + "step": 10937 + }, + { + "epoch": 1.3914260272229997, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.24858856201172, + "learning_rate": 1e-06, + "loss": 0.4215, + "mean_token_accuracy": 0.8666810989379883, + "num_tokens": 417499422.0, + "step": 10938 + }, + { + "epoch": 1.39155323750159, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.311115264892578, + "learning_rate": 1e-06, + "loss": 0.4182, + "mean_token_accuracy": 0.8672078251838684, + "num_tokens": 417541776.0, + "step": 10939 + }, + { + "epoch": 1.3916804477801805, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.20571517944336, + "learning_rate": 1e-06, + "loss": 0.4535, + "mean_token_accuracy": 0.8547707200050354, + "num_tokens": 417576469.0, + "step": 10940 + }, + { + "epoch": 1.391807658058771, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.20899772644043, + "learning_rate": 1e-06, + "loss": 0.4106, + "mean_token_accuracy": 0.8680120706558228, + "num_tokens": 417611726.0, + "step": 10941 + }, + { + "epoch": 1.3919348683373616, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.26678466796875, + "learning_rate": 1e-06, + "loss": 0.4365, + "mean_token_accuracy": 0.8601275682449341, + "num_tokens": 417650135.0, + "step": 10942 + }, + { + "epoch": 1.392062078615952, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.401458740234375, + "learning_rate": 1e-06, + "loss": 0.4711, + "mean_token_accuracy": 0.8484057188034058, + "num_tokens": 417690981.0, + "step": 10943 + }, + { + "epoch": 1.3921892888945426, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.418733596801758, + "learning_rate": 1e-06, + "loss": 0.4683, + "mean_token_accuracy": 0.8552446365356445, + "num_tokens": 417732152.0, + "step": 10944 + }, + { + "epoch": 1.3923164991731332, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.33671760559082, + "learning_rate": 1e-06, + "loss": 0.4619, + "mean_token_accuracy": 0.8564695715904236, + "num_tokens": 417771814.0, + "step": 10945 + }, + { + "epoch": 1.3924437094517237, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.346715927124023, + "learning_rate": 1e-06, + "loss": 0.4448, + "mean_token_accuracy": 0.8567862510681152, + "num_tokens": 417810663.0, + "step": 10946 + }, + { + "epoch": 1.3925709197303142, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.400035858154297, + "learning_rate": 1e-06, + "loss": 0.4107, + "mean_token_accuracy": 0.8701279759407043, + "num_tokens": 417845664.0, + "step": 10947 + }, + { + "epoch": 1.3926981300089047, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.271827697753906, + "learning_rate": 1e-06, + "loss": 0.4461, + "mean_token_accuracy": 0.8584914207458496, + "num_tokens": 417889617.0, + "step": 10948 + }, + { + "epoch": 1.3928253402874953, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.345563888549805, + "learning_rate": 1e-06, + "loss": 0.4335, + "mean_token_accuracy": 0.8637611269950867, + "num_tokens": 417930820.0, + "step": 10949 + }, + { + "epoch": 1.3929525505660858, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.227697372436523, + "learning_rate": 1e-06, + "loss": 0.4055, + "mean_token_accuracy": 0.8684278726577759, + "num_tokens": 417966799.0, + "step": 10950 + }, + { + "epoch": 1.3930797608446763, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.30980682373047, + "learning_rate": 1e-06, + "loss": 0.4339, + "mean_token_accuracy": 0.8590981960296631, + "num_tokens": 418009328.0, + "step": 10951 + }, + { + "epoch": 1.3932069711232669, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.119958877563477, + "learning_rate": 1e-06, + "loss": 0.4114, + "mean_token_accuracy": 0.8693046569824219, + "num_tokens": 418048548.0, + "step": 10952 + }, + { + "epoch": 1.3933341814018574, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.342676162719727, + "learning_rate": 1e-06, + "loss": 0.4236, + "mean_token_accuracy": 0.8642991781234741, + "num_tokens": 418094947.0, + "step": 10953 + }, + { + "epoch": 1.393461391680448, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.41128158569336, + "learning_rate": 1e-06, + "loss": 0.4695, + "mean_token_accuracy": 0.8570070266723633, + "num_tokens": 418137144.0, + "step": 10954 + }, + { + "epoch": 1.3935886019590382, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.09687042236328, + "learning_rate": 1e-06, + "loss": 0.3524, + "mean_token_accuracy": 0.8877853155136108, + "num_tokens": 418171449.0, + "step": 10955 + }, + { + "epoch": 1.3937158122376287, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.355003356933594, + "learning_rate": 1e-06, + "loss": 0.4066, + "mean_token_accuracy": 0.8687455654144287, + "num_tokens": 418209854.0, + "step": 10956 + }, + { + "epoch": 1.3938430225162193, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.283859252929688, + "learning_rate": 1e-06, + "loss": 0.3845, + "mean_token_accuracy": 0.8738694787025452, + "num_tokens": 418245121.0, + "step": 10957 + }, + { + "epoch": 1.3939702327948098, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.456254959106445, + "learning_rate": 1e-06, + "loss": 0.4001, + "mean_token_accuracy": 0.8699498772621155, + "num_tokens": 418284158.0, + "step": 10958 + }, + { + "epoch": 1.3940974430734003, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.241596221923828, + "learning_rate": 1e-06, + "loss": 0.3985, + "mean_token_accuracy": 0.8732555508613586, + "num_tokens": 418324091.0, + "step": 10959 + }, + { + "epoch": 1.3942246533519909, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.15884780883789, + "learning_rate": 1e-06, + "loss": 0.4158, + "mean_token_accuracy": 0.8669259548187256, + "num_tokens": 418364764.0, + "step": 10960 + }, + { + "epoch": 1.3943518636305814, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.311338424682617, + "learning_rate": 1e-06, + "loss": 0.4273, + "mean_token_accuracy": 0.8634105920791626, + "num_tokens": 418403974.0, + "step": 10961 + }, + { + "epoch": 1.394479073909172, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.294557571411133, + "learning_rate": 1e-06, + "loss": 0.4523, + "mean_token_accuracy": 0.8568369150161743, + "num_tokens": 418446341.0, + "step": 10962 + }, + { + "epoch": 1.3946062841877624, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.18892478942871, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.8664060235023499, + "num_tokens": 418490989.0, + "step": 10963 + }, + { + "epoch": 1.3947334944663528, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.282207489013672, + "learning_rate": 1e-06, + "loss": 0.4519, + "mean_token_accuracy": 0.8608794212341309, + "num_tokens": 418524633.0, + "step": 10964 + }, + { + "epoch": 1.3948607047449433, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.1743221282959, + "learning_rate": 1e-06, + "loss": 0.4316, + "mean_token_accuracy": 0.8660014867782593, + "num_tokens": 418562576.0, + "step": 10965 + }, + { + "epoch": 1.3949879150235338, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.34734344482422, + "learning_rate": 1e-06, + "loss": 0.4178, + "mean_token_accuracy": 0.8663303852081299, + "num_tokens": 418596537.0, + "step": 10966 + }, + { + "epoch": 1.3951151253021243, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.30657196044922, + "learning_rate": 1e-06, + "loss": 0.4193, + "mean_token_accuracy": 0.8641108274459839, + "num_tokens": 418625800.0, + "step": 10967 + }, + { + "epoch": 1.3952423355807149, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.352861404418945, + "learning_rate": 1e-06, + "loss": 0.4851, + "mean_token_accuracy": 0.8528933525085449, + "num_tokens": 418663818.0, + "step": 10968 + }, + { + "epoch": 1.3953695458593054, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.405452728271484, + "learning_rate": 1e-06, + "loss": 0.3982, + "mean_token_accuracy": 0.8717051148414612, + "num_tokens": 418700607.0, + "step": 10969 + }, + { + "epoch": 1.395496756137896, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.245222091674805, + "learning_rate": 1e-06, + "loss": 0.4476, + "mean_token_accuracy": 0.8602613210678101, + "num_tokens": 418739357.0, + "step": 10970 + }, + { + "epoch": 1.3956239664164865, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.368934631347656, + "learning_rate": 1e-06, + "loss": 0.4298, + "mean_token_accuracy": 0.8721082806587219, + "num_tokens": 418769752.0, + "step": 10971 + }, + { + "epoch": 1.395751176695077, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.399951934814453, + "learning_rate": 1e-06, + "loss": 0.4773, + "mean_token_accuracy": 0.8476624488830566, + "num_tokens": 418808196.0, + "step": 10972 + }, + { + "epoch": 1.3958783869736675, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.208866119384766, + "learning_rate": 1e-06, + "loss": 0.3758, + "mean_token_accuracy": 0.8821063041687012, + "num_tokens": 418843335.0, + "step": 10973 + }, + { + "epoch": 1.396005597252258, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.293392181396484, + "learning_rate": 1e-06, + "loss": 0.3939, + "mean_token_accuracy": 0.8746217489242554, + "num_tokens": 418876591.0, + "step": 10974 + }, + { + "epoch": 1.3961328075308486, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.18218994140625, + "learning_rate": 1e-06, + "loss": 0.4553, + "mean_token_accuracy": 0.8532650470733643, + "num_tokens": 418910482.0, + "step": 10975 + }, + { + "epoch": 1.396260017809439, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.23673439025879, + "learning_rate": 1e-06, + "loss": 0.4512, + "mean_token_accuracy": 0.857369065284729, + "num_tokens": 418942248.0, + "step": 10976 + }, + { + "epoch": 1.3963872280880296, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.565128326416016, + "learning_rate": 1e-06, + "loss": 0.4131, + "mean_token_accuracy": 0.869575023651123, + "num_tokens": 418977770.0, + "step": 10977 + }, + { + "epoch": 1.3965144383666201, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.236948013305664, + "learning_rate": 1e-06, + "loss": 0.4499, + "mean_token_accuracy": 0.8596085906028748, + "num_tokens": 419016464.0, + "step": 10978 + }, + { + "epoch": 1.3966416486452105, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.569974899291992, + "learning_rate": 1e-06, + "loss": 0.4463, + "mean_token_accuracy": 0.8573151230812073, + "num_tokens": 419053254.0, + "step": 10979 + }, + { + "epoch": 1.396768858923801, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.1866397857666, + "learning_rate": 1e-06, + "loss": 0.4245, + "mean_token_accuracy": 0.8607275485992432, + "num_tokens": 419088940.0, + "step": 10980 + }, + { + "epoch": 1.3968960692023915, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.384933471679688, + "learning_rate": 1e-06, + "loss": 0.419, + "mean_token_accuracy": 0.8657224178314209, + "num_tokens": 419127897.0, + "step": 10981 + }, + { + "epoch": 1.397023279480982, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.44172477722168, + "learning_rate": 1e-06, + "loss": 0.4167, + "mean_token_accuracy": 0.8689426779747009, + "num_tokens": 419170921.0, + "step": 10982 + }, + { + "epoch": 1.3971504897595726, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.213712692260742, + "learning_rate": 1e-06, + "loss": 0.4027, + "mean_token_accuracy": 0.8725435733795166, + "num_tokens": 419206032.0, + "step": 10983 + }, + { + "epoch": 1.397277700038163, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.302467346191406, + "learning_rate": 1e-06, + "loss": 0.3635, + "mean_token_accuracy": 0.8810979723930359, + "num_tokens": 419244240.0, + "step": 10984 + }, + { + "epoch": 1.3974049103167536, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.322500228881836, + "learning_rate": 1e-06, + "loss": 0.3835, + "mean_token_accuracy": 0.87425696849823, + "num_tokens": 419274902.0, + "step": 10985 + }, + { + "epoch": 1.3975321205953442, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.139015197753906, + "learning_rate": 1e-06, + "loss": 0.3826, + "mean_token_accuracy": 0.8783112168312073, + "num_tokens": 419312295.0, + "step": 10986 + }, + { + "epoch": 1.3976593308739347, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.495590209960938, + "learning_rate": 1e-06, + "loss": 0.4745, + "mean_token_accuracy": 0.8494520783424377, + "num_tokens": 419350999.0, + "step": 10987 + }, + { + "epoch": 1.397786541152525, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.35947608947754, + "learning_rate": 1e-06, + "loss": 0.4204, + "mean_token_accuracy": 0.867032527923584, + "num_tokens": 419386739.0, + "step": 10988 + }, + { + "epoch": 1.3979137514311155, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.303346633911133, + "learning_rate": 1e-06, + "loss": 0.3912, + "mean_token_accuracy": 0.8741828799247742, + "num_tokens": 419430353.0, + "step": 10989 + }, + { + "epoch": 1.398040961709706, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.567934036254883, + "learning_rate": 1e-06, + "loss": 0.4307, + "mean_token_accuracy": 0.8620923757553101, + "num_tokens": 419462910.0, + "step": 10990 + }, + { + "epoch": 1.3981681719882966, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.45501136779785, + "learning_rate": 1e-06, + "loss": 0.3963, + "mean_token_accuracy": 0.8762816190719604, + "num_tokens": 419498727.0, + "step": 10991 + }, + { + "epoch": 1.398295382266887, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.309417724609375, + "learning_rate": 1e-06, + "loss": 0.4452, + "mean_token_accuracy": 0.8571882247924805, + "num_tokens": 419534496.0, + "step": 10992 + }, + { + "epoch": 1.3984225925454776, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.685771942138672, + "learning_rate": 1e-06, + "loss": 0.4285, + "mean_token_accuracy": 0.8627252578735352, + "num_tokens": 419575760.0, + "step": 10993 + }, + { + "epoch": 1.3985498028240682, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.43259620666504, + "learning_rate": 1e-06, + "loss": 0.3891, + "mean_token_accuracy": 0.8736752867698669, + "num_tokens": 419616822.0, + "step": 10994 + }, + { + "epoch": 1.3986770131026587, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.46416664123535, + "learning_rate": 1e-06, + "loss": 0.4084, + "mean_token_accuracy": 0.8728108406066895, + "num_tokens": 419650425.0, + "step": 10995 + }, + { + "epoch": 1.3988042233812492, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.475194931030273, + "learning_rate": 1e-06, + "loss": 0.3939, + "mean_token_accuracy": 0.874313235282898, + "num_tokens": 419685501.0, + "step": 10996 + }, + { + "epoch": 1.3989314336598397, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.302213668823242, + "learning_rate": 1e-06, + "loss": 0.4126, + "mean_token_accuracy": 0.8688000440597534, + "num_tokens": 419724471.0, + "step": 10997 + }, + { + "epoch": 1.3990586439384303, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.30803871154785, + "learning_rate": 1e-06, + "loss": 0.3917, + "mean_token_accuracy": 0.8743418455123901, + "num_tokens": 419763683.0, + "step": 10998 + }, + { + "epoch": 1.3991858542170208, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.291982650756836, + "learning_rate": 1e-06, + "loss": 0.4167, + "mean_token_accuracy": 0.8696210384368896, + "num_tokens": 419801238.0, + "step": 10999 + }, + { + "epoch": 1.3993130644956113, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.425193786621094, + "learning_rate": 1e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.8635808229446411, + "num_tokens": 419832787.0, + "step": 11000 + }, + { + "epoch": 1.3994402747742019, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.34564208984375, + "learning_rate": 1e-06, + "loss": 0.4521, + "mean_token_accuracy": 0.856453001499176, + "num_tokens": 419864294.0, + "step": 11001 + }, + { + "epoch": 1.3995674850527924, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.24466323852539, + "learning_rate": 1e-06, + "loss": 0.4113, + "mean_token_accuracy": 0.8696441650390625, + "num_tokens": 419900493.0, + "step": 11002 + }, + { + "epoch": 1.399694695331383, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.41520881652832, + "learning_rate": 1e-06, + "loss": 0.3661, + "mean_token_accuracy": 0.8852453231811523, + "num_tokens": 419938917.0, + "step": 11003 + }, + { + "epoch": 1.3998219056099732, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.33058738708496, + "learning_rate": 1e-06, + "loss": 0.363, + "mean_token_accuracy": 0.8824940919876099, + "num_tokens": 419978771.0, + "step": 11004 + }, + { + "epoch": 1.3999491158885637, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.225635528564453, + "learning_rate": 1e-06, + "loss": 0.4189, + "mean_token_accuracy": 0.8649901747703552, + "num_tokens": 420011836.0, + "step": 11005 + }, + { + "epoch": 1.4000763261671543, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.477025985717773, + "learning_rate": 1e-06, + "loss": 0.4921, + "mean_token_accuracy": 0.8464044332504272, + "num_tokens": 420051618.0, + "step": 11006 + }, + { + "epoch": 1.4002035364457448, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.086212158203125, + "learning_rate": 1e-06, + "loss": 0.4617, + "mean_token_accuracy": 0.854174017906189, + "num_tokens": 420089119.0, + "step": 11007 + }, + { + "epoch": 1.4003307467243353, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.26787757873535, + "learning_rate": 1e-06, + "loss": 0.3766, + "mean_token_accuracy": 0.8783210515975952, + "num_tokens": 420123706.0, + "step": 11008 + }, + { + "epoch": 1.4004579570029259, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.44198989868164, + "learning_rate": 1e-06, + "loss": 0.4298, + "mean_token_accuracy": 0.8640746474266052, + "num_tokens": 420156723.0, + "step": 11009 + }, + { + "epoch": 1.4005851672815164, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.188657760620117, + "learning_rate": 1e-06, + "loss": 0.4147, + "mean_token_accuracy": 0.8668166399002075, + "num_tokens": 420198519.0, + "step": 11010 + }, + { + "epoch": 1.400712377560107, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.350473403930664, + "learning_rate": 1e-06, + "loss": 0.3736, + "mean_token_accuracy": 0.8799257278442383, + "num_tokens": 420231722.0, + "step": 11011 + }, + { + "epoch": 1.4008395878386974, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.259498596191406, + "learning_rate": 1e-06, + "loss": 0.4123, + "mean_token_accuracy": 0.8701149225234985, + "num_tokens": 420272059.0, + "step": 11012 + }, + { + "epoch": 1.4009667981172877, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.389909744262695, + "learning_rate": 1e-06, + "loss": 0.4003, + "mean_token_accuracy": 0.8729280829429626, + "num_tokens": 420307695.0, + "step": 11013 + }, + { + "epoch": 1.4010940083958783, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.24762725830078, + "learning_rate": 1e-06, + "loss": 0.3922, + "mean_token_accuracy": 0.8771231174468994, + "num_tokens": 420345989.0, + "step": 11014 + }, + { + "epoch": 1.4012212186744688, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.141366958618164, + "learning_rate": 1e-06, + "loss": 0.3997, + "mean_token_accuracy": 0.8735214471817017, + "num_tokens": 420387194.0, + "step": 11015 + }, + { + "epoch": 1.4013484289530593, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.264209747314453, + "learning_rate": 1e-06, + "loss": 0.3659, + "mean_token_accuracy": 0.8822827339172363, + "num_tokens": 420427608.0, + "step": 11016 + }, + { + "epoch": 1.4014756392316499, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.290733337402344, + "learning_rate": 1e-06, + "loss": 0.3991, + "mean_token_accuracy": 0.8738052248954773, + "num_tokens": 420461363.0, + "step": 11017 + }, + { + "epoch": 1.4016028495102404, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.197254180908203, + "learning_rate": 1e-06, + "loss": 0.4609, + "mean_token_accuracy": 0.8493552803993225, + "num_tokens": 420503577.0, + "step": 11018 + }, + { + "epoch": 1.401730059788831, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.293649673461914, + "learning_rate": 1e-06, + "loss": 0.3904, + "mean_token_accuracy": 0.877784788608551, + "num_tokens": 420538852.0, + "step": 11019 + }, + { + "epoch": 1.4018572700674214, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.25335693359375, + "learning_rate": 1e-06, + "loss": 0.4431, + "mean_token_accuracy": 0.863762617111206, + "num_tokens": 420577495.0, + "step": 11020 + }, + { + "epoch": 1.401984480346012, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.542755126953125, + "learning_rate": 1e-06, + "loss": 0.4143, + "mean_token_accuracy": 0.8673266172409058, + "num_tokens": 420616123.0, + "step": 11021 + }, + { + "epoch": 1.4021116906246025, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.275876998901367, + "learning_rate": 1e-06, + "loss": 0.3831, + "mean_token_accuracy": 0.8770594596862793, + "num_tokens": 420653904.0, + "step": 11022 + }, + { + "epoch": 1.402238900903193, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.356700897216797, + "learning_rate": 1e-06, + "loss": 0.3543, + "mean_token_accuracy": 0.8908485174179077, + "num_tokens": 420693539.0, + "step": 11023 + }, + { + "epoch": 1.4023661111817836, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.38460350036621, + "learning_rate": 1e-06, + "loss": 0.4027, + "mean_token_accuracy": 0.8717952370643616, + "num_tokens": 420729046.0, + "step": 11024 + }, + { + "epoch": 1.402493321460374, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.34026336669922, + "learning_rate": 1e-06, + "loss": 0.4214, + "mean_token_accuracy": 0.8705718517303467, + "num_tokens": 420766143.0, + "step": 11025 + }, + { + "epoch": 1.4026205317389646, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.282459259033203, + "learning_rate": 1e-06, + "loss": 0.4279, + "mean_token_accuracy": 0.8643257021903992, + "num_tokens": 420806367.0, + "step": 11026 + }, + { + "epoch": 1.4027477420175551, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.510730743408203, + "learning_rate": 1e-06, + "loss": 0.3884, + "mean_token_accuracy": 0.8743131756782532, + "num_tokens": 420845081.0, + "step": 11027 + }, + { + "epoch": 1.4028749522961454, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.44808006286621, + "learning_rate": 1e-06, + "loss": 0.394, + "mean_token_accuracy": 0.8719707727432251, + "num_tokens": 420885833.0, + "step": 11028 + }, + { + "epoch": 1.403002162574736, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.40652084350586, + "learning_rate": 1e-06, + "loss": 0.3996, + "mean_token_accuracy": 0.872771680355072, + "num_tokens": 420916313.0, + "step": 11029 + }, + { + "epoch": 1.4031293728533265, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.45447540283203, + "learning_rate": 1e-06, + "loss": 0.4635, + "mean_token_accuracy": 0.8499023914337158, + "num_tokens": 420957011.0, + "step": 11030 + }, + { + "epoch": 1.403256583131917, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.33173179626465, + "learning_rate": 1e-06, + "loss": 0.3997, + "mean_token_accuracy": 0.8713564276695251, + "num_tokens": 420994815.0, + "step": 11031 + }, + { + "epoch": 1.4033837934105076, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.45615577697754, + "learning_rate": 1e-06, + "loss": 0.3883, + "mean_token_accuracy": 0.8749604225158691, + "num_tokens": 421033977.0, + "step": 11032 + }, + { + "epoch": 1.403511003689098, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.154563903808594, + "learning_rate": 1e-06, + "loss": 0.4064, + "mean_token_accuracy": 0.8723834753036499, + "num_tokens": 421068871.0, + "step": 11033 + }, + { + "epoch": 1.4036382139676886, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.401256561279297, + "learning_rate": 1e-06, + "loss": 0.4139, + "mean_token_accuracy": 0.8688877820968628, + "num_tokens": 421105680.0, + "step": 11034 + }, + { + "epoch": 1.4037654242462791, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.314876556396484, + "learning_rate": 1e-06, + "loss": 0.3776, + "mean_token_accuracy": 0.8798201680183411, + "num_tokens": 421149519.0, + "step": 11035 + }, + { + "epoch": 1.4038926345248697, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.45384979248047, + "learning_rate": 1e-06, + "loss": 0.4464, + "mean_token_accuracy": 0.8689932823181152, + "num_tokens": 421190195.0, + "step": 11036 + }, + { + "epoch": 1.40401984480346, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.355260848999023, + "learning_rate": 1e-06, + "loss": 0.4232, + "mean_token_accuracy": 0.8624681234359741, + "num_tokens": 421218637.0, + "step": 11037 + }, + { + "epoch": 1.4041470550820505, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.367679595947266, + "learning_rate": 1e-06, + "loss": 0.4015, + "mean_token_accuracy": 0.8712314367294312, + "num_tokens": 421261300.0, + "step": 11038 + }, + { + "epoch": 1.404274265360641, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.509082794189453, + "learning_rate": 1e-06, + "loss": 0.405, + "mean_token_accuracy": 0.8633267283439636, + "num_tokens": 421297250.0, + "step": 11039 + }, + { + "epoch": 1.4044014756392316, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.157119750976562, + "learning_rate": 1e-06, + "loss": 0.4373, + "mean_token_accuracy": 0.8609384298324585, + "num_tokens": 421334213.0, + "step": 11040 + }, + { + "epoch": 1.404528685917822, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.554462432861328, + "learning_rate": 1e-06, + "loss": 0.3774, + "mean_token_accuracy": 0.8752142786979675, + "num_tokens": 421368341.0, + "step": 11041 + }, + { + "epoch": 1.4046558961964126, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.397077560424805, + "learning_rate": 1e-06, + "loss": 0.4474, + "mean_token_accuracy": 0.8549149036407471, + "num_tokens": 421408278.0, + "step": 11042 + }, + { + "epoch": 1.4047831064750032, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.428466796875, + "learning_rate": 1e-06, + "loss": 0.434, + "mean_token_accuracy": 0.8596410751342773, + "num_tokens": 421442559.0, + "step": 11043 + }, + { + "epoch": 1.4049103167535937, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.45113182067871, + "learning_rate": 1e-06, + "loss": 0.4094, + "mean_token_accuracy": 0.8658171892166138, + "num_tokens": 421474014.0, + "step": 11044 + }, + { + "epoch": 1.4050375270321842, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.34683609008789, + "learning_rate": 1e-06, + "loss": 0.4923, + "mean_token_accuracy": 0.8435159921646118, + "num_tokens": 421511193.0, + "step": 11045 + }, + { + "epoch": 1.4051647373107747, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.41994857788086, + "learning_rate": 1e-06, + "loss": 0.4043, + "mean_token_accuracy": 0.8693271279335022, + "num_tokens": 421547895.0, + "step": 11046 + }, + { + "epoch": 1.4052919475893653, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.385080337524414, + "learning_rate": 1e-06, + "loss": 0.4199, + "mean_token_accuracy": 0.8664917945861816, + "num_tokens": 421582883.0, + "step": 11047 + }, + { + "epoch": 1.4054191578679558, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.371044158935547, + "learning_rate": 1e-06, + "loss": 0.4103, + "mean_token_accuracy": 0.869487464427948, + "num_tokens": 421619838.0, + "step": 11048 + }, + { + "epoch": 1.4055463681465463, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.610628128051758, + "learning_rate": 1e-06, + "loss": 0.3896, + "mean_token_accuracy": 0.8775816559791565, + "num_tokens": 421659451.0, + "step": 11049 + }, + { + "epoch": 1.4056735784251368, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.325963973999023, + "learning_rate": 1e-06, + "loss": 0.4278, + "mean_token_accuracy": 0.8649361729621887, + "num_tokens": 421704826.0, + "step": 11050 + }, + { + "epoch": 1.4058007887037274, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.352561950683594, + "learning_rate": 1e-06, + "loss": 0.4217, + "mean_token_accuracy": 0.8644154071807861, + "num_tokens": 421748953.0, + "step": 11051 + }, + { + "epoch": 1.405927998982318, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.536083221435547, + "learning_rate": 1e-06, + "loss": 0.4719, + "mean_token_accuracy": 0.8513257503509521, + "num_tokens": 421786614.0, + "step": 11052 + }, + { + "epoch": 1.4060552092609082, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.58502960205078, + "learning_rate": 1e-06, + "loss": 0.4345, + "mean_token_accuracy": 0.8611094355583191, + "num_tokens": 421817291.0, + "step": 11053 + }, + { + "epoch": 1.4061824195394987, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.09734344482422, + "learning_rate": 1e-06, + "loss": 0.4262, + "mean_token_accuracy": 0.86549973487854, + "num_tokens": 421858200.0, + "step": 11054 + }, + { + "epoch": 1.4063096298180893, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.609634399414062, + "learning_rate": 1e-06, + "loss": 0.4173, + "mean_token_accuracy": 0.8680335283279419, + "num_tokens": 421903343.0, + "step": 11055 + }, + { + "epoch": 1.4064368400966798, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.299528121948242, + "learning_rate": 1e-06, + "loss": 0.3701, + "mean_token_accuracy": 0.8834489583969116, + "num_tokens": 421941426.0, + "step": 11056 + }, + { + "epoch": 1.4065640503752703, + "ewc_loss": 0.031494140625, + "ewc_loss_parallel": 3.147125244140625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.28891372680664, + "learning_rate": 1e-06, + "loss": 0.4283, + "mean_token_accuracy": 0.8644908666610718, + "num_tokens": 421983992.0, + "step": 11057 + }, + { + "epoch": 1.4066912606538609, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.70441246032715, + "learning_rate": 1e-06, + "loss": 0.3915, + "mean_token_accuracy": 0.8760338425636292, + "num_tokens": 422015066.0, + "step": 11058 + }, + { + "epoch": 1.4068184709324514, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.17517852783203, + "learning_rate": 1e-06, + "loss": 0.4107, + "mean_token_accuracy": 0.8706120252609253, + "num_tokens": 422052392.0, + "step": 11059 + }, + { + "epoch": 1.406945681211042, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.303232192993164, + "learning_rate": 1e-06, + "loss": 0.4309, + "mean_token_accuracy": 0.8635912537574768, + "num_tokens": 422091453.0, + "step": 11060 + }, + { + "epoch": 1.4070728914896324, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.332569122314453, + "learning_rate": 1e-06, + "loss": 0.4696, + "mean_token_accuracy": 0.8525693416595459, + "num_tokens": 422131019.0, + "step": 11061 + }, + { + "epoch": 1.4072001017682227, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.47256851196289, + "learning_rate": 1e-06, + "loss": 0.4365, + "mean_token_accuracy": 0.8607265949249268, + "num_tokens": 422172139.0, + "step": 11062 + }, + { + "epoch": 1.4073273120468133, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.241893768310547, + "learning_rate": 1e-06, + "loss": 0.4003, + "mean_token_accuracy": 0.8748867511749268, + "num_tokens": 422209708.0, + "step": 11063 + }, + { + "epoch": 1.4074545223254038, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.350963592529297, + "learning_rate": 1e-06, + "loss": 0.4107, + "mean_token_accuracy": 0.8685768842697144, + "num_tokens": 422245091.0, + "step": 11064 + }, + { + "epoch": 1.4075817326039943, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.63117027282715, + "learning_rate": 1e-06, + "loss": 0.4295, + "mean_token_accuracy": 0.8684446215629578, + "num_tokens": 422280054.0, + "step": 11065 + }, + { + "epoch": 1.4077089428825849, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.5444278717041, + "learning_rate": 1e-06, + "loss": 0.3964, + "mean_token_accuracy": 0.8747028708457947, + "num_tokens": 422310036.0, + "step": 11066 + }, + { + "epoch": 1.4078361531611754, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.489850997924805, + "learning_rate": 1e-06, + "loss": 0.4597, + "mean_token_accuracy": 0.8520355224609375, + "num_tokens": 422347914.0, + "step": 11067 + }, + { + "epoch": 1.407963363439766, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.527677536010742, + "learning_rate": 1e-06, + "loss": 0.402, + "mean_token_accuracy": 0.8691434264183044, + "num_tokens": 422383207.0, + "step": 11068 + }, + { + "epoch": 1.4080905737183564, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.395681381225586, + "learning_rate": 1e-06, + "loss": 0.4068, + "mean_token_accuracy": 0.8715634346008301, + "num_tokens": 422414675.0, + "step": 11069 + }, + { + "epoch": 1.408217783996947, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.324275970458984, + "learning_rate": 1e-06, + "loss": 0.4474, + "mean_token_accuracy": 0.8557230234146118, + "num_tokens": 422449746.0, + "step": 11070 + }, + { + "epoch": 1.4083449942755375, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.453702926635742, + "learning_rate": 1e-06, + "loss": 0.4436, + "mean_token_accuracy": 0.8620586395263672, + "num_tokens": 422487789.0, + "step": 11071 + }, + { + "epoch": 1.408472204554128, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.364334106445312, + "learning_rate": 1e-06, + "loss": 0.3871, + "mean_token_accuracy": 0.8724443912506104, + "num_tokens": 422521750.0, + "step": 11072 + }, + { + "epoch": 1.4085994148327186, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.339115142822266, + "learning_rate": 1e-06, + "loss": 0.39, + "mean_token_accuracy": 0.8770867586135864, + "num_tokens": 422562923.0, + "step": 11073 + }, + { + "epoch": 1.408726625111309, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.345853805541992, + "learning_rate": 1e-06, + "loss": 0.4521, + "mean_token_accuracy": 0.8566492199897766, + "num_tokens": 422606832.0, + "step": 11074 + }, + { + "epoch": 1.4088538353898996, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.365219116210938, + "learning_rate": 1e-06, + "loss": 0.4081, + "mean_token_accuracy": 0.8704314231872559, + "num_tokens": 422644465.0, + "step": 11075 + }, + { + "epoch": 1.4089810456684901, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.21929931640625, + "learning_rate": 1e-06, + "loss": 0.3886, + "mean_token_accuracy": 0.8764443397521973, + "num_tokens": 422681204.0, + "step": 11076 + }, + { + "epoch": 1.4091082559470804, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.462268829345703, + "learning_rate": 1e-06, + "loss": 0.4513, + "mean_token_accuracy": 0.8573514223098755, + "num_tokens": 422716325.0, + "step": 11077 + }, + { + "epoch": 1.409235466225671, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.214014053344727, + "learning_rate": 1e-06, + "loss": 0.4513, + "mean_token_accuracy": 0.8562255501747131, + "num_tokens": 422753644.0, + "step": 11078 + }, + { + "epoch": 1.4093626765042615, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.517576217651367, + "learning_rate": 1e-06, + "loss": 0.4055, + "mean_token_accuracy": 0.8673420548439026, + "num_tokens": 422790893.0, + "step": 11079 + }, + { + "epoch": 1.409489886782852, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.4248046875, + "learning_rate": 1e-06, + "loss": 0.4414, + "mean_token_accuracy": 0.8597781658172607, + "num_tokens": 422829400.0, + "step": 11080 + }, + { + "epoch": 1.4096170970614426, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.248933792114258, + "learning_rate": 1e-06, + "loss": 0.3914, + "mean_token_accuracy": 0.8769370317459106, + "num_tokens": 422867386.0, + "step": 11081 + }, + { + "epoch": 1.409744307340033, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.392972946166992, + "learning_rate": 1e-06, + "loss": 0.4153, + "mean_token_accuracy": 0.8696808815002441, + "num_tokens": 422910736.0, + "step": 11082 + }, + { + "epoch": 1.4098715176186236, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.237716674804688, + "learning_rate": 1e-06, + "loss": 0.4025, + "mean_token_accuracy": 0.8716593980789185, + "num_tokens": 422950302.0, + "step": 11083 + }, + { + "epoch": 1.4099987278972141, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.451255798339844, + "learning_rate": 1e-06, + "loss": 0.4256, + "mean_token_accuracy": 0.8659041523933411, + "num_tokens": 422985892.0, + "step": 11084 + }, + { + "epoch": 1.4101259381758047, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.306306838989258, + "learning_rate": 1e-06, + "loss": 0.4341, + "mean_token_accuracy": 0.8600852489471436, + "num_tokens": 423020075.0, + "step": 11085 + }, + { + "epoch": 1.410253148454395, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.48481559753418, + "learning_rate": 1e-06, + "loss": 0.4083, + "mean_token_accuracy": 0.8705873489379883, + "num_tokens": 423058260.0, + "step": 11086 + }, + { + "epoch": 1.4103803587329855, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.253944396972656, + "learning_rate": 1e-06, + "loss": 0.3431, + "mean_token_accuracy": 0.890743613243103, + "num_tokens": 423092158.0, + "step": 11087 + }, + { + "epoch": 1.410507569011576, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.37059211730957, + "learning_rate": 1e-06, + "loss": 0.4389, + "mean_token_accuracy": 0.8649728298187256, + "num_tokens": 423123999.0, + "step": 11088 + }, + { + "epoch": 1.4106347792901666, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.512962341308594, + "learning_rate": 1e-06, + "loss": 0.4348, + "mean_token_accuracy": 0.8636566400527954, + "num_tokens": 423167111.0, + "step": 11089 + }, + { + "epoch": 1.410761989568757, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.157258987426758, + "learning_rate": 1e-06, + "loss": 0.4452, + "mean_token_accuracy": 0.859965443611145, + "num_tokens": 423202060.0, + "step": 11090 + }, + { + "epoch": 1.4108891998473476, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.457090377807617, + "learning_rate": 1e-06, + "loss": 0.4378, + "mean_token_accuracy": 0.8586447238922119, + "num_tokens": 423241200.0, + "step": 11091 + }, + { + "epoch": 1.4110164101259381, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.24540901184082, + "learning_rate": 1e-06, + "loss": 0.3851, + "mean_token_accuracy": 0.8764747381210327, + "num_tokens": 423284713.0, + "step": 11092 + }, + { + "epoch": 1.4111436204045287, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.19270133972168, + "learning_rate": 1e-06, + "loss": 0.4088, + "mean_token_accuracy": 0.8705328702926636, + "num_tokens": 423322524.0, + "step": 11093 + }, + { + "epoch": 1.4112708306831192, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.37481117248535, + "learning_rate": 1e-06, + "loss": 0.4057, + "mean_token_accuracy": 0.8679264187812805, + "num_tokens": 423360551.0, + "step": 11094 + }, + { + "epoch": 1.4113980409617097, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.27769660949707, + "learning_rate": 1e-06, + "loss": 0.383, + "mean_token_accuracy": 0.8772885203361511, + "num_tokens": 423395227.0, + "step": 11095 + }, + { + "epoch": 1.4115252512403003, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.410751342773438, + "learning_rate": 1e-06, + "loss": 0.4773, + "mean_token_accuracy": 0.853989839553833, + "num_tokens": 423434757.0, + "step": 11096 + }, + { + "epoch": 1.4116524615188908, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.372087478637695, + "learning_rate": 1e-06, + "loss": 0.3617, + "mean_token_accuracy": 0.884692370891571, + "num_tokens": 423464599.0, + "step": 11097 + }, + { + "epoch": 1.4117796717974813, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.28299331665039, + "learning_rate": 1e-06, + "loss": 0.4547, + "mean_token_accuracy": 0.8605888485908508, + "num_tokens": 423504327.0, + "step": 11098 + }, + { + "epoch": 1.4119068820760718, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.362520217895508, + "learning_rate": 1e-06, + "loss": 0.4842, + "mean_token_accuracy": 0.8490970134735107, + "num_tokens": 423543114.0, + "step": 11099 + }, + { + "epoch": 1.4120340923546624, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.48752212524414, + "learning_rate": 1e-06, + "loss": 0.4628, + "mean_token_accuracy": 0.8561752438545227, + "num_tokens": 423581727.0, + "step": 11100 + }, + { + "epoch": 1.412161302633253, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.460874557495117, + "learning_rate": 1e-06, + "loss": 0.4136, + "mean_token_accuracy": 0.8712586760520935, + "num_tokens": 423617073.0, + "step": 11101 + }, + { + "epoch": 1.4122885129118432, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.249340057373047, + "learning_rate": 1e-06, + "loss": 0.3962, + "mean_token_accuracy": 0.8715428113937378, + "num_tokens": 423664922.0, + "step": 11102 + }, + { + "epoch": 1.4124157231904337, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.315093994140625, + "learning_rate": 1e-06, + "loss": 0.4374, + "mean_token_accuracy": 0.858518123626709, + "num_tokens": 423706990.0, + "step": 11103 + }, + { + "epoch": 1.4125429334690243, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.41672134399414, + "learning_rate": 1e-06, + "loss": 0.4754, + "mean_token_accuracy": 0.8517618775367737, + "num_tokens": 423745945.0, + "step": 11104 + }, + { + "epoch": 1.4126701437476148, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.292814254760742, + "learning_rate": 1e-06, + "loss": 0.3963, + "mean_token_accuracy": 0.873039186000824, + "num_tokens": 423780824.0, + "step": 11105 + }, + { + "epoch": 1.4127973540262053, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.520503997802734, + "learning_rate": 1e-06, + "loss": 0.3724, + "mean_token_accuracy": 0.8796037435531616, + "num_tokens": 423818640.0, + "step": 11106 + }, + { + "epoch": 1.4129245643047958, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.317758560180664, + "learning_rate": 1e-06, + "loss": 0.3936, + "mean_token_accuracy": 0.8767504692077637, + "num_tokens": 423856657.0, + "step": 11107 + }, + { + "epoch": 1.4130517745833864, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.557044982910156, + "learning_rate": 1e-06, + "loss": 0.376, + "mean_token_accuracy": 0.8783880472183228, + "num_tokens": 423900288.0, + "step": 11108 + }, + { + "epoch": 1.413178984861977, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.466482162475586, + "learning_rate": 1e-06, + "loss": 0.3959, + "mean_token_accuracy": 0.8737459182739258, + "num_tokens": 423935929.0, + "step": 11109 + }, + { + "epoch": 1.4133061951405674, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.464345932006836, + "learning_rate": 1e-06, + "loss": 0.4166, + "mean_token_accuracy": 0.8664882779121399, + "num_tokens": 423970719.0, + "step": 11110 + }, + { + "epoch": 1.4134334054191577, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.536657333374023, + "learning_rate": 1e-06, + "loss": 0.4191, + "mean_token_accuracy": 0.8688822388648987, + "num_tokens": 424012338.0, + "step": 11111 + }, + { + "epoch": 1.4135606156977483, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.367122650146484, + "learning_rate": 1e-06, + "loss": 0.4719, + "mean_token_accuracy": 0.8503615260124207, + "num_tokens": 424050112.0, + "step": 11112 + }, + { + "epoch": 1.4136878259763388, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.829219818115234, + "learning_rate": 1e-06, + "loss": 0.4301, + "mean_token_accuracy": 0.8676432371139526, + "num_tokens": 424092688.0, + "step": 11113 + }, + { + "epoch": 1.4138150362549293, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.28091812133789, + "learning_rate": 1e-06, + "loss": 0.4319, + "mean_token_accuracy": 0.8650559186935425, + "num_tokens": 424131355.0, + "step": 11114 + }, + { + "epoch": 1.4139422465335199, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.54342269897461, + "learning_rate": 1e-06, + "loss": 0.4506, + "mean_token_accuracy": 0.857234001159668, + "num_tokens": 424177768.0, + "step": 11115 + }, + { + "epoch": 1.4140694568121104, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.465604782104492, + "learning_rate": 1e-06, + "loss": 0.3782, + "mean_token_accuracy": 0.8805319666862488, + "num_tokens": 424215834.0, + "step": 11116 + }, + { + "epoch": 1.414196667090701, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.320392608642578, + "learning_rate": 1e-06, + "loss": 0.3996, + "mean_token_accuracy": 0.8717580437660217, + "num_tokens": 424260002.0, + "step": 11117 + }, + { + "epoch": 1.4143238773692914, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.67549705505371, + "learning_rate": 1e-06, + "loss": 0.4219, + "mean_token_accuracy": 0.8659138679504395, + "num_tokens": 424299343.0, + "step": 11118 + }, + { + "epoch": 1.414451087647882, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.320568084716797, + "learning_rate": 1e-06, + "loss": 0.4368, + "mean_token_accuracy": 0.8614581823348999, + "num_tokens": 424340312.0, + "step": 11119 + }, + { + "epoch": 1.4145782979264725, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.50686264038086, + "learning_rate": 1e-06, + "loss": 0.3973, + "mean_token_accuracy": 0.8762634992599487, + "num_tokens": 424376658.0, + "step": 11120 + }, + { + "epoch": 1.414705508205063, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.452665328979492, + "learning_rate": 1e-06, + "loss": 0.3857, + "mean_token_accuracy": 0.8791071772575378, + "num_tokens": 424415499.0, + "step": 11121 + }, + { + "epoch": 1.4148327184836536, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.32000160217285, + "learning_rate": 1e-06, + "loss": 0.4116, + "mean_token_accuracy": 0.8687602877616882, + "num_tokens": 424453401.0, + "step": 11122 + }, + { + "epoch": 1.414959928762244, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.40643882751465, + "learning_rate": 1e-06, + "loss": 0.4089, + "mean_token_accuracy": 0.871641993522644, + "num_tokens": 424490654.0, + "step": 11123 + }, + { + "epoch": 1.4150871390408346, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.39449119567871, + "learning_rate": 1e-06, + "loss": 0.3775, + "mean_token_accuracy": 0.8777823448181152, + "num_tokens": 424534469.0, + "step": 11124 + }, + { + "epoch": 1.4152143493194251, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.52141571044922, + "learning_rate": 1e-06, + "loss": 0.4606, + "mean_token_accuracy": 0.8591558933258057, + "num_tokens": 424570187.0, + "step": 11125 + }, + { + "epoch": 1.4153415595980154, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.355426788330078, + "learning_rate": 1e-06, + "loss": 0.4793, + "mean_token_accuracy": 0.8488661050796509, + "num_tokens": 424607898.0, + "step": 11126 + }, + { + "epoch": 1.415468769876606, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.574949264526367, + "learning_rate": 1e-06, + "loss": 0.4031, + "mean_token_accuracy": 0.8714699745178223, + "num_tokens": 424652156.0, + "step": 11127 + }, + { + "epoch": 1.4155959801551965, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.05657196044922, + "learning_rate": 1e-06, + "loss": 0.4443, + "mean_token_accuracy": 0.858535647392273, + "num_tokens": 424688735.0, + "step": 11128 + }, + { + "epoch": 1.415723190433787, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.439241409301758, + "learning_rate": 1e-06, + "loss": 0.4141, + "mean_token_accuracy": 0.8684879541397095, + "num_tokens": 424732859.0, + "step": 11129 + }, + { + "epoch": 1.4158504007123776, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.332958221435547, + "learning_rate": 1e-06, + "loss": 0.4445, + "mean_token_accuracy": 0.8597143292427063, + "num_tokens": 424769381.0, + "step": 11130 + }, + { + "epoch": 1.415977610990968, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.432689666748047, + "learning_rate": 1e-06, + "loss": 0.4114, + "mean_token_accuracy": 0.8685609698295593, + "num_tokens": 424802362.0, + "step": 11131 + }, + { + "epoch": 1.4161048212695586, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.413293838500977, + "learning_rate": 1e-06, + "loss": 0.4216, + "mean_token_accuracy": 0.864514946937561, + "num_tokens": 424837897.0, + "step": 11132 + }, + { + "epoch": 1.4162320315481491, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.433277130126953, + "learning_rate": 1e-06, + "loss": 0.3748, + "mean_token_accuracy": 0.8797922730445862, + "num_tokens": 424878154.0, + "step": 11133 + }, + { + "epoch": 1.4163592418267397, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.448007583618164, + "learning_rate": 1e-06, + "loss": 0.3988, + "mean_token_accuracy": 0.8738520741462708, + "num_tokens": 424915063.0, + "step": 11134 + }, + { + "epoch": 1.41648645210533, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.335573196411133, + "learning_rate": 1e-06, + "loss": 0.4119, + "mean_token_accuracy": 0.866653323173523, + "num_tokens": 424955159.0, + "step": 11135 + }, + { + "epoch": 1.4166136623839205, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.452714920043945, + "learning_rate": 1e-06, + "loss": 0.4434, + "mean_token_accuracy": 0.8587551712989807, + "num_tokens": 424992573.0, + "step": 11136 + }, + { + "epoch": 1.416740872662511, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.281280517578125, + "learning_rate": 1e-06, + "loss": 0.3925, + "mean_token_accuracy": 0.8726250529289246, + "num_tokens": 425030452.0, + "step": 11137 + }, + { + "epoch": 1.4168680829411016, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.399198532104492, + "learning_rate": 1e-06, + "loss": 0.3744, + "mean_token_accuracy": 0.8809386491775513, + "num_tokens": 425067118.0, + "step": 11138 + }, + { + "epoch": 1.416995293219692, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.44771957397461, + "learning_rate": 1e-06, + "loss": 0.4422, + "mean_token_accuracy": 0.8584507703781128, + "num_tokens": 425101684.0, + "step": 11139 + }, + { + "epoch": 1.4171225034982826, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.309722900390625, + "learning_rate": 1e-06, + "loss": 0.3851, + "mean_token_accuracy": 0.8759987354278564, + "num_tokens": 425143127.0, + "step": 11140 + }, + { + "epoch": 1.4172497137768731, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.230823516845703, + "learning_rate": 1e-06, + "loss": 0.4137, + "mean_token_accuracy": 0.867622971534729, + "num_tokens": 425173912.0, + "step": 11141 + }, + { + "epoch": 1.4173769240554637, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.47382926940918, + "learning_rate": 1e-06, + "loss": 0.4028, + "mean_token_accuracy": 0.8726770877838135, + "num_tokens": 425210146.0, + "step": 11142 + }, + { + "epoch": 1.4175041343340542, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.350793838500977, + "learning_rate": 1e-06, + "loss": 0.3762, + "mean_token_accuracy": 0.8801773190498352, + "num_tokens": 425246082.0, + "step": 11143 + }, + { + "epoch": 1.4176313446126447, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.314754486083984, + "learning_rate": 1e-06, + "loss": 0.4244, + "mean_token_accuracy": 0.8647651672363281, + "num_tokens": 425287470.0, + "step": 11144 + }, + { + "epoch": 1.4177585548912353, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.449007034301758, + "learning_rate": 1e-06, + "loss": 0.4273, + "mean_token_accuracy": 0.8643038272857666, + "num_tokens": 425326553.0, + "step": 11145 + }, + { + "epoch": 1.4178857651698258, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.31821632385254, + "learning_rate": 1e-06, + "loss": 0.4378, + "mean_token_accuracy": 0.8608242869377136, + "num_tokens": 425363105.0, + "step": 11146 + }, + { + "epoch": 1.4180129754484163, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.424528121948242, + "learning_rate": 1e-06, + "loss": 0.4363, + "mean_token_accuracy": 0.8625897169113159, + "num_tokens": 425402513.0, + "step": 11147 + }, + { + "epoch": 1.4181401857270068, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.37502670288086, + "learning_rate": 1e-06, + "loss": 0.4188, + "mean_token_accuracy": 0.8687055110931396, + "num_tokens": 425440132.0, + "step": 11148 + }, + { + "epoch": 1.4182673960055974, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.335887908935547, + "learning_rate": 1e-06, + "loss": 0.423, + "mean_token_accuracy": 0.866460919380188, + "num_tokens": 425481738.0, + "step": 11149 + }, + { + "epoch": 1.418394606284188, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.34019660949707, + "learning_rate": 1e-06, + "loss": 0.4266, + "mean_token_accuracy": 0.8664090633392334, + "num_tokens": 425516916.0, + "step": 11150 + }, + { + "epoch": 1.4185218165627782, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.438556671142578, + "learning_rate": 1e-06, + "loss": 0.4136, + "mean_token_accuracy": 0.8678324818611145, + "num_tokens": 425555951.0, + "step": 11151 + }, + { + "epoch": 1.4186490268413687, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.111732482910156, + "learning_rate": 1e-06, + "loss": 0.4174, + "mean_token_accuracy": 0.8679434061050415, + "num_tokens": 425595302.0, + "step": 11152 + }, + { + "epoch": 1.4187762371199593, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.419633865356445, + "learning_rate": 1e-06, + "loss": 0.4096, + "mean_token_accuracy": 0.8721479177474976, + "num_tokens": 425623281.0, + "step": 11153 + }, + { + "epoch": 1.4189034473985498, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.53601837158203, + "learning_rate": 1e-06, + "loss": 0.3878, + "mean_token_accuracy": 0.8764220476150513, + "num_tokens": 425659124.0, + "step": 11154 + }, + { + "epoch": 1.4190306576771403, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.18722152709961, + "learning_rate": 1e-06, + "loss": 0.4432, + "mean_token_accuracy": 0.8560519814491272, + "num_tokens": 425695809.0, + "step": 11155 + }, + { + "epoch": 1.4191578679557308, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.32246208190918, + "learning_rate": 1e-06, + "loss": 0.4474, + "mean_token_accuracy": 0.8588424921035767, + "num_tokens": 425731890.0, + "step": 11156 + }, + { + "epoch": 1.4192850782343214, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.5191593170166, + "learning_rate": 1e-06, + "loss": 0.4034, + "mean_token_accuracy": 0.8768752813339233, + "num_tokens": 425770899.0, + "step": 11157 + }, + { + "epoch": 1.419412288512912, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.366193771362305, + "learning_rate": 1e-06, + "loss": 0.3843, + "mean_token_accuracy": 0.8786634206771851, + "num_tokens": 425807812.0, + "step": 11158 + }, + { + "epoch": 1.4195394987915024, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.41609001159668, + "learning_rate": 1e-06, + "loss": 0.4273, + "mean_token_accuracy": 0.8639956712722778, + "num_tokens": 425848997.0, + "step": 11159 + }, + { + "epoch": 1.4196667090700927, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.318933486938477, + "learning_rate": 1e-06, + "loss": 0.3812, + "mean_token_accuracy": 0.8783961534500122, + "num_tokens": 425887816.0, + "step": 11160 + }, + { + "epoch": 1.4197939193486833, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.467594146728516, + "learning_rate": 1e-06, + "loss": 0.4369, + "mean_token_accuracy": 0.8595026135444641, + "num_tokens": 425925488.0, + "step": 11161 + }, + { + "epoch": 1.4199211296272738, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.28543472290039, + "learning_rate": 1e-06, + "loss": 0.397, + "mean_token_accuracy": 0.8734884262084961, + "num_tokens": 425965616.0, + "step": 11162 + }, + { + "epoch": 1.4200483399058643, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.447681427001953, + "learning_rate": 1e-06, + "loss": 0.4331, + "mean_token_accuracy": 0.8617788553237915, + "num_tokens": 426005308.0, + "step": 11163 + }, + { + "epoch": 1.4201755501844548, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.568696975708008, + "learning_rate": 1e-06, + "loss": 0.4212, + "mean_token_accuracy": 0.8649083375930786, + "num_tokens": 426042339.0, + "step": 11164 + }, + { + "epoch": 1.4203027604630454, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.20838737487793, + "learning_rate": 1e-06, + "loss": 0.4209, + "mean_token_accuracy": 0.8670952916145325, + "num_tokens": 426081925.0, + "step": 11165 + }, + { + "epoch": 1.420429970741636, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.447036743164062, + "learning_rate": 1e-06, + "loss": 0.4217, + "mean_token_accuracy": 0.8627504706382751, + "num_tokens": 426120918.0, + "step": 11166 + }, + { + "epoch": 1.4205571810202264, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.432558059692383, + "learning_rate": 1e-06, + "loss": 0.4474, + "mean_token_accuracy": 0.8557709455490112, + "num_tokens": 426158711.0, + "step": 11167 + }, + { + "epoch": 1.420684391298817, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.27914047241211, + "learning_rate": 1e-06, + "loss": 0.4189, + "mean_token_accuracy": 0.8637226819992065, + "num_tokens": 426197034.0, + "step": 11168 + }, + { + "epoch": 1.4208116015774075, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.29316520690918, + "learning_rate": 1e-06, + "loss": 0.4425, + "mean_token_accuracy": 0.8606860041618347, + "num_tokens": 426239390.0, + "step": 11169 + }, + { + "epoch": 1.420938811855998, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.466405868530273, + "learning_rate": 1e-06, + "loss": 0.4472, + "mean_token_accuracy": 0.8576624989509583, + "num_tokens": 426285488.0, + "step": 11170 + }, + { + "epoch": 1.4210660221345885, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.35833740234375, + "learning_rate": 1e-06, + "loss": 0.4479, + "mean_token_accuracy": 0.855056881904602, + "num_tokens": 426323955.0, + "step": 11171 + }, + { + "epoch": 1.421193232413179, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.321279525756836, + "learning_rate": 1e-06, + "loss": 0.3715, + "mean_token_accuracy": 0.8859595656394958, + "num_tokens": 426359460.0, + "step": 11172 + }, + { + "epoch": 1.4213204426917696, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.289508819580078, + "learning_rate": 1e-06, + "loss": 0.4405, + "mean_token_accuracy": 0.8602215051651001, + "num_tokens": 426397104.0, + "step": 11173 + }, + { + "epoch": 1.4214476529703601, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.296140670776367, + "learning_rate": 1e-06, + "loss": 0.3914, + "mean_token_accuracy": 0.876203179359436, + "num_tokens": 426441349.0, + "step": 11174 + }, + { + "epoch": 1.4215748632489504, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.414108276367188, + "learning_rate": 1e-06, + "loss": 0.371, + "mean_token_accuracy": 0.8755432963371277, + "num_tokens": 426478410.0, + "step": 11175 + }, + { + "epoch": 1.421702073527541, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.466217041015625, + "learning_rate": 1e-06, + "loss": 0.4161, + "mean_token_accuracy": 0.866401195526123, + "num_tokens": 426519874.0, + "step": 11176 + }, + { + "epoch": 1.4218292838061315, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.25971031188965, + "learning_rate": 1e-06, + "loss": 0.4545, + "mean_token_accuracy": 0.8553221225738525, + "num_tokens": 426559970.0, + "step": 11177 + }, + { + "epoch": 1.421956494084722, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.539794921875, + "learning_rate": 1e-06, + "loss": 0.4136, + "mean_token_accuracy": 0.8689625263214111, + "num_tokens": 426593897.0, + "step": 11178 + }, + { + "epoch": 1.4220837043633126, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.474872589111328, + "learning_rate": 1e-06, + "loss": 0.4246, + "mean_token_accuracy": 0.8679139614105225, + "num_tokens": 426633712.0, + "step": 11179 + }, + { + "epoch": 1.422210914641903, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.565256118774414, + "learning_rate": 1e-06, + "loss": 0.3738, + "mean_token_accuracy": 0.8811314105987549, + "num_tokens": 426669085.0, + "step": 11180 + }, + { + "epoch": 1.4223381249204936, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.421091079711914, + "learning_rate": 1e-06, + "loss": 0.3779, + "mean_token_accuracy": 0.8799695372581482, + "num_tokens": 426709260.0, + "step": 11181 + }, + { + "epoch": 1.4224653351990841, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.415435791015625, + "learning_rate": 1e-06, + "loss": 0.4479, + "mean_token_accuracy": 0.856602668762207, + "num_tokens": 426747747.0, + "step": 11182 + }, + { + "epoch": 1.4225925454776747, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.260223388671875, + "learning_rate": 1e-06, + "loss": 0.4588, + "mean_token_accuracy": 0.8547128438949585, + "num_tokens": 426788185.0, + "step": 11183 + }, + { + "epoch": 1.422719755756265, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.506874084472656, + "learning_rate": 1e-06, + "loss": 0.4212, + "mean_token_accuracy": 0.8694250583648682, + "num_tokens": 426828663.0, + "step": 11184 + }, + { + "epoch": 1.4228469660348555, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.220623016357422, + "learning_rate": 1e-06, + "loss": 0.4556, + "mean_token_accuracy": 0.8575724959373474, + "num_tokens": 426869205.0, + "step": 11185 + }, + { + "epoch": 1.422974176313446, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.53898811340332, + "learning_rate": 1e-06, + "loss": 0.3628, + "mean_token_accuracy": 0.8836793303489685, + "num_tokens": 426900561.0, + "step": 11186 + }, + { + "epoch": 1.4231013865920366, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.31856918334961, + "learning_rate": 1e-06, + "loss": 0.3674, + "mean_token_accuracy": 0.8819788694381714, + "num_tokens": 426940618.0, + "step": 11187 + }, + { + "epoch": 1.423228596870627, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.325536727905273, + "learning_rate": 1e-06, + "loss": 0.4356, + "mean_token_accuracy": 0.8590610027313232, + "num_tokens": 426975509.0, + "step": 11188 + }, + { + "epoch": 1.4233558071492176, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.453941345214844, + "learning_rate": 1e-06, + "loss": 0.3882, + "mean_token_accuracy": 0.8764746785163879, + "num_tokens": 427011647.0, + "step": 11189 + }, + { + "epoch": 1.4234830174278081, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.376392364501953, + "learning_rate": 1e-06, + "loss": 0.424, + "mean_token_accuracy": 0.8650320172309875, + "num_tokens": 427046631.0, + "step": 11190 + }, + { + "epoch": 1.4236102277063987, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.369054794311523, + "learning_rate": 1e-06, + "loss": 0.4199, + "mean_token_accuracy": 0.864898145198822, + "num_tokens": 427081364.0, + "step": 11191 + }, + { + "epoch": 1.4237374379849892, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.141643524169922, + "learning_rate": 1e-06, + "loss": 0.4143, + "mean_token_accuracy": 0.8704853057861328, + "num_tokens": 427118667.0, + "step": 11192 + }, + { + "epoch": 1.4238646482635797, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.373863220214844, + "learning_rate": 1e-06, + "loss": 0.3992, + "mean_token_accuracy": 0.8724116683006287, + "num_tokens": 427155017.0, + "step": 11193 + }, + { + "epoch": 1.4239918585421703, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.23326873779297, + "learning_rate": 1e-06, + "loss": 0.434, + "mean_token_accuracy": 0.8606089353561401, + "num_tokens": 427191741.0, + "step": 11194 + }, + { + "epoch": 1.4241190688207608, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.535680770874023, + "learning_rate": 1e-06, + "loss": 0.3993, + "mean_token_accuracy": 0.8716679215431213, + "num_tokens": 427226210.0, + "step": 11195 + }, + { + "epoch": 1.4242462790993513, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.191627502441406, + "learning_rate": 1e-06, + "loss": 0.4062, + "mean_token_accuracy": 0.8634476661682129, + "num_tokens": 427261699.0, + "step": 11196 + }, + { + "epoch": 1.4243734893779418, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.333410263061523, + "learning_rate": 1e-06, + "loss": 0.4328, + "mean_token_accuracy": 0.8575453758239746, + "num_tokens": 427297158.0, + "step": 11197 + }, + { + "epoch": 1.4245006996565324, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.345102310180664, + "learning_rate": 1e-06, + "loss": 0.4617, + "mean_token_accuracy": 0.8534774780273438, + "num_tokens": 427338234.0, + "step": 11198 + }, + { + "epoch": 1.424627909935123, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.31855583190918, + "learning_rate": 1e-06, + "loss": 0.4425, + "mean_token_accuracy": 0.8622581958770752, + "num_tokens": 427376727.0, + "step": 11199 + }, + { + "epoch": 1.4247551202137132, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.47038459777832, + "learning_rate": 1e-06, + "loss": 0.3846, + "mean_token_accuracy": 0.8792345523834229, + "num_tokens": 427417095.0, + "step": 11200 + }, + { + "epoch": 1.4248823304923037, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.225797653198242, + "learning_rate": 1e-06, + "loss": 0.4679, + "mean_token_accuracy": 0.8510490655899048, + "num_tokens": 427453243.0, + "step": 11201 + }, + { + "epoch": 1.4250095407708943, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.50664710998535, + "learning_rate": 1e-06, + "loss": 0.4053, + "mean_token_accuracy": 0.8703961372375488, + "num_tokens": 427490961.0, + "step": 11202 + }, + { + "epoch": 1.4251367510494848, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.345596313476562, + "learning_rate": 1e-06, + "loss": 0.38, + "mean_token_accuracy": 0.877426266670227, + "num_tokens": 427521487.0, + "step": 11203 + }, + { + "epoch": 1.4252639613280753, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.475059509277344, + "learning_rate": 1e-06, + "loss": 0.4077, + "mean_token_accuracy": 0.864314079284668, + "num_tokens": 427556607.0, + "step": 11204 + }, + { + "epoch": 1.4253911716066658, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.48011016845703, + "learning_rate": 1e-06, + "loss": 0.4147, + "mean_token_accuracy": 0.868583619594574, + "num_tokens": 427598940.0, + "step": 11205 + }, + { + "epoch": 1.4255183818852564, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.311307907104492, + "learning_rate": 1e-06, + "loss": 0.4394, + "mean_token_accuracy": 0.8574047684669495, + "num_tokens": 427641337.0, + "step": 11206 + }, + { + "epoch": 1.425645592163847, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.233049392700195, + "learning_rate": 1e-06, + "loss": 0.3913, + "mean_token_accuracy": 0.8781061172485352, + "num_tokens": 427680235.0, + "step": 11207 + }, + { + "epoch": 1.4257728024424374, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.49840545654297, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.8672906160354614, + "num_tokens": 427719918.0, + "step": 11208 + }, + { + "epoch": 1.4259000127210277, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.3278751373291, + "learning_rate": 1e-06, + "loss": 0.4066, + "mean_token_accuracy": 0.8702192306518555, + "num_tokens": 427758042.0, + "step": 11209 + }, + { + "epoch": 1.4260272229996183, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.392555236816406, + "learning_rate": 1e-06, + "loss": 0.4103, + "mean_token_accuracy": 0.8702670335769653, + "num_tokens": 427794591.0, + "step": 11210 + }, + { + "epoch": 1.4261544332782088, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.25904083251953, + "learning_rate": 1e-06, + "loss": 0.3983, + "mean_token_accuracy": 0.8750984072685242, + "num_tokens": 427828190.0, + "step": 11211 + }, + { + "epoch": 1.4262816435567993, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.257282257080078, + "learning_rate": 1e-06, + "loss": 0.4494, + "mean_token_accuracy": 0.8608042001724243, + "num_tokens": 427870363.0, + "step": 11212 + }, + { + "epoch": 1.4264088538353898, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.46832275390625, + "learning_rate": 1e-06, + "loss": 0.3728, + "mean_token_accuracy": 0.8825663924217224, + "num_tokens": 427910037.0, + "step": 11213 + }, + { + "epoch": 1.4265360641139804, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.39992904663086, + "learning_rate": 1e-06, + "loss": 0.4218, + "mean_token_accuracy": 0.8664265275001526, + "num_tokens": 427945782.0, + "step": 11214 + }, + { + "epoch": 1.426663274392571, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.36411476135254, + "learning_rate": 1e-06, + "loss": 0.4024, + "mean_token_accuracy": 0.8755388855934143, + "num_tokens": 427984635.0, + "step": 11215 + }, + { + "epoch": 1.4267904846711614, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.238046646118164, + "learning_rate": 1e-06, + "loss": 0.3604, + "mean_token_accuracy": 0.883228063583374, + "num_tokens": 428024289.0, + "step": 11216 + }, + { + "epoch": 1.426917694949752, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.33768653869629, + "learning_rate": 1e-06, + "loss": 0.44, + "mean_token_accuracy": 0.8595013618469238, + "num_tokens": 428067119.0, + "step": 11217 + }, + { + "epoch": 1.4270449052283425, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.494903564453125, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.8666519522666931, + "num_tokens": 428108150.0, + "step": 11218 + }, + { + "epoch": 1.427172115506933, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.363588333129883, + "learning_rate": 1e-06, + "loss": 0.3852, + "mean_token_accuracy": 0.8782740235328674, + "num_tokens": 428150839.0, + "step": 11219 + }, + { + "epoch": 1.4272993257855235, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.31712532043457, + "learning_rate": 1e-06, + "loss": 0.458, + "mean_token_accuracy": 0.8566426634788513, + "num_tokens": 428194325.0, + "step": 11220 + }, + { + "epoch": 1.427426536064114, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.342599868774414, + "learning_rate": 1e-06, + "loss": 0.4125, + "mean_token_accuracy": 0.8677864074707031, + "num_tokens": 428230879.0, + "step": 11221 + }, + { + "epoch": 1.4275537463427046, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.391178131103516, + "learning_rate": 1e-06, + "loss": 0.3772, + "mean_token_accuracy": 0.8787733912467957, + "num_tokens": 428265795.0, + "step": 11222 + }, + { + "epoch": 1.4276809566212951, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.461936950683594, + "learning_rate": 1e-06, + "loss": 0.4028, + "mean_token_accuracy": 0.8740818500518799, + "num_tokens": 428300413.0, + "step": 11223 + }, + { + "epoch": 1.4278081668998854, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.416603088378906, + "learning_rate": 1e-06, + "loss": 0.3525, + "mean_token_accuracy": 0.8882299661636353, + "num_tokens": 428339293.0, + "step": 11224 + }, + { + "epoch": 1.427935377178476, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.675392150878906, + "learning_rate": 1e-06, + "loss": 0.3779, + "mean_token_accuracy": 0.8798844814300537, + "num_tokens": 428374158.0, + "step": 11225 + }, + { + "epoch": 1.4280625874570665, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.463483810424805, + "learning_rate": 1e-06, + "loss": 0.4084, + "mean_token_accuracy": 0.8694359064102173, + "num_tokens": 428411715.0, + "step": 11226 + }, + { + "epoch": 1.428189797735657, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.408279418945312, + "learning_rate": 1e-06, + "loss": 0.4674, + "mean_token_accuracy": 0.8568183779716492, + "num_tokens": 428450000.0, + "step": 11227 + }, + { + "epoch": 1.4283170080142475, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.5675048828125, + "learning_rate": 1e-06, + "loss": 0.3988, + "mean_token_accuracy": 0.8749088048934937, + "num_tokens": 428484546.0, + "step": 11228 + }, + { + "epoch": 1.428444218292838, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.332950592041016, + "learning_rate": 1e-06, + "loss": 0.4102, + "mean_token_accuracy": 0.869217574596405, + "num_tokens": 428527519.0, + "step": 11229 + }, + { + "epoch": 1.4285714285714286, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.485641479492188, + "learning_rate": 1e-06, + "loss": 0.4307, + "mean_token_accuracy": 0.8638039827346802, + "num_tokens": 428568744.0, + "step": 11230 + }, + { + "epoch": 1.4286986388500191, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.538740158081055, + "learning_rate": 1e-06, + "loss": 0.4197, + "mean_token_accuracy": 0.86380934715271, + "num_tokens": 428613050.0, + "step": 11231 + }, + { + "epoch": 1.4288258491286097, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.48217010498047, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.8652054667472839, + "num_tokens": 428647334.0, + "step": 11232 + }, + { + "epoch": 1.4289530594072, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.378782272338867, + "learning_rate": 1e-06, + "loss": 0.3918, + "mean_token_accuracy": 0.8746721744537354, + "num_tokens": 428681015.0, + "step": 11233 + }, + { + "epoch": 1.4290802696857905, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.766714096069336, + "learning_rate": 1e-06, + "loss": 0.4268, + "mean_token_accuracy": 0.8621630668640137, + "num_tokens": 428718903.0, + "step": 11234 + }, + { + "epoch": 1.429207479964381, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.343534469604492, + "learning_rate": 1e-06, + "loss": 0.3897, + "mean_token_accuracy": 0.877403974533081, + "num_tokens": 428756038.0, + "step": 11235 + }, + { + "epoch": 1.4293346902429716, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.619312286376953, + "learning_rate": 1e-06, + "loss": 0.3611, + "mean_token_accuracy": 0.8847568035125732, + "num_tokens": 428793908.0, + "step": 11236 + }, + { + "epoch": 1.429461900521562, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.454618453979492, + "learning_rate": 1e-06, + "loss": 0.4364, + "mean_token_accuracy": 0.8611794710159302, + "num_tokens": 428830924.0, + "step": 11237 + }, + { + "epoch": 1.4295891108001526, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.269634246826172, + "learning_rate": 1e-06, + "loss": 0.4241, + "mean_token_accuracy": 0.8659458756446838, + "num_tokens": 428869083.0, + "step": 11238 + }, + { + "epoch": 1.4297163210787431, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.4216365814209, + "learning_rate": 1e-06, + "loss": 0.4929, + "mean_token_accuracy": 0.8440684080123901, + "num_tokens": 428903970.0, + "step": 11239 + }, + { + "epoch": 1.4298435313573337, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.49952507019043, + "learning_rate": 1e-06, + "loss": 0.3931, + "mean_token_accuracy": 0.8728375434875488, + "num_tokens": 428940710.0, + "step": 11240 + }, + { + "epoch": 1.4299707416359242, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.362178802490234, + "learning_rate": 1e-06, + "loss": 0.4108, + "mean_token_accuracy": 0.870395839214325, + "num_tokens": 428981160.0, + "step": 11241 + }, + { + "epoch": 1.4300979519145147, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.249980926513672, + "learning_rate": 1e-06, + "loss": 0.4086, + "mean_token_accuracy": 0.8696548342704773, + "num_tokens": 429022464.0, + "step": 11242 + }, + { + "epoch": 1.4302251621931052, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.3045711517334, + "learning_rate": 1e-06, + "loss": 0.4127, + "mean_token_accuracy": 0.8702991008758545, + "num_tokens": 429060964.0, + "step": 11243 + }, + { + "epoch": 1.4303523724716958, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.345462799072266, + "learning_rate": 1e-06, + "loss": 0.4182, + "mean_token_accuracy": 0.8703491687774658, + "num_tokens": 429101389.0, + "step": 11244 + }, + { + "epoch": 1.4304795827502863, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.60688591003418, + "learning_rate": 1e-06, + "loss": 0.4375, + "mean_token_accuracy": 0.8651567697525024, + "num_tokens": 429135547.0, + "step": 11245 + }, + { + "epoch": 1.4306067930288768, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.184568405151367, + "learning_rate": 1e-06, + "loss": 0.4074, + "mean_token_accuracy": 0.8715062141418457, + "num_tokens": 429177504.0, + "step": 11246 + }, + { + "epoch": 1.4307340033074674, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.525358200073242, + "learning_rate": 1e-06, + "loss": 0.3932, + "mean_token_accuracy": 0.8735443949699402, + "num_tokens": 429217923.0, + "step": 11247 + }, + { + "epoch": 1.430861213586058, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.259933471679688, + "learning_rate": 1e-06, + "loss": 0.4153, + "mean_token_accuracy": 0.8698834180831909, + "num_tokens": 429251514.0, + "step": 11248 + }, + { + "epoch": 1.4309884238646482, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.46858787536621, + "learning_rate": 1e-06, + "loss": 0.4658, + "mean_token_accuracy": 0.8530880808830261, + "num_tokens": 429291747.0, + "step": 11249 + }, + { + "epoch": 1.4311156341432387, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.201387405395508, + "learning_rate": 1e-06, + "loss": 0.4187, + "mean_token_accuracy": 0.8648685216903687, + "num_tokens": 429333480.0, + "step": 11250 + }, + { + "epoch": 1.4312428444218293, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.365764617919922, + "learning_rate": 1e-06, + "loss": 0.3737, + "mean_token_accuracy": 0.8799197673797607, + "num_tokens": 429366482.0, + "step": 11251 + }, + { + "epoch": 1.4313700547004198, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.377870559692383, + "learning_rate": 1e-06, + "loss": 0.4388, + "mean_token_accuracy": 0.8606046438217163, + "num_tokens": 429400704.0, + "step": 11252 + }, + { + "epoch": 1.4314972649790103, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.30402183532715, + "learning_rate": 1e-06, + "loss": 0.4542, + "mean_token_accuracy": 0.856191873550415, + "num_tokens": 429437463.0, + "step": 11253 + }, + { + "epoch": 1.4316244752576008, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.45024299621582, + "learning_rate": 1e-06, + "loss": 0.4453, + "mean_token_accuracy": 0.8577939867973328, + "num_tokens": 429474496.0, + "step": 11254 + }, + { + "epoch": 1.4317516855361914, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.36843490600586, + "learning_rate": 1e-06, + "loss": 0.4525, + "mean_token_accuracy": 0.855080246925354, + "num_tokens": 429514641.0, + "step": 11255 + }, + { + "epoch": 1.431878895814782, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.255863189697266, + "learning_rate": 1e-06, + "loss": 0.4178, + "mean_token_accuracy": 0.8681110143661499, + "num_tokens": 429554054.0, + "step": 11256 + }, + { + "epoch": 1.4320061060933724, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.526063919067383, + "learning_rate": 1e-06, + "loss": 0.4312, + "mean_token_accuracy": 0.8639324903488159, + "num_tokens": 429594004.0, + "step": 11257 + }, + { + "epoch": 1.4321333163719627, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.272605895996094, + "learning_rate": 1e-06, + "loss": 0.3945, + "mean_token_accuracy": 0.8750638365745544, + "num_tokens": 429633019.0, + "step": 11258 + }, + { + "epoch": 1.4322605266505533, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.38883399963379, + "learning_rate": 1e-06, + "loss": 0.4054, + "mean_token_accuracy": 0.8692219257354736, + "num_tokens": 429666869.0, + "step": 11259 + }, + { + "epoch": 1.4323877369291438, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.455461502075195, + "learning_rate": 1e-06, + "loss": 0.3894, + "mean_token_accuracy": 0.8775541186332703, + "num_tokens": 429700049.0, + "step": 11260 + }, + { + "epoch": 1.4325149472077343, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.34283447265625, + "learning_rate": 1e-06, + "loss": 0.4247, + "mean_token_accuracy": 0.8662537336349487, + "num_tokens": 429737517.0, + "step": 11261 + }, + { + "epoch": 1.4326421574863248, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.46746063232422, + "learning_rate": 1e-06, + "loss": 0.3739, + "mean_token_accuracy": 0.8802788257598877, + "num_tokens": 429778697.0, + "step": 11262 + }, + { + "epoch": 1.4327693677649154, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.411680221557617, + "learning_rate": 1e-06, + "loss": 0.4182, + "mean_token_accuracy": 0.8690837621688843, + "num_tokens": 429811883.0, + "step": 11263 + }, + { + "epoch": 1.432896578043506, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.46829605102539, + "learning_rate": 1e-06, + "loss": 0.3983, + "mean_token_accuracy": 0.8735265731811523, + "num_tokens": 429851748.0, + "step": 11264 + }, + { + "epoch": 1.4330237883220964, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.35253143310547, + "learning_rate": 1e-06, + "loss": 0.4434, + "mean_token_accuracy": 0.8589749932289124, + "num_tokens": 429891033.0, + "step": 11265 + }, + { + "epoch": 1.433150998600687, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.597026824951172, + "learning_rate": 1e-06, + "loss": 0.4019, + "mean_token_accuracy": 0.8699705004692078, + "num_tokens": 429935980.0, + "step": 11266 + }, + { + "epoch": 1.4332782088792775, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.449338912963867, + "learning_rate": 1e-06, + "loss": 0.454, + "mean_token_accuracy": 0.8564563989639282, + "num_tokens": 429970874.0, + "step": 11267 + }, + { + "epoch": 1.433405419157868, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.36113929748535, + "learning_rate": 1e-06, + "loss": 0.4554, + "mean_token_accuracy": 0.8536745309829712, + "num_tokens": 430017909.0, + "step": 11268 + }, + { + "epoch": 1.4335326294364585, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.45750617980957, + "learning_rate": 1e-06, + "loss": 0.5267, + "mean_token_accuracy": 0.8423313498497009, + "num_tokens": 430050641.0, + "step": 11269 + }, + { + "epoch": 1.433659839715049, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.26799201965332, + "learning_rate": 1e-06, + "loss": 0.3993, + "mean_token_accuracy": 0.8719749450683594, + "num_tokens": 430082985.0, + "step": 11270 + }, + { + "epoch": 1.4337870499936396, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.56873893737793, + "learning_rate": 1e-06, + "loss": 0.4149, + "mean_token_accuracy": 0.8697653412818909, + "num_tokens": 430122340.0, + "step": 11271 + }, + { + "epoch": 1.4339142602722301, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.569326400756836, + "learning_rate": 1e-06, + "loss": 0.471, + "mean_token_accuracy": 0.8535873889923096, + "num_tokens": 430162140.0, + "step": 11272 + }, + { + "epoch": 1.4340414705508204, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.45391273498535, + "learning_rate": 1e-06, + "loss": 0.3946, + "mean_token_accuracy": 0.8780680894851685, + "num_tokens": 430201672.0, + "step": 11273 + }, + { + "epoch": 1.434168680829411, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.714515686035156, + "learning_rate": 1e-06, + "loss": 0.455, + "mean_token_accuracy": 0.8579468727111816, + "num_tokens": 430239760.0, + "step": 11274 + }, + { + "epoch": 1.4342958911080015, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.419509887695312, + "learning_rate": 1e-06, + "loss": 0.4686, + "mean_token_accuracy": 0.8508667945861816, + "num_tokens": 430280527.0, + "step": 11275 + }, + { + "epoch": 1.434423101386592, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.575305938720703, + "learning_rate": 1e-06, + "loss": 0.4438, + "mean_token_accuracy": 0.8628596067428589, + "num_tokens": 430312788.0, + "step": 11276 + }, + { + "epoch": 1.4345503116651825, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.587589263916016, + "learning_rate": 1e-06, + "loss": 0.3925, + "mean_token_accuracy": 0.8752166032791138, + "num_tokens": 430348044.0, + "step": 11277 + }, + { + "epoch": 1.434677521943773, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.611879348754883, + "learning_rate": 1e-06, + "loss": 0.3678, + "mean_token_accuracy": 0.8828179836273193, + "num_tokens": 430385676.0, + "step": 11278 + }, + { + "epoch": 1.4348047322223636, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.591466903686523, + "learning_rate": 1e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.8650541305541992, + "num_tokens": 430416976.0, + "step": 11279 + }, + { + "epoch": 1.4349319425009541, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.352272033691406, + "learning_rate": 1e-06, + "loss": 0.3886, + "mean_token_accuracy": 0.8785334229469299, + "num_tokens": 430460002.0, + "step": 11280 + }, + { + "epoch": 1.4350591527795447, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.488496780395508, + "learning_rate": 1e-06, + "loss": 0.4801, + "mean_token_accuracy": 0.8464910984039307, + "num_tokens": 430503172.0, + "step": 11281 + }, + { + "epoch": 1.435186363058135, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.415983200073242, + "learning_rate": 1e-06, + "loss": 0.4559, + "mean_token_accuracy": 0.8551603555679321, + "num_tokens": 430541221.0, + "step": 11282 + }, + { + "epoch": 1.4353135733367255, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.422428131103516, + "learning_rate": 1e-06, + "loss": 0.4429, + "mean_token_accuracy": 0.8603733777999878, + "num_tokens": 430579527.0, + "step": 11283 + }, + { + "epoch": 1.435440783615316, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.523841857910156, + "learning_rate": 1e-06, + "loss": 0.3989, + "mean_token_accuracy": 0.8691426515579224, + "num_tokens": 430617263.0, + "step": 11284 + }, + { + "epoch": 1.4355679938939065, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.302202224731445, + "learning_rate": 1e-06, + "loss": 0.3766, + "mean_token_accuracy": 0.8806284666061401, + "num_tokens": 430649961.0, + "step": 11285 + }, + { + "epoch": 1.435695204172497, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.557628631591797, + "learning_rate": 1e-06, + "loss": 0.4373, + "mean_token_accuracy": 0.8627061247825623, + "num_tokens": 430682599.0, + "step": 11286 + }, + { + "epoch": 1.4358224144510876, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.323993682861328, + "learning_rate": 1e-06, + "loss": 0.505, + "mean_token_accuracy": 0.8398171663284302, + "num_tokens": 430721014.0, + "step": 11287 + }, + { + "epoch": 1.4359496247296781, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.412582397460938, + "learning_rate": 1e-06, + "loss": 0.4196, + "mean_token_accuracy": 0.8664838075637817, + "num_tokens": 430757014.0, + "step": 11288 + }, + { + "epoch": 1.4360768350082687, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.501474380493164, + "learning_rate": 1e-06, + "loss": 0.4165, + "mean_token_accuracy": 0.8682563900947571, + "num_tokens": 430796169.0, + "step": 11289 + }, + { + "epoch": 1.4362040452868592, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.36836051940918, + "learning_rate": 1e-06, + "loss": 0.4245, + "mean_token_accuracy": 0.8664430379867554, + "num_tokens": 430832726.0, + "step": 11290 + }, + { + "epoch": 1.4363312555654497, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.318679809570312, + "learning_rate": 1e-06, + "loss": 0.4516, + "mean_token_accuracy": 0.8551805019378662, + "num_tokens": 430866343.0, + "step": 11291 + }, + { + "epoch": 1.4364584658440402, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.561145782470703, + "learning_rate": 1e-06, + "loss": 0.4092, + "mean_token_accuracy": 0.873337984085083, + "num_tokens": 430901622.0, + "step": 11292 + }, + { + "epoch": 1.4365856761226308, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.319923400878906, + "learning_rate": 1e-06, + "loss": 0.3878, + "mean_token_accuracy": 0.8813634514808655, + "num_tokens": 430937499.0, + "step": 11293 + }, + { + "epoch": 1.4367128864012213, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.54197883605957, + "learning_rate": 1e-06, + "loss": 0.4481, + "mean_token_accuracy": 0.8545933961868286, + "num_tokens": 430974048.0, + "step": 11294 + }, + { + "epoch": 1.4368400966798118, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.539281845092773, + "learning_rate": 1e-06, + "loss": 0.3804, + "mean_token_accuracy": 0.8749169707298279, + "num_tokens": 431010927.0, + "step": 11295 + }, + { + "epoch": 1.4369673069584024, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.49692153930664, + "learning_rate": 1e-06, + "loss": 0.4805, + "mean_token_accuracy": 0.8524535894393921, + "num_tokens": 431051731.0, + "step": 11296 + }, + { + "epoch": 1.4370945172369929, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.46517562866211, + "learning_rate": 1e-06, + "loss": 0.3613, + "mean_token_accuracy": 0.8827611804008484, + "num_tokens": 431087876.0, + "step": 11297 + }, + { + "epoch": 1.4372217275155832, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.451723098754883, + "learning_rate": 1e-06, + "loss": 0.4387, + "mean_token_accuracy": 0.8613808751106262, + "num_tokens": 431126217.0, + "step": 11298 + }, + { + "epoch": 1.4373489377941737, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.584413528442383, + "learning_rate": 1e-06, + "loss": 0.4183, + "mean_token_accuracy": 0.8675621747970581, + "num_tokens": 431168049.0, + "step": 11299 + }, + { + "epoch": 1.4374761480727642, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.23763656616211, + "learning_rate": 1e-06, + "loss": 0.4256, + "mean_token_accuracy": 0.8654088973999023, + "num_tokens": 431211143.0, + "step": 11300 + }, + { + "epoch": 1.4376033583513548, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.715024948120117, + "learning_rate": 1e-06, + "loss": 0.4029, + "mean_token_accuracy": 0.8744586110115051, + "num_tokens": 431255747.0, + "step": 11301 + }, + { + "epoch": 1.4377305686299453, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.35508155822754, + "learning_rate": 1e-06, + "loss": 0.4076, + "mean_token_accuracy": 0.8704613447189331, + "num_tokens": 431296618.0, + "step": 11302 + }, + { + "epoch": 1.4378577789085358, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.565969467163086, + "learning_rate": 1e-06, + "loss": 0.4157, + "mean_token_accuracy": 0.869060218334198, + "num_tokens": 431336493.0, + "step": 11303 + }, + { + "epoch": 1.4379849891871264, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.652057647705078, + "learning_rate": 1e-06, + "loss": 0.4258, + "mean_token_accuracy": 0.8650078773498535, + "num_tokens": 431376200.0, + "step": 11304 + }, + { + "epoch": 1.438112199465717, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.40257453918457, + "learning_rate": 1e-06, + "loss": 0.4866, + "mean_token_accuracy": 0.8472961783409119, + "num_tokens": 431414015.0, + "step": 11305 + }, + { + "epoch": 1.4382394097443074, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.63698387145996, + "learning_rate": 1e-06, + "loss": 0.4334, + "mean_token_accuracy": 0.86224365234375, + "num_tokens": 431453498.0, + "step": 11306 + }, + { + "epoch": 1.4383666200228977, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.43294334411621, + "learning_rate": 1e-06, + "loss": 0.4111, + "mean_token_accuracy": 0.8660904169082642, + "num_tokens": 431491126.0, + "step": 11307 + }, + { + "epoch": 1.4384938303014883, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.316204071044922, + "learning_rate": 1e-06, + "loss": 0.3455, + "mean_token_accuracy": 0.8893041610717773, + "num_tokens": 431528549.0, + "step": 11308 + }, + { + "epoch": 1.4386210405800788, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.32975196838379, + "learning_rate": 1e-06, + "loss": 0.4005, + "mean_token_accuracy": 0.8731729984283447, + "num_tokens": 431569435.0, + "step": 11309 + }, + { + "epoch": 1.4387482508586693, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.208383560180664, + "learning_rate": 1e-06, + "loss": 0.3931, + "mean_token_accuracy": 0.8723651766777039, + "num_tokens": 431606125.0, + "step": 11310 + }, + { + "epoch": 1.4388754611372598, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.596290588378906, + "learning_rate": 1e-06, + "loss": 0.4345, + "mean_token_accuracy": 0.8624488711357117, + "num_tokens": 431643517.0, + "step": 11311 + }, + { + "epoch": 1.4390026714158504, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.399904251098633, + "learning_rate": 1e-06, + "loss": 0.403, + "mean_token_accuracy": 0.8768761157989502, + "num_tokens": 431682058.0, + "step": 11312 + }, + { + "epoch": 1.439129881694441, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.546781539916992, + "learning_rate": 1e-06, + "loss": 0.3754, + "mean_token_accuracy": 0.8791971802711487, + "num_tokens": 431721241.0, + "step": 11313 + }, + { + "epoch": 1.4392570919730314, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.55274200439453, + "learning_rate": 1e-06, + "loss": 0.3839, + "mean_token_accuracy": 0.8799933195114136, + "num_tokens": 431757809.0, + "step": 11314 + }, + { + "epoch": 1.439384302251622, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.302228927612305, + "learning_rate": 1e-06, + "loss": 0.4471, + "mean_token_accuracy": 0.856468915939331, + "num_tokens": 431795931.0, + "step": 11315 + }, + { + "epoch": 1.4395115125302125, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.52518653869629, + "learning_rate": 1e-06, + "loss": 0.3752, + "mean_token_accuracy": 0.8769413828849792, + "num_tokens": 431830517.0, + "step": 11316 + }, + { + "epoch": 1.439638722808803, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.364904403686523, + "learning_rate": 1e-06, + "loss": 0.4152, + "mean_token_accuracy": 0.8695770502090454, + "num_tokens": 431866047.0, + "step": 11317 + }, + { + "epoch": 1.4397659330873935, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.4864444732666, + "learning_rate": 1e-06, + "loss": 0.4359, + "mean_token_accuracy": 0.856408417224884, + "num_tokens": 431901602.0, + "step": 11318 + }, + { + "epoch": 1.439893143365984, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.363826751708984, + "learning_rate": 1e-06, + "loss": 0.4347, + "mean_token_accuracy": 0.8628800511360168, + "num_tokens": 431934989.0, + "step": 11319 + }, + { + "epoch": 1.4400203536445746, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.372676849365234, + "learning_rate": 1e-06, + "loss": 0.3963, + "mean_token_accuracy": 0.8715798258781433, + "num_tokens": 431968664.0, + "step": 11320 + }, + { + "epoch": 1.4401475639231651, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.50821876525879, + "learning_rate": 1e-06, + "loss": 0.3939, + "mean_token_accuracy": 0.8782357573509216, + "num_tokens": 432008025.0, + "step": 11321 + }, + { + "epoch": 1.4402747742017554, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.337417602539062, + "learning_rate": 1e-06, + "loss": 0.3865, + "mean_token_accuracy": 0.8724426627159119, + "num_tokens": 432043526.0, + "step": 11322 + }, + { + "epoch": 1.440401984480346, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.528104782104492, + "learning_rate": 1e-06, + "loss": 0.4178, + "mean_token_accuracy": 0.8683524131774902, + "num_tokens": 432085175.0, + "step": 11323 + }, + { + "epoch": 1.4405291947589365, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.4049129486084, + "learning_rate": 1e-06, + "loss": 0.4449, + "mean_token_accuracy": 0.8585186004638672, + "num_tokens": 432122379.0, + "step": 11324 + }, + { + "epoch": 1.440656405037527, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.30081558227539, + "learning_rate": 1e-06, + "loss": 0.413, + "mean_token_accuracy": 0.8700748085975647, + "num_tokens": 432157690.0, + "step": 11325 + }, + { + "epoch": 1.4407836153161175, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.418582916259766, + "learning_rate": 1e-06, + "loss": 0.3864, + "mean_token_accuracy": 0.8770074844360352, + "num_tokens": 432188885.0, + "step": 11326 + }, + { + "epoch": 1.440910825594708, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.525272369384766, + "learning_rate": 1e-06, + "loss": 0.4142, + "mean_token_accuracy": 0.867344856262207, + "num_tokens": 432228804.0, + "step": 11327 + }, + { + "epoch": 1.4410380358732986, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.38318634033203, + "learning_rate": 1e-06, + "loss": 0.4602, + "mean_token_accuracy": 0.8502062559127808, + "num_tokens": 432264150.0, + "step": 11328 + }, + { + "epoch": 1.4411652461518891, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.531042098999023, + "learning_rate": 1e-06, + "loss": 0.4362, + "mean_token_accuracy": 0.8609486222267151, + "num_tokens": 432306759.0, + "step": 11329 + }, + { + "epoch": 1.4412924564304797, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.417808532714844, + "learning_rate": 1e-06, + "loss": 0.3856, + "mean_token_accuracy": 0.8779463768005371, + "num_tokens": 432346928.0, + "step": 11330 + }, + { + "epoch": 1.44141966670907, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.436325073242188, + "learning_rate": 1e-06, + "loss": 0.3939, + "mean_token_accuracy": 0.8748573064804077, + "num_tokens": 432386630.0, + "step": 11331 + }, + { + "epoch": 1.4415468769876605, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.583839416503906, + "learning_rate": 1e-06, + "loss": 0.4526, + "mean_token_accuracy": 0.8556046485900879, + "num_tokens": 432427331.0, + "step": 11332 + }, + { + "epoch": 1.441674087266251, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.41140365600586, + "learning_rate": 1e-06, + "loss": 0.4512, + "mean_token_accuracy": 0.8590798377990723, + "num_tokens": 432462901.0, + "step": 11333 + }, + { + "epoch": 1.4418012975448415, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.52009391784668, + "learning_rate": 1e-06, + "loss": 0.4258, + "mean_token_accuracy": 0.8642149567604065, + "num_tokens": 432494232.0, + "step": 11334 + }, + { + "epoch": 1.441928507823432, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.47529411315918, + "learning_rate": 1e-06, + "loss": 0.3817, + "mean_token_accuracy": 0.8799601197242737, + "num_tokens": 432534276.0, + "step": 11335 + }, + { + "epoch": 1.4420557181020226, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.346858978271484, + "learning_rate": 1e-06, + "loss": 0.4356, + "mean_token_accuracy": 0.8591305017471313, + "num_tokens": 432577337.0, + "step": 11336 + }, + { + "epoch": 1.4421829283806131, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.5082950592041, + "learning_rate": 1e-06, + "loss": 0.4372, + "mean_token_accuracy": 0.8589038252830505, + "num_tokens": 432614290.0, + "step": 11337 + }, + { + "epoch": 1.4423101386592037, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.60920524597168, + "learning_rate": 1e-06, + "loss": 0.4171, + "mean_token_accuracy": 0.868379533290863, + "num_tokens": 432648558.0, + "step": 11338 + }, + { + "epoch": 1.4424373489377942, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.318065643310547, + "learning_rate": 1e-06, + "loss": 0.5052, + "mean_token_accuracy": 0.840859591960907, + "num_tokens": 432684243.0, + "step": 11339 + }, + { + "epoch": 1.4425645592163847, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.31841468811035, + "learning_rate": 1e-06, + "loss": 0.4191, + "mean_token_accuracy": 0.8658989667892456, + "num_tokens": 432719355.0, + "step": 11340 + }, + { + "epoch": 1.4426917694949752, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.429636001586914, + "learning_rate": 1e-06, + "loss": 0.4265, + "mean_token_accuracy": 0.8668681979179382, + "num_tokens": 432758675.0, + "step": 11341 + }, + { + "epoch": 1.4428189797735658, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.545692443847656, + "learning_rate": 1e-06, + "loss": 0.4328, + "mean_token_accuracy": 0.8646568059921265, + "num_tokens": 432801677.0, + "step": 11342 + }, + { + "epoch": 1.4429461900521563, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.26171875, + "learning_rate": 1e-06, + "loss": 0.4116, + "mean_token_accuracy": 0.865118145942688, + "num_tokens": 432836299.0, + "step": 11343 + }, + { + "epoch": 1.4430734003307468, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.58415412902832, + "learning_rate": 1e-06, + "loss": 0.413, + "mean_token_accuracy": 0.865356981754303, + "num_tokens": 432868838.0, + "step": 11344 + }, + { + "epoch": 1.4432006106093374, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.44599151611328, + "learning_rate": 1e-06, + "loss": 0.4069, + "mean_token_accuracy": 0.8689558506011963, + "num_tokens": 432904895.0, + "step": 11345 + }, + { + "epoch": 1.4433278208879279, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.352083206176758, + "learning_rate": 1e-06, + "loss": 0.4286, + "mean_token_accuracy": 0.8613829612731934, + "num_tokens": 432944785.0, + "step": 11346 + }, + { + "epoch": 1.4434550311665182, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.46134376525879, + "learning_rate": 1e-06, + "loss": 0.4455, + "mean_token_accuracy": 0.8574929237365723, + "num_tokens": 432975645.0, + "step": 11347 + }, + { + "epoch": 1.4435822414451087, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.453227996826172, + "learning_rate": 1e-06, + "loss": 0.4334, + "mean_token_accuracy": 0.8617653250694275, + "num_tokens": 433016887.0, + "step": 11348 + }, + { + "epoch": 1.4437094517236992, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.3868465423584, + "learning_rate": 1e-06, + "loss": 0.4089, + "mean_token_accuracy": 0.8690325617790222, + "num_tokens": 433053666.0, + "step": 11349 + }, + { + "epoch": 1.4438366620022898, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.5090274810791, + "learning_rate": 1e-06, + "loss": 0.4283, + "mean_token_accuracy": 0.862335741519928, + "num_tokens": 433101131.0, + "step": 11350 + }, + { + "epoch": 1.4439638722808803, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.62765121459961, + "learning_rate": 1e-06, + "loss": 0.4033, + "mean_token_accuracy": 0.8738741278648376, + "num_tokens": 433132535.0, + "step": 11351 + }, + { + "epoch": 1.4440910825594708, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.439743041992188, + "learning_rate": 1e-06, + "loss": 0.4642, + "mean_token_accuracy": 0.8522696495056152, + "num_tokens": 433168229.0, + "step": 11352 + }, + { + "epoch": 1.4442182928380614, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.725074768066406, + "learning_rate": 1e-06, + "loss": 0.4013, + "mean_token_accuracy": 0.8721562623977661, + "num_tokens": 433206094.0, + "step": 11353 + }, + { + "epoch": 1.4443455031166519, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.414594650268555, + "learning_rate": 1e-06, + "loss": 0.4654, + "mean_token_accuracy": 0.8549840450286865, + "num_tokens": 433241038.0, + "step": 11354 + }, + { + "epoch": 1.4444727133952424, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.493181228637695, + "learning_rate": 1e-06, + "loss": 0.4617, + "mean_token_accuracy": 0.8591164350509644, + "num_tokens": 433274674.0, + "step": 11355 + }, + { + "epoch": 1.4445999236738327, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.495914459228516, + "learning_rate": 1e-06, + "loss": 0.4084, + "mean_token_accuracy": 0.8696937561035156, + "num_tokens": 433314511.0, + "step": 11356 + }, + { + "epoch": 1.4447271339524232, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.623289108276367, + "learning_rate": 1e-06, + "loss": 0.4113, + "mean_token_accuracy": 0.8689945936203003, + "num_tokens": 433349338.0, + "step": 11357 + }, + { + "epoch": 1.4448543442310138, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.40654945373535, + "learning_rate": 1e-06, + "loss": 0.3898, + "mean_token_accuracy": 0.87709641456604, + "num_tokens": 433382609.0, + "step": 11358 + }, + { + "epoch": 1.4449815545096043, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.602842330932617, + "learning_rate": 1e-06, + "loss": 0.3769, + "mean_token_accuracy": 0.8815646171569824, + "num_tokens": 433411553.0, + "step": 11359 + }, + { + "epoch": 1.4451087647881948, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.320556640625, + "learning_rate": 1e-06, + "loss": 0.4291, + "mean_token_accuracy": 0.8645312786102295, + "num_tokens": 433450571.0, + "step": 11360 + }, + { + "epoch": 1.4452359750667854, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.6435604095459, + "learning_rate": 1e-06, + "loss": 0.4299, + "mean_token_accuracy": 0.8627364635467529, + "num_tokens": 433486994.0, + "step": 11361 + }, + { + "epoch": 1.445363185345376, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.310380935668945, + "learning_rate": 1e-06, + "loss": 0.3772, + "mean_token_accuracy": 0.8795160055160522, + "num_tokens": 433521407.0, + "step": 11362 + }, + { + "epoch": 1.4454903956239664, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.529577255249023, + "learning_rate": 1e-06, + "loss": 0.3825, + "mean_token_accuracy": 0.8778526186943054, + "num_tokens": 433561338.0, + "step": 11363 + }, + { + "epoch": 1.445617605902557, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.524534225463867, + "learning_rate": 1e-06, + "loss": 0.396, + "mean_token_accuracy": 0.8739784955978394, + "num_tokens": 433598892.0, + "step": 11364 + }, + { + "epoch": 1.4457448161811475, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.394412994384766, + "learning_rate": 1e-06, + "loss": 0.4499, + "mean_token_accuracy": 0.8562756180763245, + "num_tokens": 433641423.0, + "step": 11365 + }, + { + "epoch": 1.445872026459738, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.344791412353516, + "learning_rate": 1e-06, + "loss": 0.4085, + "mean_token_accuracy": 0.8703815937042236, + "num_tokens": 433677385.0, + "step": 11366 + }, + { + "epoch": 1.4459992367383285, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.43870735168457, + "learning_rate": 1e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.8655933737754822, + "num_tokens": 433716901.0, + "step": 11367 + }, + { + "epoch": 1.446126447016919, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.59413719177246, + "learning_rate": 1e-06, + "loss": 0.4412, + "mean_token_accuracy": 0.8630321025848389, + "num_tokens": 433755099.0, + "step": 11368 + }, + { + "epoch": 1.4462536572955096, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.313634872436523, + "learning_rate": 1e-06, + "loss": 0.4348, + "mean_token_accuracy": 0.863257646560669, + "num_tokens": 433794012.0, + "step": 11369 + }, + { + "epoch": 1.4463808675741001, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.51162338256836, + "learning_rate": 1e-06, + "loss": 0.4076, + "mean_token_accuracy": 0.8706322908401489, + "num_tokens": 433831965.0, + "step": 11370 + }, + { + "epoch": 1.4465080778526904, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.42916488647461, + "learning_rate": 1e-06, + "loss": 0.4246, + "mean_token_accuracy": 0.8686482310295105, + "num_tokens": 433870819.0, + "step": 11371 + }, + { + "epoch": 1.446635288131281, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.387773513793945, + "learning_rate": 1e-06, + "loss": 0.4263, + "mean_token_accuracy": 0.8624212741851807, + "num_tokens": 433909197.0, + "step": 11372 + }, + { + "epoch": 1.4467624984098715, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.548152923583984, + "learning_rate": 1e-06, + "loss": 0.4167, + "mean_token_accuracy": 0.8737027645111084, + "num_tokens": 433946026.0, + "step": 11373 + }, + { + "epoch": 1.446889708688462, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.409433364868164, + "learning_rate": 1e-06, + "loss": 0.3553, + "mean_token_accuracy": 0.8889175653457642, + "num_tokens": 433986239.0, + "step": 11374 + }, + { + "epoch": 1.4470169189670525, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.421140670776367, + "learning_rate": 1e-06, + "loss": 0.4016, + "mean_token_accuracy": 0.8717851042747498, + "num_tokens": 434022557.0, + "step": 11375 + }, + { + "epoch": 1.447144129245643, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.781051635742188, + "learning_rate": 1e-06, + "loss": 0.4218, + "mean_token_accuracy": 0.8710375428199768, + "num_tokens": 434059258.0, + "step": 11376 + }, + { + "epoch": 1.4472713395242336, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.42704200744629, + "learning_rate": 1e-06, + "loss": 0.4484, + "mean_token_accuracy": 0.8538297414779663, + "num_tokens": 434094872.0, + "step": 11377 + }, + { + "epoch": 1.4473985498028241, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.54039764404297, + "learning_rate": 1e-06, + "loss": 0.4935, + "mean_token_accuracy": 0.8455650806427002, + "num_tokens": 434134842.0, + "step": 11378 + }, + { + "epoch": 1.4475257600814146, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.76649284362793, + "learning_rate": 1e-06, + "loss": 0.3787, + "mean_token_accuracy": 0.8783290386199951, + "num_tokens": 434173152.0, + "step": 11379 + }, + { + "epoch": 1.447652970360005, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.44398307800293, + "learning_rate": 1e-06, + "loss": 0.4147, + "mean_token_accuracy": 0.8698134422302246, + "num_tokens": 434208318.0, + "step": 11380 + }, + { + "epoch": 1.4477801806385955, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.728330612182617, + "learning_rate": 1e-06, + "loss": 0.4125, + "mean_token_accuracy": 0.8648717999458313, + "num_tokens": 434243693.0, + "step": 11381 + }, + { + "epoch": 1.447907390917186, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.430585861206055, + "learning_rate": 1e-06, + "loss": 0.47, + "mean_token_accuracy": 0.8557379841804504, + "num_tokens": 434283172.0, + "step": 11382 + }, + { + "epoch": 1.4480346011957765, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.41712188720703, + "learning_rate": 1e-06, + "loss": 0.4072, + "mean_token_accuracy": 0.8682732582092285, + "num_tokens": 434318789.0, + "step": 11383 + }, + { + "epoch": 1.448161811474367, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.59955406188965, + "learning_rate": 1e-06, + "loss": 0.4247, + "mean_token_accuracy": 0.8595643639564514, + "num_tokens": 434358943.0, + "step": 11384 + }, + { + "epoch": 1.4482890217529576, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.54717445373535, + "learning_rate": 1e-06, + "loss": 0.4184, + "mean_token_accuracy": 0.8675607442855835, + "num_tokens": 434398264.0, + "step": 11385 + }, + { + "epoch": 1.4484162320315481, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.611814498901367, + "learning_rate": 1e-06, + "loss": 0.39, + "mean_token_accuracy": 0.8768736124038696, + "num_tokens": 434433602.0, + "step": 11386 + }, + { + "epoch": 1.4485434423101387, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.496135711669922, + "learning_rate": 1e-06, + "loss": 0.441, + "mean_token_accuracy": 0.8608013391494751, + "num_tokens": 434480001.0, + "step": 11387 + }, + { + "epoch": 1.4486706525887292, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.569156646728516, + "learning_rate": 1e-06, + "loss": 0.3679, + "mean_token_accuracy": 0.882651686668396, + "num_tokens": 434519285.0, + "step": 11388 + }, + { + "epoch": 1.4487978628673197, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.77553367614746, + "learning_rate": 1e-06, + "loss": 0.4243, + "mean_token_accuracy": 0.8668971061706543, + "num_tokens": 434556118.0, + "step": 11389 + }, + { + "epoch": 1.4489250731459102, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.428186416625977, + "learning_rate": 1e-06, + "loss": 0.4546, + "mean_token_accuracy": 0.8512572646141052, + "num_tokens": 434595350.0, + "step": 11390 + }, + { + "epoch": 1.4490522834245008, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.48280906677246, + "learning_rate": 1e-06, + "loss": 0.4245, + "mean_token_accuracy": 0.8648200035095215, + "num_tokens": 434632988.0, + "step": 11391 + }, + { + "epoch": 1.4491794937030913, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.709875106811523, + "learning_rate": 1e-06, + "loss": 0.3974, + "mean_token_accuracy": 0.8745223879814148, + "num_tokens": 434673040.0, + "step": 11392 + }, + { + "epoch": 1.4493067039816818, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.58940315246582, + "learning_rate": 1e-06, + "loss": 0.3722, + "mean_token_accuracy": 0.8826358318328857, + "num_tokens": 434712201.0, + "step": 11393 + }, + { + "epoch": 1.4494339142602723, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.654695510864258, + "learning_rate": 1e-06, + "loss": 0.3777, + "mean_token_accuracy": 0.8804367184638977, + "num_tokens": 434756676.0, + "step": 11394 + }, + { + "epoch": 1.4495611245388629, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.311420440673828, + "learning_rate": 1e-06, + "loss": 0.4069, + "mean_token_accuracy": 0.8712551593780518, + "num_tokens": 434797890.0, + "step": 11395 + }, + { + "epoch": 1.4496883348174532, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.505895614624023, + "learning_rate": 1e-06, + "loss": 0.3748, + "mean_token_accuracy": 0.8803174495697021, + "num_tokens": 434836933.0, + "step": 11396 + }, + { + "epoch": 1.4498155450960437, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.52857208251953, + "learning_rate": 1e-06, + "loss": 0.3741, + "mean_token_accuracy": 0.8810303807258606, + "num_tokens": 434868098.0, + "step": 11397 + }, + { + "epoch": 1.4499427553746342, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.77655601501465, + "learning_rate": 1e-06, + "loss": 0.43, + "mean_token_accuracy": 0.8635777831077576, + "num_tokens": 434907239.0, + "step": 11398 + }, + { + "epoch": 1.4500699656532248, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.503969192504883, + "learning_rate": 1e-06, + "loss": 0.4126, + "mean_token_accuracy": 0.8737043142318726, + "num_tokens": 434945672.0, + "step": 11399 + }, + { + "epoch": 1.4501971759318153, + "ewc_loss": 0.03173828125, + "ewc_loss_parallel": 3.170967102050781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.408727645874023, + "learning_rate": 1e-06, + "loss": 0.4453, + "mean_token_accuracy": 0.8574540615081787, + "num_tokens": 434981693.0, + "step": 11400 + }, + { + "epoch": 1.4503243862104058, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.518735885620117, + "learning_rate": 1e-06, + "loss": 0.342, + "mean_token_accuracy": 0.8899450302124023, + "num_tokens": 435012452.0, + "step": 11401 + }, + { + "epoch": 1.4504515964889964, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.810298919677734, + "learning_rate": 1e-06, + "loss": 0.4372, + "mean_token_accuracy": 0.8610880374908447, + "num_tokens": 435052711.0, + "step": 11402 + }, + { + "epoch": 1.4505788067675869, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.45673370361328, + "learning_rate": 1e-06, + "loss": 0.42, + "mean_token_accuracy": 0.8654171824455261, + "num_tokens": 435092766.0, + "step": 11403 + }, + { + "epoch": 1.4507060170461774, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.583232879638672, + "learning_rate": 1e-06, + "loss": 0.4574, + "mean_token_accuracy": 0.8531886339187622, + "num_tokens": 435132093.0, + "step": 11404 + }, + { + "epoch": 1.4508332273247677, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.51820182800293, + "learning_rate": 1e-06, + "loss": 0.4316, + "mean_token_accuracy": 0.8647217750549316, + "num_tokens": 435169071.0, + "step": 11405 + }, + { + "epoch": 1.4509604376033582, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.490983963012695, + "learning_rate": 1e-06, + "loss": 0.3837, + "mean_token_accuracy": 0.878243088722229, + "num_tokens": 435202665.0, + "step": 11406 + }, + { + "epoch": 1.4510876478819488, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.717082977294922, + "learning_rate": 1e-06, + "loss": 0.3731, + "mean_token_accuracy": 0.8818819522857666, + "num_tokens": 435234719.0, + "step": 11407 + }, + { + "epoch": 1.4512148581605393, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.38226890563965, + "learning_rate": 1e-06, + "loss": 0.3762, + "mean_token_accuracy": 0.8826370239257812, + "num_tokens": 435267090.0, + "step": 11408 + }, + { + "epoch": 1.4513420684391298, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.42320442199707, + "learning_rate": 1e-06, + "loss": 0.3894, + "mean_token_accuracy": 0.8752850294113159, + "num_tokens": 435307243.0, + "step": 11409 + }, + { + "epoch": 1.4514692787177204, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.565061569213867, + "learning_rate": 1e-06, + "loss": 0.3912, + "mean_token_accuracy": 0.875188946723938, + "num_tokens": 435347507.0, + "step": 11410 + }, + { + "epoch": 1.4515964889963109, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.588903427124023, + "learning_rate": 1e-06, + "loss": 0.4407, + "mean_token_accuracy": 0.8579840064048767, + "num_tokens": 435383499.0, + "step": 11411 + }, + { + "epoch": 1.4517236992749014, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.399723052978516, + "learning_rate": 1e-06, + "loss": 0.4199, + "mean_token_accuracy": 0.8683618903160095, + "num_tokens": 435423500.0, + "step": 11412 + }, + { + "epoch": 1.451850909553492, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.721784591674805, + "learning_rate": 1e-06, + "loss": 0.4136, + "mean_token_accuracy": 0.8721771836280823, + "num_tokens": 435457075.0, + "step": 11413 + }, + { + "epoch": 1.4519781198320825, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.36681365966797, + "learning_rate": 1e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.8671071529388428, + "num_tokens": 435498933.0, + "step": 11414 + }, + { + "epoch": 1.452105330110673, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.800931930541992, + "learning_rate": 1e-06, + "loss": 0.439, + "mean_token_accuracy": 0.8596916198730469, + "num_tokens": 435535949.0, + "step": 11415 + }, + { + "epoch": 1.4522325403892635, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.485971450805664, + "learning_rate": 1e-06, + "loss": 0.4308, + "mean_token_accuracy": 0.8615356087684631, + "num_tokens": 435565252.0, + "step": 11416 + }, + { + "epoch": 1.452359750667854, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.36028480529785, + "learning_rate": 1e-06, + "loss": 0.4721, + "mean_token_accuracy": 0.8491503000259399, + "num_tokens": 435604663.0, + "step": 11417 + }, + { + "epoch": 1.4524869609464446, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.416765213012695, + "learning_rate": 1e-06, + "loss": 0.4324, + "mean_token_accuracy": 0.8630763292312622, + "num_tokens": 435640883.0, + "step": 11418 + }, + { + "epoch": 1.4526141712250351, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.544397354125977, + "learning_rate": 1e-06, + "loss": 0.4073, + "mean_token_accuracy": 0.8706859350204468, + "num_tokens": 435682787.0, + "step": 11419 + }, + { + "epoch": 1.4527413815036254, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.753829956054688, + "learning_rate": 1e-06, + "loss": 0.4132, + "mean_token_accuracy": 0.8655834794044495, + "num_tokens": 435719982.0, + "step": 11420 + }, + { + "epoch": 1.452868591782216, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.525415420532227, + "learning_rate": 1e-06, + "loss": 0.4858, + "mean_token_accuracy": 0.8510732650756836, + "num_tokens": 435755153.0, + "step": 11421 + }, + { + "epoch": 1.4529958020608065, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.747072219848633, + "learning_rate": 1e-06, + "loss": 0.3836, + "mean_token_accuracy": 0.8764696717262268, + "num_tokens": 435797513.0, + "step": 11422 + }, + { + "epoch": 1.453123012339397, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.65814208984375, + "learning_rate": 1e-06, + "loss": 0.4203, + "mean_token_accuracy": 0.8668347597122192, + "num_tokens": 435836835.0, + "step": 11423 + }, + { + "epoch": 1.4532502226179875, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.52021026611328, + "learning_rate": 1e-06, + "loss": 0.4498, + "mean_token_accuracy": 0.859167218208313, + "num_tokens": 435875124.0, + "step": 11424 + }, + { + "epoch": 1.453377432896578, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.49551773071289, + "learning_rate": 1e-06, + "loss": 0.3886, + "mean_token_accuracy": 0.8790260553359985, + "num_tokens": 435914993.0, + "step": 11425 + }, + { + "epoch": 1.4535046431751686, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.508054733276367, + "learning_rate": 1e-06, + "loss": 0.3944, + "mean_token_accuracy": 0.8757496476173401, + "num_tokens": 435950916.0, + "step": 11426 + }, + { + "epoch": 1.4536318534537591, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.470630645751953, + "learning_rate": 1e-06, + "loss": 0.4351, + "mean_token_accuracy": 0.8627485036849976, + "num_tokens": 435991986.0, + "step": 11427 + }, + { + "epoch": 1.4537590637323496, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.56725311279297, + "learning_rate": 1e-06, + "loss": 0.3937, + "mean_token_accuracy": 0.8758776783943176, + "num_tokens": 436024603.0, + "step": 11428 + }, + { + "epoch": 1.45388627401094, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.541534423828125, + "learning_rate": 1e-06, + "loss": 0.4271, + "mean_token_accuracy": 0.8625128269195557, + "num_tokens": 436065540.0, + "step": 11429 + }, + { + "epoch": 1.4540134842895305, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.686140060424805, + "learning_rate": 1e-06, + "loss": 0.4685, + "mean_token_accuracy": 0.8542769551277161, + "num_tokens": 436104394.0, + "step": 11430 + }, + { + "epoch": 1.454140694568121, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.522546768188477, + "learning_rate": 1e-06, + "loss": 0.4767, + "mean_token_accuracy": 0.8548035025596619, + "num_tokens": 436139977.0, + "step": 11431 + }, + { + "epoch": 1.4542679048467115, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.467805862426758, + "learning_rate": 1e-06, + "loss": 0.3642, + "mean_token_accuracy": 0.8826273679733276, + "num_tokens": 436178047.0, + "step": 11432 + }, + { + "epoch": 1.454395115125302, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.538663864135742, + "learning_rate": 1e-06, + "loss": 0.4154, + "mean_token_accuracy": 0.8658231496810913, + "num_tokens": 436218331.0, + "step": 11433 + }, + { + "epoch": 1.4545223254038926, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.74593734741211, + "learning_rate": 1e-06, + "loss": 0.4263, + "mean_token_accuracy": 0.864497721195221, + "num_tokens": 436255663.0, + "step": 11434 + }, + { + "epoch": 1.4546495356824831, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.604612350463867, + "learning_rate": 1e-06, + "loss": 0.4128, + "mean_token_accuracy": 0.8678794503211975, + "num_tokens": 436301328.0, + "step": 11435 + }, + { + "epoch": 1.4547767459610736, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.251005172729492, + "learning_rate": 1e-06, + "loss": 0.3948, + "mean_token_accuracy": 0.8764636516571045, + "num_tokens": 436336487.0, + "step": 11436 + }, + { + "epoch": 1.4549039562396642, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.60039710998535, + "learning_rate": 1e-06, + "loss": 0.4054, + "mean_token_accuracy": 0.8700186014175415, + "num_tokens": 436377629.0, + "step": 11437 + }, + { + "epoch": 1.4550311665182547, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.401281356811523, + "learning_rate": 1e-06, + "loss": 0.3839, + "mean_token_accuracy": 0.8814071416854858, + "num_tokens": 436421590.0, + "step": 11438 + }, + { + "epoch": 1.4551583767968452, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.61486053466797, + "learning_rate": 1e-06, + "loss": 0.4402, + "mean_token_accuracy": 0.8600993752479553, + "num_tokens": 436461172.0, + "step": 11439 + }, + { + "epoch": 1.4552855870754358, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.720800399780273, + "learning_rate": 1e-06, + "loss": 0.3879, + "mean_token_accuracy": 0.8757688403129578, + "num_tokens": 436497536.0, + "step": 11440 + }, + { + "epoch": 1.4554127973540263, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.359590530395508, + "learning_rate": 1e-06, + "loss": 0.4196, + "mean_token_accuracy": 0.864589512348175, + "num_tokens": 436543906.0, + "step": 11441 + }, + { + "epoch": 1.4555400076326168, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.433334350585938, + "learning_rate": 1e-06, + "loss": 0.4137, + "mean_token_accuracy": 0.8690321445465088, + "num_tokens": 436579462.0, + "step": 11442 + }, + { + "epoch": 1.4556672179112073, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.824800491333008, + "learning_rate": 1e-06, + "loss": 0.4342, + "mean_token_accuracy": 0.860500693321228, + "num_tokens": 436618840.0, + "step": 11443 + }, + { + "epoch": 1.4557944281897979, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.50309181213379, + "learning_rate": 1e-06, + "loss": 0.4547, + "mean_token_accuracy": 0.8565851449966431, + "num_tokens": 436654176.0, + "step": 11444 + }, + { + "epoch": 1.4559216384683882, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.786348342895508, + "learning_rate": 1e-06, + "loss": 0.3674, + "mean_token_accuracy": 0.8796738386154175, + "num_tokens": 436685507.0, + "step": 11445 + }, + { + "epoch": 1.4560488487469787, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.422773361206055, + "learning_rate": 1e-06, + "loss": 0.4003, + "mean_token_accuracy": 0.8757155537605286, + "num_tokens": 436729881.0, + "step": 11446 + }, + { + "epoch": 1.4561760590255692, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.734907150268555, + "learning_rate": 1e-06, + "loss": 0.4177, + "mean_token_accuracy": 0.8662643432617188, + "num_tokens": 436772355.0, + "step": 11447 + }, + { + "epoch": 1.4563032693041598, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.38478660583496, + "learning_rate": 1e-06, + "loss": 0.4362, + "mean_token_accuracy": 0.8608077764511108, + "num_tokens": 436808701.0, + "step": 11448 + }, + { + "epoch": 1.4564304795827503, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.646337509155273, + "learning_rate": 1e-06, + "loss": 0.3821, + "mean_token_accuracy": 0.8770666122436523, + "num_tokens": 436845441.0, + "step": 11449 + }, + { + "epoch": 1.4565576898613408, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.696229934692383, + "learning_rate": 1e-06, + "loss": 0.4009, + "mean_token_accuracy": 0.8703975677490234, + "num_tokens": 436885316.0, + "step": 11450 + }, + { + "epoch": 1.4566849001399313, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.70868682861328, + "learning_rate": 1e-06, + "loss": 0.4277, + "mean_token_accuracy": 0.862148642539978, + "num_tokens": 436918315.0, + "step": 11451 + }, + { + "epoch": 1.4568121104185219, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.803138732910156, + "learning_rate": 1e-06, + "loss": 0.4487, + "mean_token_accuracy": 0.8598666191101074, + "num_tokens": 436956039.0, + "step": 11452 + }, + { + "epoch": 1.4569393206971124, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.626358032226562, + "learning_rate": 1e-06, + "loss": 0.4653, + "mean_token_accuracy": 0.853515625, + "num_tokens": 436991274.0, + "step": 11453 + }, + { + "epoch": 1.4570665309757027, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.634357452392578, + "learning_rate": 1e-06, + "loss": 0.3822, + "mean_token_accuracy": 0.879558265209198, + "num_tokens": 437029963.0, + "step": 11454 + }, + { + "epoch": 1.4571937412542932, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.57224464416504, + "learning_rate": 1e-06, + "loss": 0.443, + "mean_token_accuracy": 0.8607428073883057, + "num_tokens": 437067030.0, + "step": 11455 + }, + { + "epoch": 1.4573209515328838, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.598251342773438, + "learning_rate": 1e-06, + "loss": 0.3906, + "mean_token_accuracy": 0.8745719194412231, + "num_tokens": 437103363.0, + "step": 11456 + }, + { + "epoch": 1.4574481618114743, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.669200897216797, + "learning_rate": 1e-06, + "loss": 0.3835, + "mean_token_accuracy": 0.8741103410720825, + "num_tokens": 437131574.0, + "step": 11457 + }, + { + "epoch": 1.4575753720900648, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.470561981201172, + "learning_rate": 1e-06, + "loss": 0.451, + "mean_token_accuracy": 0.8599806427955627, + "num_tokens": 437169537.0, + "step": 11458 + }, + { + "epoch": 1.4577025823686554, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.439565658569336, + "learning_rate": 1e-06, + "loss": 0.4051, + "mean_token_accuracy": 0.872582197189331, + "num_tokens": 437208025.0, + "step": 11459 + }, + { + "epoch": 1.4578297926472459, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.582834243774414, + "learning_rate": 1e-06, + "loss": 0.3855, + "mean_token_accuracy": 0.8789758682250977, + "num_tokens": 437246642.0, + "step": 11460 + }, + { + "epoch": 1.4579570029258364, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.735126495361328, + "learning_rate": 1e-06, + "loss": 0.4295, + "mean_token_accuracy": 0.8653534650802612, + "num_tokens": 437285274.0, + "step": 11461 + }, + { + "epoch": 1.458084213204427, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.426292419433594, + "learning_rate": 1e-06, + "loss": 0.3593, + "mean_token_accuracy": 0.884942889213562, + "num_tokens": 437325865.0, + "step": 11462 + }, + { + "epoch": 1.4582114234830175, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.603363037109375, + "learning_rate": 1e-06, + "loss": 0.4187, + "mean_token_accuracy": 0.8731642365455627, + "num_tokens": 437364093.0, + "step": 11463 + }, + { + "epoch": 1.458338633761608, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.56324005126953, + "learning_rate": 1e-06, + "loss": 0.4579, + "mean_token_accuracy": 0.8552526831626892, + "num_tokens": 437402413.0, + "step": 11464 + }, + { + "epoch": 1.4584658440401985, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.797531127929688, + "learning_rate": 1e-06, + "loss": 0.4555, + "mean_token_accuracy": 0.8535027503967285, + "num_tokens": 437442583.0, + "step": 11465 + }, + { + "epoch": 1.458593054318789, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.58819580078125, + "learning_rate": 1e-06, + "loss": 0.4299, + "mean_token_accuracy": 0.8676971197128296, + "num_tokens": 437485682.0, + "step": 11466 + }, + { + "epoch": 1.4587202645973796, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.476789474487305, + "learning_rate": 1e-06, + "loss": 0.4376, + "mean_token_accuracy": 0.857453465461731, + "num_tokens": 437524859.0, + "step": 11467 + }, + { + "epoch": 1.45884747487597, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.510251998901367, + "learning_rate": 1e-06, + "loss": 0.4445, + "mean_token_accuracy": 0.8593838810920715, + "num_tokens": 437563590.0, + "step": 11468 + }, + { + "epoch": 1.4589746851545604, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.545175552368164, + "learning_rate": 1e-06, + "loss": 0.3776, + "mean_token_accuracy": 0.8837748765945435, + "num_tokens": 437601429.0, + "step": 11469 + }, + { + "epoch": 1.459101895433151, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.651348114013672, + "learning_rate": 1e-06, + "loss": 0.4708, + "mean_token_accuracy": 0.8556386828422546, + "num_tokens": 437639144.0, + "step": 11470 + }, + { + "epoch": 1.4592291057117415, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.44093894958496, + "learning_rate": 1e-06, + "loss": 0.3943, + "mean_token_accuracy": 0.8750717043876648, + "num_tokens": 437680846.0, + "step": 11471 + }, + { + "epoch": 1.459356315990332, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.566802978515625, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.8688083291053772, + "num_tokens": 437722156.0, + "step": 11472 + }, + { + "epoch": 1.4594835262689225, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.393352508544922, + "learning_rate": 1e-06, + "loss": 0.499, + "mean_token_accuracy": 0.843845546245575, + "num_tokens": 437766905.0, + "step": 11473 + }, + { + "epoch": 1.459610736547513, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.60572052001953, + "learning_rate": 1e-06, + "loss": 0.4219, + "mean_token_accuracy": 0.8673443794250488, + "num_tokens": 437801425.0, + "step": 11474 + }, + { + "epoch": 1.4597379468261036, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.659517288208008, + "learning_rate": 1e-06, + "loss": 0.4565, + "mean_token_accuracy": 0.8559044599533081, + "num_tokens": 437838126.0, + "step": 11475 + }, + { + "epoch": 1.4598651571046941, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.467044830322266, + "learning_rate": 1e-06, + "loss": 0.4324, + "mean_token_accuracy": 0.8617749810218811, + "num_tokens": 437876597.0, + "step": 11476 + }, + { + "epoch": 1.4599923673832846, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.501033782958984, + "learning_rate": 1e-06, + "loss": 0.3747, + "mean_token_accuracy": 0.8782368898391724, + "num_tokens": 437911852.0, + "step": 11477 + }, + { + "epoch": 1.460119577661875, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.50661849975586, + "learning_rate": 1e-06, + "loss": 0.4176, + "mean_token_accuracy": 0.8683682680130005, + "num_tokens": 437953418.0, + "step": 11478 + }, + { + "epoch": 1.4602467879404655, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.490493774414062, + "learning_rate": 1e-06, + "loss": 0.3896, + "mean_token_accuracy": 0.8728100061416626, + "num_tokens": 437989561.0, + "step": 11479 + }, + { + "epoch": 1.460373998219056, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.757837295532227, + "learning_rate": 1e-06, + "loss": 0.3995, + "mean_token_accuracy": 0.875623345375061, + "num_tokens": 438033464.0, + "step": 11480 + }, + { + "epoch": 1.4605012084976465, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.390892028808594, + "learning_rate": 1e-06, + "loss": 0.4085, + "mean_token_accuracy": 0.8711133599281311, + "num_tokens": 438068676.0, + "step": 11481 + }, + { + "epoch": 1.460628418776237, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.42334747314453, + "learning_rate": 1e-06, + "loss": 0.4787, + "mean_token_accuracy": 0.8493578433990479, + "num_tokens": 438107867.0, + "step": 11482 + }, + { + "epoch": 1.4607556290548276, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.528724670410156, + "learning_rate": 1e-06, + "loss": 0.3703, + "mean_token_accuracy": 0.8832162618637085, + "num_tokens": 438144256.0, + "step": 11483 + }, + { + "epoch": 1.4608828393334181, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.542919158935547, + "learning_rate": 1e-06, + "loss": 0.3824, + "mean_token_accuracy": 0.8780558109283447, + "num_tokens": 438183458.0, + "step": 11484 + }, + { + "epoch": 1.4610100496120086, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.296951293945312, + "learning_rate": 1e-06, + "loss": 0.453, + "mean_token_accuracy": 0.8549291491508484, + "num_tokens": 438216025.0, + "step": 11485 + }, + { + "epoch": 1.4611372598905992, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.61701774597168, + "learning_rate": 1e-06, + "loss": 0.3985, + "mean_token_accuracy": 0.8788414597511292, + "num_tokens": 438258185.0, + "step": 11486 + }, + { + "epoch": 1.4612644701691897, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.244935989379883, + "learning_rate": 1e-06, + "loss": 0.4447, + "mean_token_accuracy": 0.8641291856765747, + "num_tokens": 438298189.0, + "step": 11487 + }, + { + "epoch": 1.4613916804477802, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.691823959350586, + "learning_rate": 1e-06, + "loss": 0.4491, + "mean_token_accuracy": 0.8561726212501526, + "num_tokens": 438336593.0, + "step": 11488 + }, + { + "epoch": 1.4615188907263708, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.646299362182617, + "learning_rate": 1e-06, + "loss": 0.4075, + "mean_token_accuracy": 0.87425696849823, + "num_tokens": 438374110.0, + "step": 11489 + }, + { + "epoch": 1.4616461010049613, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.4108829498291, + "learning_rate": 1e-06, + "loss": 0.4656, + "mean_token_accuracy": 0.8509482145309448, + "num_tokens": 438413062.0, + "step": 11490 + }, + { + "epoch": 1.4617733112835518, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.764965057373047, + "learning_rate": 1e-06, + "loss": 0.4158, + "mean_token_accuracy": 0.8702850341796875, + "num_tokens": 438452055.0, + "step": 11491 + }, + { + "epoch": 1.4619005215621423, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.513193130493164, + "learning_rate": 1e-06, + "loss": 0.4064, + "mean_token_accuracy": 0.8699417114257812, + "num_tokens": 438494224.0, + "step": 11492 + }, + { + "epoch": 1.4620277318407329, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.689863204956055, + "learning_rate": 1e-06, + "loss": 0.4296, + "mean_token_accuracy": 0.8648689985275269, + "num_tokens": 438529790.0, + "step": 11493 + }, + { + "epoch": 1.4621549421193232, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.561792373657227, + "learning_rate": 1e-06, + "loss": 0.4129, + "mean_token_accuracy": 0.8716480135917664, + "num_tokens": 438571371.0, + "step": 11494 + }, + { + "epoch": 1.4622821523979137, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.505123138427734, + "learning_rate": 1e-06, + "loss": 0.4641, + "mean_token_accuracy": 0.8537157773971558, + "num_tokens": 438612383.0, + "step": 11495 + }, + { + "epoch": 1.4624093626765042, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.721921920776367, + "learning_rate": 1e-06, + "loss": 0.437, + "mean_token_accuracy": 0.8575903177261353, + "num_tokens": 438653443.0, + "step": 11496 + }, + { + "epoch": 1.4625365729550948, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.502689361572266, + "learning_rate": 1e-06, + "loss": 0.4594, + "mean_token_accuracy": 0.8556139469146729, + "num_tokens": 438689565.0, + "step": 11497 + }, + { + "epoch": 1.4626637832336853, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.785659790039062, + "learning_rate": 1e-06, + "loss": 0.4142, + "mean_token_accuracy": 0.8693153262138367, + "num_tokens": 438727753.0, + "step": 11498 + }, + { + "epoch": 1.4627909935122758, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.573577880859375, + "learning_rate": 1e-06, + "loss": 0.4262, + "mean_token_accuracy": 0.865212619304657, + "num_tokens": 438764667.0, + "step": 11499 + }, + { + "epoch": 1.4629182037908663, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.65353775024414, + "learning_rate": 1e-06, + "loss": 0.4121, + "mean_token_accuracy": 0.8725037574768066, + "num_tokens": 438801739.0, + "step": 11500 + }, + { + "epoch": 1.4630454140694569, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.641647338867188, + "learning_rate": 1e-06, + "loss": 0.4096, + "mean_token_accuracy": 0.8690101504325867, + "num_tokens": 438840028.0, + "step": 11501 + }, + { + "epoch": 1.4631726243480474, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.489582061767578, + "learning_rate": 1e-06, + "loss": 0.4287, + "mean_token_accuracy": 0.8647585511207581, + "num_tokens": 438880632.0, + "step": 11502 + }, + { + "epoch": 1.4632998346266377, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.48711395263672, + "learning_rate": 1e-06, + "loss": 0.429, + "mean_token_accuracy": 0.8648401498794556, + "num_tokens": 438923040.0, + "step": 11503 + }, + { + "epoch": 1.4634270449052282, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.505571365356445, + "learning_rate": 1e-06, + "loss": 0.4292, + "mean_token_accuracy": 0.8681248426437378, + "num_tokens": 438958597.0, + "step": 11504 + }, + { + "epoch": 1.4635542551838188, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.567262649536133, + "learning_rate": 1e-06, + "loss": 0.4365, + "mean_token_accuracy": 0.8561750650405884, + "num_tokens": 438997909.0, + "step": 11505 + }, + { + "epoch": 1.4636814654624093, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.71485710144043, + "learning_rate": 1e-06, + "loss": 0.3918, + "mean_token_accuracy": 0.8755122423171997, + "num_tokens": 439030084.0, + "step": 11506 + }, + { + "epoch": 1.4638086757409998, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.409072875976562, + "learning_rate": 1e-06, + "loss": 0.4168, + "mean_token_accuracy": 0.8701379299163818, + "num_tokens": 439067770.0, + "step": 11507 + }, + { + "epoch": 1.4639358860195903, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.62168312072754, + "learning_rate": 1e-06, + "loss": 0.4176, + "mean_token_accuracy": 0.8704808950424194, + "num_tokens": 439102476.0, + "step": 11508 + }, + { + "epoch": 1.4640630962981809, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.501752853393555, + "learning_rate": 1e-06, + "loss": 0.3862, + "mean_token_accuracy": 0.877264678478241, + "num_tokens": 439138660.0, + "step": 11509 + }, + { + "epoch": 1.4641903065767714, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.57196807861328, + "learning_rate": 1e-06, + "loss": 0.3853, + "mean_token_accuracy": 0.8753108978271484, + "num_tokens": 439181690.0, + "step": 11510 + }, + { + "epoch": 1.464317516855362, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.616619110107422, + "learning_rate": 1e-06, + "loss": 0.3935, + "mean_token_accuracy": 0.8739223480224609, + "num_tokens": 439218825.0, + "step": 11511 + }, + { + "epoch": 1.4644447271339525, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.647537231445312, + "learning_rate": 1e-06, + "loss": 0.4026, + "mean_token_accuracy": 0.8725526928901672, + "num_tokens": 439253505.0, + "step": 11512 + }, + { + "epoch": 1.464571937412543, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.400117874145508, + "learning_rate": 1e-06, + "loss": 0.4292, + "mean_token_accuracy": 0.8617565631866455, + "num_tokens": 439292764.0, + "step": 11513 + }, + { + "epoch": 1.4646991476911335, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.888290405273438, + "learning_rate": 1e-06, + "loss": 0.4028, + "mean_token_accuracy": 0.8766906261444092, + "num_tokens": 439335269.0, + "step": 11514 + }, + { + "epoch": 1.464826357969724, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.321178436279297, + "learning_rate": 1e-06, + "loss": 0.4039, + "mean_token_accuracy": 0.868910551071167, + "num_tokens": 439371977.0, + "step": 11515 + }, + { + "epoch": 1.4649535682483146, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.497882843017578, + "learning_rate": 1e-06, + "loss": 0.416, + "mean_token_accuracy": 0.8672308921813965, + "num_tokens": 439412710.0, + "step": 11516 + }, + { + "epoch": 1.465080778526905, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.73676300048828, + "learning_rate": 1e-06, + "loss": 0.4285, + "mean_token_accuracy": 0.8670667409896851, + "num_tokens": 439449637.0, + "step": 11517 + }, + { + "epoch": 1.4652079888054954, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.590431213378906, + "learning_rate": 1e-06, + "loss": 0.4351, + "mean_token_accuracy": 0.8627137541770935, + "num_tokens": 439495143.0, + "step": 11518 + }, + { + "epoch": 1.465335199084086, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.45543098449707, + "learning_rate": 1e-06, + "loss": 0.4574, + "mean_token_accuracy": 0.8562185764312744, + "num_tokens": 439532721.0, + "step": 11519 + }, + { + "epoch": 1.4654624093626765, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.557497024536133, + "learning_rate": 1e-06, + "loss": 0.3806, + "mean_token_accuracy": 0.8769667148590088, + "num_tokens": 439567938.0, + "step": 11520 + }, + { + "epoch": 1.465589619641267, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.49199104309082, + "learning_rate": 1e-06, + "loss": 0.3875, + "mean_token_accuracy": 0.8751269578933716, + "num_tokens": 439595720.0, + "step": 11521 + }, + { + "epoch": 1.4657168299198575, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.698707580566406, + "learning_rate": 1e-06, + "loss": 0.4218, + "mean_token_accuracy": 0.8678176999092102, + "num_tokens": 439634091.0, + "step": 11522 + }, + { + "epoch": 1.465844040198448, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.395959854125977, + "learning_rate": 1e-06, + "loss": 0.447, + "mean_token_accuracy": 0.8566563129425049, + "num_tokens": 439671978.0, + "step": 11523 + }, + { + "epoch": 1.4659712504770386, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.401058197021484, + "learning_rate": 1e-06, + "loss": 0.4333, + "mean_token_accuracy": 0.8638643622398376, + "num_tokens": 439707748.0, + "step": 11524 + }, + { + "epoch": 1.466098460755629, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.530479431152344, + "learning_rate": 1e-06, + "loss": 0.3946, + "mean_token_accuracy": 0.8742400407791138, + "num_tokens": 439744060.0, + "step": 11525 + }, + { + "epoch": 1.4662256710342196, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.514266967773438, + "learning_rate": 1e-06, + "loss": 0.4476, + "mean_token_accuracy": 0.8595647811889648, + "num_tokens": 439789277.0, + "step": 11526 + }, + { + "epoch": 1.46635288131281, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.478971481323242, + "learning_rate": 1e-06, + "loss": 0.3663, + "mean_token_accuracy": 0.8846497535705566, + "num_tokens": 439827524.0, + "step": 11527 + }, + { + "epoch": 1.4664800915914005, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.713542938232422, + "learning_rate": 1e-06, + "loss": 0.4291, + "mean_token_accuracy": 0.8650119304656982, + "num_tokens": 439865181.0, + "step": 11528 + }, + { + "epoch": 1.466607301869991, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.405977249145508, + "learning_rate": 1e-06, + "loss": 0.4399, + "mean_token_accuracy": 0.8597472310066223, + "num_tokens": 439900071.0, + "step": 11529 + }, + { + "epoch": 1.4667345121485815, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.789358139038086, + "learning_rate": 1e-06, + "loss": 0.4198, + "mean_token_accuracy": 0.8670533895492554, + "num_tokens": 439940839.0, + "step": 11530 + }, + { + "epoch": 1.466861722427172, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.355209350585938, + "learning_rate": 1e-06, + "loss": 0.4286, + "mean_token_accuracy": 0.8648614883422852, + "num_tokens": 439978908.0, + "step": 11531 + }, + { + "epoch": 1.4669889327057626, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.16545867919922, + "learning_rate": 1e-06, + "loss": 0.4109, + "mean_token_accuracy": 0.866586446762085, + "num_tokens": 440013448.0, + "step": 11532 + }, + { + "epoch": 1.467116142984353, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.678049087524414, + "learning_rate": 1e-06, + "loss": 0.4062, + "mean_token_accuracy": 0.871427595615387, + "num_tokens": 440048377.0, + "step": 11533 + }, + { + "epoch": 1.4672433532629436, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.376304626464844, + "learning_rate": 1e-06, + "loss": 0.4283, + "mean_token_accuracy": 0.8620913028717041, + "num_tokens": 440089806.0, + "step": 11534 + }, + { + "epoch": 1.4673705635415342, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.875141143798828, + "learning_rate": 1e-06, + "loss": 0.4054, + "mean_token_accuracy": 0.8729003667831421, + "num_tokens": 440124209.0, + "step": 11535 + }, + { + "epoch": 1.4674977738201247, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.4730281829834, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.8688996434211731, + "num_tokens": 440158349.0, + "step": 11536 + }, + { + "epoch": 1.4676249840987152, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.54123306274414, + "learning_rate": 1e-06, + "loss": 0.4056, + "mean_token_accuracy": 0.8695263862609863, + "num_tokens": 440195220.0, + "step": 11537 + }, + { + "epoch": 1.4677521943773058, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.906993865966797, + "learning_rate": 1e-06, + "loss": 0.417, + "mean_token_accuracy": 0.8674676418304443, + "num_tokens": 440235158.0, + "step": 11538 + }, + { + "epoch": 1.4678794046558963, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.672800064086914, + "learning_rate": 1e-06, + "loss": 0.4284, + "mean_token_accuracy": 0.8632408380508423, + "num_tokens": 440271879.0, + "step": 11539 + }, + { + "epoch": 1.4680066149344868, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.55615234375, + "learning_rate": 1e-06, + "loss": 0.4208, + "mean_token_accuracy": 0.86631178855896, + "num_tokens": 440306454.0, + "step": 11540 + }, + { + "epoch": 1.4681338252130773, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.676984786987305, + "learning_rate": 1e-06, + "loss": 0.4634, + "mean_token_accuracy": 0.8554791212081909, + "num_tokens": 440341185.0, + "step": 11541 + }, + { + "epoch": 1.4682610354916679, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.70256996154785, + "learning_rate": 1e-06, + "loss": 0.4163, + "mean_token_accuracy": 0.8667518496513367, + "num_tokens": 440379168.0, + "step": 11542 + }, + { + "epoch": 1.4683882457702582, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.532604217529297, + "learning_rate": 1e-06, + "loss": 0.404, + "mean_token_accuracy": 0.8698762655258179, + "num_tokens": 440417945.0, + "step": 11543 + }, + { + "epoch": 1.4685154560488487, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.36495590209961, + "learning_rate": 1e-06, + "loss": 0.4709, + "mean_token_accuracy": 0.8524385690689087, + "num_tokens": 440457864.0, + "step": 11544 + }, + { + "epoch": 1.4686426663274392, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.804840087890625, + "learning_rate": 1e-06, + "loss": 0.4436, + "mean_token_accuracy": 0.8629540205001831, + "num_tokens": 440491709.0, + "step": 11545 + }, + { + "epoch": 1.4687698766060298, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.656631469726562, + "learning_rate": 1e-06, + "loss": 0.4064, + "mean_token_accuracy": 0.8743225336074829, + "num_tokens": 440533570.0, + "step": 11546 + }, + { + "epoch": 1.4688970868846203, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.454631805419922, + "learning_rate": 1e-06, + "loss": 0.3949, + "mean_token_accuracy": 0.8727511763572693, + "num_tokens": 440568360.0, + "step": 11547 + }, + { + "epoch": 1.4690242971632108, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.512208938598633, + "learning_rate": 1e-06, + "loss": 0.3766, + "mean_token_accuracy": 0.8773784637451172, + "num_tokens": 440606848.0, + "step": 11548 + }, + { + "epoch": 1.4691515074418013, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.139963150024414, + "learning_rate": 1e-06, + "loss": 0.3818, + "mean_token_accuracy": 0.8806171417236328, + "num_tokens": 440638254.0, + "step": 11549 + }, + { + "epoch": 1.4692787177203919, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.2565975189209, + "learning_rate": 1e-06, + "loss": 0.4015, + "mean_token_accuracy": 0.8734828233718872, + "num_tokens": 440674122.0, + "step": 11550 + }, + { + "epoch": 1.4694059279989824, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.581663131713867, + "learning_rate": 1e-06, + "loss": 0.3724, + "mean_token_accuracy": 0.882648229598999, + "num_tokens": 440718362.0, + "step": 11551 + }, + { + "epoch": 1.4695331382775727, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.533849716186523, + "learning_rate": 1e-06, + "loss": 0.3982, + "mean_token_accuracy": 0.8740271329879761, + "num_tokens": 440755971.0, + "step": 11552 + }, + { + "epoch": 1.4696603485561632, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.471174240112305, + "learning_rate": 1e-06, + "loss": 0.4182, + "mean_token_accuracy": 0.8660701513290405, + "num_tokens": 440794328.0, + "step": 11553 + }, + { + "epoch": 1.4697875588347538, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.605205535888672, + "learning_rate": 1e-06, + "loss": 0.4473, + "mean_token_accuracy": 0.8552720546722412, + "num_tokens": 440833383.0, + "step": 11554 + }, + { + "epoch": 1.4699147691133443, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.59068489074707, + "learning_rate": 1e-06, + "loss": 0.3934, + "mean_token_accuracy": 0.874474048614502, + "num_tokens": 440875502.0, + "step": 11555 + }, + { + "epoch": 1.4700419793919348, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.404497146606445, + "learning_rate": 1e-06, + "loss": 0.4076, + "mean_token_accuracy": 0.8707824349403381, + "num_tokens": 440915259.0, + "step": 11556 + }, + { + "epoch": 1.4701691896705253, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.709897994995117, + "learning_rate": 1e-06, + "loss": 0.4214, + "mean_token_accuracy": 0.8652195930480957, + "num_tokens": 440951660.0, + "step": 11557 + }, + { + "epoch": 1.4702963999491159, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.577489852905273, + "learning_rate": 1e-06, + "loss": 0.4054, + "mean_token_accuracy": 0.8679929375648499, + "num_tokens": 440992533.0, + "step": 11558 + }, + { + "epoch": 1.4704236102277064, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.57309341430664, + "learning_rate": 1e-06, + "loss": 0.442, + "mean_token_accuracy": 0.856620192527771, + "num_tokens": 441027360.0, + "step": 11559 + }, + { + "epoch": 1.470550820506297, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.599939346313477, + "learning_rate": 1e-06, + "loss": 0.4012, + "mean_token_accuracy": 0.8739372491836548, + "num_tokens": 441071746.0, + "step": 11560 + }, + { + "epoch": 1.4706780307848875, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.733158111572266, + "learning_rate": 1e-06, + "loss": 0.4145, + "mean_token_accuracy": 0.86702561378479, + "num_tokens": 441105378.0, + "step": 11561 + }, + { + "epoch": 1.470805241063478, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.47085952758789, + "learning_rate": 1e-06, + "loss": 0.4142, + "mean_token_accuracy": 0.8680223226547241, + "num_tokens": 441150000.0, + "step": 11562 + }, + { + "epoch": 1.4709324513420685, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.580413818359375, + "learning_rate": 1e-06, + "loss": 0.4307, + "mean_token_accuracy": 0.860511839389801, + "num_tokens": 441188426.0, + "step": 11563 + }, + { + "epoch": 1.471059661620659, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.568750381469727, + "learning_rate": 1e-06, + "loss": 0.3988, + "mean_token_accuracy": 0.874586284160614, + "num_tokens": 441226028.0, + "step": 11564 + }, + { + "epoch": 1.4711868718992496, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.738201141357422, + "learning_rate": 1e-06, + "loss": 0.4116, + "mean_token_accuracy": 0.868626594543457, + "num_tokens": 441265788.0, + "step": 11565 + }, + { + "epoch": 1.47131408217784, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.631349563598633, + "learning_rate": 1e-06, + "loss": 0.3887, + "mean_token_accuracy": 0.8760551810264587, + "num_tokens": 441297280.0, + "step": 11566 + }, + { + "epoch": 1.4714412924564304, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.556049346923828, + "learning_rate": 1e-06, + "loss": 0.4267, + "mean_token_accuracy": 0.8664031624794006, + "num_tokens": 441331229.0, + "step": 11567 + }, + { + "epoch": 1.471568502735021, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.460817337036133, + "learning_rate": 1e-06, + "loss": 0.3969, + "mean_token_accuracy": 0.8711239695549011, + "num_tokens": 441371064.0, + "step": 11568 + }, + { + "epoch": 1.4716957130136115, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.56529426574707, + "learning_rate": 1e-06, + "loss": 0.4651, + "mean_token_accuracy": 0.8494061231613159, + "num_tokens": 441405706.0, + "step": 11569 + }, + { + "epoch": 1.471822923292202, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.672204971313477, + "learning_rate": 1e-06, + "loss": 0.4501, + "mean_token_accuracy": 0.8590054512023926, + "num_tokens": 441445966.0, + "step": 11570 + }, + { + "epoch": 1.4719501335707925, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.579896926879883, + "learning_rate": 1e-06, + "loss": 0.4566, + "mean_token_accuracy": 0.851331889629364, + "num_tokens": 441478916.0, + "step": 11571 + }, + { + "epoch": 1.472077343849383, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.607141494750977, + "learning_rate": 1e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.8647631406784058, + "num_tokens": 441519245.0, + "step": 11572 + }, + { + "epoch": 1.4722045541279736, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.473840713500977, + "learning_rate": 1e-06, + "loss": 0.4303, + "mean_token_accuracy": 0.8598032593727112, + "num_tokens": 441557485.0, + "step": 11573 + }, + { + "epoch": 1.472331764406564, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.583717346191406, + "learning_rate": 1e-06, + "loss": 0.3877, + "mean_token_accuracy": 0.8781307935714722, + "num_tokens": 441596042.0, + "step": 11574 + }, + { + "epoch": 1.4724589746851546, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.594314575195312, + "learning_rate": 1e-06, + "loss": 0.3981, + "mean_token_accuracy": 0.8773060441017151, + "num_tokens": 441634391.0, + "step": 11575 + }, + { + "epoch": 1.472586184963745, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.384660720825195, + "learning_rate": 1e-06, + "loss": 0.4326, + "mean_token_accuracy": 0.8592638969421387, + "num_tokens": 441672201.0, + "step": 11576 + }, + { + "epoch": 1.4727133952423355, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.618751525878906, + "learning_rate": 1e-06, + "loss": 0.4284, + "mean_token_accuracy": 0.8651663661003113, + "num_tokens": 441713350.0, + "step": 11577 + }, + { + "epoch": 1.472840605520926, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.47307014465332, + "learning_rate": 1e-06, + "loss": 0.4189, + "mean_token_accuracy": 0.8670689463615417, + "num_tokens": 441754677.0, + "step": 11578 + }, + { + "epoch": 1.4729678157995165, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.641433715820312, + "learning_rate": 1e-06, + "loss": 0.4435, + "mean_token_accuracy": 0.8571733832359314, + "num_tokens": 441797037.0, + "step": 11579 + }, + { + "epoch": 1.473095026078107, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.434934616088867, + "learning_rate": 1e-06, + "loss": 0.4258, + "mean_token_accuracy": 0.8653954267501831, + "num_tokens": 441831661.0, + "step": 11580 + }, + { + "epoch": 1.4732222363566976, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.445728302001953, + "learning_rate": 1e-06, + "loss": 0.4068, + "mean_token_accuracy": 0.8709961175918579, + "num_tokens": 441876933.0, + "step": 11581 + }, + { + "epoch": 1.473349446635288, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.38832664489746, + "learning_rate": 1e-06, + "loss": 0.3599, + "mean_token_accuracy": 0.8894016742706299, + "num_tokens": 441918015.0, + "step": 11582 + }, + { + "epoch": 1.4734766569138786, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.49903678894043, + "learning_rate": 1e-06, + "loss": 0.4304, + "mean_token_accuracy": 0.8617619276046753, + "num_tokens": 441956573.0, + "step": 11583 + }, + { + "epoch": 1.4736038671924692, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.578319549560547, + "learning_rate": 1e-06, + "loss": 0.3766, + "mean_token_accuracy": 0.8812621831893921, + "num_tokens": 442002064.0, + "step": 11584 + }, + { + "epoch": 1.4737310774710597, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.493450164794922, + "learning_rate": 1e-06, + "loss": 0.403, + "mean_token_accuracy": 0.8748120665550232, + "num_tokens": 442040122.0, + "step": 11585 + }, + { + "epoch": 1.4738582877496502, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.885578155517578, + "learning_rate": 1e-06, + "loss": 0.4163, + "mean_token_accuracy": 0.8699793815612793, + "num_tokens": 442079698.0, + "step": 11586 + }, + { + "epoch": 1.4739854980282407, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.31475830078125, + "learning_rate": 1e-06, + "loss": 0.3717, + "mean_token_accuracy": 0.8803610801696777, + "num_tokens": 442118252.0, + "step": 11587 + }, + { + "epoch": 1.4741127083068313, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.794593811035156, + "learning_rate": 1e-06, + "loss": 0.3974, + "mean_token_accuracy": 0.8709522485733032, + "num_tokens": 442154675.0, + "step": 11588 + }, + { + "epoch": 1.4742399185854218, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.40334129333496, + "learning_rate": 1e-06, + "loss": 0.3845, + "mean_token_accuracy": 0.8743099570274353, + "num_tokens": 442189449.0, + "step": 11589 + }, + { + "epoch": 1.4743671288640123, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.514907836914062, + "learning_rate": 1e-06, + "loss": 0.418, + "mean_token_accuracy": 0.8702783584594727, + "num_tokens": 442222844.0, + "step": 11590 + }, + { + "epoch": 1.4744943391426026, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.536727905273438, + "learning_rate": 1e-06, + "loss": 0.4084, + "mean_token_accuracy": 0.8746917247772217, + "num_tokens": 442258599.0, + "step": 11591 + }, + { + "epoch": 1.4746215494211932, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.5749454498291, + "learning_rate": 1e-06, + "loss": 0.3984, + "mean_token_accuracy": 0.875348687171936, + "num_tokens": 442294775.0, + "step": 11592 + }, + { + "epoch": 1.4747487596997837, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.47331428527832, + "learning_rate": 1e-06, + "loss": 0.3758, + "mean_token_accuracy": 0.8788695931434631, + "num_tokens": 442330313.0, + "step": 11593 + }, + { + "epoch": 1.4748759699783742, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.726112365722656, + "learning_rate": 1e-06, + "loss": 0.4217, + "mean_token_accuracy": 0.8675094842910767, + "num_tokens": 442364988.0, + "step": 11594 + }, + { + "epoch": 1.4750031802569648, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.57046127319336, + "learning_rate": 1e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.8707528710365295, + "num_tokens": 442406927.0, + "step": 11595 + }, + { + "epoch": 1.4751303905355553, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.574438095092773, + "learning_rate": 1e-06, + "loss": 0.401, + "mean_token_accuracy": 0.8719290494918823, + "num_tokens": 442442092.0, + "step": 11596 + }, + { + "epoch": 1.4752576008141458, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.754737854003906, + "learning_rate": 1e-06, + "loss": 0.3904, + "mean_token_accuracy": 0.8744685649871826, + "num_tokens": 442483013.0, + "step": 11597 + }, + { + "epoch": 1.4753848110927363, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.72625732421875, + "learning_rate": 1e-06, + "loss": 0.3915, + "mean_token_accuracy": 0.8744008541107178, + "num_tokens": 442525052.0, + "step": 11598 + }, + { + "epoch": 1.4755120213713269, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.48273468017578, + "learning_rate": 1e-06, + "loss": 0.4456, + "mean_token_accuracy": 0.8593435287475586, + "num_tokens": 442562543.0, + "step": 11599 + }, + { + "epoch": 1.4756392316499174, + "ewc_loss": 0.031982421875, + "ewc_loss_parallel": 3.1948089599609375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.407623291015625, + "learning_rate": 1e-06, + "loss": 0.3932, + "mean_token_accuracy": 0.8738526105880737, + "num_tokens": 442604976.0, + "step": 11600 + }, + { + "epoch": 1.4757664419285077, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.652788162231445, + "learning_rate": 1e-06, + "loss": 0.3774, + "mean_token_accuracy": 0.8796722888946533, + "num_tokens": 442642271.0, + "step": 11601 + }, + { + "epoch": 1.4758936522070982, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.476354598999023, + "learning_rate": 1e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.8655834794044495, + "num_tokens": 442680508.0, + "step": 11602 + }, + { + "epoch": 1.4760208624856888, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.555191040039062, + "learning_rate": 1e-06, + "loss": 0.4461, + "mean_token_accuracy": 0.8564918637275696, + "num_tokens": 442718391.0, + "step": 11603 + }, + { + "epoch": 1.4761480727642793, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.70606803894043, + "learning_rate": 1e-06, + "loss": 0.3791, + "mean_token_accuracy": 0.8785486221313477, + "num_tokens": 442757688.0, + "step": 11604 + }, + { + "epoch": 1.4762752830428698, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.497331619262695, + "learning_rate": 1e-06, + "loss": 0.3687, + "mean_token_accuracy": 0.8810387849807739, + "num_tokens": 442796704.0, + "step": 11605 + }, + { + "epoch": 1.4764024933214603, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.745824813842773, + "learning_rate": 1e-06, + "loss": 0.3837, + "mean_token_accuracy": 0.8786981105804443, + "num_tokens": 442838791.0, + "step": 11606 + }, + { + "epoch": 1.4765297036000509, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.545724868774414, + "learning_rate": 1e-06, + "loss": 0.4051, + "mean_token_accuracy": 0.8695037961006165, + "num_tokens": 442882368.0, + "step": 11607 + }, + { + "epoch": 1.4766569138786414, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.58319854736328, + "learning_rate": 1e-06, + "loss": 0.4101, + "mean_token_accuracy": 0.868545413017273, + "num_tokens": 442923768.0, + "step": 11608 + }, + { + "epoch": 1.476784124157232, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.579744338989258, + "learning_rate": 1e-06, + "loss": 0.3593, + "mean_token_accuracy": 0.882218062877655, + "num_tokens": 442955656.0, + "step": 11609 + }, + { + "epoch": 1.4769113344358225, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.593990325927734, + "learning_rate": 1e-06, + "loss": 0.438, + "mean_token_accuracy": 0.8594432473182678, + "num_tokens": 442997323.0, + "step": 11610 + }, + { + "epoch": 1.477038544714413, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.508012771606445, + "learning_rate": 1e-06, + "loss": 0.4611, + "mean_token_accuracy": 0.8546392917633057, + "num_tokens": 443033454.0, + "step": 11611 + }, + { + "epoch": 1.4771657549930035, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.662805557250977, + "learning_rate": 1e-06, + "loss": 0.4077, + "mean_token_accuracy": 0.868461012840271, + "num_tokens": 443077902.0, + "step": 11612 + }, + { + "epoch": 1.477292965271594, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.68324089050293, + "learning_rate": 1e-06, + "loss": 0.4472, + "mean_token_accuracy": 0.8590952157974243, + "num_tokens": 443116515.0, + "step": 11613 + }, + { + "epoch": 1.4774201755501846, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.72039794921875, + "learning_rate": 1e-06, + "loss": 0.3307, + "mean_token_accuracy": 0.8941071033477783, + "num_tokens": 443154124.0, + "step": 11614 + }, + { + "epoch": 1.477547385828775, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.46035385131836, + "learning_rate": 1e-06, + "loss": 0.3904, + "mean_token_accuracy": 0.8771325349807739, + "num_tokens": 443195852.0, + "step": 11615 + }, + { + "epoch": 1.4776745961073654, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.837329864501953, + "learning_rate": 1e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.8620699048042297, + "num_tokens": 443229850.0, + "step": 11616 + }, + { + "epoch": 1.477801806385956, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.539798736572266, + "learning_rate": 1e-06, + "loss": 0.4356, + "mean_token_accuracy": 0.8598974347114563, + "num_tokens": 443274481.0, + "step": 11617 + }, + { + "epoch": 1.4779290166645465, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.453943252563477, + "learning_rate": 1e-06, + "loss": 0.387, + "mean_token_accuracy": 0.8748941421508789, + "num_tokens": 443311737.0, + "step": 11618 + }, + { + "epoch": 1.478056226943137, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.66020965576172, + "learning_rate": 1e-06, + "loss": 0.4442, + "mean_token_accuracy": 0.8598361015319824, + "num_tokens": 443345310.0, + "step": 11619 + }, + { + "epoch": 1.4781834372217275, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.444835662841797, + "learning_rate": 1e-06, + "loss": 0.4207, + "mean_token_accuracy": 0.8672617673873901, + "num_tokens": 443392869.0, + "step": 11620 + }, + { + "epoch": 1.478310647500318, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.614233016967773, + "learning_rate": 1e-06, + "loss": 0.4197, + "mean_token_accuracy": 0.8675598502159119, + "num_tokens": 443432842.0, + "step": 11621 + }, + { + "epoch": 1.4784378577789086, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.69793128967285, + "learning_rate": 1e-06, + "loss": 0.4094, + "mean_token_accuracy": 0.8699925541877747, + "num_tokens": 443465954.0, + "step": 11622 + }, + { + "epoch": 1.478565068057499, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.647069931030273, + "learning_rate": 1e-06, + "loss": 0.4485, + "mean_token_accuracy": 0.8595690727233887, + "num_tokens": 443508506.0, + "step": 11623 + }, + { + "epoch": 1.4786922783360896, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.25568389892578, + "learning_rate": 1e-06, + "loss": 0.4211, + "mean_token_accuracy": 0.8677013516426086, + "num_tokens": 443543575.0, + "step": 11624 + }, + { + "epoch": 1.47881948861468, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.837757110595703, + "learning_rate": 1e-06, + "loss": 0.4021, + "mean_token_accuracy": 0.8710716962814331, + "num_tokens": 443578504.0, + "step": 11625 + }, + { + "epoch": 1.4789466988932705, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.49176597595215, + "learning_rate": 1e-06, + "loss": 0.3857, + "mean_token_accuracy": 0.8815093636512756, + "num_tokens": 443614647.0, + "step": 11626 + }, + { + "epoch": 1.479073909171861, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.70350456237793, + "learning_rate": 1e-06, + "loss": 0.4149, + "mean_token_accuracy": 0.8634636402130127, + "num_tokens": 443649244.0, + "step": 11627 + }, + { + "epoch": 1.4792011194504515, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.6719970703125, + "learning_rate": 1e-06, + "loss": 0.4345, + "mean_token_accuracy": 0.8640377521514893, + "num_tokens": 443688374.0, + "step": 11628 + }, + { + "epoch": 1.479328329729042, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.564285278320312, + "learning_rate": 1e-06, + "loss": 0.5108, + "mean_token_accuracy": 0.8382241725921631, + "num_tokens": 443730929.0, + "step": 11629 + }, + { + "epoch": 1.4794555400076326, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.633255004882812, + "learning_rate": 1e-06, + "loss": 0.4217, + "mean_token_accuracy": 0.8635294437408447, + "num_tokens": 443763067.0, + "step": 11630 + }, + { + "epoch": 1.479582750286223, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.50235366821289, + "learning_rate": 1e-06, + "loss": 0.4464, + "mean_token_accuracy": 0.8567343354225159, + "num_tokens": 443801273.0, + "step": 11631 + }, + { + "epoch": 1.4797099605648136, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.47916030883789, + "learning_rate": 1e-06, + "loss": 0.4546, + "mean_token_accuracy": 0.8555073738098145, + "num_tokens": 443840394.0, + "step": 11632 + }, + { + "epoch": 1.4798371708434042, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.721981048583984, + "learning_rate": 1e-06, + "loss": 0.4186, + "mean_token_accuracy": 0.863754391670227, + "num_tokens": 443877486.0, + "step": 11633 + }, + { + "epoch": 1.4799643811219947, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.549009323120117, + "learning_rate": 1e-06, + "loss": 0.4247, + "mean_token_accuracy": 0.8676187992095947, + "num_tokens": 443913763.0, + "step": 11634 + }, + { + "epoch": 1.4800915914005852, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.417081832885742, + "learning_rate": 1e-06, + "loss": 0.3888, + "mean_token_accuracy": 0.8759739398956299, + "num_tokens": 443946840.0, + "step": 11635 + }, + { + "epoch": 1.4802188016791757, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.706682205200195, + "learning_rate": 1e-06, + "loss": 0.4555, + "mean_token_accuracy": 0.8530822992324829, + "num_tokens": 443992439.0, + "step": 11636 + }, + { + "epoch": 1.4803460119577663, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.77726173400879, + "learning_rate": 1e-06, + "loss": 0.3887, + "mean_token_accuracy": 0.8773411512374878, + "num_tokens": 444029348.0, + "step": 11637 + }, + { + "epoch": 1.4804732222363568, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.611400604248047, + "learning_rate": 1e-06, + "loss": 0.39, + "mean_token_accuracy": 0.8754897117614746, + "num_tokens": 444068386.0, + "step": 11638 + }, + { + "epoch": 1.4806004325149473, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.488718032836914, + "learning_rate": 1e-06, + "loss": 0.4484, + "mean_token_accuracy": 0.8568918704986572, + "num_tokens": 444110307.0, + "step": 11639 + }, + { + "epoch": 1.4807276427935376, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.601972579956055, + "learning_rate": 1e-06, + "loss": 0.4132, + "mean_token_accuracy": 0.8680474758148193, + "num_tokens": 444153919.0, + "step": 11640 + }, + { + "epoch": 1.4808548530721282, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.49081039428711, + "learning_rate": 1e-06, + "loss": 0.4126, + "mean_token_accuracy": 0.8695747256278992, + "num_tokens": 444190575.0, + "step": 11641 + }, + { + "epoch": 1.4809820633507187, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.627782821655273, + "learning_rate": 1e-06, + "loss": 0.4297, + "mean_token_accuracy": 0.8618036508560181, + "num_tokens": 444226108.0, + "step": 11642 + }, + { + "epoch": 1.4811092736293092, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.5781192779541, + "learning_rate": 1e-06, + "loss": 0.4237, + "mean_token_accuracy": 0.864482045173645, + "num_tokens": 444268162.0, + "step": 11643 + }, + { + "epoch": 1.4812364839078997, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.665491104125977, + "learning_rate": 1e-06, + "loss": 0.457, + "mean_token_accuracy": 0.853682279586792, + "num_tokens": 444309273.0, + "step": 11644 + }, + { + "epoch": 1.4813636941864903, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.542905807495117, + "learning_rate": 1e-06, + "loss": 0.3915, + "mean_token_accuracy": 0.8728821873664856, + "num_tokens": 444338697.0, + "step": 11645 + }, + { + "epoch": 1.4814909044650808, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.83926773071289, + "learning_rate": 1e-06, + "loss": 0.3878, + "mean_token_accuracy": 0.8769575953483582, + "num_tokens": 444377954.0, + "step": 11646 + }, + { + "epoch": 1.4816181147436713, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.60594940185547, + "learning_rate": 1e-06, + "loss": 0.4145, + "mean_token_accuracy": 0.8681259751319885, + "num_tokens": 444416996.0, + "step": 11647 + }, + { + "epoch": 1.4817453250222619, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.65876579284668, + "learning_rate": 1e-06, + "loss": 0.3989, + "mean_token_accuracy": 0.8740730881690979, + "num_tokens": 444455550.0, + "step": 11648 + }, + { + "epoch": 1.4818725353008524, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.653446197509766, + "learning_rate": 1e-06, + "loss": 0.4254, + "mean_token_accuracy": 0.8642027974128723, + "num_tokens": 444493979.0, + "step": 11649 + }, + { + "epoch": 1.4819997455794427, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.508602142333984, + "learning_rate": 1e-06, + "loss": 0.4157, + "mean_token_accuracy": 0.8665866851806641, + "num_tokens": 444540622.0, + "step": 11650 + }, + { + "epoch": 1.4821269558580332, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.60492706298828, + "learning_rate": 1e-06, + "loss": 0.4238, + "mean_token_accuracy": 0.8646724820137024, + "num_tokens": 444577772.0, + "step": 11651 + }, + { + "epoch": 1.4822541661366238, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.796043395996094, + "learning_rate": 1e-06, + "loss": 0.4789, + "mean_token_accuracy": 0.8547036647796631, + "num_tokens": 444615523.0, + "step": 11652 + }, + { + "epoch": 1.4823813764152143, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.594467163085938, + "learning_rate": 1e-06, + "loss": 0.4209, + "mean_token_accuracy": 0.8663442134857178, + "num_tokens": 444659785.0, + "step": 11653 + }, + { + "epoch": 1.4825085866938048, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.582117080688477, + "learning_rate": 1e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.8636821508407593, + "num_tokens": 444700806.0, + "step": 11654 + }, + { + "epoch": 1.4826357969723953, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.41445541381836, + "learning_rate": 1e-06, + "loss": 0.4156, + "mean_token_accuracy": 0.8690701723098755, + "num_tokens": 444734320.0, + "step": 11655 + }, + { + "epoch": 1.4827630072509859, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.48815155029297, + "learning_rate": 1e-06, + "loss": 0.4338, + "mean_token_accuracy": 0.8612060546875, + "num_tokens": 444770215.0, + "step": 11656 + }, + { + "epoch": 1.4828902175295764, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.590131759643555, + "learning_rate": 1e-06, + "loss": 0.4136, + "mean_token_accuracy": 0.8664699792861938, + "num_tokens": 444805432.0, + "step": 11657 + }, + { + "epoch": 1.483017427808167, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.69105339050293, + "learning_rate": 1e-06, + "loss": 0.4142, + "mean_token_accuracy": 0.8669228553771973, + "num_tokens": 444832398.0, + "step": 11658 + }, + { + "epoch": 1.4831446380867574, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.5505313873291, + "learning_rate": 1e-06, + "loss": 0.3996, + "mean_token_accuracy": 0.8720913529396057, + "num_tokens": 444876128.0, + "step": 11659 + }, + { + "epoch": 1.483271848365348, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.49691390991211, + "learning_rate": 1e-06, + "loss": 0.4341, + "mean_token_accuracy": 0.8621766567230225, + "num_tokens": 444915188.0, + "step": 11660 + }, + { + "epoch": 1.4833990586439385, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.461618423461914, + "learning_rate": 1e-06, + "loss": 0.3948, + "mean_token_accuracy": 0.8731521368026733, + "num_tokens": 444953528.0, + "step": 11661 + }, + { + "epoch": 1.483526268922529, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.431278228759766, + "learning_rate": 1e-06, + "loss": 0.4461, + "mean_token_accuracy": 0.860766589641571, + "num_tokens": 444996469.0, + "step": 11662 + }, + { + "epoch": 1.4836534792011196, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.706741333007812, + "learning_rate": 1e-06, + "loss": 0.3692, + "mean_token_accuracy": 0.88504958152771, + "num_tokens": 445034298.0, + "step": 11663 + }, + { + "epoch": 1.48378068947971, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.375802993774414, + "learning_rate": 1e-06, + "loss": 0.4185, + "mean_token_accuracy": 0.866733968257904, + "num_tokens": 445079010.0, + "step": 11664 + }, + { + "epoch": 1.4839078997583004, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.549657821655273, + "learning_rate": 1e-06, + "loss": 0.3731, + "mean_token_accuracy": 0.8813073635101318, + "num_tokens": 445116009.0, + "step": 11665 + }, + { + "epoch": 1.484035110036891, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.366926193237305, + "learning_rate": 1e-06, + "loss": 0.3777, + "mean_token_accuracy": 0.8804638981819153, + "num_tokens": 445156313.0, + "step": 11666 + }, + { + "epoch": 1.4841623203154815, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.539806365966797, + "learning_rate": 1e-06, + "loss": 0.4177, + "mean_token_accuracy": 0.8697363138198853, + "num_tokens": 445187424.0, + "step": 11667 + }, + { + "epoch": 1.484289530594072, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.695009231567383, + "learning_rate": 1e-06, + "loss": 0.4138, + "mean_token_accuracy": 0.8687452673912048, + "num_tokens": 445233448.0, + "step": 11668 + }, + { + "epoch": 1.4844167408726625, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.492053985595703, + "learning_rate": 1e-06, + "loss": 0.384, + "mean_token_accuracy": 0.8782329559326172, + "num_tokens": 445273663.0, + "step": 11669 + }, + { + "epoch": 1.484543951151253, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.56218910217285, + "learning_rate": 1e-06, + "loss": 0.3903, + "mean_token_accuracy": 0.8737345337867737, + "num_tokens": 445313790.0, + "step": 11670 + }, + { + "epoch": 1.4846711614298436, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.66927146911621, + "learning_rate": 1e-06, + "loss": 0.3916, + "mean_token_accuracy": 0.8780393600463867, + "num_tokens": 445351818.0, + "step": 11671 + }, + { + "epoch": 1.484798371708434, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.70726776123047, + "learning_rate": 1e-06, + "loss": 0.4983, + "mean_token_accuracy": 0.8426061272621155, + "num_tokens": 445391763.0, + "step": 11672 + }, + { + "epoch": 1.4849255819870246, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.519229888916016, + "learning_rate": 1e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.862166166305542, + "num_tokens": 445426723.0, + "step": 11673 + }, + { + "epoch": 1.485052792265615, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.647701263427734, + "learning_rate": 1e-06, + "loss": 0.3991, + "mean_token_accuracy": 0.8729579448699951, + "num_tokens": 445465635.0, + "step": 11674 + }, + { + "epoch": 1.4851800025442055, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.43855094909668, + "learning_rate": 1e-06, + "loss": 0.4111, + "mean_token_accuracy": 0.869320273399353, + "num_tokens": 445502819.0, + "step": 11675 + }, + { + "epoch": 1.485307212822796, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.519046783447266, + "learning_rate": 1e-06, + "loss": 0.39, + "mean_token_accuracy": 0.8773168325424194, + "num_tokens": 445541861.0, + "step": 11676 + }, + { + "epoch": 1.4854344231013865, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.605371475219727, + "learning_rate": 1e-06, + "loss": 0.3672, + "mean_token_accuracy": 0.8829523324966431, + "num_tokens": 445578058.0, + "step": 11677 + }, + { + "epoch": 1.485561633379977, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.47647476196289, + "learning_rate": 1e-06, + "loss": 0.4608, + "mean_token_accuracy": 0.8530622124671936, + "num_tokens": 445612896.0, + "step": 11678 + }, + { + "epoch": 1.4856888436585676, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.47858428955078, + "learning_rate": 1e-06, + "loss": 0.4313, + "mean_token_accuracy": 0.8620867133140564, + "num_tokens": 445647849.0, + "step": 11679 + }, + { + "epoch": 1.485816053937158, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.89058494567871, + "learning_rate": 1e-06, + "loss": 0.4573, + "mean_token_accuracy": 0.8533586263656616, + "num_tokens": 445689199.0, + "step": 11680 + }, + { + "epoch": 1.4859432642157486, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.647605895996094, + "learning_rate": 1e-06, + "loss": 0.421, + "mean_token_accuracy": 0.8661632537841797, + "num_tokens": 445727869.0, + "step": 11681 + }, + { + "epoch": 1.4860704744943392, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.763851165771484, + "learning_rate": 1e-06, + "loss": 0.4079, + "mean_token_accuracy": 0.8715907335281372, + "num_tokens": 445770105.0, + "step": 11682 + }, + { + "epoch": 1.4861976847729297, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.485183715820312, + "learning_rate": 1e-06, + "loss": 0.4802, + "mean_token_accuracy": 0.8469552993774414, + "num_tokens": 445812950.0, + "step": 11683 + }, + { + "epoch": 1.4863248950515202, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.670001983642578, + "learning_rate": 1e-06, + "loss": 0.4108, + "mean_token_accuracy": 0.8655751943588257, + "num_tokens": 445850811.0, + "step": 11684 + }, + { + "epoch": 1.4864521053301107, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.775964736938477, + "learning_rate": 1e-06, + "loss": 0.4153, + "mean_token_accuracy": 0.871181845664978, + "num_tokens": 445891967.0, + "step": 11685 + }, + { + "epoch": 1.4865793156087013, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.713590621948242, + "learning_rate": 1e-06, + "loss": 0.4136, + "mean_token_accuracy": 0.8692184090614319, + "num_tokens": 445927936.0, + "step": 11686 + }, + { + "epoch": 1.4867065258872918, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.581186294555664, + "learning_rate": 1e-06, + "loss": 0.3761, + "mean_token_accuracy": 0.8797478675842285, + "num_tokens": 445962481.0, + "step": 11687 + }, + { + "epoch": 1.4868337361658823, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.53783416748047, + "learning_rate": 1e-06, + "loss": 0.3658, + "mean_token_accuracy": 0.881253719329834, + "num_tokens": 446001445.0, + "step": 11688 + }, + { + "epoch": 1.4869609464444726, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.542226791381836, + "learning_rate": 1e-06, + "loss": 0.3776, + "mean_token_accuracy": 0.8858417272567749, + "num_tokens": 446037904.0, + "step": 11689 + }, + { + "epoch": 1.4870881567230632, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.56534194946289, + "learning_rate": 1e-06, + "loss": 0.3906, + "mean_token_accuracy": 0.8767176866531372, + "num_tokens": 446077583.0, + "step": 11690 + }, + { + "epoch": 1.4872153670016537, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.61175537109375, + "learning_rate": 1e-06, + "loss": 0.4383, + "mean_token_accuracy": 0.8668820858001709, + "num_tokens": 446110123.0, + "step": 11691 + }, + { + "epoch": 1.4873425772802442, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.431663513183594, + "learning_rate": 1e-06, + "loss": 0.4259, + "mean_token_accuracy": 0.8657520413398743, + "num_tokens": 446149832.0, + "step": 11692 + }, + { + "epoch": 1.4874697875588347, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.55953598022461, + "learning_rate": 1e-06, + "loss": 0.3937, + "mean_token_accuracy": 0.8745338916778564, + "num_tokens": 446184211.0, + "step": 11693 + }, + { + "epoch": 1.4875969978374253, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.531652450561523, + "learning_rate": 1e-06, + "loss": 0.4376, + "mean_token_accuracy": 0.8619780540466309, + "num_tokens": 446229620.0, + "step": 11694 + }, + { + "epoch": 1.4877242081160158, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.630508422851562, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.8702177405357361, + "num_tokens": 446267134.0, + "step": 11695 + }, + { + "epoch": 1.4878514183946063, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.581806182861328, + "learning_rate": 1e-06, + "loss": 0.4532, + "mean_token_accuracy": 0.8563161492347717, + "num_tokens": 446303001.0, + "step": 11696 + }, + { + "epoch": 1.4879786286731969, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.542606353759766, + "learning_rate": 1e-06, + "loss": 0.4214, + "mean_token_accuracy": 0.8659716844558716, + "num_tokens": 446334286.0, + "step": 11697 + }, + { + "epoch": 1.4881058389517874, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.5235538482666, + "learning_rate": 1e-06, + "loss": 0.3887, + "mean_token_accuracy": 0.8854949474334717, + "num_tokens": 446369547.0, + "step": 11698 + }, + { + "epoch": 1.4882330492303777, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.604320526123047, + "learning_rate": 1e-06, + "loss": 0.4635, + "mean_token_accuracy": 0.8530615568161011, + "num_tokens": 446408179.0, + "step": 11699 + }, + { + "epoch": 1.4883602595089682, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.742481231689453, + "learning_rate": 1e-06, + "loss": 0.4094, + "mean_token_accuracy": 0.8703802227973938, + "num_tokens": 446448533.0, + "step": 11700 + }, + { + "epoch": 1.4884874697875587, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.52103042602539, + "learning_rate": 1e-06, + "loss": 0.4201, + "mean_token_accuracy": 0.8653066754341125, + "num_tokens": 446489445.0, + "step": 11701 + }, + { + "epoch": 1.4886146800661493, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.278045654296875, + "learning_rate": 1e-06, + "loss": 0.375, + "mean_token_accuracy": 0.8814953565597534, + "num_tokens": 446526530.0, + "step": 11702 + }, + { + "epoch": 1.4887418903447398, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.81418228149414, + "learning_rate": 1e-06, + "loss": 0.409, + "mean_token_accuracy": 0.8692043423652649, + "num_tokens": 446558315.0, + "step": 11703 + }, + { + "epoch": 1.4888691006233303, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.491451263427734, + "learning_rate": 1e-06, + "loss": 0.4044, + "mean_token_accuracy": 0.870977520942688, + "num_tokens": 446598548.0, + "step": 11704 + }, + { + "epoch": 1.4889963109019209, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.40068244934082, + "learning_rate": 1e-06, + "loss": 0.425, + "mean_token_accuracy": 0.8639934062957764, + "num_tokens": 446632111.0, + "step": 11705 + }, + { + "epoch": 1.4891235211805114, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.76641845703125, + "learning_rate": 1e-06, + "loss": 0.386, + "mean_token_accuracy": 0.876420259475708, + "num_tokens": 446669296.0, + "step": 11706 + }, + { + "epoch": 1.489250731459102, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.46237564086914, + "learning_rate": 1e-06, + "loss": 0.4032, + "mean_token_accuracy": 0.8697981834411621, + "num_tokens": 446707891.0, + "step": 11707 + }, + { + "epoch": 1.4893779417376924, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.552227020263672, + "learning_rate": 1e-06, + "loss": 0.4526, + "mean_token_accuracy": 0.8569895625114441, + "num_tokens": 446749455.0, + "step": 11708 + }, + { + "epoch": 1.489505152016283, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.554487228393555, + "learning_rate": 1e-06, + "loss": 0.3845, + "mean_token_accuracy": 0.8797193765640259, + "num_tokens": 446787001.0, + "step": 11709 + }, + { + "epoch": 1.4896323622948735, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.554052352905273, + "learning_rate": 1e-06, + "loss": 0.4083, + "mean_token_accuracy": 0.8686966896057129, + "num_tokens": 446827262.0, + "step": 11710 + }, + { + "epoch": 1.489759572573464, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.635133743286133, + "learning_rate": 1e-06, + "loss": 0.3659, + "mean_token_accuracy": 0.8833606243133545, + "num_tokens": 446860514.0, + "step": 11711 + }, + { + "epoch": 1.4898867828520546, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.709436416625977, + "learning_rate": 1e-06, + "loss": 0.4416, + "mean_token_accuracy": 0.859391987323761, + "num_tokens": 446902605.0, + "step": 11712 + }, + { + "epoch": 1.490013993130645, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.423744201660156, + "learning_rate": 1e-06, + "loss": 0.3958, + "mean_token_accuracy": 0.874567985534668, + "num_tokens": 446938638.0, + "step": 11713 + }, + { + "epoch": 1.4901412034092354, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.748573303222656, + "learning_rate": 1e-06, + "loss": 0.4019, + "mean_token_accuracy": 0.8738201856613159, + "num_tokens": 446979286.0, + "step": 11714 + }, + { + "epoch": 1.490268413687826, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.60523223876953, + "learning_rate": 1e-06, + "loss": 0.4637, + "mean_token_accuracy": 0.8529883027076721, + "num_tokens": 447017405.0, + "step": 11715 + }, + { + "epoch": 1.4903956239664164, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.42033576965332, + "learning_rate": 1e-06, + "loss": 0.4168, + "mean_token_accuracy": 0.8709964752197266, + "num_tokens": 447056733.0, + "step": 11716 + }, + { + "epoch": 1.490522834245007, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.82143783569336, + "learning_rate": 1e-06, + "loss": 0.4277, + "mean_token_accuracy": 0.8641072511672974, + "num_tokens": 447094493.0, + "step": 11717 + }, + { + "epoch": 1.4906500445235975, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.46511459350586, + "learning_rate": 1e-06, + "loss": 0.369, + "mean_token_accuracy": 0.8839797377586365, + "num_tokens": 447134039.0, + "step": 11718 + }, + { + "epoch": 1.490777254802188, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.429553985595703, + "learning_rate": 1e-06, + "loss": 0.3838, + "mean_token_accuracy": 0.8770972490310669, + "num_tokens": 447168320.0, + "step": 11719 + }, + { + "epoch": 1.4909044650807786, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.611791610717773, + "learning_rate": 1e-06, + "loss": 0.4522, + "mean_token_accuracy": 0.8520243167877197, + "num_tokens": 447205640.0, + "step": 11720 + }, + { + "epoch": 1.491031675359369, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.74324607849121, + "learning_rate": 1e-06, + "loss": 0.3705, + "mean_token_accuracy": 0.8819412589073181, + "num_tokens": 447239831.0, + "step": 11721 + }, + { + "epoch": 1.4911588856379596, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.394956588745117, + "learning_rate": 1e-06, + "loss": 0.4563, + "mean_token_accuracy": 0.8536518216133118, + "num_tokens": 447281100.0, + "step": 11722 + }, + { + "epoch": 1.49128609591655, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.83230972290039, + "learning_rate": 1e-06, + "loss": 0.4769, + "mean_token_accuracy": 0.8493958711624146, + "num_tokens": 447316137.0, + "step": 11723 + }, + { + "epoch": 1.4914133061951405, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.467260360717773, + "learning_rate": 1e-06, + "loss": 0.4541, + "mean_token_accuracy": 0.8619276285171509, + "num_tokens": 447357922.0, + "step": 11724 + }, + { + "epoch": 1.491540516473731, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.736507415771484, + "learning_rate": 1e-06, + "loss": 0.4318, + "mean_token_accuracy": 0.864284873008728, + "num_tokens": 447394905.0, + "step": 11725 + }, + { + "epoch": 1.4916677267523215, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.5433292388916, + "learning_rate": 1e-06, + "loss": 0.378, + "mean_token_accuracy": 0.8789300322532654, + "num_tokens": 447435379.0, + "step": 11726 + }, + { + "epoch": 1.491794937030912, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.556154251098633, + "learning_rate": 1e-06, + "loss": 0.4348, + "mean_token_accuracy": 0.8594731092453003, + "num_tokens": 447473058.0, + "step": 11727 + }, + { + "epoch": 1.4919221473095026, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.65091323852539, + "learning_rate": 1e-06, + "loss": 0.3886, + "mean_token_accuracy": 0.8785276412963867, + "num_tokens": 447510966.0, + "step": 11728 + }, + { + "epoch": 1.492049357588093, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.55532455444336, + "learning_rate": 1e-06, + "loss": 0.3965, + "mean_token_accuracy": 0.871160626411438, + "num_tokens": 447554747.0, + "step": 11729 + }, + { + "epoch": 1.4921765678666836, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.44120979309082, + "learning_rate": 1e-06, + "loss": 0.403, + "mean_token_accuracy": 0.8706195950508118, + "num_tokens": 447594426.0, + "step": 11730 + }, + { + "epoch": 1.4923037781452742, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.665447235107422, + "learning_rate": 1e-06, + "loss": 0.4403, + "mean_token_accuracy": 0.8611140251159668, + "num_tokens": 447634786.0, + "step": 11731 + }, + { + "epoch": 1.4924309884238647, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.594621658325195, + "learning_rate": 1e-06, + "loss": 0.4234, + "mean_token_accuracy": 0.8652247786521912, + "num_tokens": 447668682.0, + "step": 11732 + }, + { + "epoch": 1.4925581987024552, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.739091873168945, + "learning_rate": 1e-06, + "loss": 0.4461, + "mean_token_accuracy": 0.8607548475265503, + "num_tokens": 447705566.0, + "step": 11733 + }, + { + "epoch": 1.4926854089810457, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.747766494750977, + "learning_rate": 1e-06, + "loss": 0.4355, + "mean_token_accuracy": 0.8652716875076294, + "num_tokens": 447746385.0, + "step": 11734 + }, + { + "epoch": 1.4928126192596363, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.4197998046875, + "learning_rate": 1e-06, + "loss": 0.3993, + "mean_token_accuracy": 0.8723245859146118, + "num_tokens": 447790483.0, + "step": 11735 + }, + { + "epoch": 1.4929398295382268, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.58592414855957, + "learning_rate": 1e-06, + "loss": 0.4196, + "mean_token_accuracy": 0.8658394813537598, + "num_tokens": 447826872.0, + "step": 11736 + }, + { + "epoch": 1.4930670398168173, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.58527946472168, + "learning_rate": 1e-06, + "loss": 0.4006, + "mean_token_accuracy": 0.8727290630340576, + "num_tokens": 447864071.0, + "step": 11737 + }, + { + "epoch": 1.4931942500954076, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.73544692993164, + "learning_rate": 1e-06, + "loss": 0.4583, + "mean_token_accuracy": 0.8518791198730469, + "num_tokens": 447903320.0, + "step": 11738 + }, + { + "epoch": 1.4933214603739982, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.491655349731445, + "learning_rate": 1e-06, + "loss": 0.3799, + "mean_token_accuracy": 0.8825581073760986, + "num_tokens": 447940253.0, + "step": 11739 + }, + { + "epoch": 1.4934486706525887, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.887008666992188, + "learning_rate": 1e-06, + "loss": 0.3986, + "mean_token_accuracy": 0.873555600643158, + "num_tokens": 447978277.0, + "step": 11740 + }, + { + "epoch": 1.4935758809311792, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.71240997314453, + "learning_rate": 1e-06, + "loss": 0.4472, + "mean_token_accuracy": 0.8574711680412292, + "num_tokens": 448018798.0, + "step": 11741 + }, + { + "epoch": 1.4937030912097697, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.69107437133789, + "learning_rate": 1e-06, + "loss": 0.433, + "mean_token_accuracy": 0.8586216568946838, + "num_tokens": 448057469.0, + "step": 11742 + }, + { + "epoch": 1.4938303014883603, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.68507957458496, + "learning_rate": 1e-06, + "loss": 0.4437, + "mean_token_accuracy": 0.8551909327507019, + "num_tokens": 448089032.0, + "step": 11743 + }, + { + "epoch": 1.4939575117669508, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.734546661376953, + "learning_rate": 1e-06, + "loss": 0.4311, + "mean_token_accuracy": 0.8628047704696655, + "num_tokens": 448122438.0, + "step": 11744 + }, + { + "epoch": 1.4940847220455413, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.62860679626465, + "learning_rate": 1e-06, + "loss": 0.4194, + "mean_token_accuracy": 0.8665256500244141, + "num_tokens": 448166844.0, + "step": 11745 + }, + { + "epoch": 1.4942119323241319, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.836523056030273, + "learning_rate": 1e-06, + "loss": 0.4426, + "mean_token_accuracy": 0.8584305047988892, + "num_tokens": 448197766.0, + "step": 11746 + }, + { + "epoch": 1.4943391426027224, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.38714599609375, + "learning_rate": 1e-06, + "loss": 0.4666, + "mean_token_accuracy": 0.8544715046882629, + "num_tokens": 448240850.0, + "step": 11747 + }, + { + "epoch": 1.4944663528813127, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.948585510253906, + "learning_rate": 1e-06, + "loss": 0.3802, + "mean_token_accuracy": 0.876684308052063, + "num_tokens": 448274187.0, + "step": 11748 + }, + { + "epoch": 1.4945935631599032, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.567049026489258, + "learning_rate": 1e-06, + "loss": 0.3961, + "mean_token_accuracy": 0.8736099600791931, + "num_tokens": 448311069.0, + "step": 11749 + }, + { + "epoch": 1.4947207734384937, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.577213287353516, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.8677563667297363, + "num_tokens": 448351536.0, + "step": 11750 + }, + { + "epoch": 1.4948479837170843, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.7872314453125, + "learning_rate": 1e-06, + "loss": 0.418, + "mean_token_accuracy": 0.8655214309692383, + "num_tokens": 448385108.0, + "step": 11751 + }, + { + "epoch": 1.4949751939956748, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.75309181213379, + "learning_rate": 1e-06, + "loss": 0.3784, + "mean_token_accuracy": 0.8786104917526245, + "num_tokens": 448423272.0, + "step": 11752 + }, + { + "epoch": 1.4951024042742653, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.489944458007812, + "learning_rate": 1e-06, + "loss": 0.4028, + "mean_token_accuracy": 0.8728209733963013, + "num_tokens": 448462904.0, + "step": 11753 + }, + { + "epoch": 1.4952296145528559, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.699750900268555, + "learning_rate": 1e-06, + "loss": 0.4434, + "mean_token_accuracy": 0.8647692203521729, + "num_tokens": 448503257.0, + "step": 11754 + }, + { + "epoch": 1.4953568248314464, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.747804641723633, + "learning_rate": 1e-06, + "loss": 0.4058, + "mean_token_accuracy": 0.8735771179199219, + "num_tokens": 448535299.0, + "step": 11755 + }, + { + "epoch": 1.495484035110037, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.701663970947266, + "learning_rate": 1e-06, + "loss": 0.3948, + "mean_token_accuracy": 0.8743538856506348, + "num_tokens": 448568878.0, + "step": 11756 + }, + { + "epoch": 1.4956112453886274, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.795881271362305, + "learning_rate": 1e-06, + "loss": 0.3607, + "mean_token_accuracy": 0.8859488368034363, + "num_tokens": 448604959.0, + "step": 11757 + }, + { + "epoch": 1.495738455667218, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.594743728637695, + "learning_rate": 1e-06, + "loss": 0.3779, + "mean_token_accuracy": 0.8793120384216309, + "num_tokens": 448638487.0, + "step": 11758 + }, + { + "epoch": 1.4958656659458085, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.698143005371094, + "learning_rate": 1e-06, + "loss": 0.4034, + "mean_token_accuracy": 0.8742769956588745, + "num_tokens": 448677212.0, + "step": 11759 + }, + { + "epoch": 1.495992876224399, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.58663558959961, + "learning_rate": 1e-06, + "loss": 0.4308, + "mean_token_accuracy": 0.8623982667922974, + "num_tokens": 448719654.0, + "step": 11760 + }, + { + "epoch": 1.4961200865029896, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.648691177368164, + "learning_rate": 1e-06, + "loss": 0.3961, + "mean_token_accuracy": 0.87138831615448, + "num_tokens": 448760783.0, + "step": 11761 + }, + { + "epoch": 1.49624729678158, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.70053482055664, + "learning_rate": 1e-06, + "loss": 0.4442, + "mean_token_accuracy": 0.8576158881187439, + "num_tokens": 448802843.0, + "step": 11762 + }, + { + "epoch": 1.4963745070601704, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.381227493286133, + "learning_rate": 1e-06, + "loss": 0.432, + "mean_token_accuracy": 0.864748477935791, + "num_tokens": 448838430.0, + "step": 11763 + }, + { + "epoch": 1.496501717338761, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.72067642211914, + "learning_rate": 1e-06, + "loss": 0.4461, + "mean_token_accuracy": 0.8589115142822266, + "num_tokens": 448880450.0, + "step": 11764 + }, + { + "epoch": 1.4966289276173514, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.665435791015625, + "learning_rate": 1e-06, + "loss": 0.4147, + "mean_token_accuracy": 0.8704997301101685, + "num_tokens": 448919352.0, + "step": 11765 + }, + { + "epoch": 1.496756137895942, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.896940231323242, + "learning_rate": 1e-06, + "loss": 0.4858, + "mean_token_accuracy": 0.8429384231567383, + "num_tokens": 448965402.0, + "step": 11766 + }, + { + "epoch": 1.4968833481745325, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.42784881591797, + "learning_rate": 1e-06, + "loss": 0.3725, + "mean_token_accuracy": 0.8796500563621521, + "num_tokens": 448996983.0, + "step": 11767 + }, + { + "epoch": 1.497010558453123, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.93492889404297, + "learning_rate": 1e-06, + "loss": 0.4553, + "mean_token_accuracy": 0.8543280959129333, + "num_tokens": 449036555.0, + "step": 11768 + }, + { + "epoch": 1.4971377687317136, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.427448272705078, + "learning_rate": 1e-06, + "loss": 0.373, + "mean_token_accuracy": 0.8789135813713074, + "num_tokens": 449070545.0, + "step": 11769 + }, + { + "epoch": 1.497264979010304, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.722116470336914, + "learning_rate": 1e-06, + "loss": 0.4827, + "mean_token_accuracy": 0.852458119392395, + "num_tokens": 449108859.0, + "step": 11770 + }, + { + "epoch": 1.4973921892888946, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.921558380126953, + "learning_rate": 1e-06, + "loss": 0.4186, + "mean_token_accuracy": 0.8647243976593018, + "num_tokens": 449145333.0, + "step": 11771 + }, + { + "epoch": 1.497519399567485, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.659500122070312, + "learning_rate": 1e-06, + "loss": 0.4368, + "mean_token_accuracy": 0.862588107585907, + "num_tokens": 449181292.0, + "step": 11772 + }, + { + "epoch": 1.4976466098460754, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.709312438964844, + "learning_rate": 1e-06, + "loss": 0.3875, + "mean_token_accuracy": 0.8711484670639038, + "num_tokens": 449213940.0, + "step": 11773 + }, + { + "epoch": 1.497773820124666, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.73928451538086, + "learning_rate": 1e-06, + "loss": 0.386, + "mean_token_accuracy": 0.8788040280342102, + "num_tokens": 449252487.0, + "step": 11774 + }, + { + "epoch": 1.4979010304032565, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.861047744750977, + "learning_rate": 1e-06, + "loss": 0.4378, + "mean_token_accuracy": 0.8611882925033569, + "num_tokens": 449287268.0, + "step": 11775 + }, + { + "epoch": 1.498028240681847, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.417932510375977, + "learning_rate": 1e-06, + "loss": 0.4187, + "mean_token_accuracy": 0.864799439907074, + "num_tokens": 449326975.0, + "step": 11776 + }, + { + "epoch": 1.4981554509604376, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.78036880493164, + "learning_rate": 1e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.8672016859054565, + "num_tokens": 449366061.0, + "step": 11777 + }, + { + "epoch": 1.498282661239028, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.538089752197266, + "learning_rate": 1e-06, + "loss": 0.3938, + "mean_token_accuracy": 0.875824511051178, + "num_tokens": 449401763.0, + "step": 11778 + }, + { + "epoch": 1.4984098715176186, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.657594680786133, + "learning_rate": 1e-06, + "loss": 0.3594, + "mean_token_accuracy": 0.8870495557785034, + "num_tokens": 449436605.0, + "step": 11779 + }, + { + "epoch": 1.4985370817962091, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.70878791809082, + "learning_rate": 1e-06, + "loss": 0.4316, + "mean_token_accuracy": 0.8630750179290771, + "num_tokens": 449474523.0, + "step": 11780 + }, + { + "epoch": 1.4986642920747997, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.611352920532227, + "learning_rate": 1e-06, + "loss": 0.3816, + "mean_token_accuracy": 0.8797388076782227, + "num_tokens": 449513301.0, + "step": 11781 + }, + { + "epoch": 1.4987915023533902, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.648515701293945, + "learning_rate": 1e-06, + "loss": 0.4107, + "mean_token_accuracy": 0.8652138710021973, + "num_tokens": 449547826.0, + "step": 11782 + }, + { + "epoch": 1.4989187126319807, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.52347183227539, + "learning_rate": 1e-06, + "loss": 0.4271, + "mean_token_accuracy": 0.8672149181365967, + "num_tokens": 449584818.0, + "step": 11783 + }, + { + "epoch": 1.4990459229105713, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.421667098999023, + "learning_rate": 1e-06, + "loss": 0.4769, + "mean_token_accuracy": 0.8494342565536499, + "num_tokens": 449625368.0, + "step": 11784 + }, + { + "epoch": 1.4991731331891618, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.8668155670166, + "learning_rate": 1e-06, + "loss": 0.4477, + "mean_token_accuracy": 0.8553301095962524, + "num_tokens": 449666320.0, + "step": 11785 + }, + { + "epoch": 1.4993003434677523, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.478239059448242, + "learning_rate": 1e-06, + "loss": 0.444, + "mean_token_accuracy": 0.8599252700805664, + "num_tokens": 449705993.0, + "step": 11786 + }, + { + "epoch": 1.4994275537463426, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.61077308654785, + "learning_rate": 1e-06, + "loss": 0.4263, + "mean_token_accuracy": 0.863456130027771, + "num_tokens": 449747088.0, + "step": 11787 + }, + { + "epoch": 1.4995547640249332, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.633258819580078, + "learning_rate": 1e-06, + "loss": 0.4055, + "mean_token_accuracy": 0.8702259659767151, + "num_tokens": 449789840.0, + "step": 11788 + }, + { + "epoch": 1.4996819743035237, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.524829864501953, + "learning_rate": 1e-06, + "loss": 0.4268, + "mean_token_accuracy": 0.8673516511917114, + "num_tokens": 449830599.0, + "step": 11789 + }, + { + "epoch": 1.4998091845821142, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.854816436767578, + "learning_rate": 1e-06, + "loss": 0.4143, + "mean_token_accuracy": 0.8725738525390625, + "num_tokens": 449864476.0, + "step": 11790 + }, + { + "epoch": 1.4999363948607047, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.76980209350586, + "learning_rate": 1e-06, + "loss": 0.4694, + "mean_token_accuracy": 0.851988673210144, + "num_tokens": 449903791.0, + "step": 11791 + }, + { + "epoch": 1.5000636051392953, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.61191177368164, + "learning_rate": 1e-06, + "loss": 0.4051, + "mean_token_accuracy": 0.8677116632461548, + "num_tokens": 449945478.0, + "step": 11792 + }, + { + "epoch": 1.5001908154178858, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.5311336517334, + "learning_rate": 1e-06, + "loss": 0.4312, + "mean_token_accuracy": 0.8646843433380127, + "num_tokens": 449986175.0, + "step": 11793 + }, + { + "epoch": 1.5003180256964763, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.74567413330078, + "learning_rate": 1e-06, + "loss": 0.4246, + "mean_token_accuracy": 0.8652039170265198, + "num_tokens": 450025537.0, + "step": 11794 + }, + { + "epoch": 1.5004452359750666, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.772003173828125, + "learning_rate": 1e-06, + "loss": 0.4635, + "mean_token_accuracy": 0.8540114164352417, + "num_tokens": 450068269.0, + "step": 11795 + }, + { + "epoch": 1.5005724462536572, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.602108001708984, + "learning_rate": 1e-06, + "loss": 0.368, + "mean_token_accuracy": 0.8830772638320923, + "num_tokens": 450106234.0, + "step": 11796 + }, + { + "epoch": 1.5006996565322477, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.91641616821289, + "learning_rate": 1e-06, + "loss": 0.3882, + "mean_token_accuracy": 0.8775040507316589, + "num_tokens": 450141916.0, + "step": 11797 + }, + { + "epoch": 1.5008268668108382, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.69220733642578, + "learning_rate": 1e-06, + "loss": 0.447, + "mean_token_accuracy": 0.8537981510162354, + "num_tokens": 450175427.0, + "step": 11798 + }, + { + "epoch": 1.5009540770894287, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.549617767333984, + "learning_rate": 1e-06, + "loss": 0.4125, + "mean_token_accuracy": 0.8700286746025085, + "num_tokens": 450211104.0, + "step": 11799 + }, + { + "epoch": 1.5010812873680193, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.80520248413086, + "learning_rate": 1e-06, + "loss": 0.4517, + "mean_token_accuracy": 0.8594515323638916, + "num_tokens": 450244844.0, + "step": 11800 + }, + { + "epoch": 1.5012084976466098, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.50794792175293, + "learning_rate": 1e-06, + "loss": 0.419, + "mean_token_accuracy": 0.8665984272956848, + "num_tokens": 450286094.0, + "step": 11801 + }, + { + "epoch": 1.5013357079252003, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.778900146484375, + "learning_rate": 1e-06, + "loss": 0.4585, + "mean_token_accuracy": 0.8589128255844116, + "num_tokens": 450324938.0, + "step": 11802 + }, + { + "epoch": 1.5014629182037909, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.99500846862793, + "learning_rate": 1e-06, + "loss": 0.4403, + "mean_token_accuracy": 0.8680498003959656, + "num_tokens": 450368572.0, + "step": 11803 + }, + { + "epoch": 1.5015901284823814, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.518808364868164, + "learning_rate": 1e-06, + "loss": 0.4904, + "mean_token_accuracy": 0.8471063375473022, + "num_tokens": 450410926.0, + "step": 11804 + }, + { + "epoch": 1.501717338760972, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.729549407958984, + "learning_rate": 1e-06, + "loss": 0.4125, + "mean_token_accuracy": 0.8708579540252686, + "num_tokens": 450450206.0, + "step": 11805 + }, + { + "epoch": 1.5018445490395624, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.657957077026367, + "learning_rate": 1e-06, + "loss": 0.4872, + "mean_token_accuracy": 0.8474926948547363, + "num_tokens": 450488998.0, + "step": 11806 + }, + { + "epoch": 1.501971759318153, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.868900299072266, + "learning_rate": 1e-06, + "loss": 0.385, + "mean_token_accuracy": 0.8779841661453247, + "num_tokens": 450530476.0, + "step": 11807 + }, + { + "epoch": 1.5020989695967435, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.575904846191406, + "learning_rate": 1e-06, + "loss": 0.3905, + "mean_token_accuracy": 0.8744097352027893, + "num_tokens": 450560655.0, + "step": 11808 + }, + { + "epoch": 1.502226179875334, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.633825302124023, + "learning_rate": 1e-06, + "loss": 0.4069, + "mean_token_accuracy": 0.8702764511108398, + "num_tokens": 450601816.0, + "step": 11809 + }, + { + "epoch": 1.5023533901539246, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.564451217651367, + "learning_rate": 1e-06, + "loss": 0.398, + "mean_token_accuracy": 0.8718044757843018, + "num_tokens": 450634182.0, + "step": 11810 + }, + { + "epoch": 1.502480600432515, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.765316009521484, + "learning_rate": 1e-06, + "loss": 0.435, + "mean_token_accuracy": 0.8622645139694214, + "num_tokens": 450672705.0, + "step": 11811 + }, + { + "epoch": 1.5026078107111056, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.761362075805664, + "learning_rate": 1e-06, + "loss": 0.4462, + "mean_token_accuracy": 0.8551269173622131, + "num_tokens": 450705636.0, + "step": 11812 + }, + { + "epoch": 1.502735020989696, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.95660400390625, + "learning_rate": 1e-06, + "loss": 0.4485, + "mean_token_accuracy": 0.8584470748901367, + "num_tokens": 450736165.0, + "step": 11813 + }, + { + "epoch": 1.5028622312682864, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.67098617553711, + "learning_rate": 1e-06, + "loss": 0.4194, + "mean_token_accuracy": 0.869513213634491, + "num_tokens": 450776939.0, + "step": 11814 + }, + { + "epoch": 1.502989441546877, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.725793838500977, + "learning_rate": 1e-06, + "loss": 0.397, + "mean_token_accuracy": 0.8726913928985596, + "num_tokens": 450811994.0, + "step": 11815 + }, + { + "epoch": 1.5031166518254675, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.690332412719727, + "learning_rate": 1e-06, + "loss": 0.4071, + "mean_token_accuracy": 0.8683976531028748, + "num_tokens": 450853688.0, + "step": 11816 + }, + { + "epoch": 1.503243862104058, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.85153579711914, + "learning_rate": 1e-06, + "loss": 0.4128, + "mean_token_accuracy": 0.8670271635055542, + "num_tokens": 450892100.0, + "step": 11817 + }, + { + "epoch": 1.5033710723826486, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.644065856933594, + "learning_rate": 1e-06, + "loss": 0.36, + "mean_token_accuracy": 0.8836114406585693, + "num_tokens": 450936383.0, + "step": 11818 + }, + { + "epoch": 1.503498282661239, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.72202491760254, + "learning_rate": 1e-06, + "loss": 0.3935, + "mean_token_accuracy": 0.8759015798568726, + "num_tokens": 450965525.0, + "step": 11819 + }, + { + "epoch": 1.5036254929398294, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.689788818359375, + "learning_rate": 1e-06, + "loss": 0.4733, + "mean_token_accuracy": 0.8547906875610352, + "num_tokens": 451005874.0, + "step": 11820 + }, + { + "epoch": 1.50375270321842, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.732166290283203, + "learning_rate": 1e-06, + "loss": 0.4822, + "mean_token_accuracy": 0.8484106063842773, + "num_tokens": 451040668.0, + "step": 11821 + }, + { + "epoch": 1.5038799134970104, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.814882278442383, + "learning_rate": 1e-06, + "loss": 0.4307, + "mean_token_accuracy": 0.8650925755500793, + "num_tokens": 451082747.0, + "step": 11822 + }, + { + "epoch": 1.504007123775601, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.580453872680664, + "learning_rate": 1e-06, + "loss": 0.4051, + "mean_token_accuracy": 0.8738526105880737, + "num_tokens": 451120344.0, + "step": 11823 + }, + { + "epoch": 1.5041343340541915, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.591291427612305, + "learning_rate": 1e-06, + "loss": 0.4428, + "mean_token_accuracy": 0.8612360954284668, + "num_tokens": 451156564.0, + "step": 11824 + }, + { + "epoch": 1.504261544332782, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.678632736206055, + "learning_rate": 1e-06, + "loss": 0.4301, + "mean_token_accuracy": 0.8631095886230469, + "num_tokens": 451197252.0, + "step": 11825 + }, + { + "epoch": 1.5043887546113726, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.748428344726562, + "learning_rate": 1e-06, + "loss": 0.4733, + "mean_token_accuracy": 0.856147289276123, + "num_tokens": 451236150.0, + "step": 11826 + }, + { + "epoch": 1.504515964889963, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.628982543945312, + "learning_rate": 1e-06, + "loss": 0.4274, + "mean_token_accuracy": 0.863793134689331, + "num_tokens": 451275751.0, + "step": 11827 + }, + { + "epoch": 1.5046431751685536, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.493728637695312, + "learning_rate": 1e-06, + "loss": 0.3493, + "mean_token_accuracy": 0.8886367082595825, + "num_tokens": 451316778.0, + "step": 11828 + }, + { + "epoch": 1.5047703854471441, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.641984939575195, + "learning_rate": 1e-06, + "loss": 0.4219, + "mean_token_accuracy": 0.8662819862365723, + "num_tokens": 451359921.0, + "step": 11829 + }, + { + "epoch": 1.5048975957257347, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.529687881469727, + "learning_rate": 1e-06, + "loss": 0.3931, + "mean_token_accuracy": 0.8732780814170837, + "num_tokens": 451395616.0, + "step": 11830 + }, + { + "epoch": 1.5050248060043252, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.802785873413086, + "learning_rate": 1e-06, + "loss": 0.4214, + "mean_token_accuracy": 0.8669053912162781, + "num_tokens": 451427766.0, + "step": 11831 + }, + { + "epoch": 1.5051520162829157, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.673357009887695, + "learning_rate": 1e-06, + "loss": 0.3895, + "mean_token_accuracy": 0.8757114410400391, + "num_tokens": 451463073.0, + "step": 11832 + }, + { + "epoch": 1.5052792265615063, + "ewc_loss": 0.0322265625, + "ewc_loss_parallel": 3.218650817871094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.588687896728516, + "learning_rate": 1e-06, + "loss": 0.3648, + "mean_token_accuracy": 0.885648787021637, + "num_tokens": 451503039.0, + "step": 11833 + }, + { + "epoch": 1.5054064368400968, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.697893142700195, + "learning_rate": 1e-06, + "loss": 0.4931, + "mean_token_accuracy": 0.8502565026283264, + "num_tokens": 451541599.0, + "step": 11834 + }, + { + "epoch": 1.5055336471186873, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.816322326660156, + "learning_rate": 1e-06, + "loss": 0.4514, + "mean_token_accuracy": 0.8525230884552002, + "num_tokens": 451579278.0, + "step": 11835 + }, + { + "epoch": 1.5056608573972778, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.55349349975586, + "learning_rate": 1e-06, + "loss": 0.4147, + "mean_token_accuracy": 0.8708454370498657, + "num_tokens": 451618591.0, + "step": 11836 + }, + { + "epoch": 1.5057880676758684, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.856794357299805, + "learning_rate": 1e-06, + "loss": 0.4159, + "mean_token_accuracy": 0.8686178922653198, + "num_tokens": 451654453.0, + "step": 11837 + }, + { + "epoch": 1.5059152779544587, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.5129337310791, + "learning_rate": 1e-06, + "loss": 0.4295, + "mean_token_accuracy": 0.8653411865234375, + "num_tokens": 451688302.0, + "step": 11838 + }, + { + "epoch": 1.5060424882330492, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.50729751586914, + "learning_rate": 1e-06, + "loss": 0.3979, + "mean_token_accuracy": 0.8723194599151611, + "num_tokens": 451727301.0, + "step": 11839 + }, + { + "epoch": 1.5061696985116397, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.59392547607422, + "learning_rate": 1e-06, + "loss": 0.4626, + "mean_token_accuracy": 0.8525633215904236, + "num_tokens": 451770471.0, + "step": 11840 + }, + { + "epoch": 1.5062969087902303, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.633941650390625, + "learning_rate": 1e-06, + "loss": 0.4546, + "mean_token_accuracy": 0.8557642698287964, + "num_tokens": 451809413.0, + "step": 11841 + }, + { + "epoch": 1.5064241190688208, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.572267532348633, + "learning_rate": 1e-06, + "loss": 0.4209, + "mean_token_accuracy": 0.864690363407135, + "num_tokens": 451850177.0, + "step": 11842 + }, + { + "epoch": 1.5065513293474113, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.614900588989258, + "learning_rate": 1e-06, + "loss": 0.4607, + "mean_token_accuracy": 0.8553813695907593, + "num_tokens": 451887977.0, + "step": 11843 + }, + { + "epoch": 1.5066785396260016, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.495853424072266, + "learning_rate": 1e-06, + "loss": 0.3794, + "mean_token_accuracy": 0.8782936334609985, + "num_tokens": 451927403.0, + "step": 11844 + }, + { + "epoch": 1.5068057499045922, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.683141708374023, + "learning_rate": 1e-06, + "loss": 0.4231, + "mean_token_accuracy": 0.8669098615646362, + "num_tokens": 451967350.0, + "step": 11845 + }, + { + "epoch": 1.5069329601831827, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.74551010131836, + "learning_rate": 1e-06, + "loss": 0.4222, + "mean_token_accuracy": 0.8674845695495605, + "num_tokens": 452008664.0, + "step": 11846 + }, + { + "epoch": 1.5070601704617732, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.652143478393555, + "learning_rate": 1e-06, + "loss": 0.4054, + "mean_token_accuracy": 0.8730238676071167, + "num_tokens": 452047303.0, + "step": 11847 + }, + { + "epoch": 1.5071873807403637, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.907888412475586, + "learning_rate": 1e-06, + "loss": 0.4108, + "mean_token_accuracy": 0.8687812089920044, + "num_tokens": 452087359.0, + "step": 11848 + }, + { + "epoch": 1.5073145910189543, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.546709060668945, + "learning_rate": 1e-06, + "loss": 0.4671, + "mean_token_accuracy": 0.8528872728347778, + "num_tokens": 452129426.0, + "step": 11849 + }, + { + "epoch": 1.5074418012975448, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.784914016723633, + "learning_rate": 1e-06, + "loss": 0.4016, + "mean_token_accuracy": 0.8720622062683105, + "num_tokens": 452169134.0, + "step": 11850 + }, + { + "epoch": 1.5075690115761353, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.46428680419922, + "learning_rate": 1e-06, + "loss": 0.4474, + "mean_token_accuracy": 0.8585727214813232, + "num_tokens": 452209962.0, + "step": 11851 + }, + { + "epoch": 1.5076962218547258, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.60526466369629, + "learning_rate": 1e-06, + "loss": 0.3991, + "mean_token_accuracy": 0.8703274130821228, + "num_tokens": 452245978.0, + "step": 11852 + }, + { + "epoch": 1.5078234321333164, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.670190811157227, + "learning_rate": 1e-06, + "loss": 0.3881, + "mean_token_accuracy": 0.8769937753677368, + "num_tokens": 452281013.0, + "step": 11853 + }, + { + "epoch": 1.507950642411907, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.817596435546875, + "learning_rate": 1e-06, + "loss": 0.3708, + "mean_token_accuracy": 0.8810771107673645, + "num_tokens": 452317174.0, + "step": 11854 + }, + { + "epoch": 1.5080778526904974, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.669113159179688, + "learning_rate": 1e-06, + "loss": 0.4364, + "mean_token_accuracy": 0.8635461926460266, + "num_tokens": 452362974.0, + "step": 11855 + }, + { + "epoch": 1.508205062969088, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.542804718017578, + "learning_rate": 1e-06, + "loss": 0.3782, + "mean_token_accuracy": 0.8786022663116455, + "num_tokens": 452400701.0, + "step": 11856 + }, + { + "epoch": 1.5083322732476785, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.61178207397461, + "learning_rate": 1e-06, + "loss": 0.3903, + "mean_token_accuracy": 0.8769099116325378, + "num_tokens": 452438541.0, + "step": 11857 + }, + { + "epoch": 1.508459483526269, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.727184295654297, + "learning_rate": 1e-06, + "loss": 0.4322, + "mean_token_accuracy": 0.8615390062332153, + "num_tokens": 452476918.0, + "step": 11858 + }, + { + "epoch": 1.5085866938048595, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.770320892333984, + "learning_rate": 1e-06, + "loss": 0.4081, + "mean_token_accuracy": 0.8709448575973511, + "num_tokens": 452509231.0, + "step": 11859 + }, + { + "epoch": 1.50871390408345, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.6778507232666, + "learning_rate": 1e-06, + "loss": 0.4265, + "mean_token_accuracy": 0.8616155982017517, + "num_tokens": 452547079.0, + "step": 11860 + }, + { + "epoch": 1.5088411143620406, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.583274841308594, + "learning_rate": 1e-06, + "loss": 0.3879, + "mean_token_accuracy": 0.8789104223251343, + "num_tokens": 452580215.0, + "step": 11861 + }, + { + "epoch": 1.508968324640631, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.3987979888916, + "learning_rate": 1e-06, + "loss": 0.3944, + "mean_token_accuracy": 0.8742398023605347, + "num_tokens": 452615778.0, + "step": 11862 + }, + { + "epoch": 1.5090955349192214, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.750598907470703, + "learning_rate": 1e-06, + "loss": 0.4065, + "mean_token_accuracy": 0.8693360090255737, + "num_tokens": 452652184.0, + "step": 11863 + }, + { + "epoch": 1.509222745197812, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.633298873901367, + "learning_rate": 1e-06, + "loss": 0.4615, + "mean_token_accuracy": 0.8531439900398254, + "num_tokens": 452691345.0, + "step": 11864 + }, + { + "epoch": 1.5093499554764025, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.84872055053711, + "learning_rate": 1e-06, + "loss": 0.4071, + "mean_token_accuracy": 0.8685033321380615, + "num_tokens": 452729982.0, + "step": 11865 + }, + { + "epoch": 1.509477165754993, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.714126586914062, + "learning_rate": 1e-06, + "loss": 0.4185, + "mean_token_accuracy": 0.8659766316413879, + "num_tokens": 452771272.0, + "step": 11866 + }, + { + "epoch": 1.5096043760335836, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.548656463623047, + "learning_rate": 1e-06, + "loss": 0.4266, + "mean_token_accuracy": 0.8637992143630981, + "num_tokens": 452812231.0, + "step": 11867 + }, + { + "epoch": 1.509731586312174, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.720211029052734, + "learning_rate": 1e-06, + "loss": 0.478, + "mean_token_accuracy": 0.8562835454940796, + "num_tokens": 452858406.0, + "step": 11868 + }, + { + "epoch": 1.5098587965907644, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.643404006958008, + "learning_rate": 1e-06, + "loss": 0.4503, + "mean_token_accuracy": 0.8590093851089478, + "num_tokens": 452898793.0, + "step": 11869 + }, + { + "epoch": 1.509986006869355, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.715078353881836, + "learning_rate": 1e-06, + "loss": 0.3958, + "mean_token_accuracy": 0.8730741739273071, + "num_tokens": 452937929.0, + "step": 11870 + }, + { + "epoch": 1.5101132171479454, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.621206283569336, + "learning_rate": 1e-06, + "loss": 0.4031, + "mean_token_accuracy": 0.8705095052719116, + "num_tokens": 452975436.0, + "step": 11871 + }, + { + "epoch": 1.510240427426536, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.888938903808594, + "learning_rate": 1e-06, + "loss": 0.4318, + "mean_token_accuracy": 0.8673033118247986, + "num_tokens": 453016334.0, + "step": 11872 + }, + { + "epoch": 1.5103676377051265, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.53523826599121, + "learning_rate": 1e-06, + "loss": 0.4028, + "mean_token_accuracy": 0.8711158037185669, + "num_tokens": 453053630.0, + "step": 11873 + }, + { + "epoch": 1.510494847983717, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.818998336791992, + "learning_rate": 1e-06, + "loss": 0.4764, + "mean_token_accuracy": 0.849432110786438, + "num_tokens": 453092341.0, + "step": 11874 + }, + { + "epoch": 1.5106220582623076, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.750080108642578, + "learning_rate": 1e-06, + "loss": 0.3945, + "mean_token_accuracy": 0.8752686977386475, + "num_tokens": 453126859.0, + "step": 11875 + }, + { + "epoch": 1.510749268540898, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.66014289855957, + "learning_rate": 1e-06, + "loss": 0.4347, + "mean_token_accuracy": 0.8641899824142456, + "num_tokens": 453167327.0, + "step": 11876 + }, + { + "epoch": 1.5108764788194886, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.78139305114746, + "learning_rate": 1e-06, + "loss": 0.4077, + "mean_token_accuracy": 0.872681736946106, + "num_tokens": 453205901.0, + "step": 11877 + }, + { + "epoch": 1.5110036890980791, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.510330200195312, + "learning_rate": 1e-06, + "loss": 0.4156, + "mean_token_accuracy": 0.865631103515625, + "num_tokens": 453241856.0, + "step": 11878 + }, + { + "epoch": 1.5111308993766697, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.756832122802734, + "learning_rate": 1e-06, + "loss": 0.4043, + "mean_token_accuracy": 0.8751219511032104, + "num_tokens": 453282528.0, + "step": 11879 + }, + { + "epoch": 1.5112581096552602, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.707670211791992, + "learning_rate": 1e-06, + "loss": 0.4497, + "mean_token_accuracy": 0.8583735227584839, + "num_tokens": 453318762.0, + "step": 11880 + }, + { + "epoch": 1.5113853199338507, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.669239044189453, + "learning_rate": 1e-06, + "loss": 0.3771, + "mean_token_accuracy": 0.8830716013908386, + "num_tokens": 453352482.0, + "step": 11881 + }, + { + "epoch": 1.5115125302124413, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.736377716064453, + "learning_rate": 1e-06, + "loss": 0.4194, + "mean_token_accuracy": 0.8663533926010132, + "num_tokens": 453393692.0, + "step": 11882 + }, + { + "epoch": 1.5116397404910318, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.94093894958496, + "learning_rate": 1e-06, + "loss": 0.4055, + "mean_token_accuracy": 0.8700330853462219, + "num_tokens": 453429217.0, + "step": 11883 + }, + { + "epoch": 1.5117669507696223, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.56622314453125, + "learning_rate": 1e-06, + "loss": 0.4632, + "mean_token_accuracy": 0.8523286581039429, + "num_tokens": 453468190.0, + "step": 11884 + }, + { + "epoch": 1.5118941610482128, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.970245361328125, + "learning_rate": 1e-06, + "loss": 0.4427, + "mean_token_accuracy": 0.8600929379463196, + "num_tokens": 453509893.0, + "step": 11885 + }, + { + "epoch": 1.5120213713268034, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.6423282623291, + "learning_rate": 1e-06, + "loss": 0.3982, + "mean_token_accuracy": 0.8703663349151611, + "num_tokens": 453546972.0, + "step": 11886 + }, + { + "epoch": 1.5121485816053937, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.672557830810547, + "learning_rate": 1e-06, + "loss": 0.3834, + "mean_token_accuracy": 0.8789387345314026, + "num_tokens": 453578480.0, + "step": 11887 + }, + { + "epoch": 1.5122757918839842, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.834980010986328, + "learning_rate": 1e-06, + "loss": 0.4211, + "mean_token_accuracy": 0.8675683736801147, + "num_tokens": 453623248.0, + "step": 11888 + }, + { + "epoch": 1.5124030021625747, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.691415786743164, + "learning_rate": 1e-06, + "loss": 0.3537, + "mean_token_accuracy": 0.8872677087783813, + "num_tokens": 453660088.0, + "step": 11889 + }, + { + "epoch": 1.5125302124411653, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.59269142150879, + "learning_rate": 1e-06, + "loss": 0.4162, + "mean_token_accuracy": 0.869060754776001, + "num_tokens": 453698931.0, + "step": 11890 + }, + { + "epoch": 1.5126574227197558, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.877822875976562, + "learning_rate": 1e-06, + "loss": 0.4076, + "mean_token_accuracy": 0.8694199323654175, + "num_tokens": 453734220.0, + "step": 11891 + }, + { + "epoch": 1.5127846329983463, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.701284408569336, + "learning_rate": 1e-06, + "loss": 0.4274, + "mean_token_accuracy": 0.8658630847930908, + "num_tokens": 453774056.0, + "step": 11892 + }, + { + "epoch": 1.5129118432769366, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.64232063293457, + "learning_rate": 1e-06, + "loss": 0.3977, + "mean_token_accuracy": 0.8740047812461853, + "num_tokens": 453806616.0, + "step": 11893 + }, + { + "epoch": 1.5130390535555271, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.732646942138672, + "learning_rate": 1e-06, + "loss": 0.3901, + "mean_token_accuracy": 0.8762811422348022, + "num_tokens": 453843838.0, + "step": 11894 + }, + { + "epoch": 1.5131662638341177, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.83392906188965, + "learning_rate": 1e-06, + "loss": 0.3951, + "mean_token_accuracy": 0.874732255935669, + "num_tokens": 453876888.0, + "step": 11895 + }, + { + "epoch": 1.5132934741127082, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.723712921142578, + "learning_rate": 1e-06, + "loss": 0.3616, + "mean_token_accuracy": 0.8834090828895569, + "num_tokens": 453915334.0, + "step": 11896 + }, + { + "epoch": 1.5134206843912987, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.770902633666992, + "learning_rate": 1e-06, + "loss": 0.3781, + "mean_token_accuracy": 0.8767788410186768, + "num_tokens": 453950590.0, + "step": 11897 + }, + { + "epoch": 1.5135478946698893, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.688268661499023, + "learning_rate": 1e-06, + "loss": 0.4041, + "mean_token_accuracy": 0.8714638352394104, + "num_tokens": 453992590.0, + "step": 11898 + }, + { + "epoch": 1.5136751049484798, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.67851448059082, + "learning_rate": 1e-06, + "loss": 0.3999, + "mean_token_accuracy": 0.8727186918258667, + "num_tokens": 454038619.0, + "step": 11899 + }, + { + "epoch": 1.5138023152270703, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.752635955810547, + "learning_rate": 1e-06, + "loss": 0.4456, + "mean_token_accuracy": 0.8592478036880493, + "num_tokens": 454081564.0, + "step": 11900 + }, + { + "epoch": 1.5139295255056608, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.64303207397461, + "learning_rate": 1e-06, + "loss": 0.47, + "mean_token_accuracy": 0.855926513671875, + "num_tokens": 454125581.0, + "step": 11901 + }, + { + "epoch": 1.5140567357842514, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.915742874145508, + "learning_rate": 1e-06, + "loss": 0.3964, + "mean_token_accuracy": 0.8772028684616089, + "num_tokens": 454163961.0, + "step": 11902 + }, + { + "epoch": 1.514183946062842, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.85468864440918, + "learning_rate": 1e-06, + "loss": 0.4529, + "mean_token_accuracy": 0.8611609935760498, + "num_tokens": 454200410.0, + "step": 11903 + }, + { + "epoch": 1.5143111563414324, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.764053344726562, + "learning_rate": 1e-06, + "loss": 0.456, + "mean_token_accuracy": 0.8546020984649658, + "num_tokens": 454237143.0, + "step": 11904 + }, + { + "epoch": 1.514438366620023, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.718080520629883, + "learning_rate": 1e-06, + "loss": 0.4054, + "mean_token_accuracy": 0.8758150935173035, + "num_tokens": 454274022.0, + "step": 11905 + }, + { + "epoch": 1.5145655768986135, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.710973739624023, + "learning_rate": 1e-06, + "loss": 0.4315, + "mean_token_accuracy": 0.8629642724990845, + "num_tokens": 454307955.0, + "step": 11906 + }, + { + "epoch": 1.514692787177204, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.6832218170166, + "learning_rate": 1e-06, + "loss": 0.401, + "mean_token_accuracy": 0.8750597834587097, + "num_tokens": 454345449.0, + "step": 11907 + }, + { + "epoch": 1.5148199974557945, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.557912826538086, + "learning_rate": 1e-06, + "loss": 0.4574, + "mean_token_accuracy": 0.8564625978469849, + "num_tokens": 454383096.0, + "step": 11908 + }, + { + "epoch": 1.514947207734385, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.483259201049805, + "learning_rate": 1e-06, + "loss": 0.4606, + "mean_token_accuracy": 0.856805145740509, + "num_tokens": 454420142.0, + "step": 11909 + }, + { + "epoch": 1.5150744180129756, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.500106811523438, + "learning_rate": 1e-06, + "loss": 0.419, + "mean_token_accuracy": 0.8671336770057678, + "num_tokens": 454454991.0, + "step": 11910 + }, + { + "epoch": 1.515201628291566, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.898576736450195, + "learning_rate": 1e-06, + "loss": 0.4278, + "mean_token_accuracy": 0.8637417554855347, + "num_tokens": 454490161.0, + "step": 11911 + }, + { + "epoch": 1.5153288385701564, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.690258026123047, + "learning_rate": 1e-06, + "loss": 0.4396, + "mean_token_accuracy": 0.8591376543045044, + "num_tokens": 454534767.0, + "step": 11912 + }, + { + "epoch": 1.515456048848747, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.76582145690918, + "learning_rate": 1e-06, + "loss": 0.388, + "mean_token_accuracy": 0.8764601945877075, + "num_tokens": 454573246.0, + "step": 11913 + }, + { + "epoch": 1.5155832591273375, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.793962478637695, + "learning_rate": 1e-06, + "loss": 0.3692, + "mean_token_accuracy": 0.8807967305183411, + "num_tokens": 454607626.0, + "step": 11914 + }, + { + "epoch": 1.515710469405928, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.695539474487305, + "learning_rate": 1e-06, + "loss": 0.4283, + "mean_token_accuracy": 0.8604376316070557, + "num_tokens": 454639840.0, + "step": 11915 + }, + { + "epoch": 1.5158376796845185, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.802249908447266, + "learning_rate": 1e-06, + "loss": 0.4382, + "mean_token_accuracy": 0.8593215942382812, + "num_tokens": 454681445.0, + "step": 11916 + }, + { + "epoch": 1.515964889963109, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.698598861694336, + "learning_rate": 1e-06, + "loss": 0.4394, + "mean_token_accuracy": 0.8611631393432617, + "num_tokens": 454726191.0, + "step": 11917 + }, + { + "epoch": 1.5160921002416994, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.540557861328125, + "learning_rate": 1e-06, + "loss": 0.3968, + "mean_token_accuracy": 0.8747314214706421, + "num_tokens": 454765112.0, + "step": 11918 + }, + { + "epoch": 1.51621931052029, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.85519790649414, + "learning_rate": 1e-06, + "loss": 0.4241, + "mean_token_accuracy": 0.8652828335762024, + "num_tokens": 454805254.0, + "step": 11919 + }, + { + "epoch": 1.5163465207988804, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.70931625366211, + "learning_rate": 1e-06, + "loss": 0.4372, + "mean_token_accuracy": 0.8599830865859985, + "num_tokens": 454841344.0, + "step": 11920 + }, + { + "epoch": 1.516473731077471, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.565624237060547, + "learning_rate": 1e-06, + "loss": 0.3791, + "mean_token_accuracy": 0.8808775544166565, + "num_tokens": 454879986.0, + "step": 11921 + }, + { + "epoch": 1.5166009413560615, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.685497283935547, + "learning_rate": 1e-06, + "loss": 0.4561, + "mean_token_accuracy": 0.8590525984764099, + "num_tokens": 454917731.0, + "step": 11922 + }, + { + "epoch": 1.516728151634652, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.672962188720703, + "learning_rate": 1e-06, + "loss": 0.3912, + "mean_token_accuracy": 0.8734608292579651, + "num_tokens": 454952837.0, + "step": 11923 + }, + { + "epoch": 1.5168553619132426, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.794031143188477, + "learning_rate": 1e-06, + "loss": 0.3936, + "mean_token_accuracy": 0.8709184527397156, + "num_tokens": 454992751.0, + "step": 11924 + }, + { + "epoch": 1.516982572191833, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.72894859313965, + "learning_rate": 1e-06, + "loss": 0.4165, + "mean_token_accuracy": 0.8659218549728394, + "num_tokens": 455029829.0, + "step": 11925 + }, + { + "epoch": 1.5171097824704236, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.743675231933594, + "learning_rate": 1e-06, + "loss": 0.4221, + "mean_token_accuracy": 0.8673913478851318, + "num_tokens": 455072167.0, + "step": 11926 + }, + { + "epoch": 1.5172369927490141, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.730607986450195, + "learning_rate": 1e-06, + "loss": 0.3903, + "mean_token_accuracy": 0.8758304119110107, + "num_tokens": 455108433.0, + "step": 11927 + }, + { + "epoch": 1.5173642030276047, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.762998580932617, + "learning_rate": 1e-06, + "loss": 0.4162, + "mean_token_accuracy": 0.8711394667625427, + "num_tokens": 455148943.0, + "step": 11928 + }, + { + "epoch": 1.5174914133061952, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.956130981445312, + "learning_rate": 1e-06, + "loss": 0.4376, + "mean_token_accuracy": 0.8623810410499573, + "num_tokens": 455185099.0, + "step": 11929 + }, + { + "epoch": 1.5176186235847857, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.69861602783203, + "learning_rate": 1e-06, + "loss": 0.372, + "mean_token_accuracy": 0.8819676637649536, + "num_tokens": 455225322.0, + "step": 11930 + }, + { + "epoch": 1.5177458338633762, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.784393310546875, + "learning_rate": 1e-06, + "loss": 0.4193, + "mean_token_accuracy": 0.8654395341873169, + "num_tokens": 455262830.0, + "step": 11931 + }, + { + "epoch": 1.5178730441419668, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.90934944152832, + "learning_rate": 1e-06, + "loss": 0.401, + "mean_token_accuracy": 0.8747066259384155, + "num_tokens": 455300825.0, + "step": 11932 + }, + { + "epoch": 1.5180002544205573, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.59111785888672, + "learning_rate": 1e-06, + "loss": 0.4201, + "mean_token_accuracy": 0.8641429543495178, + "num_tokens": 455336209.0, + "step": 11933 + }, + { + "epoch": 1.5181274646991478, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.924631118774414, + "learning_rate": 1e-06, + "loss": 0.4288, + "mean_token_accuracy": 0.8675111532211304, + "num_tokens": 455374909.0, + "step": 11934 + }, + { + "epoch": 1.5182546749777384, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.52517318725586, + "learning_rate": 1e-06, + "loss": 0.3741, + "mean_token_accuracy": 0.8800034523010254, + "num_tokens": 455417179.0, + "step": 11935 + }, + { + "epoch": 1.5183818852563287, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.861202239990234, + "learning_rate": 1e-06, + "loss": 0.4768, + "mean_token_accuracy": 0.8547841906547546, + "num_tokens": 455457923.0, + "step": 11936 + }, + { + "epoch": 1.5185090955349192, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.553285598754883, + "learning_rate": 1e-06, + "loss": 0.4375, + "mean_token_accuracy": 0.8579150438308716, + "num_tokens": 455492932.0, + "step": 11937 + }, + { + "epoch": 1.5186363058135097, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.649444580078125, + "learning_rate": 1e-06, + "loss": 0.4466, + "mean_token_accuracy": 0.8584159016609192, + "num_tokens": 455540392.0, + "step": 11938 + }, + { + "epoch": 1.5187635160921003, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.889352798461914, + "learning_rate": 1e-06, + "loss": 0.4301, + "mean_token_accuracy": 0.862873911857605, + "num_tokens": 455574461.0, + "step": 11939 + }, + { + "epoch": 1.5188907263706908, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.821758270263672, + "learning_rate": 1e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.8640652894973755, + "num_tokens": 455608315.0, + "step": 11940 + }, + { + "epoch": 1.5190179366492813, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.690458297729492, + "learning_rate": 1e-06, + "loss": 0.4029, + "mean_token_accuracy": 0.8712201118469238, + "num_tokens": 455654927.0, + "step": 11941 + }, + { + "epoch": 1.5191451469278716, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.496400833129883, + "learning_rate": 1e-06, + "loss": 0.4522, + "mean_token_accuracy": 0.8550281524658203, + "num_tokens": 455695745.0, + "step": 11942 + }, + { + "epoch": 1.5192723572064621, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.809053421020508, + "learning_rate": 1e-06, + "loss": 0.4183, + "mean_token_accuracy": 0.8685630559921265, + "num_tokens": 455734652.0, + "step": 11943 + }, + { + "epoch": 1.5193995674850527, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.72515106201172, + "learning_rate": 1e-06, + "loss": 0.4806, + "mean_token_accuracy": 0.845553457736969, + "num_tokens": 455776063.0, + "step": 11944 + }, + { + "epoch": 1.5195267777636432, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.714529037475586, + "learning_rate": 1e-06, + "loss": 0.4341, + "mean_token_accuracy": 0.8620524406433105, + "num_tokens": 455820595.0, + "step": 11945 + }, + { + "epoch": 1.5196539880422337, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.793134689331055, + "learning_rate": 1e-06, + "loss": 0.4104, + "mean_token_accuracy": 0.8734867572784424, + "num_tokens": 455859553.0, + "step": 11946 + }, + { + "epoch": 1.5197811983208243, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.630706787109375, + "learning_rate": 1e-06, + "loss": 0.4262, + "mean_token_accuracy": 0.8685789108276367, + "num_tokens": 455892676.0, + "step": 11947 + }, + { + "epoch": 1.5199084085994148, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.660568237304688, + "learning_rate": 1e-06, + "loss": 0.4397, + "mean_token_accuracy": 0.8594849109649658, + "num_tokens": 455932106.0, + "step": 11948 + }, + { + "epoch": 1.5200356188780053, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.57353973388672, + "learning_rate": 1e-06, + "loss": 0.3969, + "mean_token_accuracy": 0.872412919998169, + "num_tokens": 455971555.0, + "step": 11949 + }, + { + "epoch": 1.5201628291565958, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.734195709228516, + "learning_rate": 1e-06, + "loss": 0.432, + "mean_token_accuracy": 0.8619101047515869, + "num_tokens": 456014177.0, + "step": 11950 + }, + { + "epoch": 1.5202900394351864, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.69841766357422, + "learning_rate": 1e-06, + "loss": 0.4064, + "mean_token_accuracy": 0.8727632761001587, + "num_tokens": 456048229.0, + "step": 11951 + }, + { + "epoch": 1.520417249713777, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.546953201293945, + "learning_rate": 1e-06, + "loss": 0.3568, + "mean_token_accuracy": 0.8864434957504272, + "num_tokens": 456085815.0, + "step": 11952 + }, + { + "epoch": 1.5205444599923674, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.72156524658203, + "learning_rate": 1e-06, + "loss": 0.3997, + "mean_token_accuracy": 0.870004415512085, + "num_tokens": 456128971.0, + "step": 11953 + }, + { + "epoch": 1.520671670270958, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.626182556152344, + "learning_rate": 1e-06, + "loss": 0.4622, + "mean_token_accuracy": 0.8498148322105408, + "num_tokens": 456159705.0, + "step": 11954 + }, + { + "epoch": 1.5207988805495485, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.722631454467773, + "learning_rate": 1e-06, + "loss": 0.4459, + "mean_token_accuracy": 0.8583239912986755, + "num_tokens": 456198251.0, + "step": 11955 + }, + { + "epoch": 1.520926090828139, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.668033599853516, + "learning_rate": 1e-06, + "loss": 0.4484, + "mean_token_accuracy": 0.8606455326080322, + "num_tokens": 456237612.0, + "step": 11956 + }, + { + "epoch": 1.5210533011067295, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.824909210205078, + "learning_rate": 1e-06, + "loss": 0.4255, + "mean_token_accuracy": 0.8664989471435547, + "num_tokens": 456273665.0, + "step": 11957 + }, + { + "epoch": 1.52118051138532, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.75711441040039, + "learning_rate": 1e-06, + "loss": 0.4131, + "mean_token_accuracy": 0.8692128658294678, + "num_tokens": 456305467.0, + "step": 11958 + }, + { + "epoch": 1.5213077216639106, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.540109634399414, + "learning_rate": 1e-06, + "loss": 0.3935, + "mean_token_accuracy": 0.8732500672340393, + "num_tokens": 456342951.0, + "step": 11959 + }, + { + "epoch": 1.521434931942501, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.715776443481445, + "learning_rate": 1e-06, + "loss": 0.4182, + "mean_token_accuracy": 0.8676185607910156, + "num_tokens": 456375846.0, + "step": 11960 + }, + { + "epoch": 1.5215621422210914, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.72929573059082, + "learning_rate": 1e-06, + "loss": 0.383, + "mean_token_accuracy": 0.8763151168823242, + "num_tokens": 456407993.0, + "step": 11961 + }, + { + "epoch": 1.521689352499682, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.60659408569336, + "learning_rate": 1e-06, + "loss": 0.4785, + "mean_token_accuracy": 0.8484087586402893, + "num_tokens": 456444278.0, + "step": 11962 + }, + { + "epoch": 1.5218165627782725, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.628482818603516, + "learning_rate": 1e-06, + "loss": 0.3922, + "mean_token_accuracy": 0.8755139112472534, + "num_tokens": 456483950.0, + "step": 11963 + }, + { + "epoch": 1.521943773056863, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.602174758911133, + "learning_rate": 1e-06, + "loss": 0.4252, + "mean_token_accuracy": 0.8681387305259705, + "num_tokens": 456527438.0, + "step": 11964 + }, + { + "epoch": 1.5220709833354535, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.637706756591797, + "learning_rate": 1e-06, + "loss": 0.3781, + "mean_token_accuracy": 0.8783731460571289, + "num_tokens": 456560572.0, + "step": 11965 + }, + { + "epoch": 1.5221981936140438, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.570682525634766, + "learning_rate": 1e-06, + "loss": 0.4484, + "mean_token_accuracy": 0.8585240840911865, + "num_tokens": 456604781.0, + "step": 11966 + }, + { + "epoch": 1.5223254038926344, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.811878204345703, + "learning_rate": 1e-06, + "loss": 0.3713, + "mean_token_accuracy": 0.880852460861206, + "num_tokens": 456639415.0, + "step": 11967 + }, + { + "epoch": 1.522452614171225, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.780223846435547, + "learning_rate": 1e-06, + "loss": 0.3769, + "mean_token_accuracy": 0.880344033241272, + "num_tokens": 456671183.0, + "step": 11968 + }, + { + "epoch": 1.5225798244498154, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.6710262298584, + "learning_rate": 1e-06, + "loss": 0.4514, + "mean_token_accuracy": 0.8555572032928467, + "num_tokens": 456712995.0, + "step": 11969 + }, + { + "epoch": 1.522707034728406, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.606164932250977, + "learning_rate": 1e-06, + "loss": 0.4343, + "mean_token_accuracy": 0.8625811338424683, + "num_tokens": 456750587.0, + "step": 11970 + }, + { + "epoch": 1.5228342450069965, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.563629150390625, + "learning_rate": 1e-06, + "loss": 0.3918, + "mean_token_accuracy": 0.8748840093612671, + "num_tokens": 456787503.0, + "step": 11971 + }, + { + "epoch": 1.522961455285587, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.848777770996094, + "learning_rate": 1e-06, + "loss": 0.4849, + "mean_token_accuracy": 0.844947099685669, + "num_tokens": 456831645.0, + "step": 11972 + }, + { + "epoch": 1.5230886655641775, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.650833129882812, + "learning_rate": 1e-06, + "loss": 0.3975, + "mean_token_accuracy": 0.87441086769104, + "num_tokens": 456869143.0, + "step": 11973 + }, + { + "epoch": 1.523215875842768, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.82098388671875, + "learning_rate": 1e-06, + "loss": 0.4195, + "mean_token_accuracy": 0.8653863072395325, + "num_tokens": 456906465.0, + "step": 11974 + }, + { + "epoch": 1.5233430861213586, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.882558822631836, + "learning_rate": 1e-06, + "loss": 0.4228, + "mean_token_accuracy": 0.8663762211799622, + "num_tokens": 456942048.0, + "step": 11975 + }, + { + "epoch": 1.5234702963999491, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.812938690185547, + "learning_rate": 1e-06, + "loss": 0.3594, + "mean_token_accuracy": 0.8851867914199829, + "num_tokens": 456975710.0, + "step": 11976 + }, + { + "epoch": 1.5235975066785397, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.055356979370117, + "learning_rate": 1e-06, + "loss": 0.4391, + "mean_token_accuracy": 0.8556557297706604, + "num_tokens": 457022295.0, + "step": 11977 + }, + { + "epoch": 1.5237247169571302, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.717348098754883, + "learning_rate": 1e-06, + "loss": 0.4338, + "mean_token_accuracy": 0.860619068145752, + "num_tokens": 457060433.0, + "step": 11978 + }, + { + "epoch": 1.5238519272357207, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.886890411376953, + "learning_rate": 1e-06, + "loss": 0.403, + "mean_token_accuracy": 0.873249888420105, + "num_tokens": 457101760.0, + "step": 11979 + }, + { + "epoch": 1.5239791375143112, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.776403427124023, + "learning_rate": 1e-06, + "loss": 0.3653, + "mean_token_accuracy": 0.8830210566520691, + "num_tokens": 457142676.0, + "step": 11980 + }, + { + "epoch": 1.5241063477929018, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.656116485595703, + "learning_rate": 1e-06, + "loss": 0.3994, + "mean_token_accuracy": 0.8739364743232727, + "num_tokens": 457179507.0, + "step": 11981 + }, + { + "epoch": 1.5242335580714923, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.979703903198242, + "learning_rate": 1e-06, + "loss": 0.414, + "mean_token_accuracy": 0.8682836294174194, + "num_tokens": 457221563.0, + "step": 11982 + }, + { + "epoch": 1.5243607683500828, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.66608428955078, + "learning_rate": 1e-06, + "loss": 0.4359, + "mean_token_accuracy": 0.865402340888977, + "num_tokens": 457258634.0, + "step": 11983 + }, + { + "epoch": 1.5244879786286734, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.745935440063477, + "learning_rate": 1e-06, + "loss": 0.3729, + "mean_token_accuracy": 0.8813517689704895, + "num_tokens": 457293097.0, + "step": 11984 + }, + { + "epoch": 1.5246151889072637, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.902441024780273, + "learning_rate": 1e-06, + "loss": 0.3716, + "mean_token_accuracy": 0.8821191787719727, + "num_tokens": 457333525.0, + "step": 11985 + }, + { + "epoch": 1.5247423991858542, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.82683563232422, + "learning_rate": 1e-06, + "loss": 0.4563, + "mean_token_accuracy": 0.8535879254341125, + "num_tokens": 457379772.0, + "step": 11986 + }, + { + "epoch": 1.5248696094644447, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.73290252685547, + "learning_rate": 1e-06, + "loss": 0.3977, + "mean_token_accuracy": 0.8728064298629761, + "num_tokens": 457415852.0, + "step": 11987 + }, + { + "epoch": 1.5249968197430352, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.934642791748047, + "learning_rate": 1e-06, + "loss": 0.4046, + "mean_token_accuracy": 0.8725161552429199, + "num_tokens": 457449771.0, + "step": 11988 + }, + { + "epoch": 1.5251240300216258, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.799755096435547, + "learning_rate": 1e-06, + "loss": 0.4303, + "mean_token_accuracy": 0.8629826307296753, + "num_tokens": 457489601.0, + "step": 11989 + }, + { + "epoch": 1.5252512403002163, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.850698471069336, + "learning_rate": 1e-06, + "loss": 0.3713, + "mean_token_accuracy": 0.8816673755645752, + "num_tokens": 457530516.0, + "step": 11990 + }, + { + "epoch": 1.5253784505788066, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.522897720336914, + "learning_rate": 1e-06, + "loss": 0.4307, + "mean_token_accuracy": 0.8648805618286133, + "num_tokens": 457568299.0, + "step": 11991 + }, + { + "epoch": 1.5255056608573971, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.939117431640625, + "learning_rate": 1e-06, + "loss": 0.4317, + "mean_token_accuracy": 0.8669993281364441, + "num_tokens": 457600476.0, + "step": 11992 + }, + { + "epoch": 1.5256328711359877, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.743423461914062, + "learning_rate": 1e-06, + "loss": 0.3642, + "mean_token_accuracy": 0.8880503177642822, + "num_tokens": 457632024.0, + "step": 11993 + }, + { + "epoch": 1.5257600814145782, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.6179141998291, + "learning_rate": 1e-06, + "loss": 0.3941, + "mean_token_accuracy": 0.8746654987335205, + "num_tokens": 457670805.0, + "step": 11994 + }, + { + "epoch": 1.5258872916931687, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.93558120727539, + "learning_rate": 1e-06, + "loss": 0.4133, + "mean_token_accuracy": 0.8694289922714233, + "num_tokens": 457702468.0, + "step": 11995 + }, + { + "epoch": 1.5260145019717593, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.639074325561523, + "learning_rate": 1e-06, + "loss": 0.3854, + "mean_token_accuracy": 0.8776411414146423, + "num_tokens": 457740443.0, + "step": 11996 + }, + { + "epoch": 1.5261417122503498, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.70596694946289, + "learning_rate": 1e-06, + "loss": 0.4084, + "mean_token_accuracy": 0.869165301322937, + "num_tokens": 457775833.0, + "step": 11997 + }, + { + "epoch": 1.5262689225289403, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.73758316040039, + "learning_rate": 1e-06, + "loss": 0.3967, + "mean_token_accuracy": 0.8719862699508667, + "num_tokens": 457815850.0, + "step": 11998 + }, + { + "epoch": 1.5263961328075308, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.795740127563477, + "learning_rate": 1e-06, + "loss": 0.3971, + "mean_token_accuracy": 0.8759303689002991, + "num_tokens": 457854946.0, + "step": 11999 + }, + { + "epoch": 1.5265233430861214, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.660978317260742, + "learning_rate": 1e-06, + "loss": 0.4058, + "mean_token_accuracy": 0.8736770749092102, + "num_tokens": 457890775.0, + "step": 12000 + }, + { + "epoch": 1.526650553364712, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.840679168701172, + "learning_rate": 1e-06, + "loss": 0.3946, + "mean_token_accuracy": 0.8739702701568604, + "num_tokens": 457928143.0, + "step": 12001 + }, + { + "epoch": 1.5267777636433024, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.733566284179688, + "learning_rate": 1e-06, + "loss": 0.3999, + "mean_token_accuracy": 0.8744881749153137, + "num_tokens": 457965999.0, + "step": 12002 + }, + { + "epoch": 1.526904973921893, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.659896850585938, + "learning_rate": 1e-06, + "loss": 0.4152, + "mean_token_accuracy": 0.8676886558532715, + "num_tokens": 458006638.0, + "step": 12003 + }, + { + "epoch": 1.5270321842004835, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.840782165527344, + "learning_rate": 1e-06, + "loss": 0.404, + "mean_token_accuracy": 0.8696094155311584, + "num_tokens": 458047522.0, + "step": 12004 + }, + { + "epoch": 1.527159394479074, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.597454071044922, + "learning_rate": 1e-06, + "loss": 0.4195, + "mean_token_accuracy": 0.8690420985221863, + "num_tokens": 458091219.0, + "step": 12005 + }, + { + "epoch": 1.5272866047576645, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.771381378173828, + "learning_rate": 1e-06, + "loss": 0.446, + "mean_token_accuracy": 0.8565337061882019, + "num_tokens": 458129208.0, + "step": 12006 + }, + { + "epoch": 1.527413815036255, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.81783103942871, + "learning_rate": 1e-06, + "loss": 0.4335, + "mean_token_accuracy": 0.864348292350769, + "num_tokens": 458163632.0, + "step": 12007 + }, + { + "epoch": 1.5275410253148456, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.7573184967041, + "learning_rate": 1e-06, + "loss": 0.4258, + "mean_token_accuracy": 0.8631328344345093, + "num_tokens": 458202377.0, + "step": 12008 + }, + { + "epoch": 1.527668235593436, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.662403106689453, + "learning_rate": 1e-06, + "loss": 0.4117, + "mean_token_accuracy": 0.8685296773910522, + "num_tokens": 458239901.0, + "step": 12009 + }, + { + "epoch": 1.5277954458720264, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.63966941833496, + "learning_rate": 1e-06, + "loss": 0.4425, + "mean_token_accuracy": 0.8631956577301025, + "num_tokens": 458280291.0, + "step": 12010 + }, + { + "epoch": 1.527922656150617, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.679357528686523, + "learning_rate": 1e-06, + "loss": 0.3874, + "mean_token_accuracy": 0.8745994567871094, + "num_tokens": 458316931.0, + "step": 12011 + }, + { + "epoch": 1.5280498664292075, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.796817779541016, + "learning_rate": 1e-06, + "loss": 0.4284, + "mean_token_accuracy": 0.8624062538146973, + "num_tokens": 458356477.0, + "step": 12012 + }, + { + "epoch": 1.528177076707798, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.76618194580078, + "learning_rate": 1e-06, + "loss": 0.4576, + "mean_token_accuracy": 0.8539973497390747, + "num_tokens": 458401340.0, + "step": 12013 + }, + { + "epoch": 1.5283042869863885, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.6868896484375, + "learning_rate": 1e-06, + "loss": 0.3982, + "mean_token_accuracy": 0.8746752142906189, + "num_tokens": 458436190.0, + "step": 12014 + }, + { + "epoch": 1.5284314972649788, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.734590530395508, + "learning_rate": 1e-06, + "loss": 0.3987, + "mean_token_accuracy": 0.8668938875198364, + "num_tokens": 458465513.0, + "step": 12015 + }, + { + "epoch": 1.5285587075435694, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.93434715270996, + "learning_rate": 1e-06, + "loss": 0.4269, + "mean_token_accuracy": 0.8654864430427551, + "num_tokens": 458505465.0, + "step": 12016 + }, + { + "epoch": 1.52868591782216, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.52896499633789, + "learning_rate": 1e-06, + "loss": 0.422, + "mean_token_accuracy": 0.8656992316246033, + "num_tokens": 458545087.0, + "step": 12017 + }, + { + "epoch": 1.5288131281007504, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.835176467895508, + "learning_rate": 1e-06, + "loss": 0.4021, + "mean_token_accuracy": 0.8767092227935791, + "num_tokens": 458583274.0, + "step": 12018 + }, + { + "epoch": 1.528940338379341, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.71944808959961, + "learning_rate": 1e-06, + "loss": 0.4362, + "mean_token_accuracy": 0.8631561398506165, + "num_tokens": 458626026.0, + "step": 12019 + }, + { + "epoch": 1.5290675486579315, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.554332733154297, + "learning_rate": 1e-06, + "loss": 0.3778, + "mean_token_accuracy": 0.8798328638076782, + "num_tokens": 458668774.0, + "step": 12020 + }, + { + "epoch": 1.529194758936522, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.79146957397461, + "learning_rate": 1e-06, + "loss": 0.4078, + "mean_token_accuracy": 0.8715485334396362, + "num_tokens": 458702256.0, + "step": 12021 + }, + { + "epoch": 1.5293219692151125, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.660411834716797, + "learning_rate": 1e-06, + "loss": 0.5017, + "mean_token_accuracy": 0.8410488963127136, + "num_tokens": 458741429.0, + "step": 12022 + }, + { + "epoch": 1.529449179493703, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.87788200378418, + "learning_rate": 1e-06, + "loss": 0.4018, + "mean_token_accuracy": 0.8723607063293457, + "num_tokens": 458775608.0, + "step": 12023 + }, + { + "epoch": 1.5295763897722936, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.71813201904297, + "learning_rate": 1e-06, + "loss": 0.4129, + "mean_token_accuracy": 0.8718839883804321, + "num_tokens": 458811451.0, + "step": 12024 + }, + { + "epoch": 1.5297036000508841, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.70279884338379, + "learning_rate": 1e-06, + "loss": 0.3984, + "mean_token_accuracy": 0.8720571398735046, + "num_tokens": 458851761.0, + "step": 12025 + }, + { + "epoch": 1.5298308103294747, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.02068519592285, + "learning_rate": 1e-06, + "loss": 0.4404, + "mean_token_accuracy": 0.8614532947540283, + "num_tokens": 458888948.0, + "step": 12026 + }, + { + "epoch": 1.5299580206080652, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.505422592163086, + "learning_rate": 1e-06, + "loss": 0.4246, + "mean_token_accuracy": 0.8666499853134155, + "num_tokens": 458922466.0, + "step": 12027 + }, + { + "epoch": 1.5300852308866557, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.820098876953125, + "learning_rate": 1e-06, + "loss": 0.4195, + "mean_token_accuracy": 0.863192081451416, + "num_tokens": 458956947.0, + "step": 12028 + }, + { + "epoch": 1.5302124411652462, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.851669311523438, + "learning_rate": 1e-06, + "loss": 0.3639, + "mean_token_accuracy": 0.8858703970909119, + "num_tokens": 458993634.0, + "step": 12029 + }, + { + "epoch": 1.5303396514438368, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.611326217651367, + "learning_rate": 1e-06, + "loss": 0.3784, + "mean_token_accuracy": 0.8770266771316528, + "num_tokens": 459034314.0, + "step": 12030 + }, + { + "epoch": 1.5304668617224273, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.97593116760254, + "learning_rate": 1e-06, + "loss": 0.3894, + "mean_token_accuracy": 0.8741811513900757, + "num_tokens": 459072117.0, + "step": 12031 + }, + { + "epoch": 1.5305940720010178, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.629722595214844, + "learning_rate": 1e-06, + "loss": 0.4299, + "mean_token_accuracy": 0.8638999462127686, + "num_tokens": 459112382.0, + "step": 12032 + }, + { + "epoch": 1.5307212822796084, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.75783920288086, + "learning_rate": 1e-06, + "loss": 0.3793, + "mean_token_accuracy": 0.8811942934989929, + "num_tokens": 459148973.0, + "step": 12033 + }, + { + "epoch": 1.5308484925581987, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.872211456298828, + "learning_rate": 1e-06, + "loss": 0.4254, + "mean_token_accuracy": 0.8659072518348694, + "num_tokens": 459183242.0, + "step": 12034 + }, + { + "epoch": 1.5309757028367892, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.70093536376953, + "learning_rate": 1e-06, + "loss": 0.358, + "mean_token_accuracy": 0.8858950138092041, + "num_tokens": 459219606.0, + "step": 12035 + }, + { + "epoch": 1.5311029131153797, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.85843276977539, + "learning_rate": 1e-06, + "loss": 0.3759, + "mean_token_accuracy": 0.88702392578125, + "num_tokens": 459253763.0, + "step": 12036 + }, + { + "epoch": 1.5312301233939702, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.758821487426758, + "learning_rate": 1e-06, + "loss": 0.4058, + "mean_token_accuracy": 0.8720571994781494, + "num_tokens": 459291187.0, + "step": 12037 + }, + { + "epoch": 1.5313573336725608, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.691911697387695, + "learning_rate": 1e-06, + "loss": 0.4216, + "mean_token_accuracy": 0.8657408952713013, + "num_tokens": 459327423.0, + "step": 12038 + }, + { + "epoch": 1.5314845439511513, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.869441986083984, + "learning_rate": 1e-06, + "loss": 0.4191, + "mean_token_accuracy": 0.8649044036865234, + "num_tokens": 459366207.0, + "step": 12039 + }, + { + "epoch": 1.5316117542297416, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.59444808959961, + "learning_rate": 1e-06, + "loss": 0.4533, + "mean_token_accuracy": 0.8614989519119263, + "num_tokens": 459408338.0, + "step": 12040 + }, + { + "epoch": 1.5317389645083321, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.725860595703125, + "learning_rate": 1e-06, + "loss": 0.3812, + "mean_token_accuracy": 0.8789260387420654, + "num_tokens": 459443420.0, + "step": 12041 + }, + { + "epoch": 1.5318661747869227, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.795631408691406, + "learning_rate": 1e-06, + "loss": 0.461, + "mean_token_accuracy": 0.8538841009140015, + "num_tokens": 459475835.0, + "step": 12042 + }, + { + "epoch": 1.5319933850655132, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.714357376098633, + "learning_rate": 1e-06, + "loss": 0.4114, + "mean_token_accuracy": 0.8676847815513611, + "num_tokens": 459511993.0, + "step": 12043 + }, + { + "epoch": 1.5321205953441037, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.785877227783203, + "learning_rate": 1e-06, + "loss": 0.3849, + "mean_token_accuracy": 0.8769940137863159, + "num_tokens": 459546221.0, + "step": 12044 + }, + { + "epoch": 1.5322478056226942, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.6636905670166, + "learning_rate": 1e-06, + "loss": 0.4003, + "mean_token_accuracy": 0.8732110857963562, + "num_tokens": 459587271.0, + "step": 12045 + }, + { + "epoch": 1.5323750159012848, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.548486709594727, + "learning_rate": 1e-06, + "loss": 0.4428, + "mean_token_accuracy": 0.8580076694488525, + "num_tokens": 459623388.0, + "step": 12046 + }, + { + "epoch": 1.5325022261798753, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.638303756713867, + "learning_rate": 1e-06, + "loss": 0.4148, + "mean_token_accuracy": 0.8677504062652588, + "num_tokens": 459666440.0, + "step": 12047 + }, + { + "epoch": 1.5326294364584658, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.854700088500977, + "learning_rate": 1e-06, + "loss": 0.4554, + "mean_token_accuracy": 0.8559078574180603, + "num_tokens": 459708373.0, + "step": 12048 + }, + { + "epoch": 1.5327566467370564, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.87930679321289, + "learning_rate": 1e-06, + "loss": 0.3904, + "mean_token_accuracy": 0.8790494203567505, + "num_tokens": 459746707.0, + "step": 12049 + }, + { + "epoch": 1.532883857015647, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.711673736572266, + "learning_rate": 1e-06, + "loss": 0.3973, + "mean_token_accuracy": 0.873548150062561, + "num_tokens": 459792493.0, + "step": 12050 + }, + { + "epoch": 1.5330110672942374, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.75188446044922, + "learning_rate": 1e-06, + "loss": 0.3797, + "mean_token_accuracy": 0.8780661821365356, + "num_tokens": 459828897.0, + "step": 12051 + }, + { + "epoch": 1.533138277572828, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.507356643676758, + "learning_rate": 1e-06, + "loss": 0.4446, + "mean_token_accuracy": 0.8605293035507202, + "num_tokens": 459866274.0, + "step": 12052 + }, + { + "epoch": 1.5332654878514185, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.034151077270508, + "learning_rate": 1e-06, + "loss": 0.361, + "mean_token_accuracy": 0.8849235773086548, + "num_tokens": 459902442.0, + "step": 12053 + }, + { + "epoch": 1.533392698130009, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.546842575073242, + "learning_rate": 1e-06, + "loss": 0.3917, + "mean_token_accuracy": 0.8773494958877563, + "num_tokens": 459943331.0, + "step": 12054 + }, + { + "epoch": 1.5335199084085995, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.69112777709961, + "learning_rate": 1e-06, + "loss": 0.3981, + "mean_token_accuracy": 0.8743946552276611, + "num_tokens": 459975349.0, + "step": 12055 + }, + { + "epoch": 1.53364711868719, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.814090728759766, + "learning_rate": 1e-06, + "loss": 0.3588, + "mean_token_accuracy": 0.8818956017494202, + "num_tokens": 460008295.0, + "step": 12056 + }, + { + "epoch": 1.5337743289657806, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.835311889648438, + "learning_rate": 1e-06, + "loss": 0.4032, + "mean_token_accuracy": 0.8738715052604675, + "num_tokens": 460045892.0, + "step": 12057 + }, + { + "epoch": 1.533901539244371, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.749523162841797, + "learning_rate": 1e-06, + "loss": 0.403, + "mean_token_accuracy": 0.8688005208969116, + "num_tokens": 460084255.0, + "step": 12058 + }, + { + "epoch": 1.5340287495229614, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.829038619995117, + "learning_rate": 1e-06, + "loss": 0.441, + "mean_token_accuracy": 0.8555087447166443, + "num_tokens": 460123706.0, + "step": 12059 + }, + { + "epoch": 1.534155959801552, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.70424461364746, + "learning_rate": 1e-06, + "loss": 0.4521, + "mean_token_accuracy": 0.8553208112716675, + "num_tokens": 460162351.0, + "step": 12060 + }, + { + "epoch": 1.5342831700801425, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.78703498840332, + "learning_rate": 1e-06, + "loss": 0.4443, + "mean_token_accuracy": 0.8596574068069458, + "num_tokens": 460200135.0, + "step": 12061 + }, + { + "epoch": 1.534410380358733, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.597951889038086, + "learning_rate": 1e-06, + "loss": 0.4313, + "mean_token_accuracy": 0.8631998896598816, + "num_tokens": 460243585.0, + "step": 12062 + }, + { + "epoch": 1.5345375906373235, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.638179779052734, + "learning_rate": 1e-06, + "loss": 0.433, + "mean_token_accuracy": 0.8626749515533447, + "num_tokens": 460283695.0, + "step": 12063 + }, + { + "epoch": 1.5346648009159138, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.64601707458496, + "learning_rate": 1e-06, + "loss": 0.3851, + "mean_token_accuracy": 0.8756670951843262, + "num_tokens": 460317241.0, + "step": 12064 + }, + { + "epoch": 1.5347920111945044, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.568016052246094, + "learning_rate": 1e-06, + "loss": 0.4142, + "mean_token_accuracy": 0.8706431984901428, + "num_tokens": 460351900.0, + "step": 12065 + }, + { + "epoch": 1.534919221473095, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.828662872314453, + "learning_rate": 1e-06, + "loss": 0.4061, + "mean_token_accuracy": 0.8706698417663574, + "num_tokens": 460391672.0, + "step": 12066 + }, + { + "epoch": 1.5350464317516854, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.74205780029297, + "learning_rate": 1e-06, + "loss": 0.4479, + "mean_token_accuracy": 0.8615264296531677, + "num_tokens": 460434120.0, + "step": 12067 + }, + { + "epoch": 1.535173642030276, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.86116600036621, + "learning_rate": 1e-06, + "loss": 0.4085, + "mean_token_accuracy": 0.8700814247131348, + "num_tokens": 460474214.0, + "step": 12068 + }, + { + "epoch": 1.5353008523088665, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.746959686279297, + "learning_rate": 1e-06, + "loss": 0.3827, + "mean_token_accuracy": 0.8750109672546387, + "num_tokens": 460507549.0, + "step": 12069 + }, + { + "epoch": 1.535428062587457, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.65355682373047, + "learning_rate": 1e-06, + "loss": 0.4461, + "mean_token_accuracy": 0.8613487482070923, + "num_tokens": 460546391.0, + "step": 12070 + }, + { + "epoch": 1.5355552728660475, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.846094131469727, + "learning_rate": 1e-06, + "loss": 0.4601, + "mean_token_accuracy": 0.8576166033744812, + "num_tokens": 460585232.0, + "step": 12071 + }, + { + "epoch": 1.535682483144638, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.96259117126465, + "learning_rate": 1e-06, + "loss": 0.3916, + "mean_token_accuracy": 0.876983642578125, + "num_tokens": 460624398.0, + "step": 12072 + }, + { + "epoch": 1.5358096934232286, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.742883682250977, + "learning_rate": 1e-06, + "loss": 0.4413, + "mean_token_accuracy": 0.8620694875717163, + "num_tokens": 460667193.0, + "step": 12073 + }, + { + "epoch": 1.5359369037018191, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.715078353881836, + "learning_rate": 1e-06, + "loss": 0.4835, + "mean_token_accuracy": 0.8452446460723877, + "num_tokens": 460710614.0, + "step": 12074 + }, + { + "epoch": 1.5360641139804097, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.66361427307129, + "learning_rate": 1e-06, + "loss": 0.4257, + "mean_token_accuracy": 0.8649348020553589, + "num_tokens": 460751079.0, + "step": 12075 + }, + { + "epoch": 1.5361913242590002, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.849699020385742, + "learning_rate": 1e-06, + "loss": 0.4122, + "mean_token_accuracy": 0.8691909909248352, + "num_tokens": 460786768.0, + "step": 12076 + }, + { + "epoch": 1.5363185345375907, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.840845108032227, + "learning_rate": 1e-06, + "loss": 0.4401, + "mean_token_accuracy": 0.8582009673118591, + "num_tokens": 460824757.0, + "step": 12077 + }, + { + "epoch": 1.5364457448161812, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.78228759765625, + "learning_rate": 1e-06, + "loss": 0.4383, + "mean_token_accuracy": 0.8623950481414795, + "num_tokens": 460865024.0, + "step": 12078 + }, + { + "epoch": 1.5365729550947718, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.7421875, + "learning_rate": 1e-06, + "loss": 0.4713, + "mean_token_accuracy": 0.8594237565994263, + "num_tokens": 460900895.0, + "step": 12079 + }, + { + "epoch": 1.5367001653733623, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.546472549438477, + "learning_rate": 1e-06, + "loss": 0.3925, + "mean_token_accuracy": 0.8735167980194092, + "num_tokens": 460936876.0, + "step": 12080 + }, + { + "epoch": 1.5368273756519528, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.806934356689453, + "learning_rate": 1e-06, + "loss": 0.4675, + "mean_token_accuracy": 0.853715181350708, + "num_tokens": 460975297.0, + "step": 12081 + }, + { + "epoch": 1.5369545859305433, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.621784210205078, + "learning_rate": 1e-06, + "loss": 0.4574, + "mean_token_accuracy": 0.8566044569015503, + "num_tokens": 461008603.0, + "step": 12082 + }, + { + "epoch": 1.5370817962091337, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.64862632751465, + "learning_rate": 1e-06, + "loss": 0.4521, + "mean_token_accuracy": 0.8599576354026794, + "num_tokens": 461047770.0, + "step": 12083 + }, + { + "epoch": 1.5372090064877242, + "ewc_loss": 0.032470703125, + "ewc_loss_parallel": 3.24249267578125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.851991653442383, + "learning_rate": 1e-06, + "loss": 0.4543, + "mean_token_accuracy": 0.8553517460823059, + "num_tokens": 461088554.0, + "step": 12084 + }, + { + "epoch": 1.5373362167663147, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.739395141601562, + "learning_rate": 1e-06, + "loss": 0.4812, + "mean_token_accuracy": 0.8471589684486389, + "num_tokens": 461126070.0, + "step": 12085 + }, + { + "epoch": 1.5374634270449052, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.63106346130371, + "learning_rate": 1e-06, + "loss": 0.4258, + "mean_token_accuracy": 0.8652372360229492, + "num_tokens": 461166394.0, + "step": 12086 + }, + { + "epoch": 1.5375906373234958, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.792701721191406, + "learning_rate": 1e-06, + "loss": 0.438, + "mean_token_accuracy": 0.8601836562156677, + "num_tokens": 461208263.0, + "step": 12087 + }, + { + "epoch": 1.5377178476020863, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.664905548095703, + "learning_rate": 1e-06, + "loss": 0.3736, + "mean_token_accuracy": 0.8781283497810364, + "num_tokens": 461245054.0, + "step": 12088 + }, + { + "epoch": 1.5378450578806766, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.70926856994629, + "learning_rate": 1e-06, + "loss": 0.4563, + "mean_token_accuracy": 0.8574247360229492, + "num_tokens": 461281584.0, + "step": 12089 + }, + { + "epoch": 1.5379722681592671, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.61780548095703, + "learning_rate": 1e-06, + "loss": 0.3904, + "mean_token_accuracy": 0.8769207000732422, + "num_tokens": 461322247.0, + "step": 12090 + }, + { + "epoch": 1.5380994784378577, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.735280990600586, + "learning_rate": 1e-06, + "loss": 0.448, + "mean_token_accuracy": 0.8561451435089111, + "num_tokens": 461358371.0, + "step": 12091 + }, + { + "epoch": 1.5382266887164482, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.7714786529541, + "learning_rate": 1e-06, + "loss": 0.3993, + "mean_token_accuracy": 0.8746780157089233, + "num_tokens": 461394666.0, + "step": 12092 + }, + { + "epoch": 1.5383538989950387, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.646560668945312, + "learning_rate": 1e-06, + "loss": 0.3731, + "mean_token_accuracy": 0.8800825476646423, + "num_tokens": 461430875.0, + "step": 12093 + }, + { + "epoch": 1.5384811092736292, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.681089401245117, + "learning_rate": 1e-06, + "loss": 0.4223, + "mean_token_accuracy": 0.8638303279876709, + "num_tokens": 461467412.0, + "step": 12094 + }, + { + "epoch": 1.5386083195522198, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.873979568481445, + "learning_rate": 1e-06, + "loss": 0.4668, + "mean_token_accuracy": 0.8465205430984497, + "num_tokens": 461501970.0, + "step": 12095 + }, + { + "epoch": 1.5387355298308103, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.727983474731445, + "learning_rate": 1e-06, + "loss": 0.4643, + "mean_token_accuracy": 0.8552968502044678, + "num_tokens": 461542735.0, + "step": 12096 + }, + { + "epoch": 1.5388627401094008, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.76934814453125, + "learning_rate": 1e-06, + "loss": 0.4201, + "mean_token_accuracy": 0.8702133893966675, + "num_tokens": 461583883.0, + "step": 12097 + }, + { + "epoch": 1.5389899503879914, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.74659538269043, + "learning_rate": 1e-06, + "loss": 0.448, + "mean_token_accuracy": 0.8592543005943298, + "num_tokens": 461625124.0, + "step": 12098 + }, + { + "epoch": 1.5391171606665819, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.686180114746094, + "learning_rate": 1e-06, + "loss": 0.443, + "mean_token_accuracy": 0.8568548560142517, + "num_tokens": 461660273.0, + "step": 12099 + }, + { + "epoch": 1.5392443709451724, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.72281837463379, + "learning_rate": 1e-06, + "loss": 0.4394, + "mean_token_accuracy": 0.8614087104797363, + "num_tokens": 461698890.0, + "step": 12100 + }, + { + "epoch": 1.539371581223763, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.812925338745117, + "learning_rate": 1e-06, + "loss": 0.4138, + "mean_token_accuracy": 0.8676544427871704, + "num_tokens": 461736927.0, + "step": 12101 + }, + { + "epoch": 1.5394987915023535, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.892620086669922, + "learning_rate": 1e-06, + "loss": 0.416, + "mean_token_accuracy": 0.8654232621192932, + "num_tokens": 461777796.0, + "step": 12102 + }, + { + "epoch": 1.539626001780944, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.91554069519043, + "learning_rate": 1e-06, + "loss": 0.3862, + "mean_token_accuracy": 0.8767071962356567, + "num_tokens": 461814656.0, + "step": 12103 + }, + { + "epoch": 1.5397532120595345, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.827272415161133, + "learning_rate": 1e-06, + "loss": 0.4834, + "mean_token_accuracy": 0.8496381044387817, + "num_tokens": 461852186.0, + "step": 12104 + }, + { + "epoch": 1.539880422338125, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.828203201293945, + "learning_rate": 1e-06, + "loss": 0.376, + "mean_token_accuracy": 0.8809935450553894, + "num_tokens": 461893724.0, + "step": 12105 + }, + { + "epoch": 1.5400076326167156, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.762174606323242, + "learning_rate": 1e-06, + "loss": 0.4681, + "mean_token_accuracy": 0.8555724620819092, + "num_tokens": 461935380.0, + "step": 12106 + }, + { + "epoch": 1.5401348428953059, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.826677322387695, + "learning_rate": 1e-06, + "loss": 0.4038, + "mean_token_accuracy": 0.8737421035766602, + "num_tokens": 461971463.0, + "step": 12107 + }, + { + "epoch": 1.5402620531738964, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.731191635131836, + "learning_rate": 1e-06, + "loss": 0.3993, + "mean_token_accuracy": 0.8745185732841492, + "num_tokens": 462007514.0, + "step": 12108 + }, + { + "epoch": 1.540389263452487, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.864831924438477, + "learning_rate": 1e-06, + "loss": 0.3834, + "mean_token_accuracy": 0.8812274932861328, + "num_tokens": 462046022.0, + "step": 12109 + }, + { + "epoch": 1.5405164737310775, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.846513748168945, + "learning_rate": 1e-06, + "loss": 0.4025, + "mean_token_accuracy": 0.8721293210983276, + "num_tokens": 462089367.0, + "step": 12110 + }, + { + "epoch": 1.540643684009668, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.782596588134766, + "learning_rate": 1e-06, + "loss": 0.4047, + "mean_token_accuracy": 0.8727248907089233, + "num_tokens": 462128558.0, + "step": 12111 + }, + { + "epoch": 1.5407708942882585, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.891216278076172, + "learning_rate": 1e-06, + "loss": 0.3929, + "mean_token_accuracy": 0.8748206496238708, + "num_tokens": 462162639.0, + "step": 12112 + }, + { + "epoch": 1.5408981045668488, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.752777099609375, + "learning_rate": 1e-06, + "loss": 0.4193, + "mean_token_accuracy": 0.8688687086105347, + "num_tokens": 462199303.0, + "step": 12113 + }, + { + "epoch": 1.5410253148454394, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.909292221069336, + "learning_rate": 1e-06, + "loss": 0.4251, + "mean_token_accuracy": 0.8650858402252197, + "num_tokens": 462246095.0, + "step": 12114 + }, + { + "epoch": 1.54115252512403, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.546205520629883, + "learning_rate": 1e-06, + "loss": 0.4269, + "mean_token_accuracy": 0.8637881278991699, + "num_tokens": 462286653.0, + "step": 12115 + }, + { + "epoch": 1.5412797354026204, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.785329818725586, + "learning_rate": 1e-06, + "loss": 0.4301, + "mean_token_accuracy": 0.8652309775352478, + "num_tokens": 462328295.0, + "step": 12116 + }, + { + "epoch": 1.541406945681211, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.920114517211914, + "learning_rate": 1e-06, + "loss": 0.4119, + "mean_token_accuracy": 0.8691939115524292, + "num_tokens": 462365175.0, + "step": 12117 + }, + { + "epoch": 1.5415341559598015, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.892669677734375, + "learning_rate": 1e-06, + "loss": 0.4481, + "mean_token_accuracy": 0.8615167737007141, + "num_tokens": 462407044.0, + "step": 12118 + }, + { + "epoch": 1.541661366238392, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.906761169433594, + "learning_rate": 1e-06, + "loss": 0.4309, + "mean_token_accuracy": 0.8605345487594604, + "num_tokens": 462443819.0, + "step": 12119 + }, + { + "epoch": 1.5417885765169825, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.85592269897461, + "learning_rate": 1e-06, + "loss": 0.3865, + "mean_token_accuracy": 0.8774380683898926, + "num_tokens": 462487319.0, + "step": 12120 + }, + { + "epoch": 1.541915786795573, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.76688575744629, + "learning_rate": 1e-06, + "loss": 0.4088, + "mean_token_accuracy": 0.8691427707672119, + "num_tokens": 462527026.0, + "step": 12121 + }, + { + "epoch": 1.5420429970741636, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.548952102661133, + "learning_rate": 1e-06, + "loss": 0.4417, + "mean_token_accuracy": 0.8594313859939575, + "num_tokens": 462557552.0, + "step": 12122 + }, + { + "epoch": 1.5421702073527541, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.947269439697266, + "learning_rate": 1e-06, + "loss": 0.41, + "mean_token_accuracy": 0.8692322373390198, + "num_tokens": 462595243.0, + "step": 12123 + }, + { + "epoch": 1.5422974176313446, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.87213706970215, + "learning_rate": 1e-06, + "loss": 0.4148, + "mean_token_accuracy": 0.8705877661705017, + "num_tokens": 462636077.0, + "step": 12124 + }, + { + "epoch": 1.5424246279099352, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.586030960083008, + "learning_rate": 1e-06, + "loss": 0.3769, + "mean_token_accuracy": 0.8781777620315552, + "num_tokens": 462668407.0, + "step": 12125 + }, + { + "epoch": 1.5425518381885257, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.81795310974121, + "learning_rate": 1e-06, + "loss": 0.4073, + "mean_token_accuracy": 0.8678643703460693, + "num_tokens": 462707678.0, + "step": 12126 + }, + { + "epoch": 1.5426790484671162, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.66474151611328, + "learning_rate": 1e-06, + "loss": 0.4401, + "mean_token_accuracy": 0.8599234223365784, + "num_tokens": 462739722.0, + "step": 12127 + }, + { + "epoch": 1.5428062587457068, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.804000854492188, + "learning_rate": 1e-06, + "loss": 0.3874, + "mean_token_accuracy": 0.8741884231567383, + "num_tokens": 462770369.0, + "step": 12128 + }, + { + "epoch": 1.5429334690242973, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.77821159362793, + "learning_rate": 1e-06, + "loss": 0.4109, + "mean_token_accuracy": 0.8746498823165894, + "num_tokens": 462806126.0, + "step": 12129 + }, + { + "epoch": 1.5430606793028878, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.957319259643555, + "learning_rate": 1e-06, + "loss": 0.4158, + "mean_token_accuracy": 0.8699407577514648, + "num_tokens": 462846585.0, + "step": 12130 + }, + { + "epoch": 1.5431878895814783, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.690324783325195, + "learning_rate": 1e-06, + "loss": 0.4277, + "mean_token_accuracy": 0.8641046285629272, + "num_tokens": 462889892.0, + "step": 12131 + }, + { + "epoch": 1.5433150998600687, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.67998504638672, + "learning_rate": 1e-06, + "loss": 0.426, + "mean_token_accuracy": 0.8679587841033936, + "num_tokens": 462927612.0, + "step": 12132 + }, + { + "epoch": 1.5434423101386592, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.926374435424805, + "learning_rate": 1e-06, + "loss": 0.4011, + "mean_token_accuracy": 0.8703434467315674, + "num_tokens": 462971661.0, + "step": 12133 + }, + { + "epoch": 1.5435695204172497, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.576391220092773, + "learning_rate": 1e-06, + "loss": 0.3953, + "mean_token_accuracy": 0.8730623126029968, + "num_tokens": 463011260.0, + "step": 12134 + }, + { + "epoch": 1.5436967306958402, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.026742935180664, + "learning_rate": 1e-06, + "loss": 0.3916, + "mean_token_accuracy": 0.8739644289016724, + "num_tokens": 463045395.0, + "step": 12135 + }, + { + "epoch": 1.5438239409744308, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.629398345947266, + "learning_rate": 1e-06, + "loss": 0.4436, + "mean_token_accuracy": 0.8570312261581421, + "num_tokens": 463081015.0, + "step": 12136 + }, + { + "epoch": 1.5439511512530213, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.089523315429688, + "learning_rate": 1e-06, + "loss": 0.4156, + "mean_token_accuracy": 0.8674786686897278, + "num_tokens": 463115658.0, + "step": 12137 + }, + { + "epoch": 1.5440783615316116, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.74860382080078, + "learning_rate": 1e-06, + "loss": 0.3701, + "mean_token_accuracy": 0.8799317479133606, + "num_tokens": 463152084.0, + "step": 12138 + }, + { + "epoch": 1.5442055718102021, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.71417808532715, + "learning_rate": 1e-06, + "loss": 0.4522, + "mean_token_accuracy": 0.858216404914856, + "num_tokens": 463190137.0, + "step": 12139 + }, + { + "epoch": 1.5443327820887927, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.69364356994629, + "learning_rate": 1e-06, + "loss": 0.4215, + "mean_token_accuracy": 0.8672956228256226, + "num_tokens": 463227626.0, + "step": 12140 + }, + { + "epoch": 1.5444599923673832, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.793760299682617, + "learning_rate": 1e-06, + "loss": 0.4203, + "mean_token_accuracy": 0.8692668676376343, + "num_tokens": 463266451.0, + "step": 12141 + }, + { + "epoch": 1.5445872026459737, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.769826889038086, + "learning_rate": 1e-06, + "loss": 0.4506, + "mean_token_accuracy": 0.8564882874488831, + "num_tokens": 463305327.0, + "step": 12142 + }, + { + "epoch": 1.5447144129245642, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.76983642578125, + "learning_rate": 1e-06, + "loss": 0.427, + "mean_token_accuracy": 0.8645309209823608, + "num_tokens": 463345890.0, + "step": 12143 + }, + { + "epoch": 1.5448416232031548, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.913734436035156, + "learning_rate": 1e-06, + "loss": 0.3975, + "mean_token_accuracy": 0.8724023103713989, + "num_tokens": 463381583.0, + "step": 12144 + }, + { + "epoch": 1.5449688334817453, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.025928497314453, + "learning_rate": 1e-06, + "loss": 0.3798, + "mean_token_accuracy": 0.8758455514907837, + "num_tokens": 463414442.0, + "step": 12145 + }, + { + "epoch": 1.5450960437603358, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.603139877319336, + "learning_rate": 1e-06, + "loss": 0.3913, + "mean_token_accuracy": 0.8739891052246094, + "num_tokens": 463449517.0, + "step": 12146 + }, + { + "epoch": 1.5452232540389264, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.827058792114258, + "learning_rate": 1e-06, + "loss": 0.3966, + "mean_token_accuracy": 0.8747202157974243, + "num_tokens": 463487952.0, + "step": 12147 + }, + { + "epoch": 1.5453504643175169, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.680185317993164, + "learning_rate": 1e-06, + "loss": 0.411, + "mean_token_accuracy": 0.8685435056686401, + "num_tokens": 463520724.0, + "step": 12148 + }, + { + "epoch": 1.5454776745961074, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.668598175048828, + "learning_rate": 1e-06, + "loss": 0.427, + "mean_token_accuracy": 0.8656104803085327, + "num_tokens": 463564691.0, + "step": 12149 + }, + { + "epoch": 1.545604884874698, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.84095573425293, + "learning_rate": 1e-06, + "loss": 0.4339, + "mean_token_accuracy": 0.864862322807312, + "num_tokens": 463599037.0, + "step": 12150 + }, + { + "epoch": 1.5457320951532885, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.747661590576172, + "learning_rate": 1e-06, + "loss": 0.4557, + "mean_token_accuracy": 0.8564456701278687, + "num_tokens": 463630744.0, + "step": 12151 + }, + { + "epoch": 1.545859305431879, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.762584686279297, + "learning_rate": 1e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.8642359972000122, + "num_tokens": 463663551.0, + "step": 12152 + }, + { + "epoch": 1.5459865157104695, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.496591567993164, + "learning_rate": 1e-06, + "loss": 0.3799, + "mean_token_accuracy": 0.8798184394836426, + "num_tokens": 463702446.0, + "step": 12153 + }, + { + "epoch": 1.54611372598906, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.785470962524414, + "learning_rate": 1e-06, + "loss": 0.4424, + "mean_token_accuracy": 0.8597632646560669, + "num_tokens": 463739324.0, + "step": 12154 + }, + { + "epoch": 1.5462409362676506, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.550304412841797, + "learning_rate": 1e-06, + "loss": 0.4739, + "mean_token_accuracy": 0.8481907844543457, + "num_tokens": 463780765.0, + "step": 12155 + }, + { + "epoch": 1.5463681465462409, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.666513442993164, + "learning_rate": 1e-06, + "loss": 0.5006, + "mean_token_accuracy": 0.8465449810028076, + "num_tokens": 463817727.0, + "step": 12156 + }, + { + "epoch": 1.5464953568248314, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.786787033081055, + "learning_rate": 1e-06, + "loss": 0.3678, + "mean_token_accuracy": 0.8835302591323853, + "num_tokens": 463865793.0, + "step": 12157 + }, + { + "epoch": 1.546622567103422, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.583974838256836, + "learning_rate": 1e-06, + "loss": 0.4206, + "mean_token_accuracy": 0.8691509962081909, + "num_tokens": 463908299.0, + "step": 12158 + }, + { + "epoch": 1.5467497773820125, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.693506240844727, + "learning_rate": 1e-06, + "loss": 0.4193, + "mean_token_accuracy": 0.8648011684417725, + "num_tokens": 463944513.0, + "step": 12159 + }, + { + "epoch": 1.546876987660603, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.63899040222168, + "learning_rate": 1e-06, + "loss": 0.4391, + "mean_token_accuracy": 0.8609441518783569, + "num_tokens": 463982862.0, + "step": 12160 + }, + { + "epoch": 1.5470041979391935, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.61431884765625, + "learning_rate": 1e-06, + "loss": 0.4006, + "mean_token_accuracy": 0.8722000122070312, + "num_tokens": 464023357.0, + "step": 12161 + }, + { + "epoch": 1.5471314082177838, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.51506805419922, + "learning_rate": 1e-06, + "loss": 0.4059, + "mean_token_accuracy": 0.868230938911438, + "num_tokens": 464071889.0, + "step": 12162 + }, + { + "epoch": 1.5472586184963744, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.808883666992188, + "learning_rate": 1e-06, + "loss": 0.4616, + "mean_token_accuracy": 0.8542054891586304, + "num_tokens": 464112334.0, + "step": 12163 + }, + { + "epoch": 1.5473858287749649, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.708650588989258, + "learning_rate": 1e-06, + "loss": 0.4354, + "mean_token_accuracy": 0.8634199500083923, + "num_tokens": 464156833.0, + "step": 12164 + }, + { + "epoch": 1.5475130390535554, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.65985107421875, + "learning_rate": 1e-06, + "loss": 0.4065, + "mean_token_accuracy": 0.8709033727645874, + "num_tokens": 464192989.0, + "step": 12165 + }, + { + "epoch": 1.547640249332146, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.83773422241211, + "learning_rate": 1e-06, + "loss": 0.399, + "mean_token_accuracy": 0.8742058277130127, + "num_tokens": 464232855.0, + "step": 12166 + }, + { + "epoch": 1.5477674596107365, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.78606605529785, + "learning_rate": 1e-06, + "loss": 0.4701, + "mean_token_accuracy": 0.8510834574699402, + "num_tokens": 464271860.0, + "step": 12167 + }, + { + "epoch": 1.547894669889327, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.678255081176758, + "learning_rate": 1e-06, + "loss": 0.383, + "mean_token_accuracy": 0.8805860877037048, + "num_tokens": 464312925.0, + "step": 12168 + }, + { + "epoch": 1.5480218801679175, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.981712341308594, + "learning_rate": 1e-06, + "loss": 0.4328, + "mean_token_accuracy": 0.8630281090736389, + "num_tokens": 464349190.0, + "step": 12169 + }, + { + "epoch": 1.548149090446508, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.598527908325195, + "learning_rate": 1e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.8633102178573608, + "num_tokens": 464392031.0, + "step": 12170 + }, + { + "epoch": 1.5482763007250986, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.673723220825195, + "learning_rate": 1e-06, + "loss": 0.4214, + "mean_token_accuracy": 0.8633922934532166, + "num_tokens": 464424884.0, + "step": 12171 + }, + { + "epoch": 1.5484035110036891, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.81439781188965, + "learning_rate": 1e-06, + "loss": 0.4342, + "mean_token_accuracy": 0.8623533844947815, + "num_tokens": 464466047.0, + "step": 12172 + }, + { + "epoch": 1.5485307212822796, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.695053100585938, + "learning_rate": 1e-06, + "loss": 0.4195, + "mean_token_accuracy": 0.8669860363006592, + "num_tokens": 464497293.0, + "step": 12173 + }, + { + "epoch": 1.5486579315608702, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.945587158203125, + "learning_rate": 1e-06, + "loss": 0.4644, + "mean_token_accuracy": 0.8540952801704407, + "num_tokens": 464541873.0, + "step": 12174 + }, + { + "epoch": 1.5487851418394607, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.8574161529541, + "learning_rate": 1e-06, + "loss": 0.4139, + "mean_token_accuracy": 0.8679099082946777, + "num_tokens": 464579712.0, + "step": 12175 + }, + { + "epoch": 1.5489123521180512, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.60759735107422, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.8664082288742065, + "num_tokens": 464612277.0, + "step": 12176 + }, + { + "epoch": 1.5490395623966418, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.873140335083008, + "learning_rate": 1e-06, + "loss": 0.4214, + "mean_token_accuracy": 0.8642383813858032, + "num_tokens": 464647062.0, + "step": 12177 + }, + { + "epoch": 1.5491667726752323, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.81297492980957, + "learning_rate": 1e-06, + "loss": 0.3981, + "mean_token_accuracy": 0.8694164752960205, + "num_tokens": 464677998.0, + "step": 12178 + }, + { + "epoch": 1.5492939829538228, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.617504119873047, + "learning_rate": 1e-06, + "loss": 0.4291, + "mean_token_accuracy": 0.86060631275177, + "num_tokens": 464719205.0, + "step": 12179 + }, + { + "epoch": 1.5494211932324133, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.876922607421875, + "learning_rate": 1e-06, + "loss": 0.4031, + "mean_token_accuracy": 0.8717638254165649, + "num_tokens": 464760758.0, + "step": 12180 + }, + { + "epoch": 1.5495484035110036, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.77487564086914, + "learning_rate": 1e-06, + "loss": 0.4086, + "mean_token_accuracy": 0.8705660104751587, + "num_tokens": 464797422.0, + "step": 12181 + }, + { + "epoch": 1.5496756137895942, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.766864776611328, + "learning_rate": 1e-06, + "loss": 0.398, + "mean_token_accuracy": 0.8687224388122559, + "num_tokens": 464832889.0, + "step": 12182 + }, + { + "epoch": 1.5498028240681847, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.580577850341797, + "learning_rate": 1e-06, + "loss": 0.4248, + "mean_token_accuracy": 0.8607068657875061, + "num_tokens": 464867743.0, + "step": 12183 + }, + { + "epoch": 1.5499300343467752, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.611474990844727, + "learning_rate": 1e-06, + "loss": 0.4622, + "mean_token_accuracy": 0.8543996810913086, + "num_tokens": 464904666.0, + "step": 12184 + }, + { + "epoch": 1.5500572446253658, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.690807342529297, + "learning_rate": 1e-06, + "loss": 0.4034, + "mean_token_accuracy": 0.8719096779823303, + "num_tokens": 464942886.0, + "step": 12185 + }, + { + "epoch": 1.5501844549039563, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.675981521606445, + "learning_rate": 1e-06, + "loss": 0.4299, + "mean_token_accuracy": 0.8662344813346863, + "num_tokens": 464977463.0, + "step": 12186 + }, + { + "epoch": 1.5503116651825466, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.63286590576172, + "learning_rate": 1e-06, + "loss": 0.3829, + "mean_token_accuracy": 0.8766574263572693, + "num_tokens": 465015732.0, + "step": 12187 + }, + { + "epoch": 1.5504388754611371, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.426307678222656, + "learning_rate": 1e-06, + "loss": 0.3884, + "mean_token_accuracy": 0.8768993020057678, + "num_tokens": 465060789.0, + "step": 12188 + }, + { + "epoch": 1.5505660857397277, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.815502166748047, + "learning_rate": 1e-06, + "loss": 0.4249, + "mean_token_accuracy": 0.8649562001228333, + "num_tokens": 465097907.0, + "step": 12189 + }, + { + "epoch": 1.5506932960183182, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.83763885498047, + "learning_rate": 1e-06, + "loss": 0.392, + "mean_token_accuracy": 0.8753345012664795, + "num_tokens": 465139401.0, + "step": 12190 + }, + { + "epoch": 1.5508205062969087, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.724010467529297, + "learning_rate": 1e-06, + "loss": 0.4987, + "mean_token_accuracy": 0.8459063768386841, + "num_tokens": 465176531.0, + "step": 12191 + }, + { + "epoch": 1.5509477165754992, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.824207305908203, + "learning_rate": 1e-06, + "loss": 0.3846, + "mean_token_accuracy": 0.8763828873634338, + "num_tokens": 465216074.0, + "step": 12192 + }, + { + "epoch": 1.5510749268540898, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.744487762451172, + "learning_rate": 1e-06, + "loss": 0.3855, + "mean_token_accuracy": 0.880260705947876, + "num_tokens": 465257986.0, + "step": 12193 + }, + { + "epoch": 1.5512021371326803, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.783287048339844, + "learning_rate": 1e-06, + "loss": 0.3977, + "mean_token_accuracy": 0.8718525171279907, + "num_tokens": 465296038.0, + "step": 12194 + }, + { + "epoch": 1.5513293474112708, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.8715763092041, + "learning_rate": 1e-06, + "loss": 0.4852, + "mean_token_accuracy": 0.8438988327980042, + "num_tokens": 465334215.0, + "step": 12195 + }, + { + "epoch": 1.5514565576898613, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.5512638092041, + "learning_rate": 1e-06, + "loss": 0.4311, + "mean_token_accuracy": 0.8665687441825867, + "num_tokens": 465373362.0, + "step": 12196 + }, + { + "epoch": 1.5515837679684519, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.832849502563477, + "learning_rate": 1e-06, + "loss": 0.4465, + "mean_token_accuracy": 0.862199068069458, + "num_tokens": 465410181.0, + "step": 12197 + }, + { + "epoch": 1.5517109782470424, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.78170394897461, + "learning_rate": 1e-06, + "loss": 0.3927, + "mean_token_accuracy": 0.8753827810287476, + "num_tokens": 465443056.0, + "step": 12198 + }, + { + "epoch": 1.551838188525633, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.72047233581543, + "learning_rate": 1e-06, + "loss": 0.4165, + "mean_token_accuracy": 0.8707662224769592, + "num_tokens": 465484509.0, + "step": 12199 + }, + { + "epoch": 1.5519653988042235, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.107616424560547, + "learning_rate": 1e-06, + "loss": 0.4196, + "mean_token_accuracy": 0.8660856485366821, + "num_tokens": 465520017.0, + "step": 12200 + }, + { + "epoch": 1.552092609082814, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.609073638916016, + "learning_rate": 1e-06, + "loss": 0.3785, + "mean_token_accuracy": 0.8842887878417969, + "num_tokens": 465557221.0, + "step": 12201 + }, + { + "epoch": 1.5522198193614045, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.00547981262207, + "learning_rate": 1e-06, + "loss": 0.3948, + "mean_token_accuracy": 0.8767147064208984, + "num_tokens": 465595455.0, + "step": 12202 + }, + { + "epoch": 1.552347029639995, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.903072357177734, + "learning_rate": 1e-06, + "loss": 0.4128, + "mean_token_accuracy": 0.8690097332000732, + "num_tokens": 465636436.0, + "step": 12203 + }, + { + "epoch": 1.5524742399185856, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.78167152404785, + "learning_rate": 1e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.8647282123565674, + "num_tokens": 465668956.0, + "step": 12204 + }, + { + "epoch": 1.5526014501971759, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.754192352294922, + "learning_rate": 1e-06, + "loss": 0.4875, + "mean_token_accuracy": 0.8487886190414429, + "num_tokens": 465708812.0, + "step": 12205 + }, + { + "epoch": 1.5527286604757664, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.975690841674805, + "learning_rate": 1e-06, + "loss": 0.4537, + "mean_token_accuracy": 0.8547205328941345, + "num_tokens": 465742610.0, + "step": 12206 + }, + { + "epoch": 1.552855870754357, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.616777420043945, + "learning_rate": 1e-06, + "loss": 0.4223, + "mean_token_accuracy": 0.8674436211585999, + "num_tokens": 465776468.0, + "step": 12207 + }, + { + "epoch": 1.5529830810329475, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.829898834228516, + "learning_rate": 1e-06, + "loss": 0.361, + "mean_token_accuracy": 0.8849297761917114, + "num_tokens": 465809300.0, + "step": 12208 + }, + { + "epoch": 1.553110291311538, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.62267303466797, + "learning_rate": 1e-06, + "loss": 0.4125, + "mean_token_accuracy": 0.8697668313980103, + "num_tokens": 465849847.0, + "step": 12209 + }, + { + "epoch": 1.5532375015901285, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.900964736938477, + "learning_rate": 1e-06, + "loss": 0.3641, + "mean_token_accuracy": 0.8790393471717834, + "num_tokens": 465884986.0, + "step": 12210 + }, + { + "epoch": 1.5533647118687188, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.967395782470703, + "learning_rate": 1e-06, + "loss": 0.3947, + "mean_token_accuracy": 0.8748117685317993, + "num_tokens": 465919710.0, + "step": 12211 + }, + { + "epoch": 1.5534919221473094, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.648303985595703, + "learning_rate": 1e-06, + "loss": 0.4541, + "mean_token_accuracy": 0.86018967628479, + "num_tokens": 465956041.0, + "step": 12212 + }, + { + "epoch": 1.5536191324258999, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.893434524536133, + "learning_rate": 1e-06, + "loss": 0.3764, + "mean_token_accuracy": 0.879533052444458, + "num_tokens": 465994352.0, + "step": 12213 + }, + { + "epoch": 1.5537463427044904, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.802730560302734, + "learning_rate": 1e-06, + "loss": 0.4125, + "mean_token_accuracy": 0.8680812120437622, + "num_tokens": 466032493.0, + "step": 12214 + }, + { + "epoch": 1.553873552983081, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.74766731262207, + "learning_rate": 1e-06, + "loss": 0.4038, + "mean_token_accuracy": 0.8712347745895386, + "num_tokens": 466067048.0, + "step": 12215 + }, + { + "epoch": 1.5540007632616715, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.840408325195312, + "learning_rate": 1e-06, + "loss": 0.476, + "mean_token_accuracy": 0.8458987474441528, + "num_tokens": 466106895.0, + "step": 12216 + }, + { + "epoch": 1.554127973540262, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.628997802734375, + "learning_rate": 1e-06, + "loss": 0.4635, + "mean_token_accuracy": 0.8537795543670654, + "num_tokens": 466146740.0, + "step": 12217 + }, + { + "epoch": 1.5542551838188525, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.770959854125977, + "learning_rate": 1e-06, + "loss": 0.398, + "mean_token_accuracy": 0.8737911581993103, + "num_tokens": 466187097.0, + "step": 12218 + }, + { + "epoch": 1.554382394097443, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.00635528564453, + "learning_rate": 1e-06, + "loss": 0.4569, + "mean_token_accuracy": 0.8584027290344238, + "num_tokens": 466223969.0, + "step": 12219 + }, + { + "epoch": 1.5545096043760336, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.684850692749023, + "learning_rate": 1e-06, + "loss": 0.3842, + "mean_token_accuracy": 0.8788599967956543, + "num_tokens": 466260127.0, + "step": 12220 + }, + { + "epoch": 1.554636814654624, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.82501220703125, + "learning_rate": 1e-06, + "loss": 0.3937, + "mean_token_accuracy": 0.8740653991699219, + "num_tokens": 466297219.0, + "step": 12221 + }, + { + "epoch": 1.5547640249332146, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.732091903686523, + "learning_rate": 1e-06, + "loss": 0.485, + "mean_token_accuracy": 0.8481409549713135, + "num_tokens": 466329474.0, + "step": 12222 + }, + { + "epoch": 1.5548912352118052, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.7582950592041, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.8667070865631104, + "num_tokens": 466372801.0, + "step": 12223 + }, + { + "epoch": 1.5550184454903957, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.666553497314453, + "learning_rate": 1e-06, + "loss": 0.3902, + "mean_token_accuracy": 0.8746967911720276, + "num_tokens": 466412950.0, + "step": 12224 + }, + { + "epoch": 1.5551456557689862, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.74932861328125, + "learning_rate": 1e-06, + "loss": 0.382, + "mean_token_accuracy": 0.8749085068702698, + "num_tokens": 466448830.0, + "step": 12225 + }, + { + "epoch": 1.5552728660475768, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.71499252319336, + "learning_rate": 1e-06, + "loss": 0.4458, + "mean_token_accuracy": 0.8571380376815796, + "num_tokens": 466489749.0, + "step": 12226 + }, + { + "epoch": 1.5554000763261673, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.694026947021484, + "learning_rate": 1e-06, + "loss": 0.4782, + "mean_token_accuracy": 0.8497158288955688, + "num_tokens": 466531320.0, + "step": 12227 + }, + { + "epoch": 1.5555272866047578, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.76136589050293, + "learning_rate": 1e-06, + "loss": 0.4555, + "mean_token_accuracy": 0.8597726225852966, + "num_tokens": 466570001.0, + "step": 12228 + }, + { + "epoch": 1.5556544968833483, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.73379135131836, + "learning_rate": 1e-06, + "loss": 0.4059, + "mean_token_accuracy": 0.8710281848907471, + "num_tokens": 466607186.0, + "step": 12229 + }, + { + "epoch": 1.5557817071619386, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.939159393310547, + "learning_rate": 1e-06, + "loss": 0.4403, + "mean_token_accuracy": 0.8589344024658203, + "num_tokens": 466649480.0, + "step": 12230 + }, + { + "epoch": 1.5559089174405292, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.740751266479492, + "learning_rate": 1e-06, + "loss": 0.4052, + "mean_token_accuracy": 0.8742644786834717, + "num_tokens": 466688861.0, + "step": 12231 + }, + { + "epoch": 1.5560361277191197, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.746299743652344, + "learning_rate": 1e-06, + "loss": 0.4773, + "mean_token_accuracy": 0.848945677280426, + "num_tokens": 466723200.0, + "step": 12232 + }, + { + "epoch": 1.5561633379977102, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.783315658569336, + "learning_rate": 1e-06, + "loss": 0.4493, + "mean_token_accuracy": 0.8538885116577148, + "num_tokens": 466759639.0, + "step": 12233 + }, + { + "epoch": 1.5562905482763008, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.671260833740234, + "learning_rate": 1e-06, + "loss": 0.4357, + "mean_token_accuracy": 0.8617070317268372, + "num_tokens": 466795748.0, + "step": 12234 + }, + { + "epoch": 1.5564177585548913, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.882835388183594, + "learning_rate": 1e-06, + "loss": 0.3881, + "mean_token_accuracy": 0.8774023056030273, + "num_tokens": 466832938.0, + "step": 12235 + }, + { + "epoch": 1.5565449688334816, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.9969425201416, + "learning_rate": 1e-06, + "loss": 0.4415, + "mean_token_accuracy": 0.8606430292129517, + "num_tokens": 466868710.0, + "step": 12236 + }, + { + "epoch": 1.5566721791120721, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.617950439453125, + "learning_rate": 1e-06, + "loss": 0.4234, + "mean_token_accuracy": 0.865127682685852, + "num_tokens": 466903558.0, + "step": 12237 + }, + { + "epoch": 1.5567993893906626, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.944881439208984, + "learning_rate": 1e-06, + "loss": 0.4516, + "mean_token_accuracy": 0.8578941822052002, + "num_tokens": 466946785.0, + "step": 12238 + }, + { + "epoch": 1.5569265996692532, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.667282104492188, + "learning_rate": 1e-06, + "loss": 0.4491, + "mean_token_accuracy": 0.8595059514045715, + "num_tokens": 466988563.0, + "step": 12239 + }, + { + "epoch": 1.5570538099478437, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.633113861083984, + "learning_rate": 1e-06, + "loss": 0.4364, + "mean_token_accuracy": 0.8612785339355469, + "num_tokens": 467025447.0, + "step": 12240 + }, + { + "epoch": 1.5571810202264342, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.94767951965332, + "learning_rate": 1e-06, + "loss": 0.3979, + "mean_token_accuracy": 0.8741700649261475, + "num_tokens": 467060751.0, + "step": 12241 + }, + { + "epoch": 1.5573082305050248, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.876291275024414, + "learning_rate": 1e-06, + "loss": 0.4214, + "mean_token_accuracy": 0.8667455911636353, + "num_tokens": 467101889.0, + "step": 12242 + }, + { + "epoch": 1.5574354407836153, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.785442352294922, + "learning_rate": 1e-06, + "loss": 0.4196, + "mean_token_accuracy": 0.8698476552963257, + "num_tokens": 467136782.0, + "step": 12243 + }, + { + "epoch": 1.5575626510622058, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.798959732055664, + "learning_rate": 1e-06, + "loss": 0.3985, + "mean_token_accuracy": 0.8759772181510925, + "num_tokens": 467174137.0, + "step": 12244 + }, + { + "epoch": 1.5576898613407963, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.79645347595215, + "learning_rate": 1e-06, + "loss": 0.4464, + "mean_token_accuracy": 0.863237202167511, + "num_tokens": 467213919.0, + "step": 12245 + }, + { + "epoch": 1.5578170716193869, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.618682861328125, + "learning_rate": 1e-06, + "loss": 0.4024, + "mean_token_accuracy": 0.875206708908081, + "num_tokens": 467254860.0, + "step": 12246 + }, + { + "epoch": 1.5579442818979774, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.921401977539062, + "learning_rate": 1e-06, + "loss": 0.3877, + "mean_token_accuracy": 0.8773773312568665, + "num_tokens": 467296806.0, + "step": 12247 + }, + { + "epoch": 1.558071492176568, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.51423454284668, + "learning_rate": 1e-06, + "loss": 0.383, + "mean_token_accuracy": 0.873786985874176, + "num_tokens": 467332882.0, + "step": 12248 + }, + { + "epoch": 1.5581987024551585, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.93327522277832, + "learning_rate": 1e-06, + "loss": 0.3672, + "mean_token_accuracy": 0.8816820383071899, + "num_tokens": 467375088.0, + "step": 12249 + }, + { + "epoch": 1.558325912733749, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.830419540405273, + "learning_rate": 1e-06, + "loss": 0.4392, + "mean_token_accuracy": 0.8673444986343384, + "num_tokens": 467413000.0, + "step": 12250 + }, + { + "epoch": 1.5584531230123395, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.786846160888672, + "learning_rate": 1e-06, + "loss": 0.3702, + "mean_token_accuracy": 0.8814736604690552, + "num_tokens": 467450947.0, + "step": 12251 + }, + { + "epoch": 1.55858033329093, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.79071807861328, + "learning_rate": 1e-06, + "loss": 0.4433, + "mean_token_accuracy": 0.8574955463409424, + "num_tokens": 467493770.0, + "step": 12252 + }, + { + "epoch": 1.5587075435695206, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.86998748779297, + "learning_rate": 1e-06, + "loss": 0.4374, + "mean_token_accuracy": 0.8648183941841125, + "num_tokens": 467531882.0, + "step": 12253 + }, + { + "epoch": 1.5588347538481109, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.0176944732666, + "learning_rate": 1e-06, + "loss": 0.3272, + "mean_token_accuracy": 0.8925999999046326, + "num_tokens": 467566059.0, + "step": 12254 + }, + { + "epoch": 1.5589619641267014, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.665260314941406, + "learning_rate": 1e-06, + "loss": 0.3831, + "mean_token_accuracy": 0.8824152946472168, + "num_tokens": 467603734.0, + "step": 12255 + }, + { + "epoch": 1.559089174405292, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.844900131225586, + "learning_rate": 1e-06, + "loss": 0.4059, + "mean_token_accuracy": 0.8750902414321899, + "num_tokens": 467641224.0, + "step": 12256 + }, + { + "epoch": 1.5592163846838825, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.583349227905273, + "learning_rate": 1e-06, + "loss": 0.4397, + "mean_token_accuracy": 0.866463840007782, + "num_tokens": 467677576.0, + "step": 12257 + }, + { + "epoch": 1.559343594962473, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.853036880493164, + "learning_rate": 1e-06, + "loss": 0.4041, + "mean_token_accuracy": 0.8747488260269165, + "num_tokens": 467717526.0, + "step": 12258 + }, + { + "epoch": 1.5594708052410635, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.99457550048828, + "learning_rate": 1e-06, + "loss": 0.451, + "mean_token_accuracy": 0.8638319969177246, + "num_tokens": 467757591.0, + "step": 12259 + }, + { + "epoch": 1.5595980155196538, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.588241577148438, + "learning_rate": 1e-06, + "loss": 0.4625, + "mean_token_accuracy": 0.8534791469573975, + "num_tokens": 467792794.0, + "step": 12260 + }, + { + "epoch": 1.5597252257982444, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.764623641967773, + "learning_rate": 1e-06, + "loss": 0.4294, + "mean_token_accuracy": 0.8627511262893677, + "num_tokens": 467829554.0, + "step": 12261 + }, + { + "epoch": 1.5598524360768349, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.827465057373047, + "learning_rate": 1e-06, + "loss": 0.4663, + "mean_token_accuracy": 0.8511412143707275, + "num_tokens": 467871403.0, + "step": 12262 + }, + { + "epoch": 1.5599796463554254, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.654342651367188, + "learning_rate": 1e-06, + "loss": 0.4665, + "mean_token_accuracy": 0.8478779196739197, + "num_tokens": 467906498.0, + "step": 12263 + }, + { + "epoch": 1.560106856634016, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.62116050720215, + "learning_rate": 1e-06, + "loss": 0.4193, + "mean_token_accuracy": 0.8654531240463257, + "num_tokens": 467946426.0, + "step": 12264 + }, + { + "epoch": 1.5602340669126065, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.836687088012695, + "learning_rate": 1e-06, + "loss": 0.4119, + "mean_token_accuracy": 0.870741605758667, + "num_tokens": 467979930.0, + "step": 12265 + }, + { + "epoch": 1.560361277191197, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.70928382873535, + "learning_rate": 1e-06, + "loss": 0.4347, + "mean_token_accuracy": 0.8616900444030762, + "num_tokens": 468016347.0, + "step": 12266 + }, + { + "epoch": 1.5604884874697875, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.759002685546875, + "learning_rate": 1e-06, + "loss": 0.3889, + "mean_token_accuracy": 0.8737241625785828, + "num_tokens": 468051446.0, + "step": 12267 + }, + { + "epoch": 1.560615697748378, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.886106491088867, + "learning_rate": 1e-06, + "loss": 0.372, + "mean_token_accuracy": 0.8836245536804199, + "num_tokens": 468085713.0, + "step": 12268 + }, + { + "epoch": 1.5607429080269686, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.746597290039062, + "learning_rate": 1e-06, + "loss": 0.3863, + "mean_token_accuracy": 0.8796100616455078, + "num_tokens": 468117863.0, + "step": 12269 + }, + { + "epoch": 1.560870118305559, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.682859420776367, + "learning_rate": 1e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.8638447523117065, + "num_tokens": 468153041.0, + "step": 12270 + }, + { + "epoch": 1.5609973285841496, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.706663131713867, + "learning_rate": 1e-06, + "loss": 0.4153, + "mean_token_accuracy": 0.8654439449310303, + "num_tokens": 468193624.0, + "step": 12271 + }, + { + "epoch": 1.5611245388627402, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.68626594543457, + "learning_rate": 1e-06, + "loss": 0.3789, + "mean_token_accuracy": 0.8823617100715637, + "num_tokens": 468227091.0, + "step": 12272 + }, + { + "epoch": 1.5612517491413307, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.619800567626953, + "learning_rate": 1e-06, + "loss": 0.4269, + "mean_token_accuracy": 0.8629676103591919, + "num_tokens": 468265079.0, + "step": 12273 + }, + { + "epoch": 1.5613789594199212, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.5388240814209, + "learning_rate": 1e-06, + "loss": 0.4345, + "mean_token_accuracy": 0.8644536733627319, + "num_tokens": 468307055.0, + "step": 12274 + }, + { + "epoch": 1.5615061696985117, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.72716522216797, + "learning_rate": 1e-06, + "loss": 0.4295, + "mean_token_accuracy": 0.8616812825202942, + "num_tokens": 468344429.0, + "step": 12275 + }, + { + "epoch": 1.5616333799771023, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.519346237182617, + "learning_rate": 1e-06, + "loss": 0.4255, + "mean_token_accuracy": 0.8665485382080078, + "num_tokens": 468379448.0, + "step": 12276 + }, + { + "epoch": 1.5617605902556928, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.97945213317871, + "learning_rate": 1e-06, + "loss": 0.405, + "mean_token_accuracy": 0.8708223104476929, + "num_tokens": 468423344.0, + "step": 12277 + }, + { + "epoch": 1.5618878005342833, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.585268020629883, + "learning_rate": 1e-06, + "loss": 0.4015, + "mean_token_accuracy": 0.8732084631919861, + "num_tokens": 468461657.0, + "step": 12278 + }, + { + "epoch": 1.5620150108128736, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.81869888305664, + "learning_rate": 1e-06, + "loss": 0.3729, + "mean_token_accuracy": 0.8822740912437439, + "num_tokens": 468500428.0, + "step": 12279 + }, + { + "epoch": 1.5621422210914642, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.699735641479492, + "learning_rate": 1e-06, + "loss": 0.4476, + "mean_token_accuracy": 0.8566842079162598, + "num_tokens": 468536310.0, + "step": 12280 + }, + { + "epoch": 1.5622694313700547, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.729877471923828, + "learning_rate": 1e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.865496039390564, + "num_tokens": 468577954.0, + "step": 12281 + }, + { + "epoch": 1.5623966416486452, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.785823822021484, + "learning_rate": 1e-06, + "loss": 0.4341, + "mean_token_accuracy": 0.8583850860595703, + "num_tokens": 468612131.0, + "step": 12282 + }, + { + "epoch": 1.5625238519272358, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.72505760192871, + "learning_rate": 1e-06, + "loss": 0.4339, + "mean_token_accuracy": 0.8629899024963379, + "num_tokens": 468654022.0, + "step": 12283 + }, + { + "epoch": 1.5626510622058263, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.882709503173828, + "learning_rate": 1e-06, + "loss": 0.4407, + "mean_token_accuracy": 0.8587059378623962, + "num_tokens": 468693120.0, + "step": 12284 + }, + { + "epoch": 1.5627782724844166, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.79697608947754, + "learning_rate": 1e-06, + "loss": 0.4267, + "mean_token_accuracy": 0.8667391538619995, + "num_tokens": 468727695.0, + "step": 12285 + }, + { + "epoch": 1.5629054827630071, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.721115112304688, + "learning_rate": 1e-06, + "loss": 0.4221, + "mean_token_accuracy": 0.8633724451065063, + "num_tokens": 468766705.0, + "step": 12286 + }, + { + "epoch": 1.5630326930415976, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.9184513092041, + "learning_rate": 1e-06, + "loss": 0.4502, + "mean_token_accuracy": 0.8576281666755676, + "num_tokens": 468804555.0, + "step": 12287 + }, + { + "epoch": 1.5631599033201882, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.712472915649414, + "learning_rate": 1e-06, + "loss": 0.4331, + "mean_token_accuracy": 0.8640127778053284, + "num_tokens": 468842495.0, + "step": 12288 + }, + { + "epoch": 1.5632871135987787, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.831424713134766, + "learning_rate": 1e-06, + "loss": 0.4128, + "mean_token_accuracy": 0.8664352893829346, + "num_tokens": 468877574.0, + "step": 12289 + }, + { + "epoch": 1.5634143238773692, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.803218841552734, + "learning_rate": 1e-06, + "loss": 0.3867, + "mean_token_accuracy": 0.8792015314102173, + "num_tokens": 468914573.0, + "step": 12290 + }, + { + "epoch": 1.5635415341559598, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.7504940032959, + "learning_rate": 1e-06, + "loss": 0.3865, + "mean_token_accuracy": 0.8777122497558594, + "num_tokens": 468950943.0, + "step": 12291 + }, + { + "epoch": 1.5636687444345503, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.050172805786133, + "learning_rate": 1e-06, + "loss": 0.3795, + "mean_token_accuracy": 0.881516695022583, + "num_tokens": 468990007.0, + "step": 12292 + }, + { + "epoch": 1.5637959547131408, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.61185073852539, + "learning_rate": 1e-06, + "loss": 0.397, + "mean_token_accuracy": 0.8710007667541504, + "num_tokens": 469022461.0, + "step": 12293 + }, + { + "epoch": 1.5639231649917313, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.76970672607422, + "learning_rate": 1e-06, + "loss": 0.4026, + "mean_token_accuracy": 0.8728317022323608, + "num_tokens": 469056396.0, + "step": 12294 + }, + { + "epoch": 1.5640503752703219, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.7406005859375, + "learning_rate": 1e-06, + "loss": 0.3924, + "mean_token_accuracy": 0.876328706741333, + "num_tokens": 469099464.0, + "step": 12295 + }, + { + "epoch": 1.5641775855489124, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.822036743164062, + "learning_rate": 1e-06, + "loss": 0.395, + "mean_token_accuracy": 0.8746837377548218, + "num_tokens": 469139830.0, + "step": 12296 + }, + { + "epoch": 1.564304795827503, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.735008239746094, + "learning_rate": 1e-06, + "loss": 0.4199, + "mean_token_accuracy": 0.8661022782325745, + "num_tokens": 469177838.0, + "step": 12297 + }, + { + "epoch": 1.5644320061060935, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.72324562072754, + "learning_rate": 1e-06, + "loss": 0.3412, + "mean_token_accuracy": 0.8924461603164673, + "num_tokens": 469218727.0, + "step": 12298 + }, + { + "epoch": 1.564559216384684, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.668001174926758, + "learning_rate": 1e-06, + "loss": 0.4892, + "mean_token_accuracy": 0.8455510139465332, + "num_tokens": 469261186.0, + "step": 12299 + }, + { + "epoch": 1.5646864266632745, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.853830337524414, + "learning_rate": 1e-06, + "loss": 0.4051, + "mean_token_accuracy": 0.875067412853241, + "num_tokens": 469301191.0, + "step": 12300 + }, + { + "epoch": 1.564813636941865, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.838708877563477, + "learning_rate": 1e-06, + "loss": 0.4008, + "mean_token_accuracy": 0.8724499940872192, + "num_tokens": 469340132.0, + "step": 12301 + }, + { + "epoch": 1.5649408472204556, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.81491470336914, + "learning_rate": 1e-06, + "loss": 0.3634, + "mean_token_accuracy": 0.8867650032043457, + "num_tokens": 469376211.0, + "step": 12302 + }, + { + "epoch": 1.5650680574990459, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.074174880981445, + "learning_rate": 1e-06, + "loss": 0.4557, + "mean_token_accuracy": 0.8577697277069092, + "num_tokens": 469413961.0, + "step": 12303 + }, + { + "epoch": 1.5651952677776364, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.767181396484375, + "learning_rate": 1e-06, + "loss": 0.4176, + "mean_token_accuracy": 0.8673039674758911, + "num_tokens": 469449507.0, + "step": 12304 + }, + { + "epoch": 1.565322478056227, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.016206741333008, + "learning_rate": 1e-06, + "loss": 0.4602, + "mean_token_accuracy": 0.8552370071411133, + "num_tokens": 469486324.0, + "step": 12305 + }, + { + "epoch": 1.5654496883348175, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.006942749023438, + "learning_rate": 1e-06, + "loss": 0.4359, + "mean_token_accuracy": 0.8634055256843567, + "num_tokens": 469523799.0, + "step": 12306 + }, + { + "epoch": 1.565576898613408, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.885129928588867, + "learning_rate": 1e-06, + "loss": 0.4611, + "mean_token_accuracy": 0.8534376621246338, + "num_tokens": 469558661.0, + "step": 12307 + }, + { + "epoch": 1.5657041088919985, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.99384307861328, + "learning_rate": 1e-06, + "loss": 0.4146, + "mean_token_accuracy": 0.8667470216751099, + "num_tokens": 469603087.0, + "step": 12308 + }, + { + "epoch": 1.5658313191705888, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.839462280273438, + "learning_rate": 1e-06, + "loss": 0.418, + "mean_token_accuracy": 0.8673985600471497, + "num_tokens": 469637344.0, + "step": 12309 + }, + { + "epoch": 1.5659585294491793, + "ewc_loss": 0.03271484375, + "ewc_loss_parallel": 3.266334533691406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.935171127319336, + "learning_rate": 1e-06, + "loss": 0.388, + "mean_token_accuracy": 0.876417338848114, + "num_tokens": 469681127.0, + "step": 12310 + }, + { + "epoch": 1.5660857397277699, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.782724380493164, + "learning_rate": 1e-06, + "loss": 0.4205, + "mean_token_accuracy": 0.86656653881073, + "num_tokens": 469724238.0, + "step": 12311 + }, + { + "epoch": 1.5662129500063604, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.893035888671875, + "learning_rate": 1e-06, + "loss": 0.423, + "mean_token_accuracy": 0.8607518672943115, + "num_tokens": 469765967.0, + "step": 12312 + }, + { + "epoch": 1.566340160284951, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.795019149780273, + "learning_rate": 1e-06, + "loss": 0.4167, + "mean_token_accuracy": 0.8686427474021912, + "num_tokens": 469801387.0, + "step": 12313 + }, + { + "epoch": 1.5664673705635415, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.70592498779297, + "learning_rate": 1e-06, + "loss": 0.4008, + "mean_token_accuracy": 0.8689706325531006, + "num_tokens": 469835448.0, + "step": 12314 + }, + { + "epoch": 1.566594580842132, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.819162368774414, + "learning_rate": 1e-06, + "loss": 0.4398, + "mean_token_accuracy": 0.858929455280304, + "num_tokens": 469876647.0, + "step": 12315 + }, + { + "epoch": 1.5667217911207225, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.727319717407227, + "learning_rate": 1e-06, + "loss": 0.3781, + "mean_token_accuracy": 0.8815621733665466, + "num_tokens": 469918158.0, + "step": 12316 + }, + { + "epoch": 1.566849001399313, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.873750686645508, + "learning_rate": 1e-06, + "loss": 0.4694, + "mean_token_accuracy": 0.8535098433494568, + "num_tokens": 469953691.0, + "step": 12317 + }, + { + "epoch": 1.5669762116779036, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.745967864990234, + "learning_rate": 1e-06, + "loss": 0.4071, + "mean_token_accuracy": 0.8705669045448303, + "num_tokens": 469996174.0, + "step": 12318 + }, + { + "epoch": 1.567103421956494, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.003360748291016, + "learning_rate": 1e-06, + "loss": 0.4508, + "mean_token_accuracy": 0.8554369807243347, + "num_tokens": 470031371.0, + "step": 12319 + }, + { + "epoch": 1.5672306322350846, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.622787475585938, + "learning_rate": 1e-06, + "loss": 0.4047, + "mean_token_accuracy": 0.8716093301773071, + "num_tokens": 470068783.0, + "step": 12320 + }, + { + "epoch": 1.5673578425136752, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.83458709716797, + "learning_rate": 1e-06, + "loss": 0.4645, + "mean_token_accuracy": 0.8506805896759033, + "num_tokens": 470103490.0, + "step": 12321 + }, + { + "epoch": 1.5674850527922657, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.81495475769043, + "learning_rate": 1e-06, + "loss": 0.436, + "mean_token_accuracy": 0.8634092807769775, + "num_tokens": 470137213.0, + "step": 12322 + }, + { + "epoch": 1.5676122630708562, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.75225830078125, + "learning_rate": 1e-06, + "loss": 0.4196, + "mean_token_accuracy": 0.8726195096969604, + "num_tokens": 470171816.0, + "step": 12323 + }, + { + "epoch": 1.5677394733494467, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.753982543945312, + "learning_rate": 1e-06, + "loss": 0.49, + "mean_token_accuracy": 0.8446526527404785, + "num_tokens": 470210517.0, + "step": 12324 + }, + { + "epoch": 1.5678666836280373, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.858240127563477, + "learning_rate": 1e-06, + "loss": 0.4547, + "mean_token_accuracy": 0.8558176755905151, + "num_tokens": 470247085.0, + "step": 12325 + }, + { + "epoch": 1.5679938939066278, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.82103157043457, + "learning_rate": 1e-06, + "loss": 0.4159, + "mean_token_accuracy": 0.8677833080291748, + "num_tokens": 470288483.0, + "step": 12326 + }, + { + "epoch": 1.5681211041852183, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.762537002563477, + "learning_rate": 1e-06, + "loss": 0.4288, + "mean_token_accuracy": 0.8677240610122681, + "num_tokens": 470322173.0, + "step": 12327 + }, + { + "epoch": 1.5682483144638086, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.8129825592041, + "learning_rate": 1e-06, + "loss": 0.4219, + "mean_token_accuracy": 0.869272768497467, + "num_tokens": 470356617.0, + "step": 12328 + }, + { + "epoch": 1.5683755247423992, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.761629104614258, + "learning_rate": 1e-06, + "loss": 0.3877, + "mean_token_accuracy": 0.8775405883789062, + "num_tokens": 470393095.0, + "step": 12329 + }, + { + "epoch": 1.5685027350209897, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.671836853027344, + "learning_rate": 1e-06, + "loss": 0.3786, + "mean_token_accuracy": 0.8786344528198242, + "num_tokens": 470427785.0, + "step": 12330 + }, + { + "epoch": 1.5686299452995802, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.854381561279297, + "learning_rate": 1e-06, + "loss": 0.4344, + "mean_token_accuracy": 0.8619996309280396, + "num_tokens": 470469971.0, + "step": 12331 + }, + { + "epoch": 1.5687571555781707, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.866544723510742, + "learning_rate": 1e-06, + "loss": 0.4477, + "mean_token_accuracy": 0.8594449162483215, + "num_tokens": 470503935.0, + "step": 12332 + }, + { + "epoch": 1.5688843658567613, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.623830795288086, + "learning_rate": 1e-06, + "loss": 0.415, + "mean_token_accuracy": 0.8707836866378784, + "num_tokens": 470536014.0, + "step": 12333 + }, + { + "epoch": 1.5690115761353516, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.708267211914062, + "learning_rate": 1e-06, + "loss": 0.3834, + "mean_token_accuracy": 0.8798565864562988, + "num_tokens": 470575632.0, + "step": 12334 + }, + { + "epoch": 1.569138786413942, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.905202865600586, + "learning_rate": 1e-06, + "loss": 0.3922, + "mean_token_accuracy": 0.8778417706489563, + "num_tokens": 470611295.0, + "step": 12335 + }, + { + "epoch": 1.5692659966925326, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.839237213134766, + "learning_rate": 1e-06, + "loss": 0.4161, + "mean_token_accuracy": 0.865887463092804, + "num_tokens": 470646993.0, + "step": 12336 + }, + { + "epoch": 1.5693932069711232, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.621118545532227, + "learning_rate": 1e-06, + "loss": 0.4024, + "mean_token_accuracy": 0.8706347942352295, + "num_tokens": 470680253.0, + "step": 12337 + }, + { + "epoch": 1.5695204172497137, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.703245162963867, + "learning_rate": 1e-06, + "loss": 0.4308, + "mean_token_accuracy": 0.865134596824646, + "num_tokens": 470716169.0, + "step": 12338 + }, + { + "epoch": 1.5696476275283042, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.779733657836914, + "learning_rate": 1e-06, + "loss": 0.3765, + "mean_token_accuracy": 0.8803137540817261, + "num_tokens": 470754208.0, + "step": 12339 + }, + { + "epoch": 1.5697748378068948, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.907459259033203, + "learning_rate": 1e-06, + "loss": 0.4133, + "mean_token_accuracy": 0.8692271113395691, + "num_tokens": 470801329.0, + "step": 12340 + }, + { + "epoch": 1.5699020480854853, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.868343353271484, + "learning_rate": 1e-06, + "loss": 0.4398, + "mean_token_accuracy": 0.8633373975753784, + "num_tokens": 470838408.0, + "step": 12341 + }, + { + "epoch": 1.5700292583640758, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.83084487915039, + "learning_rate": 1e-06, + "loss": 0.4187, + "mean_token_accuracy": 0.8672473430633545, + "num_tokens": 470874001.0, + "step": 12342 + }, + { + "epoch": 1.5701564686426663, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.631860733032227, + "learning_rate": 1e-06, + "loss": 0.4247, + "mean_token_accuracy": 0.8621549606323242, + "num_tokens": 470916870.0, + "step": 12343 + }, + { + "epoch": 1.5702836789212569, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.75062370300293, + "learning_rate": 1e-06, + "loss": 0.3885, + "mean_token_accuracy": 0.8763875365257263, + "num_tokens": 470953862.0, + "step": 12344 + }, + { + "epoch": 1.5704108891998474, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.844242095947266, + "learning_rate": 1e-06, + "loss": 0.3836, + "mean_token_accuracy": 0.8772960305213928, + "num_tokens": 470991507.0, + "step": 12345 + }, + { + "epoch": 1.570538099478438, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.643613815307617, + "learning_rate": 1e-06, + "loss": 0.4099, + "mean_token_accuracy": 0.8710197806358337, + "num_tokens": 471030109.0, + "step": 12346 + }, + { + "epoch": 1.5706653097570284, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.7917423248291, + "learning_rate": 1e-06, + "loss": 0.3679, + "mean_token_accuracy": 0.8814326524734497, + "num_tokens": 471063829.0, + "step": 12347 + }, + { + "epoch": 1.570792520035619, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.867605209350586, + "learning_rate": 1e-06, + "loss": 0.4415, + "mean_token_accuracy": 0.8577147722244263, + "num_tokens": 471107259.0, + "step": 12348 + }, + { + "epoch": 1.5709197303142095, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.76028823852539, + "learning_rate": 1e-06, + "loss": 0.4246, + "mean_token_accuracy": 0.8709737658500671, + "num_tokens": 471141516.0, + "step": 12349 + }, + { + "epoch": 1.5710469405928, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.851821899414062, + "learning_rate": 1e-06, + "loss": 0.4164, + "mean_token_accuracy": 0.8680305480957031, + "num_tokens": 471178655.0, + "step": 12350 + }, + { + "epoch": 1.5711741508713906, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.777116775512695, + "learning_rate": 1e-06, + "loss": 0.3892, + "mean_token_accuracy": 0.8793821334838867, + "num_tokens": 471209437.0, + "step": 12351 + }, + { + "epoch": 1.5713013611499809, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.83005714416504, + "learning_rate": 1e-06, + "loss": 0.3628, + "mean_token_accuracy": 0.8827029466629028, + "num_tokens": 471249559.0, + "step": 12352 + }, + { + "epoch": 1.5714285714285714, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.611419677734375, + "learning_rate": 1e-06, + "loss": 0.3989, + "mean_token_accuracy": 0.8729503154754639, + "num_tokens": 471287179.0, + "step": 12353 + }, + { + "epoch": 1.571555781707162, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.639053344726562, + "learning_rate": 1e-06, + "loss": 0.4149, + "mean_token_accuracy": 0.8708952069282532, + "num_tokens": 471328052.0, + "step": 12354 + }, + { + "epoch": 1.5716829919857525, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.088668823242188, + "learning_rate": 1e-06, + "loss": 0.4156, + "mean_token_accuracy": 0.8666858673095703, + "num_tokens": 471362414.0, + "step": 12355 + }, + { + "epoch": 1.571810202264343, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.66436195373535, + "learning_rate": 1e-06, + "loss": 0.4148, + "mean_token_accuracy": 0.8642376065254211, + "num_tokens": 471397910.0, + "step": 12356 + }, + { + "epoch": 1.5719374125429335, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.70771598815918, + "learning_rate": 1e-06, + "loss": 0.3678, + "mean_token_accuracy": 0.8825184106826782, + "num_tokens": 471436951.0, + "step": 12357 + }, + { + "epoch": 1.5720646228215238, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.84699249267578, + "learning_rate": 1e-06, + "loss": 0.4051, + "mean_token_accuracy": 0.8702172040939331, + "num_tokens": 471470139.0, + "step": 12358 + }, + { + "epoch": 1.5721918331001143, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.909461975097656, + "learning_rate": 1e-06, + "loss": 0.3853, + "mean_token_accuracy": 0.8762305378913879, + "num_tokens": 471508591.0, + "step": 12359 + }, + { + "epoch": 1.5723190433787049, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.89653205871582, + "learning_rate": 1e-06, + "loss": 0.4094, + "mean_token_accuracy": 0.8690860867500305, + "num_tokens": 471549165.0, + "step": 12360 + }, + { + "epoch": 1.5724462536572954, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.673349380493164, + "learning_rate": 1e-06, + "loss": 0.4253, + "mean_token_accuracy": 0.8622267246246338, + "num_tokens": 471589021.0, + "step": 12361 + }, + { + "epoch": 1.572573463935886, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.80219841003418, + "learning_rate": 1e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.8617658615112305, + "num_tokens": 471624263.0, + "step": 12362 + }, + { + "epoch": 1.5727006742144765, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.67872428894043, + "learning_rate": 1e-06, + "loss": 0.4288, + "mean_token_accuracy": 0.8621832728385925, + "num_tokens": 471663678.0, + "step": 12363 + }, + { + "epoch": 1.572827884493067, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.967275619506836, + "learning_rate": 1e-06, + "loss": 0.4298, + "mean_token_accuracy": 0.8660606145858765, + "num_tokens": 471705887.0, + "step": 12364 + }, + { + "epoch": 1.5729550947716575, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.71721839904785, + "learning_rate": 1e-06, + "loss": 0.3864, + "mean_token_accuracy": 0.876158595085144, + "num_tokens": 471739796.0, + "step": 12365 + }, + { + "epoch": 1.573082305050248, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.84650993347168, + "learning_rate": 1e-06, + "loss": 0.4311, + "mean_token_accuracy": 0.8647932410240173, + "num_tokens": 471774723.0, + "step": 12366 + }, + { + "epoch": 1.5732095153288386, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.974306106567383, + "learning_rate": 1e-06, + "loss": 0.4296, + "mean_token_accuracy": 0.861415684223175, + "num_tokens": 471814169.0, + "step": 12367 + }, + { + "epoch": 1.573336725607429, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.722522735595703, + "learning_rate": 1e-06, + "loss": 0.4082, + "mean_token_accuracy": 0.8697603940963745, + "num_tokens": 471852367.0, + "step": 12368 + }, + { + "epoch": 1.5734639358860196, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.82352638244629, + "learning_rate": 1e-06, + "loss": 0.3923, + "mean_token_accuracy": 0.8748052716255188, + "num_tokens": 471883438.0, + "step": 12369 + }, + { + "epoch": 1.5735911461646102, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.953096389770508, + "learning_rate": 1e-06, + "loss": 0.4206, + "mean_token_accuracy": 0.8631575107574463, + "num_tokens": 471921051.0, + "step": 12370 + }, + { + "epoch": 1.5737183564432007, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.84276008605957, + "learning_rate": 1e-06, + "loss": 0.4102, + "mean_token_accuracy": 0.8718747496604919, + "num_tokens": 471959685.0, + "step": 12371 + }, + { + "epoch": 1.5738455667217912, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.811847686767578, + "learning_rate": 1e-06, + "loss": 0.4134, + "mean_token_accuracy": 0.8689411878585815, + "num_tokens": 471997345.0, + "step": 12372 + }, + { + "epoch": 1.5739727770003817, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.88050651550293, + "learning_rate": 1e-06, + "loss": 0.3904, + "mean_token_accuracy": 0.8742982745170593, + "num_tokens": 472034455.0, + "step": 12373 + }, + { + "epoch": 1.5740999872789723, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.81540298461914, + "learning_rate": 1e-06, + "loss": 0.4224, + "mean_token_accuracy": 0.8676996231079102, + "num_tokens": 472073630.0, + "step": 12374 + }, + { + "epoch": 1.5742271975575628, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.759313583374023, + "learning_rate": 1e-06, + "loss": 0.4211, + "mean_token_accuracy": 0.8666906356811523, + "num_tokens": 472111556.0, + "step": 12375 + }, + { + "epoch": 1.5743544078361533, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.938495635986328, + "learning_rate": 1e-06, + "loss": 0.3939, + "mean_token_accuracy": 0.8756641745567322, + "num_tokens": 472148276.0, + "step": 12376 + }, + { + "epoch": 1.5744816181147436, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.78384780883789, + "learning_rate": 1e-06, + "loss": 0.4112, + "mean_token_accuracy": 0.8676570057868958, + "num_tokens": 472181665.0, + "step": 12377 + }, + { + "epoch": 1.5746088283933342, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.768810272216797, + "learning_rate": 1e-06, + "loss": 0.4451, + "mean_token_accuracy": 0.8623840808868408, + "num_tokens": 472222759.0, + "step": 12378 + }, + { + "epoch": 1.5747360386719247, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.99995231628418, + "learning_rate": 1e-06, + "loss": 0.4317, + "mean_token_accuracy": 0.8628296852111816, + "num_tokens": 472260867.0, + "step": 12379 + }, + { + "epoch": 1.5748632489505152, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.704734802246094, + "learning_rate": 1e-06, + "loss": 0.4456, + "mean_token_accuracy": 0.8565840721130371, + "num_tokens": 472300068.0, + "step": 12380 + }, + { + "epoch": 1.5749904592291057, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.904260635375977, + "learning_rate": 1e-06, + "loss": 0.3777, + "mean_token_accuracy": 0.8785761594772339, + "num_tokens": 472336459.0, + "step": 12381 + }, + { + "epoch": 1.5751176695076963, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.646503448486328, + "learning_rate": 1e-06, + "loss": 0.402, + "mean_token_accuracy": 0.8728069067001343, + "num_tokens": 472376696.0, + "step": 12382 + }, + { + "epoch": 1.5752448797862866, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.958105087280273, + "learning_rate": 1e-06, + "loss": 0.3648, + "mean_token_accuracy": 0.880543053150177, + "num_tokens": 472414491.0, + "step": 12383 + }, + { + "epoch": 1.575372090064877, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.851133346557617, + "learning_rate": 1e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.86829674243927, + "num_tokens": 472452188.0, + "step": 12384 + }, + { + "epoch": 1.5754993003434676, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.776283264160156, + "learning_rate": 1e-06, + "loss": 0.4165, + "mean_token_accuracy": 0.8676337003707886, + "num_tokens": 472489690.0, + "step": 12385 + }, + { + "epoch": 1.5756265106220582, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.810138702392578, + "learning_rate": 1e-06, + "loss": 0.4162, + "mean_token_accuracy": 0.8667699098587036, + "num_tokens": 472529912.0, + "step": 12386 + }, + { + "epoch": 1.5757537209006487, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.840213775634766, + "learning_rate": 1e-06, + "loss": 0.4102, + "mean_token_accuracy": 0.867019534111023, + "num_tokens": 472562534.0, + "step": 12387 + }, + { + "epoch": 1.5758809311792392, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.759376525878906, + "learning_rate": 1e-06, + "loss": 0.4208, + "mean_token_accuracy": 0.8667583465576172, + "num_tokens": 472598955.0, + "step": 12388 + }, + { + "epoch": 1.5760081414578297, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.927736282348633, + "learning_rate": 1e-06, + "loss": 0.3982, + "mean_token_accuracy": 0.8711564540863037, + "num_tokens": 472638064.0, + "step": 12389 + }, + { + "epoch": 1.5761353517364203, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.934486389160156, + "learning_rate": 1e-06, + "loss": 0.4051, + "mean_token_accuracy": 0.8718478679656982, + "num_tokens": 472676823.0, + "step": 12390 + }, + { + "epoch": 1.5762625620150108, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.757558822631836, + "learning_rate": 1e-06, + "loss": 0.412, + "mean_token_accuracy": 0.8682045936584473, + "num_tokens": 472718528.0, + "step": 12391 + }, + { + "epoch": 1.5763897722936013, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.844467163085938, + "learning_rate": 1e-06, + "loss": 0.4218, + "mean_token_accuracy": 0.8749627470970154, + "num_tokens": 472755634.0, + "step": 12392 + }, + { + "epoch": 1.5765169825721919, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.844274520874023, + "learning_rate": 1e-06, + "loss": 0.3766, + "mean_token_accuracy": 0.8796801567077637, + "num_tokens": 472798060.0, + "step": 12393 + }, + { + "epoch": 1.5766441928507824, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.695655822753906, + "learning_rate": 1e-06, + "loss": 0.401, + "mean_token_accuracy": 0.8736855387687683, + "num_tokens": 472837628.0, + "step": 12394 + }, + { + "epoch": 1.576771403129373, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.006372451782227, + "learning_rate": 1e-06, + "loss": 0.4177, + "mean_token_accuracy": 0.8698201179504395, + "num_tokens": 472877446.0, + "step": 12395 + }, + { + "epoch": 1.5768986134079634, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.863203048706055, + "learning_rate": 1e-06, + "loss": 0.4214, + "mean_token_accuracy": 0.8623356223106384, + "num_tokens": 472918437.0, + "step": 12396 + }, + { + "epoch": 1.577025823686554, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.755718231201172, + "learning_rate": 1e-06, + "loss": 0.3855, + "mean_token_accuracy": 0.8779424428939819, + "num_tokens": 472956242.0, + "step": 12397 + }, + { + "epoch": 1.5771530339651445, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.10572624206543, + "learning_rate": 1e-06, + "loss": 0.3809, + "mean_token_accuracy": 0.8783389925956726, + "num_tokens": 472985620.0, + "step": 12398 + }, + { + "epoch": 1.577280244243735, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.681427001953125, + "learning_rate": 1e-06, + "loss": 0.3944, + "mean_token_accuracy": 0.8737435936927795, + "num_tokens": 473019517.0, + "step": 12399 + }, + { + "epoch": 1.5774074545223256, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.922693252563477, + "learning_rate": 1e-06, + "loss": 0.3484, + "mean_token_accuracy": 0.8883353471755981, + "num_tokens": 473054624.0, + "step": 12400 + }, + { + "epoch": 1.5775346648009159, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.810131072998047, + "learning_rate": 1e-06, + "loss": 0.4452, + "mean_token_accuracy": 0.8594239950180054, + "num_tokens": 473094140.0, + "step": 12401 + }, + { + "epoch": 1.5776618750795064, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.79205322265625, + "learning_rate": 1e-06, + "loss": 0.4708, + "mean_token_accuracy": 0.8501235842704773, + "num_tokens": 473136886.0, + "step": 12402 + }, + { + "epoch": 1.577789085358097, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.016042709350586, + "learning_rate": 1e-06, + "loss": 0.45, + "mean_token_accuracy": 0.8572226762771606, + "num_tokens": 473173267.0, + "step": 12403 + }, + { + "epoch": 1.5779162956366874, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.7976016998291, + "learning_rate": 1e-06, + "loss": 0.4756, + "mean_token_accuracy": 0.8484847545623779, + "num_tokens": 473216477.0, + "step": 12404 + }, + { + "epoch": 1.578043505915278, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.834054946899414, + "learning_rate": 1e-06, + "loss": 0.4512, + "mean_token_accuracy": 0.8609635829925537, + "num_tokens": 473256986.0, + "step": 12405 + }, + { + "epoch": 1.5781707161938685, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.921772003173828, + "learning_rate": 1e-06, + "loss": 0.3924, + "mean_token_accuracy": 0.8771232962608337, + "num_tokens": 473297967.0, + "step": 12406 + }, + { + "epoch": 1.5782979264724588, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.962648391723633, + "learning_rate": 1e-06, + "loss": 0.4323, + "mean_token_accuracy": 0.8613308668136597, + "num_tokens": 473333580.0, + "step": 12407 + }, + { + "epoch": 1.5784251367510493, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.981426239013672, + "learning_rate": 1e-06, + "loss": 0.4033, + "mean_token_accuracy": 0.8727877140045166, + "num_tokens": 473370553.0, + "step": 12408 + }, + { + "epoch": 1.5785523470296399, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.863853454589844, + "learning_rate": 1e-06, + "loss": 0.462, + "mean_token_accuracy": 0.8505075573921204, + "num_tokens": 473410705.0, + "step": 12409 + }, + { + "epoch": 1.5786795573082304, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.843416213989258, + "learning_rate": 1e-06, + "loss": 0.3932, + "mean_token_accuracy": 0.873234748840332, + "num_tokens": 473443315.0, + "step": 12410 + }, + { + "epoch": 1.578806767586821, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.89903450012207, + "learning_rate": 1e-06, + "loss": 0.4382, + "mean_token_accuracy": 0.8606731295585632, + "num_tokens": 473488767.0, + "step": 12411 + }, + { + "epoch": 1.5789339778654115, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.630720138549805, + "learning_rate": 1e-06, + "loss": 0.37, + "mean_token_accuracy": 0.8831562995910645, + "num_tokens": 473525360.0, + "step": 12412 + }, + { + "epoch": 1.579061188144002, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.890342712402344, + "learning_rate": 1e-06, + "loss": 0.4228, + "mean_token_accuracy": 0.8606114983558655, + "num_tokens": 473560625.0, + "step": 12413 + }, + { + "epoch": 1.5791883984225925, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.812572479248047, + "learning_rate": 1e-06, + "loss": 0.3604, + "mean_token_accuracy": 0.8863910436630249, + "num_tokens": 473595484.0, + "step": 12414 + }, + { + "epoch": 1.579315608701183, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.733505249023438, + "learning_rate": 1e-06, + "loss": 0.4391, + "mean_token_accuracy": 0.8653249740600586, + "num_tokens": 473627269.0, + "step": 12415 + }, + { + "epoch": 1.5794428189797736, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.91253662109375, + "learning_rate": 1e-06, + "loss": 0.4597, + "mean_token_accuracy": 0.8532055616378784, + "num_tokens": 473669159.0, + "step": 12416 + }, + { + "epoch": 1.579570029258364, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.829435348510742, + "learning_rate": 1e-06, + "loss": 0.3859, + "mean_token_accuracy": 0.8781605958938599, + "num_tokens": 473707739.0, + "step": 12417 + }, + { + "epoch": 1.5796972395369546, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.7757511138916, + "learning_rate": 1e-06, + "loss": 0.4093, + "mean_token_accuracy": 0.8698734045028687, + "num_tokens": 473746908.0, + "step": 12418 + }, + { + "epoch": 1.5798244498155452, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.685253143310547, + "learning_rate": 1e-06, + "loss": 0.4163, + "mean_token_accuracy": 0.8669760227203369, + "num_tokens": 473781480.0, + "step": 12419 + }, + { + "epoch": 1.5799516600941357, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.94719886779785, + "learning_rate": 1e-06, + "loss": 0.4055, + "mean_token_accuracy": 0.8677834272384644, + "num_tokens": 473814676.0, + "step": 12420 + }, + { + "epoch": 1.5800788703727262, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.750638961791992, + "learning_rate": 1e-06, + "loss": 0.4299, + "mean_token_accuracy": 0.8648473024368286, + "num_tokens": 473855487.0, + "step": 12421 + }, + { + "epoch": 1.5802060806513167, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.73274803161621, + "learning_rate": 1e-06, + "loss": 0.3679, + "mean_token_accuracy": 0.8809645175933838, + "num_tokens": 473885648.0, + "step": 12422 + }, + { + "epoch": 1.5803332909299073, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.895671844482422, + "learning_rate": 1e-06, + "loss": 0.4254, + "mean_token_accuracy": 0.8639645576477051, + "num_tokens": 473923862.0, + "step": 12423 + }, + { + "epoch": 1.5804605012084978, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.769901275634766, + "learning_rate": 1e-06, + "loss": 0.404, + "mean_token_accuracy": 0.8729110956192017, + "num_tokens": 473957725.0, + "step": 12424 + }, + { + "epoch": 1.5805877114870883, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.869016647338867, + "learning_rate": 1e-06, + "loss": 0.4069, + "mean_token_accuracy": 0.87054443359375, + "num_tokens": 473994939.0, + "step": 12425 + }, + { + "epoch": 1.5807149217656786, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.819232940673828, + "learning_rate": 1e-06, + "loss": 0.4197, + "mean_token_accuracy": 0.8643800020217896, + "num_tokens": 474035996.0, + "step": 12426 + }, + { + "epoch": 1.5808421320442692, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.815793991088867, + "learning_rate": 1e-06, + "loss": 0.3958, + "mean_token_accuracy": 0.8768410682678223, + "num_tokens": 474071693.0, + "step": 12427 + }, + { + "epoch": 1.5809693423228597, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.643041610717773, + "learning_rate": 1e-06, + "loss": 0.3938, + "mean_token_accuracy": 0.8757455945014954, + "num_tokens": 474114415.0, + "step": 12428 + }, + { + "epoch": 1.5810965526014502, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.80050277709961, + "learning_rate": 1e-06, + "loss": 0.4343, + "mean_token_accuracy": 0.8626803159713745, + "num_tokens": 474152060.0, + "step": 12429 + }, + { + "epoch": 1.5812237628800407, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.802915573120117, + "learning_rate": 1e-06, + "loss": 0.3993, + "mean_token_accuracy": 0.8725967407226562, + "num_tokens": 474190605.0, + "step": 12430 + }, + { + "epoch": 1.5813509731586313, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.821683883666992, + "learning_rate": 1e-06, + "loss": 0.4332, + "mean_token_accuracy": 0.8637089133262634, + "num_tokens": 474229447.0, + "step": 12431 + }, + { + "epoch": 1.5814781834372216, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.845312118530273, + "learning_rate": 1e-06, + "loss": 0.3931, + "mean_token_accuracy": 0.8758713603019714, + "num_tokens": 474268088.0, + "step": 12432 + }, + { + "epoch": 1.581605393715812, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.931455612182617, + "learning_rate": 1e-06, + "loss": 0.4188, + "mean_token_accuracy": 0.8645755648612976, + "num_tokens": 474304856.0, + "step": 12433 + }, + { + "epoch": 1.5817326039944026, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.790496826171875, + "learning_rate": 1e-06, + "loss": 0.4067, + "mean_token_accuracy": 0.8713506460189819, + "num_tokens": 474349248.0, + "step": 12434 + }, + { + "epoch": 1.5818598142729932, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.85607147216797, + "learning_rate": 1e-06, + "loss": 0.3734, + "mean_token_accuracy": 0.8782241940498352, + "num_tokens": 474384432.0, + "step": 12435 + }, + { + "epoch": 1.5819870245515837, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.828977584838867, + "learning_rate": 1e-06, + "loss": 0.422, + "mean_token_accuracy": 0.8659365177154541, + "num_tokens": 474415881.0, + "step": 12436 + }, + { + "epoch": 1.5821142348301742, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.77609634399414, + "learning_rate": 1e-06, + "loss": 0.4227, + "mean_token_accuracy": 0.8661386966705322, + "num_tokens": 474452755.0, + "step": 12437 + }, + { + "epoch": 1.5822414451087647, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.8857479095459, + "learning_rate": 1e-06, + "loss": 0.4249, + "mean_token_accuracy": 0.863881528377533, + "num_tokens": 474493937.0, + "step": 12438 + }, + { + "epoch": 1.5823686553873553, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.035404205322266, + "learning_rate": 1e-06, + "loss": 0.394, + "mean_token_accuracy": 0.8745132684707642, + "num_tokens": 474534072.0, + "step": 12439 + }, + { + "epoch": 1.5824958656659458, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.730060577392578, + "learning_rate": 1e-06, + "loss": 0.4545, + "mean_token_accuracy": 0.8610368967056274, + "num_tokens": 474569343.0, + "step": 12440 + }, + { + "epoch": 1.5826230759445363, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.76238441467285, + "learning_rate": 1e-06, + "loss": 0.4494, + "mean_token_accuracy": 0.8584359884262085, + "num_tokens": 474606610.0, + "step": 12441 + }, + { + "epoch": 1.5827502862231269, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.014423370361328, + "learning_rate": 1e-06, + "loss": 0.4384, + "mean_token_accuracy": 0.8600060939788818, + "num_tokens": 474645071.0, + "step": 12442 + }, + { + "epoch": 1.5828774965017174, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.85344886779785, + "learning_rate": 1e-06, + "loss": 0.4504, + "mean_token_accuracy": 0.8556641340255737, + "num_tokens": 474683985.0, + "step": 12443 + }, + { + "epoch": 1.583004706780308, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.776132583618164, + "learning_rate": 1e-06, + "loss": 0.4113, + "mean_token_accuracy": 0.87450110912323, + "num_tokens": 474726102.0, + "step": 12444 + }, + { + "epoch": 1.5831319170588984, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.91757583618164, + "learning_rate": 1e-06, + "loss": 0.406, + "mean_token_accuracy": 0.8707633018493652, + "num_tokens": 474766430.0, + "step": 12445 + }, + { + "epoch": 1.583259127337489, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.84510040283203, + "learning_rate": 1e-06, + "loss": 0.4161, + "mean_token_accuracy": 0.8665966987609863, + "num_tokens": 474800648.0, + "step": 12446 + }, + { + "epoch": 1.5833863376160795, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.88482093811035, + "learning_rate": 1e-06, + "loss": 0.4103, + "mean_token_accuracy": 0.8716545104980469, + "num_tokens": 474845540.0, + "step": 12447 + }, + { + "epoch": 1.58351354789467, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.874820709228516, + "learning_rate": 1e-06, + "loss": 0.4246, + "mean_token_accuracy": 0.8661305904388428, + "num_tokens": 474884296.0, + "step": 12448 + }, + { + "epoch": 1.5836407581732606, + "ewc_loss": 0.03369140625, + "ewc_loss_parallel": 3.361701965332031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.599275588989258, + "learning_rate": 1e-06, + "loss": 0.4051, + "mean_token_accuracy": 0.8724898099899292, + "num_tokens": 474929344.0, + "step": 12449 + }, + { + "epoch": 1.5837679684518509, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.979740142822266, + "learning_rate": 1e-06, + "loss": 0.3956, + "mean_token_accuracy": 0.8732479810714722, + "num_tokens": 474968035.0, + "step": 12450 + }, + { + "epoch": 1.5838951787304414, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.905580520629883, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8630527257919312, + "num_tokens": 475006189.0, + "step": 12451 + }, + { + "epoch": 1.584022389009032, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.676267623901367, + "learning_rate": 1e-06, + "loss": 0.4212, + "mean_token_accuracy": 0.8634573221206665, + "num_tokens": 475039768.0, + "step": 12452 + }, + { + "epoch": 1.5841495992876224, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.8026065826416, + "learning_rate": 1e-06, + "loss": 0.4043, + "mean_token_accuracy": 0.8741191625595093, + "num_tokens": 475079938.0, + "step": 12453 + }, + { + "epoch": 1.584276809566213, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.954269409179688, + "learning_rate": 1e-06, + "loss": 0.451, + "mean_token_accuracy": 0.8561443090438843, + "num_tokens": 475122405.0, + "step": 12454 + }, + { + "epoch": 1.5844040198448035, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.817428588867188, + "learning_rate": 1e-06, + "loss": 0.4483, + "mean_token_accuracy": 0.8610867261886597, + "num_tokens": 475158237.0, + "step": 12455 + }, + { + "epoch": 1.5845312301233938, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.984376907348633, + "learning_rate": 1e-06, + "loss": 0.4045, + "mean_token_accuracy": 0.869446873664856, + "num_tokens": 475194966.0, + "step": 12456 + }, + { + "epoch": 1.5846584404019843, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.97810935974121, + "learning_rate": 1e-06, + "loss": 0.4206, + "mean_token_accuracy": 0.8666172027587891, + "num_tokens": 475234483.0, + "step": 12457 + }, + { + "epoch": 1.5847856506805749, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.884014129638672, + "learning_rate": 1e-06, + "loss": 0.4278, + "mean_token_accuracy": 0.8628755807876587, + "num_tokens": 475275172.0, + "step": 12458 + }, + { + "epoch": 1.5849128609591654, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.987018585205078, + "learning_rate": 1e-06, + "loss": 0.4015, + "mean_token_accuracy": 0.8727918267250061, + "num_tokens": 475313550.0, + "step": 12459 + }, + { + "epoch": 1.585040071237756, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.790632247924805, + "learning_rate": 1e-06, + "loss": 0.3903, + "mean_token_accuracy": 0.8752983808517456, + "num_tokens": 475357019.0, + "step": 12460 + }, + { + "epoch": 1.5851672815163464, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.79985809326172, + "learning_rate": 1e-06, + "loss": 0.395, + "mean_token_accuracy": 0.8736763000488281, + "num_tokens": 475394788.0, + "step": 12461 + }, + { + "epoch": 1.585294491794937, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.861698150634766, + "learning_rate": 1e-06, + "loss": 0.4212, + "mean_token_accuracy": 0.8663693070411682, + "num_tokens": 475432354.0, + "step": 12462 + }, + { + "epoch": 1.5854217020735275, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.83244514465332, + "learning_rate": 1e-06, + "loss": 0.4045, + "mean_token_accuracy": 0.8722147941589355, + "num_tokens": 475467284.0, + "step": 12463 + }, + { + "epoch": 1.585548912352118, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.74188232421875, + "learning_rate": 1e-06, + "loss": 0.436, + "mean_token_accuracy": 0.8602200150489807, + "num_tokens": 475505946.0, + "step": 12464 + }, + { + "epoch": 1.5856761226307086, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.9084415435791, + "learning_rate": 1e-06, + "loss": 0.4702, + "mean_token_accuracy": 0.8551580309867859, + "num_tokens": 475542799.0, + "step": 12465 + }, + { + "epoch": 1.585803332909299, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.80985450744629, + "learning_rate": 1e-06, + "loss": 0.4058, + "mean_token_accuracy": 0.8689825534820557, + "num_tokens": 475582064.0, + "step": 12466 + }, + { + "epoch": 1.5859305431878896, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.91608428955078, + "learning_rate": 1e-06, + "loss": 0.4247, + "mean_token_accuracy": 0.863460898399353, + "num_tokens": 475615521.0, + "step": 12467 + }, + { + "epoch": 1.5860577534664801, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.909711837768555, + "learning_rate": 1e-06, + "loss": 0.3894, + "mean_token_accuracy": 0.8759903311729431, + "num_tokens": 475650500.0, + "step": 12468 + }, + { + "epoch": 1.5861849637450707, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.729820251464844, + "learning_rate": 1e-06, + "loss": 0.4051, + "mean_token_accuracy": 0.8695657849311829, + "num_tokens": 475695824.0, + "step": 12469 + }, + { + "epoch": 1.5863121740236612, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.796070098876953, + "learning_rate": 1e-06, + "loss": 0.4932, + "mean_token_accuracy": 0.8464587926864624, + "num_tokens": 475734015.0, + "step": 12470 + }, + { + "epoch": 1.5864393843022517, + "ewc_loss": 0.03369140625, + "ewc_loss_parallel": 3.361701965332031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.96088981628418, + "learning_rate": 1e-06, + "loss": 0.4094, + "mean_token_accuracy": 0.8661374449729919, + "num_tokens": 475768961.0, + "step": 12471 + }, + { + "epoch": 1.5865665945808423, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.79632568359375, + "learning_rate": 1e-06, + "loss": 0.3669, + "mean_token_accuracy": 0.8824460506439209, + "num_tokens": 475806551.0, + "step": 12472 + }, + { + "epoch": 1.5866938048594328, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.816261291503906, + "learning_rate": 1e-06, + "loss": 0.4331, + "mean_token_accuracy": 0.8613554239273071, + "num_tokens": 475844066.0, + "step": 12473 + }, + { + "epoch": 1.5868210151380233, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.762454986572266, + "learning_rate": 1e-06, + "loss": 0.3998, + "mean_token_accuracy": 0.8734824061393738, + "num_tokens": 475874554.0, + "step": 12474 + }, + { + "epoch": 1.5869482254166136, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.828594207763672, + "learning_rate": 1e-06, + "loss": 0.3909, + "mean_token_accuracy": 0.8747726678848267, + "num_tokens": 475915840.0, + "step": 12475 + }, + { + "epoch": 1.5870754356952042, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.792308807373047, + "learning_rate": 1e-06, + "loss": 0.4277, + "mean_token_accuracy": 0.8664754629135132, + "num_tokens": 475951516.0, + "step": 12476 + }, + { + "epoch": 1.5872026459737947, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.993518829345703, + "learning_rate": 1e-06, + "loss": 0.4083, + "mean_token_accuracy": 0.8708455562591553, + "num_tokens": 475984224.0, + "step": 12477 + }, + { + "epoch": 1.5873298562523852, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.925647735595703, + "learning_rate": 1e-06, + "loss": 0.4796, + "mean_token_accuracy": 0.8466705083847046, + "num_tokens": 476015758.0, + "step": 12478 + }, + { + "epoch": 1.5874570665309757, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.768404006958008, + "learning_rate": 1e-06, + "loss": 0.4172, + "mean_token_accuracy": 0.8655569553375244, + "num_tokens": 476051835.0, + "step": 12479 + }, + { + "epoch": 1.5875842768095663, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.773601531982422, + "learning_rate": 1e-06, + "loss": 0.4257, + "mean_token_accuracy": 0.8644026517868042, + "num_tokens": 476090667.0, + "step": 12480 + }, + { + "epoch": 1.5877114870881566, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.82941246032715, + "learning_rate": 1e-06, + "loss": 0.4229, + "mean_token_accuracy": 0.8661033511161804, + "num_tokens": 476130448.0, + "step": 12481 + }, + { + "epoch": 1.587838697366747, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.66498374938965, + "learning_rate": 1e-06, + "loss": 0.4371, + "mean_token_accuracy": 0.8591862916946411, + "num_tokens": 476174495.0, + "step": 12482 + }, + { + "epoch": 1.5879659076453376, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.00359535217285, + "learning_rate": 1e-06, + "loss": 0.3995, + "mean_token_accuracy": 0.8763279914855957, + "num_tokens": 476213412.0, + "step": 12483 + }, + { + "epoch": 1.5880931179239282, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.055177688598633, + "learning_rate": 1e-06, + "loss": 0.3755, + "mean_token_accuracy": 0.8811831474304199, + "num_tokens": 476250441.0, + "step": 12484 + }, + { + "epoch": 1.5882203282025187, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.667570114135742, + "learning_rate": 1e-06, + "loss": 0.371, + "mean_token_accuracy": 0.8819933533668518, + "num_tokens": 476284466.0, + "step": 12485 + }, + { + "epoch": 1.5883475384811092, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.81675148010254, + "learning_rate": 1e-06, + "loss": 0.4234, + "mean_token_accuracy": 0.8673650622367859, + "num_tokens": 476327760.0, + "step": 12486 + }, + { + "epoch": 1.5884747487596997, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.019765853881836, + "learning_rate": 1e-06, + "loss": 0.4244, + "mean_token_accuracy": 0.864255428314209, + "num_tokens": 476369259.0, + "step": 12487 + }, + { + "epoch": 1.5886019590382903, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.79078483581543, + "learning_rate": 1e-06, + "loss": 0.4038, + "mean_token_accuracy": 0.8692306280136108, + "num_tokens": 476403915.0, + "step": 12488 + }, + { + "epoch": 1.5887291693168808, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.861093521118164, + "learning_rate": 1e-06, + "loss": 0.4247, + "mean_token_accuracy": 0.8656812906265259, + "num_tokens": 476440837.0, + "step": 12489 + }, + { + "epoch": 1.5888563795954713, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.838546752929688, + "learning_rate": 1e-06, + "loss": 0.4141, + "mean_token_accuracy": 0.8664737939834595, + "num_tokens": 476478478.0, + "step": 12490 + }, + { + "epoch": 1.5889835898740619, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.890146255493164, + "learning_rate": 1e-06, + "loss": 0.3781, + "mean_token_accuracy": 0.8763622045516968, + "num_tokens": 476514675.0, + "step": 12491 + }, + { + "epoch": 1.5891108001526524, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.765111923217773, + "learning_rate": 1e-06, + "loss": 0.4293, + "mean_token_accuracy": 0.8659332990646362, + "num_tokens": 476559015.0, + "step": 12492 + }, + { + "epoch": 1.589238010431243, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.976545333862305, + "learning_rate": 1e-06, + "loss": 0.4554, + "mean_token_accuracy": 0.852214515209198, + "num_tokens": 476601848.0, + "step": 12493 + }, + { + "epoch": 1.5893652207098334, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.776365280151367, + "learning_rate": 1e-06, + "loss": 0.3873, + "mean_token_accuracy": 0.8758730888366699, + "num_tokens": 476636794.0, + "step": 12494 + }, + { + "epoch": 1.589492430988424, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.732168197631836, + "learning_rate": 1e-06, + "loss": 0.4267, + "mean_token_accuracy": 0.8627372980117798, + "num_tokens": 476671593.0, + "step": 12495 + }, + { + "epoch": 1.5896196412670145, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.935264587402344, + "learning_rate": 1e-06, + "loss": 0.3842, + "mean_token_accuracy": 0.877875804901123, + "num_tokens": 476706776.0, + "step": 12496 + }, + { + "epoch": 1.589746851545605, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.814245223999023, + "learning_rate": 1e-06, + "loss": 0.4258, + "mean_token_accuracy": 0.8668087720870972, + "num_tokens": 476746700.0, + "step": 12497 + }, + { + "epoch": 1.5898740618241956, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.986825942993164, + "learning_rate": 1e-06, + "loss": 0.4011, + "mean_token_accuracy": 0.8736713528633118, + "num_tokens": 476785837.0, + "step": 12498 + }, + { + "epoch": 1.5900012721027859, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.90619659423828, + "learning_rate": 1e-06, + "loss": 0.3999, + "mean_token_accuracy": 0.8734102845191956, + "num_tokens": 476818894.0, + "step": 12499 + }, + { + "epoch": 1.5901284823813764, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.747297286987305, + "learning_rate": 1e-06, + "loss": 0.4697, + "mean_token_accuracy": 0.8508610725402832, + "num_tokens": 476861773.0, + "step": 12500 + }, + { + "epoch": 1.590255692659967, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.770828247070312, + "learning_rate": 1e-06, + "loss": 0.3962, + "mean_token_accuracy": 0.8738564252853394, + "num_tokens": 476899522.0, + "step": 12501 + }, + { + "epoch": 1.5903829029385574, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.1212158203125, + "learning_rate": 1e-06, + "loss": 0.4411, + "mean_token_accuracy": 0.8611094951629639, + "num_tokens": 476939923.0, + "step": 12502 + }, + { + "epoch": 1.590510113217148, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.756162643432617, + "learning_rate": 1e-06, + "loss": 0.4182, + "mean_token_accuracy": 0.8678508996963501, + "num_tokens": 476979867.0, + "step": 12503 + }, + { + "epoch": 1.5906373234957385, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.877370834350586, + "learning_rate": 1e-06, + "loss": 0.3935, + "mean_token_accuracy": 0.8755087852478027, + "num_tokens": 477016068.0, + "step": 12504 + }, + { + "epoch": 1.5907645337743288, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.938156127929688, + "learning_rate": 1e-06, + "loss": 0.433, + "mean_token_accuracy": 0.8650680780410767, + "num_tokens": 477054526.0, + "step": 12505 + }, + { + "epoch": 1.5908917440529193, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.89558982849121, + "learning_rate": 1e-06, + "loss": 0.4322, + "mean_token_accuracy": 0.8629137277603149, + "num_tokens": 477098029.0, + "step": 12506 + }, + { + "epoch": 1.5910189543315099, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.869831085205078, + "learning_rate": 1e-06, + "loss": 0.3975, + "mean_token_accuracy": 0.8744730949401855, + "num_tokens": 477137070.0, + "step": 12507 + }, + { + "epoch": 1.5911461646101004, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.77017593383789, + "learning_rate": 1e-06, + "loss": 0.4124, + "mean_token_accuracy": 0.866674542427063, + "num_tokens": 477175640.0, + "step": 12508 + }, + { + "epoch": 1.591273374888691, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.800029754638672, + "learning_rate": 1e-06, + "loss": 0.405, + "mean_token_accuracy": 0.8697155714035034, + "num_tokens": 477210935.0, + "step": 12509 + }, + { + "epoch": 1.5914005851672814, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.022659301757812, + "learning_rate": 1e-06, + "loss": 0.3983, + "mean_token_accuracy": 0.8751022815704346, + "num_tokens": 477251567.0, + "step": 12510 + }, + { + "epoch": 1.591527795445872, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.9108829498291, + "learning_rate": 1e-06, + "loss": 0.408, + "mean_token_accuracy": 0.8702826499938965, + "num_tokens": 477287964.0, + "step": 12511 + }, + { + "epoch": 1.5916550057244625, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.854372024536133, + "learning_rate": 1e-06, + "loss": 0.4901, + "mean_token_accuracy": 0.8428949117660522, + "num_tokens": 477328660.0, + "step": 12512 + }, + { + "epoch": 1.591782216003053, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.928302764892578, + "learning_rate": 1e-06, + "loss": 0.3953, + "mean_token_accuracy": 0.8726750612258911, + "num_tokens": 477364111.0, + "step": 12513 + }, + { + "epoch": 1.5919094262816436, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.913185119628906, + "learning_rate": 1e-06, + "loss": 0.3904, + "mean_token_accuracy": 0.87447589635849, + "num_tokens": 477402691.0, + "step": 12514 + }, + { + "epoch": 1.592036636560234, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.95742416381836, + "learning_rate": 1e-06, + "loss": 0.3745, + "mean_token_accuracy": 0.881492018699646, + "num_tokens": 477437526.0, + "step": 12515 + }, + { + "epoch": 1.5921638468388246, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.015371322631836, + "learning_rate": 1e-06, + "loss": 0.4231, + "mean_token_accuracy": 0.8664370179176331, + "num_tokens": 477479175.0, + "step": 12516 + }, + { + "epoch": 1.5922910571174151, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.88776397705078, + "learning_rate": 1e-06, + "loss": 0.4085, + "mean_token_accuracy": 0.8704524040222168, + "num_tokens": 477516896.0, + "step": 12517 + }, + { + "epoch": 1.5924182673960057, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.897872924804688, + "learning_rate": 1e-06, + "loss": 0.4485, + "mean_token_accuracy": 0.8586891889572144, + "num_tokens": 477555543.0, + "step": 12518 + }, + { + "epoch": 1.5925454776745962, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.988767623901367, + "learning_rate": 1e-06, + "loss": 0.3846, + "mean_token_accuracy": 0.879767656326294, + "num_tokens": 477592251.0, + "step": 12519 + }, + { + "epoch": 1.5926726879531867, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.07830810546875, + "learning_rate": 1e-06, + "loss": 0.4692, + "mean_token_accuracy": 0.8551959991455078, + "num_tokens": 477630039.0, + "step": 12520 + }, + { + "epoch": 1.5927998982317773, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.107213973999023, + "learning_rate": 1e-06, + "loss": 0.4122, + "mean_token_accuracy": 0.8667855262756348, + "num_tokens": 477661028.0, + "step": 12521 + }, + { + "epoch": 1.5929271085103678, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.9896297454834, + "learning_rate": 1e-06, + "loss": 0.3803, + "mean_token_accuracy": 0.8812444806098938, + "num_tokens": 477699612.0, + "step": 12522 + }, + { + "epoch": 1.5930543187889583, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.073312759399414, + "learning_rate": 1e-06, + "loss": 0.4323, + "mean_token_accuracy": 0.8637738823890686, + "num_tokens": 477737300.0, + "step": 12523 + }, + { + "epoch": 1.5931815290675486, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.97684097290039, + "learning_rate": 1e-06, + "loss": 0.4625, + "mean_token_accuracy": 0.8556699752807617, + "num_tokens": 477776031.0, + "step": 12524 + }, + { + "epoch": 1.5933087393461391, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.811614990234375, + "learning_rate": 1e-06, + "loss": 0.414, + "mean_token_accuracy": 0.8690834045410156, + "num_tokens": 477813640.0, + "step": 12525 + }, + { + "epoch": 1.5934359496247297, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.276813507080078, + "learning_rate": 1e-06, + "loss": 0.418, + "mean_token_accuracy": 0.8636445999145508, + "num_tokens": 477848652.0, + "step": 12526 + }, + { + "epoch": 1.5935631599033202, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.85776138305664, + "learning_rate": 1e-06, + "loss": 0.425, + "mean_token_accuracy": 0.8638505935668945, + "num_tokens": 477883710.0, + "step": 12527 + }, + { + "epoch": 1.5936903701819107, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.01749610900879, + "learning_rate": 1e-06, + "loss": 0.4387, + "mean_token_accuracy": 0.8614163994789124, + "num_tokens": 477920316.0, + "step": 12528 + }, + { + "epoch": 1.5938175804605013, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.81195640563965, + "learning_rate": 1e-06, + "loss": 0.3803, + "mean_token_accuracy": 0.8769409656524658, + "num_tokens": 477955523.0, + "step": 12529 + }, + { + "epoch": 1.5939447907390916, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.175378799438477, + "learning_rate": 1e-06, + "loss": 0.438, + "mean_token_accuracy": 0.8605445623397827, + "num_tokens": 477995768.0, + "step": 12530 + }, + { + "epoch": 1.594072001017682, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.939912796020508, + "learning_rate": 1e-06, + "loss": 0.4219, + "mean_token_accuracy": 0.8674768209457397, + "num_tokens": 478037718.0, + "step": 12531 + }, + { + "epoch": 1.5941992112962726, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.088075637817383, + "learning_rate": 1e-06, + "loss": 0.4516, + "mean_token_accuracy": 0.8558025360107422, + "num_tokens": 478077388.0, + "step": 12532 + }, + { + "epoch": 1.5943264215748632, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.936307907104492, + "learning_rate": 1e-06, + "loss": 0.3682, + "mean_token_accuracy": 0.8832675814628601, + "num_tokens": 478109494.0, + "step": 12533 + }, + { + "epoch": 1.5944536318534537, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.08234405517578, + "learning_rate": 1e-06, + "loss": 0.4265, + "mean_token_accuracy": 0.8634266257286072, + "num_tokens": 478142420.0, + "step": 12534 + }, + { + "epoch": 1.5945808421320442, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.114885330200195, + "learning_rate": 1e-06, + "loss": 0.4035, + "mean_token_accuracy": 0.8728024959564209, + "num_tokens": 478183848.0, + "step": 12535 + }, + { + "epoch": 1.5947080524106347, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.100133895874023, + "learning_rate": 1e-06, + "loss": 0.3866, + "mean_token_accuracy": 0.8771010637283325, + "num_tokens": 478225242.0, + "step": 12536 + }, + { + "epoch": 1.5948352626892253, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.016178131103516, + "learning_rate": 1e-06, + "loss": 0.393, + "mean_token_accuracy": 0.8745312690734863, + "num_tokens": 478263167.0, + "step": 12537 + }, + { + "epoch": 1.5949624729678158, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.99312400817871, + "learning_rate": 1e-06, + "loss": 0.4085, + "mean_token_accuracy": 0.868859052658081, + "num_tokens": 478297752.0, + "step": 12538 + }, + { + "epoch": 1.5950896832464063, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.97863006591797, + "learning_rate": 1e-06, + "loss": 0.4252, + "mean_token_accuracy": 0.8687365055084229, + "num_tokens": 478332955.0, + "step": 12539 + }, + { + "epoch": 1.5952168935249968, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.949405670166016, + "learning_rate": 1e-06, + "loss": 0.4774, + "mean_token_accuracy": 0.8480799198150635, + "num_tokens": 478367298.0, + "step": 12540 + }, + { + "epoch": 1.5953441038035874, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.034055709838867, + "learning_rate": 1e-06, + "loss": 0.4043, + "mean_token_accuracy": 0.8728059530258179, + "num_tokens": 478411183.0, + "step": 12541 + }, + { + "epoch": 1.595471314082178, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.823732376098633, + "learning_rate": 1e-06, + "loss": 0.418, + "mean_token_accuracy": 0.8658478260040283, + "num_tokens": 478451184.0, + "step": 12542 + }, + { + "epoch": 1.5955985243607684, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.9167537689209, + "learning_rate": 1e-06, + "loss": 0.4281, + "mean_token_accuracy": 0.8636889457702637, + "num_tokens": 478490913.0, + "step": 12543 + }, + { + "epoch": 1.595725734639359, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.131641387939453, + "learning_rate": 1e-06, + "loss": 0.4026, + "mean_token_accuracy": 0.8693737387657166, + "num_tokens": 478529769.0, + "step": 12544 + }, + { + "epoch": 1.5958529449179495, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.8897762298584, + "learning_rate": 1e-06, + "loss": 0.4284, + "mean_token_accuracy": 0.8646466732025146, + "num_tokens": 478566665.0, + "step": 12545 + }, + { + "epoch": 1.59598015519654, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.251230239868164, + "learning_rate": 1e-06, + "loss": 0.3334, + "mean_token_accuracy": 0.8947116136550903, + "num_tokens": 478600814.0, + "step": 12546 + }, + { + "epoch": 1.5961073654751305, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.102624893188477, + "learning_rate": 1e-06, + "loss": 0.4003, + "mean_token_accuracy": 0.8708325624465942, + "num_tokens": 478641610.0, + "step": 12547 + }, + { + "epoch": 1.5962345757537209, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.965410232543945, + "learning_rate": 1e-06, + "loss": 0.3961, + "mean_token_accuracy": 0.8765415549278259, + "num_tokens": 478678183.0, + "step": 12548 + }, + { + "epoch": 1.5963617860323114, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.93387794494629, + "learning_rate": 1e-06, + "loss": 0.4149, + "mean_token_accuracy": 0.8688793778419495, + "num_tokens": 478710881.0, + "step": 12549 + }, + { + "epoch": 1.596488996310902, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.87919044494629, + "learning_rate": 1e-06, + "loss": 0.3752, + "mean_token_accuracy": 0.8820565938949585, + "num_tokens": 478748550.0, + "step": 12550 + }, + { + "epoch": 1.5966162065894924, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.879640579223633, + "learning_rate": 1e-06, + "loss": 0.4587, + "mean_token_accuracy": 0.8560940623283386, + "num_tokens": 478790417.0, + "step": 12551 + }, + { + "epoch": 1.596743416868083, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.829303741455078, + "learning_rate": 1e-06, + "loss": 0.3819, + "mean_token_accuracy": 0.8739490509033203, + "num_tokens": 478828856.0, + "step": 12552 + }, + { + "epoch": 1.5968706271466735, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.96737289428711, + "learning_rate": 1e-06, + "loss": 0.4291, + "mean_token_accuracy": 0.8663946390151978, + "num_tokens": 478870240.0, + "step": 12553 + }, + { + "epoch": 1.5969978374252638, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.067462921142578, + "learning_rate": 1e-06, + "loss": 0.4476, + "mean_token_accuracy": 0.8596411943435669, + "num_tokens": 478911920.0, + "step": 12554 + }, + { + "epoch": 1.5971250477038543, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.931285858154297, + "learning_rate": 1e-06, + "loss": 0.4322, + "mean_token_accuracy": 0.8620526790618896, + "num_tokens": 478949083.0, + "step": 12555 + }, + { + "epoch": 1.5972522579824449, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.898447036743164, + "learning_rate": 1e-06, + "loss": 0.4275, + "mean_token_accuracy": 0.8670698404312134, + "num_tokens": 478984087.0, + "step": 12556 + }, + { + "epoch": 1.5973794682610354, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.079002380371094, + "learning_rate": 1e-06, + "loss": 0.4774, + "mean_token_accuracy": 0.8498606085777283, + "num_tokens": 479019364.0, + "step": 12557 + }, + { + "epoch": 1.597506678539626, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.04615592956543, + "learning_rate": 1e-06, + "loss": 0.4057, + "mean_token_accuracy": 0.8708012700080872, + "num_tokens": 479059002.0, + "step": 12558 + }, + { + "epoch": 1.5976338888182164, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.934173583984375, + "learning_rate": 1e-06, + "loss": 0.4129, + "mean_token_accuracy": 0.867335319519043, + "num_tokens": 479092676.0, + "step": 12559 + }, + { + "epoch": 1.597761099096807, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.966228485107422, + "learning_rate": 1e-06, + "loss": 0.4078, + "mean_token_accuracy": 0.870861291885376, + "num_tokens": 479125951.0, + "step": 12560 + }, + { + "epoch": 1.5978883093753975, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.93813705444336, + "learning_rate": 1e-06, + "loss": 0.3641, + "mean_token_accuracy": 0.8840010166168213, + "num_tokens": 479166230.0, + "step": 12561 + }, + { + "epoch": 1.598015519653988, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.87535858154297, + "learning_rate": 1e-06, + "loss": 0.4208, + "mean_token_accuracy": 0.8650741577148438, + "num_tokens": 479205813.0, + "step": 12562 + }, + { + "epoch": 1.5981427299325786, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.927309036254883, + "learning_rate": 1e-06, + "loss": 0.4215, + "mean_token_accuracy": 0.864728569984436, + "num_tokens": 479244960.0, + "step": 12563 + }, + { + "epoch": 1.598269940211169, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.721221923828125, + "learning_rate": 1e-06, + "loss": 0.3964, + "mean_token_accuracy": 0.875146746635437, + "num_tokens": 479289617.0, + "step": 12564 + }, + { + "epoch": 1.5983971504897596, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.01510238647461, + "learning_rate": 1e-06, + "loss": 0.425, + "mean_token_accuracy": 0.8690423965454102, + "num_tokens": 479326377.0, + "step": 12565 + }, + { + "epoch": 1.5985243607683501, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.704612731933594, + "learning_rate": 1e-06, + "loss": 0.4172, + "mean_token_accuracy": 0.8675073981285095, + "num_tokens": 479360563.0, + "step": 12566 + }, + { + "epoch": 1.5986515710469407, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.13558578491211, + "learning_rate": 1e-06, + "loss": 0.3797, + "mean_token_accuracy": 0.879551112651825, + "num_tokens": 479403042.0, + "step": 12567 + }, + { + "epoch": 1.5987787813255312, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.03612518310547, + "learning_rate": 1e-06, + "loss": 0.3921, + "mean_token_accuracy": 0.8743886947631836, + "num_tokens": 479437099.0, + "step": 12568 + }, + { + "epoch": 1.5989059916041217, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.782068252563477, + "learning_rate": 1e-06, + "loss": 0.4199, + "mean_token_accuracy": 0.8664469122886658, + "num_tokens": 479479326.0, + "step": 12569 + }, + { + "epoch": 1.5990332018827123, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.98244285583496, + "learning_rate": 1e-06, + "loss": 0.4669, + "mean_token_accuracy": 0.8508285880088806, + "num_tokens": 479515320.0, + "step": 12570 + }, + { + "epoch": 1.5991604121613028, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.829334259033203, + "learning_rate": 1e-06, + "loss": 0.449, + "mean_token_accuracy": 0.8576402068138123, + "num_tokens": 479559960.0, + "step": 12571 + }, + { + "epoch": 1.5992876224398933, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.968996047973633, + "learning_rate": 1e-06, + "loss": 0.415, + "mean_token_accuracy": 0.8689745664596558, + "num_tokens": 479599115.0, + "step": 12572 + }, + { + "epoch": 1.5994148327184836, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.94102668762207, + "learning_rate": 1e-06, + "loss": 0.3817, + "mean_token_accuracy": 0.8765723705291748, + "num_tokens": 479636116.0, + "step": 12573 + }, + { + "epoch": 1.5995420429970741, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.975645065307617, + "learning_rate": 1e-06, + "loss": 0.4473, + "mean_token_accuracy": 0.8596552610397339, + "num_tokens": 479676880.0, + "step": 12574 + }, + { + "epoch": 1.5996692532756647, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.877349853515625, + "learning_rate": 1e-06, + "loss": 0.4098, + "mean_token_accuracy": 0.8704404234886169, + "num_tokens": 479717424.0, + "step": 12575 + }, + { + "epoch": 1.5997964635542552, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.90182876586914, + "learning_rate": 1e-06, + "loss": 0.4141, + "mean_token_accuracy": 0.86751389503479, + "num_tokens": 479756240.0, + "step": 12576 + }, + { + "epoch": 1.5999236738328457, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.845600128173828, + "learning_rate": 1e-06, + "loss": 0.4633, + "mean_token_accuracy": 0.8539146184921265, + "num_tokens": 479791863.0, + "step": 12577 + }, + { + "epoch": 1.6000508841114363, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.895675659179688, + "learning_rate": 1e-06, + "loss": 0.3848, + "mean_token_accuracy": 0.878269374370575, + "num_tokens": 479827252.0, + "step": 12578 + }, + { + "epoch": 1.6001780943900266, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.87392234802246, + "learning_rate": 1e-06, + "loss": 0.4274, + "mean_token_accuracy": 0.8648023009300232, + "num_tokens": 479863100.0, + "step": 12579 + }, + { + "epoch": 1.600305304668617, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.99290657043457, + "learning_rate": 1e-06, + "loss": 0.4511, + "mean_token_accuracy": 0.8573136329650879, + "num_tokens": 479899684.0, + "step": 12580 + }, + { + "epoch": 1.6004325149472076, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.91650390625, + "learning_rate": 1e-06, + "loss": 0.4127, + "mean_token_accuracy": 0.8685962557792664, + "num_tokens": 479937656.0, + "step": 12581 + }, + { + "epoch": 1.6005597252257981, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.021028518676758, + "learning_rate": 1e-06, + "loss": 0.4377, + "mean_token_accuracy": 0.8640190958976746, + "num_tokens": 479973357.0, + "step": 12582 + }, + { + "epoch": 1.6006869355043887, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.22179412841797, + "learning_rate": 1e-06, + "loss": 0.418, + "mean_token_accuracy": 0.8689144253730774, + "num_tokens": 480017862.0, + "step": 12583 + }, + { + "epoch": 1.6008141457829792, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.779647827148438, + "learning_rate": 1e-06, + "loss": 0.4001, + "mean_token_accuracy": 0.8738057017326355, + "num_tokens": 480057039.0, + "step": 12584 + }, + { + "epoch": 1.6009413560615697, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.063764572143555, + "learning_rate": 1e-06, + "loss": 0.3966, + "mean_token_accuracy": 0.8714101314544678, + "num_tokens": 480093396.0, + "step": 12585 + }, + { + "epoch": 1.6010685663401603, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.92513084411621, + "learning_rate": 1e-06, + "loss": 0.4148, + "mean_token_accuracy": 0.8637442588806152, + "num_tokens": 480125200.0, + "step": 12586 + }, + { + "epoch": 1.6011957766187508, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.86376953125, + "learning_rate": 1e-06, + "loss": 0.4293, + "mean_token_accuracy": 0.8663347959518433, + "num_tokens": 480158085.0, + "step": 12587 + }, + { + "epoch": 1.6013229868973413, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.051855087280273, + "learning_rate": 1e-06, + "loss": 0.3756, + "mean_token_accuracy": 0.8843551874160767, + "num_tokens": 480192979.0, + "step": 12588 + }, + { + "epoch": 1.6014501971759318, + "ewc_loss": 0.03369140625, + "ewc_loss_parallel": 3.361701965332031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.013933181762695, + "learning_rate": 1e-06, + "loss": 0.3825, + "mean_token_accuracy": 0.8782246708869934, + "num_tokens": 480227984.0, + "step": 12589 + }, + { + "epoch": 1.6015774074545224, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.680007934570312, + "learning_rate": 1e-06, + "loss": 0.3596, + "mean_token_accuracy": 0.8854537606239319, + "num_tokens": 480264280.0, + "step": 12590 + }, + { + "epoch": 1.601704617733113, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.859533309936523, + "learning_rate": 1e-06, + "loss": 0.4529, + "mean_token_accuracy": 0.8544951677322388, + "num_tokens": 480300670.0, + "step": 12591 + }, + { + "epoch": 1.6018318280117034, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.99048614501953, + "learning_rate": 1e-06, + "loss": 0.4131, + "mean_token_accuracy": 0.8667374849319458, + "num_tokens": 480334460.0, + "step": 12592 + }, + { + "epoch": 1.601959038290294, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.01377296447754, + "learning_rate": 1e-06, + "loss": 0.3826, + "mean_token_accuracy": 0.8780205845832825, + "num_tokens": 480376066.0, + "step": 12593 + }, + { + "epoch": 1.6020862485688845, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.94707679748535, + "learning_rate": 1e-06, + "loss": 0.3898, + "mean_token_accuracy": 0.8745347857475281, + "num_tokens": 480417575.0, + "step": 12594 + }, + { + "epoch": 1.602213458847475, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.929582595825195, + "learning_rate": 1e-06, + "loss": 0.4426, + "mean_token_accuracy": 0.8585236072540283, + "num_tokens": 480454106.0, + "step": 12595 + }, + { + "epoch": 1.6023406691260655, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.84709930419922, + "learning_rate": 1e-06, + "loss": 0.4551, + "mean_token_accuracy": 0.8561055660247803, + "num_tokens": 480493031.0, + "step": 12596 + }, + { + "epoch": 1.6024678794046558, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.924375534057617, + "learning_rate": 1e-06, + "loss": 0.4083, + "mean_token_accuracy": 0.8699688911437988, + "num_tokens": 480528326.0, + "step": 12597 + }, + { + "epoch": 1.6025950896832464, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.967912673950195, + "learning_rate": 1e-06, + "loss": 0.3947, + "mean_token_accuracy": 0.876727283000946, + "num_tokens": 480565399.0, + "step": 12598 + }, + { + "epoch": 1.602722299961837, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.83006477355957, + "learning_rate": 1e-06, + "loss": 0.3617, + "mean_token_accuracy": 0.8838977217674255, + "num_tokens": 480600028.0, + "step": 12599 + }, + { + "epoch": 1.6028495102404274, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.945022583007812, + "learning_rate": 1e-06, + "loss": 0.4162, + "mean_token_accuracy": 0.8682514429092407, + "num_tokens": 480635759.0, + "step": 12600 + }, + { + "epoch": 1.602976720519018, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.113122940063477, + "learning_rate": 1e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.8743890523910522, + "num_tokens": 480669528.0, + "step": 12601 + }, + { + "epoch": 1.6031039307976085, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.866273880004883, + "learning_rate": 1e-06, + "loss": 0.4258, + "mean_token_accuracy": 0.8666000366210938, + "num_tokens": 480709425.0, + "step": 12602 + }, + { + "epoch": 1.6032311410761988, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.872608184814453, + "learning_rate": 1e-06, + "loss": 0.4585, + "mean_token_accuracy": 0.8550760746002197, + "num_tokens": 480745010.0, + "step": 12603 + }, + { + "epoch": 1.6033583513547893, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.062070846557617, + "learning_rate": 1e-06, + "loss": 0.3934, + "mean_token_accuracy": 0.8751264810562134, + "num_tokens": 480780320.0, + "step": 12604 + }, + { + "epoch": 1.6034855616333799, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.864126205444336, + "learning_rate": 1e-06, + "loss": 0.3909, + "mean_token_accuracy": 0.8760160207748413, + "num_tokens": 480819335.0, + "step": 12605 + }, + { + "epoch": 1.6036127719119704, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.15233039855957, + "learning_rate": 1e-06, + "loss": 0.3775, + "mean_token_accuracy": 0.881717324256897, + "num_tokens": 480852702.0, + "step": 12606 + }, + { + "epoch": 1.603739982190561, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.114158630371094, + "learning_rate": 1e-06, + "loss": 0.427, + "mean_token_accuracy": 0.8628137707710266, + "num_tokens": 480890258.0, + "step": 12607 + }, + { + "epoch": 1.6038671924691514, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.0769100189209, + "learning_rate": 1e-06, + "loss": 0.4186, + "mean_token_accuracy": 0.8681668639183044, + "num_tokens": 480929555.0, + "step": 12608 + }, + { + "epoch": 1.603994402747742, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.066917419433594, + "learning_rate": 1e-06, + "loss": 0.3492, + "mean_token_accuracy": 0.8908206820487976, + "num_tokens": 480970295.0, + "step": 12609 + }, + { + "epoch": 1.6041216130263325, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.842374801635742, + "learning_rate": 1e-06, + "loss": 0.4112, + "mean_token_accuracy": 0.869959831237793, + "num_tokens": 481011213.0, + "step": 12610 + }, + { + "epoch": 1.604248823304923, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.95077896118164, + "learning_rate": 1e-06, + "loss": 0.4255, + "mean_token_accuracy": 0.864046573638916, + "num_tokens": 481045273.0, + "step": 12611 + }, + { + "epoch": 1.6043760335835135, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.08816146850586, + "learning_rate": 1e-06, + "loss": 0.4771, + "mean_token_accuracy": 0.8497625589370728, + "num_tokens": 481084567.0, + "step": 12612 + }, + { + "epoch": 1.604503243862104, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.93712043762207, + "learning_rate": 1e-06, + "loss": 0.3921, + "mean_token_accuracy": 0.8699709177017212, + "num_tokens": 481119455.0, + "step": 12613 + }, + { + "epoch": 1.6046304541406946, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.934295654296875, + "learning_rate": 1e-06, + "loss": 0.4661, + "mean_token_accuracy": 0.8540911674499512, + "num_tokens": 481154718.0, + "step": 12614 + }, + { + "epoch": 1.6047576644192851, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.021080017089844, + "learning_rate": 1e-06, + "loss": 0.4346, + "mean_token_accuracy": 0.8631454706192017, + "num_tokens": 481193716.0, + "step": 12615 + }, + { + "epoch": 1.6048848746978757, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.892398834228516, + "learning_rate": 1e-06, + "loss": 0.4943, + "mean_token_accuracy": 0.844536304473877, + "num_tokens": 481231378.0, + "step": 12616 + }, + { + "epoch": 1.6050120849764662, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.918933868408203, + "learning_rate": 1e-06, + "loss": 0.4184, + "mean_token_accuracy": 0.8668615818023682, + "num_tokens": 481267907.0, + "step": 12617 + }, + { + "epoch": 1.6051392952550567, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.773176193237305, + "learning_rate": 1e-06, + "loss": 0.4106, + "mean_token_accuracy": 0.8695669770240784, + "num_tokens": 481309899.0, + "step": 12618 + }, + { + "epoch": 1.6052665055336472, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.95854949951172, + "learning_rate": 1e-06, + "loss": 0.393, + "mean_token_accuracy": 0.8781179189682007, + "num_tokens": 481348845.0, + "step": 12619 + }, + { + "epoch": 1.6053937158122378, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.844411849975586, + "learning_rate": 1e-06, + "loss": 0.4387, + "mean_token_accuracy": 0.8587255477905273, + "num_tokens": 481384999.0, + "step": 12620 + }, + { + "epoch": 1.6055209260908283, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.959434509277344, + "learning_rate": 1e-06, + "loss": 0.4029, + "mean_token_accuracy": 0.8704634308815002, + "num_tokens": 481417287.0, + "step": 12621 + }, + { + "epoch": 1.6056481363694186, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.0418701171875, + "learning_rate": 1e-06, + "loss": 0.4049, + "mean_token_accuracy": 0.8721990585327148, + "num_tokens": 481455943.0, + "step": 12622 + }, + { + "epoch": 1.6057753466480091, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.95892333984375, + "learning_rate": 1e-06, + "loss": 0.3722, + "mean_token_accuracy": 0.8829224109649658, + "num_tokens": 481492017.0, + "step": 12623 + }, + { + "epoch": 1.6059025569265997, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.5400333404541, + "learning_rate": 1e-06, + "loss": 0.4443, + "mean_token_accuracy": 0.862349271774292, + "num_tokens": 481535115.0, + "step": 12624 + }, + { + "epoch": 1.6060297672051902, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.024335861206055, + "learning_rate": 1e-06, + "loss": 0.4512, + "mean_token_accuracy": 0.85273277759552, + "num_tokens": 481580048.0, + "step": 12625 + }, + { + "epoch": 1.6061569774837807, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.002111434936523, + "learning_rate": 1e-06, + "loss": 0.3875, + "mean_token_accuracy": 0.8795768022537231, + "num_tokens": 481616748.0, + "step": 12626 + }, + { + "epoch": 1.6062841877623713, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.676021575927734, + "learning_rate": 1e-06, + "loss": 0.4988, + "mean_token_accuracy": 0.8396580219268799, + "num_tokens": 481659123.0, + "step": 12627 + }, + { + "epoch": 1.6064113980409616, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.894784927368164, + "learning_rate": 1e-06, + "loss": 0.46, + "mean_token_accuracy": 0.8597261905670166, + "num_tokens": 481704791.0, + "step": 12628 + }, + { + "epoch": 1.606538608319552, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.274328231811523, + "learning_rate": 1e-06, + "loss": 0.4529, + "mean_token_accuracy": 0.8557910919189453, + "num_tokens": 481742913.0, + "step": 12629 + }, + { + "epoch": 1.6066658185981426, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.656391143798828, + "learning_rate": 1e-06, + "loss": 0.4509, + "mean_token_accuracy": 0.8553551435470581, + "num_tokens": 481783549.0, + "step": 12630 + }, + { + "epoch": 1.6067930288767331, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.992626190185547, + "learning_rate": 1e-06, + "loss": 0.3971, + "mean_token_accuracy": 0.8740042448043823, + "num_tokens": 481818486.0, + "step": 12631 + }, + { + "epoch": 1.6069202391553237, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.86910629272461, + "learning_rate": 1e-06, + "loss": 0.4171, + "mean_token_accuracy": 0.868656575679779, + "num_tokens": 481849969.0, + "step": 12632 + }, + { + "epoch": 1.6070474494339142, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.840789794921875, + "learning_rate": 1e-06, + "loss": 0.463, + "mean_token_accuracy": 0.852368950843811, + "num_tokens": 481886108.0, + "step": 12633 + }, + { + "epoch": 1.6071746597125047, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.91786003112793, + "learning_rate": 1e-06, + "loss": 0.3745, + "mean_token_accuracy": 0.879259467124939, + "num_tokens": 481919389.0, + "step": 12634 + }, + { + "epoch": 1.6073018699910953, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.80300521850586, + "learning_rate": 1e-06, + "loss": 0.3697, + "mean_token_accuracy": 0.8810023069381714, + "num_tokens": 481958531.0, + "step": 12635 + }, + { + "epoch": 1.6074290802696858, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.058788299560547, + "learning_rate": 1e-06, + "loss": 0.4242, + "mean_token_accuracy": 0.8643181324005127, + "num_tokens": 481999864.0, + "step": 12636 + }, + { + "epoch": 1.6075562905482763, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.95106315612793, + "learning_rate": 1e-06, + "loss": 0.3547, + "mean_token_accuracy": 0.8908510208129883, + "num_tokens": 482033890.0, + "step": 12637 + }, + { + "epoch": 1.6076835008268668, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.84341812133789, + "learning_rate": 1e-06, + "loss": 0.3972, + "mean_token_accuracy": 0.8738961219787598, + "num_tokens": 482071186.0, + "step": 12638 + }, + { + "epoch": 1.6078107111054574, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.893932342529297, + "learning_rate": 1e-06, + "loss": 0.423, + "mean_token_accuracy": 0.8669112920761108, + "num_tokens": 482107218.0, + "step": 12639 + }, + { + "epoch": 1.607937921384048, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.893646240234375, + "learning_rate": 1e-06, + "loss": 0.4277, + "mean_token_accuracy": 0.864138126373291, + "num_tokens": 482146101.0, + "step": 12640 + }, + { + "epoch": 1.6080651316626384, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.87879753112793, + "learning_rate": 1e-06, + "loss": 0.3693, + "mean_token_accuracy": 0.8813799619674683, + "num_tokens": 482178531.0, + "step": 12641 + }, + { + "epoch": 1.608192341941229, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.013023376464844, + "learning_rate": 1e-06, + "loss": 0.3913, + "mean_token_accuracy": 0.8775037527084351, + "num_tokens": 482217019.0, + "step": 12642 + }, + { + "epoch": 1.6083195522198195, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.760784149169922, + "learning_rate": 1e-06, + "loss": 0.4068, + "mean_token_accuracy": 0.8691785335540771, + "num_tokens": 482250547.0, + "step": 12643 + }, + { + "epoch": 1.60844676249841, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.117080688476562, + "learning_rate": 1e-06, + "loss": 0.4274, + "mean_token_accuracy": 0.8644295930862427, + "num_tokens": 482287009.0, + "step": 12644 + }, + { + "epoch": 1.6085739727770005, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.850168228149414, + "learning_rate": 1e-06, + "loss": 0.4605, + "mean_token_accuracy": 0.8556700348854065, + "num_tokens": 482327373.0, + "step": 12645 + }, + { + "epoch": 1.6087011830555908, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.83898162841797, + "learning_rate": 1e-06, + "loss": 0.4503, + "mean_token_accuracy": 0.8571163415908813, + "num_tokens": 482363409.0, + "step": 12646 + }, + { + "epoch": 1.6088283933341814, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.93691062927246, + "learning_rate": 1e-06, + "loss": 0.4376, + "mean_token_accuracy": 0.858578085899353, + "num_tokens": 482396499.0, + "step": 12647 + }, + { + "epoch": 1.608955603612772, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.93180274963379, + "learning_rate": 1e-06, + "loss": 0.4061, + "mean_token_accuracy": 0.8692319393157959, + "num_tokens": 482435059.0, + "step": 12648 + }, + { + "epoch": 1.6090828138913624, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.896364212036133, + "learning_rate": 1e-06, + "loss": 0.4402, + "mean_token_accuracy": 0.861387312412262, + "num_tokens": 482472793.0, + "step": 12649 + }, + { + "epoch": 1.609210024169953, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.75016975402832, + "learning_rate": 1e-06, + "loss": 0.4108, + "mean_token_accuracy": 0.8703052997589111, + "num_tokens": 482508857.0, + "step": 12650 + }, + { + "epoch": 1.6093372344485435, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.271839141845703, + "learning_rate": 1e-06, + "loss": 0.465, + "mean_token_accuracy": 0.8514627814292908, + "num_tokens": 482548526.0, + "step": 12651 + }, + { + "epoch": 1.6094644447271338, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.765459060668945, + "learning_rate": 1e-06, + "loss": 0.4434, + "mean_token_accuracy": 0.8595331907272339, + "num_tokens": 482586461.0, + "step": 12652 + }, + { + "epoch": 1.6095916550057243, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.044939041137695, + "learning_rate": 1e-06, + "loss": 0.4421, + "mean_token_accuracy": 0.8613303899765015, + "num_tokens": 482624642.0, + "step": 12653 + }, + { + "epoch": 1.6097188652843148, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.04924774169922, + "learning_rate": 1e-06, + "loss": 0.4188, + "mean_token_accuracy": 0.866478681564331, + "num_tokens": 482664096.0, + "step": 12654 + }, + { + "epoch": 1.6098460755629054, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.163732528686523, + "learning_rate": 1e-06, + "loss": 0.378, + "mean_token_accuracy": 0.8783591985702515, + "num_tokens": 482701230.0, + "step": 12655 + }, + { + "epoch": 1.609973285841496, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.787559509277344, + "learning_rate": 1e-06, + "loss": 0.413, + "mean_token_accuracy": 0.8685075640678406, + "num_tokens": 482741611.0, + "step": 12656 + }, + { + "epoch": 1.6101004961200864, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.227985382080078, + "learning_rate": 1e-06, + "loss": 0.3803, + "mean_token_accuracy": 0.8777331113815308, + "num_tokens": 482779482.0, + "step": 12657 + }, + { + "epoch": 1.610227706398677, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.14606475830078, + "learning_rate": 1e-06, + "loss": 0.3952, + "mean_token_accuracy": 0.8756966590881348, + "num_tokens": 482816279.0, + "step": 12658 + }, + { + "epoch": 1.6103549166772675, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.093162536621094, + "learning_rate": 1e-06, + "loss": 0.4414, + "mean_token_accuracy": 0.8593432903289795, + "num_tokens": 482859941.0, + "step": 12659 + }, + { + "epoch": 1.610482126955858, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.013938903808594, + "learning_rate": 1e-06, + "loss": 0.3943, + "mean_token_accuracy": 0.8730728626251221, + "num_tokens": 482894348.0, + "step": 12660 + }, + { + "epoch": 1.6106093372344485, + "ewc_loss": 0.03369140625, + "ewc_loss_parallel": 3.361701965332031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.86695671081543, + "learning_rate": 1e-06, + "loss": 0.4364, + "mean_token_accuracy": 0.8626683354377747, + "num_tokens": 482930882.0, + "step": 12661 + }, + { + "epoch": 1.610736547513039, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.897483825683594, + "learning_rate": 1e-06, + "loss": 0.4729, + "mean_token_accuracy": 0.8507760167121887, + "num_tokens": 482969515.0, + "step": 12662 + }, + { + "epoch": 1.6108637577916296, + "ewc_loss": 0.03369140625, + "ewc_loss_parallel": 3.361701965332031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.181758880615234, + "learning_rate": 1e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.8649695515632629, + "num_tokens": 483008074.0, + "step": 12663 + }, + { + "epoch": 1.6109909680702201, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.066591262817383, + "learning_rate": 1e-06, + "loss": 0.4346, + "mean_token_accuracy": 0.8603694438934326, + "num_tokens": 483038920.0, + "step": 12664 + }, + { + "epoch": 1.6111181783488107, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.781341552734375, + "learning_rate": 1e-06, + "loss": 0.3803, + "mean_token_accuracy": 0.8786693215370178, + "num_tokens": 483075559.0, + "step": 12665 + }, + { + "epoch": 1.6112453886274012, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.021589279174805, + "learning_rate": 1e-06, + "loss": 0.4067, + "mean_token_accuracy": 0.8717741966247559, + "num_tokens": 483114378.0, + "step": 12666 + }, + { + "epoch": 1.6113725989059917, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.950218200683594, + "learning_rate": 1e-06, + "loss": 0.3984, + "mean_token_accuracy": 0.8721113801002502, + "num_tokens": 483147911.0, + "step": 12667 + }, + { + "epoch": 1.6114998091845822, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.158512115478516, + "learning_rate": 1e-06, + "loss": 0.3651, + "mean_token_accuracy": 0.8825296759605408, + "num_tokens": 483188275.0, + "step": 12668 + }, + { + "epoch": 1.6116270194631728, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.864551544189453, + "learning_rate": 1e-06, + "loss": 0.3699, + "mean_token_accuracy": 0.8791588544845581, + "num_tokens": 483230804.0, + "step": 12669 + }, + { + "epoch": 1.6117542297417633, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.026174545288086, + "learning_rate": 1e-06, + "loss": 0.4147, + "mean_token_accuracy": 0.8708648085594177, + "num_tokens": 483262368.0, + "step": 12670 + }, + { + "epoch": 1.6118814400203536, + "ewc_loss": 0.03369140625, + "ewc_loss_parallel": 3.361701965332031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.781597137451172, + "learning_rate": 1e-06, + "loss": 0.4647, + "mean_token_accuracy": 0.8570500612258911, + "num_tokens": 483300894.0, + "step": 12671 + }, + { + "epoch": 1.6120086502989441, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.895978927612305, + "learning_rate": 1e-06, + "loss": 0.3991, + "mean_token_accuracy": 0.8746616244316101, + "num_tokens": 483337377.0, + "step": 12672 + }, + { + "epoch": 1.6121358605775347, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.04100799560547, + "learning_rate": 1e-06, + "loss": 0.421, + "mean_token_accuracy": 0.867453932762146, + "num_tokens": 483376572.0, + "step": 12673 + }, + { + "epoch": 1.6122630708561252, + "ewc_loss": 0.03369140625, + "ewc_loss_parallel": 3.361701965332031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.708532333374023, + "learning_rate": 1e-06, + "loss": 0.445, + "mean_token_accuracy": 0.8568651676177979, + "num_tokens": 483415584.0, + "step": 12674 + }, + { + "epoch": 1.6123902811347157, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.938745498657227, + "learning_rate": 1e-06, + "loss": 0.4145, + "mean_token_accuracy": 0.8694353103637695, + "num_tokens": 483459584.0, + "step": 12675 + }, + { + "epoch": 1.6125174914133062, + "ewc_loss": 0.03369140625, + "ewc_loss_parallel": 3.361701965332031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.745986938476562, + "learning_rate": 1e-06, + "loss": 0.4307, + "mean_token_accuracy": 0.862319827079773, + "num_tokens": 483504260.0, + "step": 12676 + }, + { + "epoch": 1.6126447016918966, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.861244201660156, + "learning_rate": 1e-06, + "loss": 0.4003, + "mean_token_accuracy": 0.8700981140136719, + "num_tokens": 483547271.0, + "step": 12677 + }, + { + "epoch": 1.612771911970487, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.9106502532959, + "learning_rate": 1e-06, + "loss": 0.4142, + "mean_token_accuracy": 0.8707813024520874, + "num_tokens": 483582255.0, + "step": 12678 + }, + { + "epoch": 1.6128991222490776, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.751060485839844, + "learning_rate": 1e-06, + "loss": 0.4384, + "mean_token_accuracy": 0.8638938665390015, + "num_tokens": 483622290.0, + "step": 12679 + }, + { + "epoch": 1.6130263325276681, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.07061767578125, + "learning_rate": 1e-06, + "loss": 0.381, + "mean_token_accuracy": 0.8806394934654236, + "num_tokens": 483657253.0, + "step": 12680 + }, + { + "epoch": 1.6131535428062587, + "ewc_loss": 0.03369140625, + "ewc_loss_parallel": 3.361701965332031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.02066993713379, + "learning_rate": 1e-06, + "loss": 0.3959, + "mean_token_accuracy": 0.8725032806396484, + "num_tokens": 483693062.0, + "step": 12681 + }, + { + "epoch": 1.6132807530848492, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.977161407470703, + "learning_rate": 1e-06, + "loss": 0.4356, + "mean_token_accuracy": 0.8628962635993958, + "num_tokens": 483735845.0, + "step": 12682 + }, + { + "epoch": 1.6134079633634397, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.84730339050293, + "learning_rate": 1e-06, + "loss": 0.3988, + "mean_token_accuracy": 0.871819019317627, + "num_tokens": 483783833.0, + "step": 12683 + }, + { + "epoch": 1.6135351736420303, + "ewc_loss": 0.03369140625, + "ewc_loss_parallel": 3.361701965332031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.08490562438965, + "learning_rate": 1e-06, + "loss": 0.4292, + "mean_token_accuracy": 0.868017315864563, + "num_tokens": 483820175.0, + "step": 12684 + }, + { + "epoch": 1.6136623839206208, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.946889877319336, + "learning_rate": 1e-06, + "loss": 0.3847, + "mean_token_accuracy": 0.8793200254440308, + "num_tokens": 483863454.0, + "step": 12685 + }, + { + "epoch": 1.6137895941992113, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.895654678344727, + "learning_rate": 1e-06, + "loss": 0.3514, + "mean_token_accuracy": 0.8878416419029236, + "num_tokens": 483900657.0, + "step": 12686 + }, + { + "epoch": 1.6139168044778018, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.0714168548584, + "learning_rate": 1e-06, + "loss": 0.4272, + "mean_token_accuracy": 0.8612996935844421, + "num_tokens": 483935912.0, + "step": 12687 + }, + { + "epoch": 1.6140440147563924, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.909875869750977, + "learning_rate": 1e-06, + "loss": 0.407, + "mean_token_accuracy": 0.8685224652290344, + "num_tokens": 483968594.0, + "step": 12688 + }, + { + "epoch": 1.614171225034983, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.03502082824707, + "learning_rate": 1e-06, + "loss": 0.3912, + "mean_token_accuracy": 0.8774747252464294, + "num_tokens": 484010866.0, + "step": 12689 + }, + { + "epoch": 1.6142984353135734, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.034629821777344, + "learning_rate": 1e-06, + "loss": 0.4363, + "mean_token_accuracy": 0.8624140024185181, + "num_tokens": 484051355.0, + "step": 12690 + }, + { + "epoch": 1.614425645592164, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.045299530029297, + "learning_rate": 1e-06, + "loss": 0.3705, + "mean_token_accuracy": 0.8788493275642395, + "num_tokens": 484087049.0, + "step": 12691 + }, + { + "epoch": 1.6145528558707545, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.10049057006836, + "learning_rate": 1e-06, + "loss": 0.4511, + "mean_token_accuracy": 0.8576333522796631, + "num_tokens": 484124914.0, + "step": 12692 + }, + { + "epoch": 1.614680066149345, + "ewc_loss": 0.03369140625, + "ewc_loss_parallel": 3.361701965332031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.926414489746094, + "learning_rate": 1e-06, + "loss": 0.3641, + "mean_token_accuracy": 0.885691225528717, + "num_tokens": 484154524.0, + "step": 12693 + }, + { + "epoch": 1.6148072764279355, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.062206268310547, + "learning_rate": 1e-06, + "loss": 0.4165, + "mean_token_accuracy": 0.8677065968513489, + "num_tokens": 484195091.0, + "step": 12694 + }, + { + "epoch": 1.6149344867065258, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.05315399169922, + "learning_rate": 1e-06, + "loss": 0.4161, + "mean_token_accuracy": 0.8703954219818115, + "num_tokens": 484236787.0, + "step": 12695 + }, + { + "epoch": 1.6150616969851164, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.934053421020508, + "learning_rate": 1e-06, + "loss": 0.446, + "mean_token_accuracy": 0.8638575673103333, + "num_tokens": 484281748.0, + "step": 12696 + }, + { + "epoch": 1.615188907263707, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.055198669433594, + "learning_rate": 1e-06, + "loss": 0.3927, + "mean_token_accuracy": 0.8748371601104736, + "num_tokens": 484321181.0, + "step": 12697 + }, + { + "epoch": 1.6153161175422974, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.9997501373291, + "learning_rate": 1e-06, + "loss": 0.4278, + "mean_token_accuracy": 0.8659530282020569, + "num_tokens": 484360476.0, + "step": 12698 + }, + { + "epoch": 1.615443327820888, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.896947860717773, + "learning_rate": 1e-06, + "loss": 0.4102, + "mean_token_accuracy": 0.8697524666786194, + "num_tokens": 484395065.0, + "step": 12699 + }, + { + "epoch": 1.6155705380994785, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.839704513549805, + "learning_rate": 1e-06, + "loss": 0.4083, + "mean_token_accuracy": 0.8709284663200378, + "num_tokens": 484439309.0, + "step": 12700 + }, + { + "epoch": 1.6156977483780688, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.1117000579834, + "learning_rate": 1e-06, + "loss": 0.4135, + "mean_token_accuracy": 0.8692305684089661, + "num_tokens": 484473018.0, + "step": 12701 + }, + { + "epoch": 1.6158249586566593, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.032976150512695, + "learning_rate": 1e-06, + "loss": 0.392, + "mean_token_accuracy": 0.8754501342773438, + "num_tokens": 484518306.0, + "step": 12702 + }, + { + "epoch": 1.6159521689352498, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.919776916503906, + "learning_rate": 1e-06, + "loss": 0.4621, + "mean_token_accuracy": 0.8549767732620239, + "num_tokens": 484556348.0, + "step": 12703 + }, + { + "epoch": 1.6160793792138404, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.911033630371094, + "learning_rate": 1e-06, + "loss": 0.3806, + "mean_token_accuracy": 0.8796249032020569, + "num_tokens": 484593136.0, + "step": 12704 + }, + { + "epoch": 1.616206589492431, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.064332962036133, + "learning_rate": 1e-06, + "loss": 0.3782, + "mean_token_accuracy": 0.8788567781448364, + "num_tokens": 484628025.0, + "step": 12705 + }, + { + "epoch": 1.6163337997710214, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.969938278198242, + "learning_rate": 1e-06, + "loss": 0.3977, + "mean_token_accuracy": 0.8745628595352173, + "num_tokens": 484666703.0, + "step": 12706 + }, + { + "epoch": 1.616461010049612, + "ewc_loss": 0.03369140625, + "ewc_loss_parallel": 3.361701965332031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.950706481933594, + "learning_rate": 1e-06, + "loss": 0.3829, + "mean_token_accuracy": 0.8784822225570679, + "num_tokens": 484703417.0, + "step": 12707 + }, + { + "epoch": 1.6165882203282025, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.89798355102539, + "learning_rate": 1e-06, + "loss": 0.3743, + "mean_token_accuracy": 0.8828490972518921, + "num_tokens": 484741830.0, + "step": 12708 + }, + { + "epoch": 1.616715430606793, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.92336654663086, + "learning_rate": 1e-06, + "loss": 0.4174, + "mean_token_accuracy": 0.866934061050415, + "num_tokens": 484780140.0, + "step": 12709 + }, + { + "epoch": 1.6168426408853835, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.030685424804688, + "learning_rate": 1e-06, + "loss": 0.4127, + "mean_token_accuracy": 0.8670162558555603, + "num_tokens": 484813856.0, + "step": 12710 + }, + { + "epoch": 1.616969851163974, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.962039947509766, + "learning_rate": 1e-06, + "loss": 0.3932, + "mean_token_accuracy": 0.8785471320152283, + "num_tokens": 484851383.0, + "step": 12711 + }, + { + "epoch": 1.6170970614425646, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.046342849731445, + "learning_rate": 1e-06, + "loss": 0.3884, + "mean_token_accuracy": 0.8780609965324402, + "num_tokens": 484884361.0, + "step": 12712 + }, + { + "epoch": 1.6172242717211551, + "ewc_loss": 0.03369140625, + "ewc_loss_parallel": 3.361701965332031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.988414764404297, + "learning_rate": 1e-06, + "loss": 0.4658, + "mean_token_accuracy": 0.8487036228179932, + "num_tokens": 484922106.0, + "step": 12713 + }, + { + "epoch": 1.6173514819997457, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.986854553222656, + "learning_rate": 1e-06, + "loss": 0.4138, + "mean_token_accuracy": 0.870046854019165, + "num_tokens": 484963100.0, + "step": 12714 + }, + { + "epoch": 1.6174786922783362, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.884950637817383, + "learning_rate": 1e-06, + "loss": 0.3774, + "mean_token_accuracy": 0.8805537223815918, + "num_tokens": 485006896.0, + "step": 12715 + }, + { + "epoch": 1.6176059025569267, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.06429672241211, + "learning_rate": 1e-06, + "loss": 0.4, + "mean_token_accuracy": 0.8695584535598755, + "num_tokens": 485045968.0, + "step": 12716 + }, + { + "epoch": 1.6177331128355172, + "ewc_loss": 0.03369140625, + "ewc_loss_parallel": 3.361701965332031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.935623168945312, + "learning_rate": 1e-06, + "loss": 0.4189, + "mean_token_accuracy": 0.8660211563110352, + "num_tokens": 485084736.0, + "step": 12717 + }, + { + "epoch": 1.6178603231141078, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.883094787597656, + "learning_rate": 1e-06, + "loss": 0.3939, + "mean_token_accuracy": 0.8771565556526184, + "num_tokens": 485123130.0, + "step": 12718 + }, + { + "epoch": 1.6179875333926983, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.9520206451416, + "learning_rate": 1e-06, + "loss": 0.4184, + "mean_token_accuracy": 0.8668217658996582, + "num_tokens": 485158072.0, + "step": 12719 + }, + { + "epoch": 1.6181147436712886, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.054363250732422, + "learning_rate": 1e-06, + "loss": 0.3915, + "mean_token_accuracy": 0.877517819404602, + "num_tokens": 485196783.0, + "step": 12720 + }, + { + "epoch": 1.6182419539498791, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.866777420043945, + "learning_rate": 1e-06, + "loss": 0.3911, + "mean_token_accuracy": 0.8755094408988953, + "num_tokens": 485231433.0, + "step": 12721 + }, + { + "epoch": 1.6183691642284697, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.85626220703125, + "learning_rate": 1e-06, + "loss": 0.4141, + "mean_token_accuracy": 0.8675297498703003, + "num_tokens": 485269016.0, + "step": 12722 + }, + { + "epoch": 1.6184963745070602, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.974702835083008, + "learning_rate": 1e-06, + "loss": 0.4391, + "mean_token_accuracy": 0.8582087755203247, + "num_tokens": 485308293.0, + "step": 12723 + }, + { + "epoch": 1.6186235847856507, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.843812942504883, + "learning_rate": 1e-06, + "loss": 0.389, + "mean_token_accuracy": 0.8786683082580566, + "num_tokens": 485340758.0, + "step": 12724 + }, + { + "epoch": 1.6187507950642412, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.04063606262207, + "learning_rate": 1e-06, + "loss": 0.3577, + "mean_token_accuracy": 0.8845207691192627, + "num_tokens": 485384047.0, + "step": 12725 + }, + { + "epoch": 1.6188780053428315, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.838960647583008, + "learning_rate": 1e-06, + "loss": 0.3986, + "mean_token_accuracy": 0.8726426362991333, + "num_tokens": 485416909.0, + "step": 12726 + }, + { + "epoch": 1.619005215621422, + "ewc_loss": 0.03369140625, + "ewc_loss_parallel": 3.361701965332031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.927518844604492, + "learning_rate": 1e-06, + "loss": 0.3996, + "mean_token_accuracy": 0.8704071640968323, + "num_tokens": 485449532.0, + "step": 12727 + }, + { + "epoch": 1.6191324259000126, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.96141815185547, + "learning_rate": 1e-06, + "loss": 0.3836, + "mean_token_accuracy": 0.880061686038971, + "num_tokens": 485491171.0, + "step": 12728 + }, + { + "epoch": 1.6192596361786031, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.9521541595459, + "learning_rate": 1e-06, + "loss": 0.4328, + "mean_token_accuracy": 0.8651841878890991, + "num_tokens": 485525511.0, + "step": 12729 + }, + { + "epoch": 1.6193868464571937, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.922178268432617, + "learning_rate": 1e-06, + "loss": 0.4122, + "mean_token_accuracy": 0.8653353452682495, + "num_tokens": 485562517.0, + "step": 12730 + }, + { + "epoch": 1.6195140567357842, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.976781845092773, + "learning_rate": 1e-06, + "loss": 0.4106, + "mean_token_accuracy": 0.870458722114563, + "num_tokens": 485601058.0, + "step": 12731 + }, + { + "epoch": 1.6196412670143747, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.91434669494629, + "learning_rate": 1e-06, + "loss": 0.3697, + "mean_token_accuracy": 0.8830799460411072, + "num_tokens": 485634737.0, + "step": 12732 + }, + { + "epoch": 1.6197684772929652, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.896577835083008, + "learning_rate": 1e-06, + "loss": 0.4039, + "mean_token_accuracy": 0.87314772605896, + "num_tokens": 485671300.0, + "step": 12733 + }, + { + "epoch": 1.6198956875715558, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.96103858947754, + "learning_rate": 1e-06, + "loss": 0.3944, + "mean_token_accuracy": 0.8781802654266357, + "num_tokens": 485708712.0, + "step": 12734 + }, + { + "epoch": 1.6200228978501463, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.960403442382812, + "learning_rate": 1e-06, + "loss": 0.3669, + "mean_token_accuracy": 0.8822592496871948, + "num_tokens": 485746820.0, + "step": 12735 + }, + { + "epoch": 1.6201501081287368, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.85015296936035, + "learning_rate": 1e-06, + "loss": 0.4105, + "mean_token_accuracy": 0.8701552152633667, + "num_tokens": 485785921.0, + "step": 12736 + }, + { + "epoch": 1.6202773184073274, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.068256378173828, + "learning_rate": 1e-06, + "loss": 0.4029, + "mean_token_accuracy": 0.8703464269638062, + "num_tokens": 485825766.0, + "step": 12737 + }, + { + "epoch": 1.6204045286859179, + "ewc_loss": 0.03369140625, + "ewc_loss_parallel": 3.361701965332031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.979581832885742, + "learning_rate": 1e-06, + "loss": 0.4163, + "mean_token_accuracy": 0.86366868019104, + "num_tokens": 485862288.0, + "step": 12738 + }, + { + "epoch": 1.6205317389645084, + "ewc_loss": 0.03369140625, + "ewc_loss_parallel": 3.361701965332031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.92568588256836, + "learning_rate": 1e-06, + "loss": 0.429, + "mean_token_accuracy": 0.863227128982544, + "num_tokens": 485903582.0, + "step": 12739 + }, + { + "epoch": 1.620658949243099, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.889894485473633, + "learning_rate": 1e-06, + "loss": 0.4288, + "mean_token_accuracy": 0.8661317825317383, + "num_tokens": 485950931.0, + "step": 12740 + }, + { + "epoch": 1.6207861595216895, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.109399795532227, + "learning_rate": 1e-06, + "loss": 0.4138, + "mean_token_accuracy": 0.8713113069534302, + "num_tokens": 485984185.0, + "step": 12741 + }, + { + "epoch": 1.62091336980028, + "ewc_loss": 0.03369140625, + "ewc_loss_parallel": 3.361701965332031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.762832641601562, + "learning_rate": 1e-06, + "loss": 0.392, + "mean_token_accuracy": 0.8743332624435425, + "num_tokens": 486018796.0, + "step": 12742 + }, + { + "epoch": 1.6210405800788705, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.905790328979492, + "learning_rate": 1e-06, + "loss": 0.4565, + "mean_token_accuracy": 0.8562139272689819, + "num_tokens": 486055328.0, + "step": 12743 + }, + { + "epoch": 1.6211677903574608, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.936124801635742, + "learning_rate": 1e-06, + "loss": 0.427, + "mean_token_accuracy": 0.8649898767471313, + "num_tokens": 486092340.0, + "step": 12744 + }, + { + "epoch": 1.6212950006360514, + "ewc_loss": 0.03369140625, + "ewc_loss_parallel": 3.361701965332031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.900197982788086, + "learning_rate": 1e-06, + "loss": 0.447, + "mean_token_accuracy": 0.8572308421134949, + "num_tokens": 486123195.0, + "step": 12745 + }, + { + "epoch": 1.621422210914642, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.751943588256836, + "learning_rate": 1e-06, + "loss": 0.3755, + "mean_token_accuracy": 0.8793115019798279, + "num_tokens": 486167145.0, + "step": 12746 + }, + { + "epoch": 1.6215494211932324, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.999862670898438, + "learning_rate": 1e-06, + "loss": 0.4423, + "mean_token_accuracy": 0.8616651296615601, + "num_tokens": 486207925.0, + "step": 12747 + }, + { + "epoch": 1.621676631471823, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.74401092529297, + "learning_rate": 1e-06, + "loss": 0.4032, + "mean_token_accuracy": 0.8747154474258423, + "num_tokens": 486245446.0, + "step": 12748 + }, + { + "epoch": 1.6218038417504135, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.909770965576172, + "learning_rate": 1e-06, + "loss": 0.4154, + "mean_token_accuracy": 0.8698127269744873, + "num_tokens": 486280461.0, + "step": 12749 + }, + { + "epoch": 1.6219310520290038, + "ewc_loss": 0.03369140625, + "ewc_loss_parallel": 3.361701965332031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.017532348632812, + "learning_rate": 1e-06, + "loss": 0.4397, + "mean_token_accuracy": 0.8575209379196167, + "num_tokens": 486323926.0, + "step": 12750 + }, + { + "epoch": 1.6220582623075943, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.849079132080078, + "learning_rate": 1e-06, + "loss": 0.4274, + "mean_token_accuracy": 0.8631494641304016, + "num_tokens": 486360016.0, + "step": 12751 + }, + { + "epoch": 1.6221854725861848, + "ewc_loss": 0.03369140625, + "ewc_loss_parallel": 3.361701965332031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.12240982055664, + "learning_rate": 1e-06, + "loss": 0.4139, + "mean_token_accuracy": 0.8729997873306274, + "num_tokens": 486394994.0, + "step": 12752 + }, + { + "epoch": 1.6223126828647754, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.850496292114258, + "learning_rate": 1e-06, + "loss": 0.4101, + "mean_token_accuracy": 0.8687850832939148, + "num_tokens": 486436909.0, + "step": 12753 + }, + { + "epoch": 1.622439893143366, + "ewc_loss": 0.032958984375, + "ewc_loss_parallel": 3.2901763916015625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.027341842651367, + "learning_rate": 1e-06, + "loss": 0.3907, + "mean_token_accuracy": 0.8751828670501709, + "num_tokens": 486473201.0, + "step": 12754 + }, + { + "epoch": 1.6225671034219564, + "ewc_loss": 0.03369140625, + "ewc_loss_parallel": 3.361701965332031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.202285766601562, + "learning_rate": 1e-06, + "loss": 0.4633, + "mean_token_accuracy": 0.8505091667175293, + "num_tokens": 486510733.0, + "step": 12755 + }, + { + "epoch": 1.622694313700547, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.923171997070312, + "learning_rate": 1e-06, + "loss": 0.3825, + "mean_token_accuracy": 0.8852525949478149, + "num_tokens": 486546666.0, + "step": 12756 + }, + { + "epoch": 1.6228215239791375, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.02178955078125, + "learning_rate": 1e-06, + "loss": 0.4784, + "mean_token_accuracy": 0.8506020903587341, + "num_tokens": 486585639.0, + "step": 12757 + }, + { + "epoch": 1.622948734257728, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.91625213623047, + "learning_rate": 1e-06, + "loss": 0.3852, + "mean_token_accuracy": 0.8745056390762329, + "num_tokens": 486623028.0, + "step": 12758 + }, + { + "epoch": 1.6230759445363185, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.737207412719727, + "learning_rate": 1e-06, + "loss": 0.4316, + "mean_token_accuracy": 0.8656162023544312, + "num_tokens": 486655906.0, + "step": 12759 + }, + { + "epoch": 1.623203154814909, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.008758544921875, + "learning_rate": 1e-06, + "loss": 0.4183, + "mean_token_accuracy": 0.8664345741271973, + "num_tokens": 486689461.0, + "step": 12760 + }, + { + "epoch": 1.6233303650934996, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.89680290222168, + "learning_rate": 1e-06, + "loss": 0.4656, + "mean_token_accuracy": 0.8546364307403564, + "num_tokens": 486728373.0, + "step": 12761 + }, + { + "epoch": 1.6234575753720901, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.00855827331543, + "learning_rate": 1e-06, + "loss": 0.3817, + "mean_token_accuracy": 0.8792507648468018, + "num_tokens": 486769955.0, + "step": 12762 + }, + { + "epoch": 1.6235847856506807, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.08500099182129, + "learning_rate": 1e-06, + "loss": 0.4014, + "mean_token_accuracy": 0.8737585544586182, + "num_tokens": 486806468.0, + "step": 12763 + }, + { + "epoch": 1.6237119959292712, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.671030044555664, + "learning_rate": 1e-06, + "loss": 0.4541, + "mean_token_accuracy": 0.8566561937332153, + "num_tokens": 486846526.0, + "step": 12764 + }, + { + "epoch": 1.6238392062078617, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.94597053527832, + "learning_rate": 1e-06, + "loss": 0.3593, + "mean_token_accuracy": 0.8873254060745239, + "num_tokens": 486886323.0, + "step": 12765 + }, + { + "epoch": 1.6239664164864522, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.020509719848633, + "learning_rate": 1e-06, + "loss": 0.4609, + "mean_token_accuracy": 0.8515664339065552, + "num_tokens": 486918099.0, + "step": 12766 + }, + { + "epoch": 1.6240936267650428, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.09572410583496, + "learning_rate": 1e-06, + "loss": 0.3662, + "mean_token_accuracy": 0.8839246034622192, + "num_tokens": 486953789.0, + "step": 12767 + }, + { + "epoch": 1.6242208370436333, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.188600540161133, + "learning_rate": 1e-06, + "loss": 0.4038, + "mean_token_accuracy": 0.8680858612060547, + "num_tokens": 486996165.0, + "step": 12768 + }, + { + "epoch": 1.6243480473222236, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.85915184020996, + "learning_rate": 1e-06, + "loss": 0.3753, + "mean_token_accuracy": 0.8802257776260376, + "num_tokens": 487028249.0, + "step": 12769 + }, + { + "epoch": 1.6244752576008141, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.874481201171875, + "learning_rate": 1e-06, + "loss": 0.4098, + "mean_token_accuracy": 0.8706247806549072, + "num_tokens": 487070099.0, + "step": 12770 + }, + { + "epoch": 1.6246024678794047, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.074430465698242, + "learning_rate": 1e-06, + "loss": 0.3972, + "mean_token_accuracy": 0.8737935423851013, + "num_tokens": 487107117.0, + "step": 12771 + }, + { + "epoch": 1.6247296781579952, + "ewc_loss": 0.03369140625, + "ewc_loss_parallel": 3.361701965332031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.004175186157227, + "learning_rate": 1e-06, + "loss": 0.4482, + "mean_token_accuracy": 0.8579950332641602, + "num_tokens": 487146024.0, + "step": 12772 + }, + { + "epoch": 1.6248568884365857, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.044353485107422, + "learning_rate": 1e-06, + "loss": 0.4518, + "mean_token_accuracy": 0.8573098182678223, + "num_tokens": 487189864.0, + "step": 12773 + }, + { + "epoch": 1.6249840987151762, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.890159606933594, + "learning_rate": 1e-06, + "loss": 0.396, + "mean_token_accuracy": 0.8723764419555664, + "num_tokens": 487230307.0, + "step": 12774 + }, + { + "epoch": 1.6251113089937665, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.268455505371094, + "learning_rate": 1e-06, + "loss": 0.3974, + "mean_token_accuracy": 0.8726162910461426, + "num_tokens": 487261387.0, + "step": 12775 + }, + { + "epoch": 1.625238519272357, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.813030242919922, + "learning_rate": 1e-06, + "loss": 0.4386, + "mean_token_accuracy": 0.8580375909805298, + "num_tokens": 487298611.0, + "step": 12776 + }, + { + "epoch": 1.6253657295509476, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.928604125976562, + "learning_rate": 1e-06, + "loss": 0.4125, + "mean_token_accuracy": 0.8692558407783508, + "num_tokens": 487338172.0, + "step": 12777 + }, + { + "epoch": 1.6254929398295381, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.1873836517334, + "learning_rate": 1e-06, + "loss": 0.4057, + "mean_token_accuracy": 0.8711403012275696, + "num_tokens": 487375126.0, + "step": 12778 + }, + { + "epoch": 1.6256201501081287, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.70229148864746, + "learning_rate": 1e-06, + "loss": 0.3866, + "mean_token_accuracy": 0.879863440990448, + "num_tokens": 487416539.0, + "step": 12779 + }, + { + "epoch": 1.6257473603867192, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.07306671142578, + "learning_rate": 1e-06, + "loss": 0.3981, + "mean_token_accuracy": 0.8714452385902405, + "num_tokens": 487448429.0, + "step": 12780 + }, + { + "epoch": 1.6258745706653097, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.827892303466797, + "learning_rate": 1e-06, + "loss": 0.4148, + "mean_token_accuracy": 0.8695927858352661, + "num_tokens": 487486054.0, + "step": 12781 + }, + { + "epoch": 1.6260017809439002, + "ewc_loss": 0.03369140625, + "ewc_loss_parallel": 3.361701965332031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.955509185791016, + "learning_rate": 1e-06, + "loss": 0.3881, + "mean_token_accuracy": 0.8754792213439941, + "num_tokens": 487523771.0, + "step": 12782 + }, + { + "epoch": 1.6261289912224908, + "ewc_loss": 0.03369140625, + "ewc_loss_parallel": 3.361701965332031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.003427505493164, + "learning_rate": 1e-06, + "loss": 0.3887, + "mean_token_accuracy": 0.8749780654907227, + "num_tokens": 487561334.0, + "step": 12783 + }, + { + "epoch": 1.6262562015010813, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.916399002075195, + "learning_rate": 1e-06, + "loss": 0.4005, + "mean_token_accuracy": 0.8700700402259827, + "num_tokens": 487606382.0, + "step": 12784 + }, + { + "epoch": 1.6263834117796718, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.0128173828125, + "learning_rate": 1e-06, + "loss": 0.4187, + "mean_token_accuracy": 0.8654499053955078, + "num_tokens": 487647640.0, + "step": 12785 + }, + { + "epoch": 1.6265106220582624, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.90862464904785, + "learning_rate": 1e-06, + "loss": 0.4303, + "mean_token_accuracy": 0.8669906258583069, + "num_tokens": 487682604.0, + "step": 12786 + }, + { + "epoch": 1.6266378323368529, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.956092834472656, + "learning_rate": 1e-06, + "loss": 0.3932, + "mean_token_accuracy": 0.8731882572174072, + "num_tokens": 487718271.0, + "step": 12787 + }, + { + "epoch": 1.6267650426154434, + "ewc_loss": 0.03369140625, + "ewc_loss_parallel": 3.361701965332031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.856948852539062, + "learning_rate": 1e-06, + "loss": 0.4439, + "mean_token_accuracy": 0.8572583198547363, + "num_tokens": 487752872.0, + "step": 12788 + }, + { + "epoch": 1.626892252894034, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.14982795715332, + "learning_rate": 1e-06, + "loss": 0.4444, + "mean_token_accuracy": 0.8595741987228394, + "num_tokens": 487798309.0, + "step": 12789 + }, + { + "epoch": 1.6270194631726245, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.911758422851562, + "learning_rate": 1e-06, + "loss": 0.4271, + "mean_token_accuracy": 0.858289361000061, + "num_tokens": 487834052.0, + "step": 12790 + }, + { + "epoch": 1.627146673451215, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.081186294555664, + "learning_rate": 1e-06, + "loss": 0.4639, + "mean_token_accuracy": 0.8580465316772461, + "num_tokens": 487868662.0, + "step": 12791 + }, + { + "epoch": 1.6272738837298055, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.047618865966797, + "learning_rate": 1e-06, + "loss": 0.4173, + "mean_token_accuracy": 0.8666613698005676, + "num_tokens": 487907425.0, + "step": 12792 + }, + { + "epoch": 1.6274010940083958, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.85258674621582, + "learning_rate": 1e-06, + "loss": 0.3871, + "mean_token_accuracy": 0.8779479265213013, + "num_tokens": 487942979.0, + "step": 12793 + }, + { + "epoch": 1.6275283042869864, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.88975715637207, + "learning_rate": 1e-06, + "loss": 0.3929, + "mean_token_accuracy": 0.8741402626037598, + "num_tokens": 487982402.0, + "step": 12794 + }, + { + "epoch": 1.6276555145655769, + "ewc_loss": 0.03369140625, + "ewc_loss_parallel": 3.361701965332031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.86790657043457, + "learning_rate": 1e-06, + "loss": 0.3855, + "mean_token_accuracy": 0.8769020438194275, + "num_tokens": 488020071.0, + "step": 12795 + }, + { + "epoch": 1.6277827248441674, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.85838508605957, + "learning_rate": 1e-06, + "loss": 0.4453, + "mean_token_accuracy": 0.8564982414245605, + "num_tokens": 488057457.0, + "step": 12796 + }, + { + "epoch": 1.627909935122758, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.960525512695312, + "learning_rate": 1e-06, + "loss": 0.421, + "mean_token_accuracy": 0.865493655204773, + "num_tokens": 488094002.0, + "step": 12797 + }, + { + "epoch": 1.6280371454013485, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.02393341064453, + "learning_rate": 1e-06, + "loss": 0.4556, + "mean_token_accuracy": 0.8549357652664185, + "num_tokens": 488134995.0, + "step": 12798 + }, + { + "epoch": 1.6281643556799388, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.834203720092773, + "learning_rate": 1e-06, + "loss": 0.3971, + "mean_token_accuracy": 0.8734539151191711, + "num_tokens": 488169942.0, + "step": 12799 + }, + { + "epoch": 1.6282915659585293, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.961254119873047, + "learning_rate": 1e-06, + "loss": 0.405, + "mean_token_accuracy": 0.8715505003929138, + "num_tokens": 488210852.0, + "step": 12800 + }, + { + "epoch": 1.6284187762371198, + "ewc_loss": 0.03369140625, + "ewc_loss_parallel": 3.361701965332031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.032840728759766, + "learning_rate": 1e-06, + "loss": 0.4055, + "mean_token_accuracy": 0.8725311160087585, + "num_tokens": 488253067.0, + "step": 12801 + }, + { + "epoch": 1.6285459865157104, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.944826126098633, + "learning_rate": 1e-06, + "loss": 0.3795, + "mean_token_accuracy": 0.877128005027771, + "num_tokens": 488293498.0, + "step": 12802 + }, + { + "epoch": 1.628673196794301, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.838211059570312, + "learning_rate": 1e-06, + "loss": 0.4118, + "mean_token_accuracy": 0.8714990615844727, + "num_tokens": 488333931.0, + "step": 12803 + }, + { + "epoch": 1.6288004070728914, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.977359771728516, + "learning_rate": 1e-06, + "loss": 0.4015, + "mean_token_accuracy": 0.8747849464416504, + "num_tokens": 488371724.0, + "step": 12804 + }, + { + "epoch": 1.628927617351482, + "ewc_loss": 0.03369140625, + "ewc_loss_parallel": 3.361701965332031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.85287094116211, + "learning_rate": 1e-06, + "loss": 0.428, + "mean_token_accuracy": 0.8637676239013672, + "num_tokens": 488415171.0, + "step": 12805 + }, + { + "epoch": 1.6290548276300725, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.08127784729004, + "learning_rate": 1e-06, + "loss": 0.4274, + "mean_token_accuracy": 0.8638661503791809, + "num_tokens": 488451672.0, + "step": 12806 + }, + { + "epoch": 1.629182037908663, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.808168411254883, + "learning_rate": 1e-06, + "loss": 0.3712, + "mean_token_accuracy": 0.884172260761261, + "num_tokens": 488492657.0, + "step": 12807 + }, + { + "epoch": 1.6293092481872535, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.831201553344727, + "learning_rate": 1e-06, + "loss": 0.3895, + "mean_token_accuracy": 0.8765477538108826, + "num_tokens": 488530681.0, + "step": 12808 + }, + { + "epoch": 1.629436458465844, + "ewc_loss": 0.03369140625, + "ewc_loss_parallel": 3.361701965332031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.027090072631836, + "learning_rate": 1e-06, + "loss": 0.3736, + "mean_token_accuracy": 0.8794339299201965, + "num_tokens": 488560542.0, + "step": 12809 + }, + { + "epoch": 1.6295636687444346, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.643478393554688, + "learning_rate": 1e-06, + "loss": 0.4103, + "mean_token_accuracy": 0.8711991310119629, + "num_tokens": 488597491.0, + "step": 12810 + }, + { + "epoch": 1.6296908790230251, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.965837478637695, + "learning_rate": 1e-06, + "loss": 0.3893, + "mean_token_accuracy": 0.8768344521522522, + "num_tokens": 488634556.0, + "step": 12811 + }, + { + "epoch": 1.6298180893016156, + "ewc_loss": 0.03369140625, + "ewc_loss_parallel": 3.361701965332031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.839111328125, + "learning_rate": 1e-06, + "loss": 0.4168, + "mean_token_accuracy": 0.8660098910331726, + "num_tokens": 488668174.0, + "step": 12812 + }, + { + "epoch": 1.6299452995802062, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.983186721801758, + "learning_rate": 1e-06, + "loss": 0.4089, + "mean_token_accuracy": 0.8718481063842773, + "num_tokens": 488705994.0, + "step": 12813 + }, + { + "epoch": 1.6300725098587967, + "ewc_loss": 0.03369140625, + "ewc_loss_parallel": 3.361701965332031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.821853637695312, + "learning_rate": 1e-06, + "loss": 0.4511, + "mean_token_accuracy": 0.8581138849258423, + "num_tokens": 488746908.0, + "step": 12814 + }, + { + "epoch": 1.6301997201373872, + "ewc_loss": 0.03369140625, + "ewc_loss_parallel": 3.361701965332031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.03999137878418, + "learning_rate": 1e-06, + "loss": 0.4127, + "mean_token_accuracy": 0.8693078756332397, + "num_tokens": 488785607.0, + "step": 12815 + }, + { + "epoch": 1.6303269304159778, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.061687469482422, + "learning_rate": 1e-06, + "loss": 0.4164, + "mean_token_accuracy": 0.8673157691955566, + "num_tokens": 488823831.0, + "step": 12816 + }, + { + "epoch": 1.630454140694568, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.797670364379883, + "learning_rate": 1e-06, + "loss": 0.387, + "mean_token_accuracy": 0.8768103122711182, + "num_tokens": 488859944.0, + "step": 12817 + }, + { + "epoch": 1.6305813509731586, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.94513511657715, + "learning_rate": 1e-06, + "loss": 0.4565, + "mean_token_accuracy": 0.8547543287277222, + "num_tokens": 488903764.0, + "step": 12818 + }, + { + "epoch": 1.6307085612517491, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.86768913269043, + "learning_rate": 1e-06, + "loss": 0.447, + "mean_token_accuracy": 0.8597397804260254, + "num_tokens": 488946890.0, + "step": 12819 + }, + { + "epoch": 1.6308357715303397, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.011991500854492, + "learning_rate": 1e-06, + "loss": 0.4131, + "mean_token_accuracy": 0.8668069243431091, + "num_tokens": 488981901.0, + "step": 12820 + }, + { + "epoch": 1.6309629818089302, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.007020950317383, + "learning_rate": 1e-06, + "loss": 0.4095, + "mean_token_accuracy": 0.871876060962677, + "num_tokens": 489024180.0, + "step": 12821 + }, + { + "epoch": 1.6310901920875207, + "ewc_loss": 0.03369140625, + "ewc_loss_parallel": 3.361701965332031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.016536712646484, + "learning_rate": 1e-06, + "loss": 0.3949, + "mean_token_accuracy": 0.8734278082847595, + "num_tokens": 489070971.0, + "step": 12822 + }, + { + "epoch": 1.6312174023661112, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.84328842163086, + "learning_rate": 1e-06, + "loss": 0.3847, + "mean_token_accuracy": 0.8788007497787476, + "num_tokens": 489111636.0, + "step": 12823 + }, + { + "epoch": 1.6313446126447015, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.112634658813477, + "learning_rate": 1e-06, + "loss": 0.4161, + "mean_token_accuracy": 0.8672460317611694, + "num_tokens": 489148677.0, + "step": 12824 + }, + { + "epoch": 1.631471822923292, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.928884506225586, + "learning_rate": 1e-06, + "loss": 0.4412, + "mean_token_accuracy": 0.8574853539466858, + "num_tokens": 489186222.0, + "step": 12825 + }, + { + "epoch": 1.6315990332018826, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.072425842285156, + "learning_rate": 1e-06, + "loss": 0.4875, + "mean_token_accuracy": 0.8433094620704651, + "num_tokens": 489223173.0, + "step": 12826 + }, + { + "epoch": 1.6317262434804731, + "ewc_loss": 0.03369140625, + "ewc_loss_parallel": 3.361701965332031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.065366744995117, + "learning_rate": 1e-06, + "loss": 0.4068, + "mean_token_accuracy": 0.8697820901870728, + "num_tokens": 489259236.0, + "step": 12827 + }, + { + "epoch": 1.6318534537590637, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.741844177246094, + "learning_rate": 1e-06, + "loss": 0.4296, + "mean_token_accuracy": 0.8617917895317078, + "num_tokens": 489298449.0, + "step": 12828 + }, + { + "epoch": 1.6319806640376542, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.80242156982422, + "learning_rate": 1e-06, + "loss": 0.4797, + "mean_token_accuracy": 0.8484206199645996, + "num_tokens": 489334800.0, + "step": 12829 + }, + { + "epoch": 1.6321078743162447, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.20319366455078, + "learning_rate": 1e-06, + "loss": 0.4002, + "mean_token_accuracy": 0.8753591775894165, + "num_tokens": 489378510.0, + "step": 12830 + }, + { + "epoch": 1.6322350845948352, + "ewc_loss": 0.03369140625, + "ewc_loss_parallel": 3.361701965332031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.972944259643555, + "learning_rate": 1e-06, + "loss": 0.4413, + "mean_token_accuracy": 0.8575180768966675, + "num_tokens": 489419267.0, + "step": 12831 + }, + { + "epoch": 1.6323622948734258, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.025135040283203, + "learning_rate": 1e-06, + "loss": 0.4067, + "mean_token_accuracy": 0.87277752161026, + "num_tokens": 489454772.0, + "step": 12832 + }, + { + "epoch": 1.6324895051520163, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.14767837524414, + "learning_rate": 1e-06, + "loss": 0.4386, + "mean_token_accuracy": 0.8635647296905518, + "num_tokens": 489490146.0, + "step": 12833 + }, + { + "epoch": 1.6326167154306068, + "ewc_loss": 0.03369140625, + "ewc_loss_parallel": 3.361701965332031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.988332748413086, + "learning_rate": 1e-06, + "loss": 0.5081, + "mean_token_accuracy": 0.8398178219795227, + "num_tokens": 489532754.0, + "step": 12834 + }, + { + "epoch": 1.6327439257091974, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.282987594604492, + "learning_rate": 1e-06, + "loss": 0.4275, + "mean_token_accuracy": 0.8673866391181946, + "num_tokens": 489570378.0, + "step": 12835 + }, + { + "epoch": 1.6328711359877879, + "ewc_loss": 0.03369140625, + "ewc_loss_parallel": 3.361701965332031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.83713150024414, + "learning_rate": 1e-06, + "loss": 0.4278, + "mean_token_accuracy": 0.8613201379776001, + "num_tokens": 489604562.0, + "step": 12836 + }, + { + "epoch": 1.6329983462663784, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.031944274902344, + "learning_rate": 1e-06, + "loss": 0.3911, + "mean_token_accuracy": 0.8772523403167725, + "num_tokens": 489644060.0, + "step": 12837 + }, + { + "epoch": 1.633125556544969, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.143062591552734, + "learning_rate": 1e-06, + "loss": 0.4138, + "mean_token_accuracy": 0.8706799745559692, + "num_tokens": 489682693.0, + "step": 12838 + }, + { + "epoch": 1.6332527668235595, + "ewc_loss": 0.03369140625, + "ewc_loss_parallel": 3.361701965332031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.897844314575195, + "learning_rate": 1e-06, + "loss": 0.387, + "mean_token_accuracy": 0.8772566914558411, + "num_tokens": 489720786.0, + "step": 12839 + }, + { + "epoch": 1.63337997710215, + "ewc_loss": 0.03369140625, + "ewc_loss_parallel": 3.361701965332031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.034826278686523, + "learning_rate": 1e-06, + "loss": 0.3918, + "mean_token_accuracy": 0.8745733499526978, + "num_tokens": 489758773.0, + "step": 12840 + }, + { + "epoch": 1.6335071873807405, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.95684814453125, + "learning_rate": 1e-06, + "loss": 0.4072, + "mean_token_accuracy": 0.8691372275352478, + "num_tokens": 489793436.0, + "step": 12841 + }, + { + "epoch": 1.6336343976593308, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.019128799438477, + "learning_rate": 1e-06, + "loss": 0.3514, + "mean_token_accuracy": 0.8873651027679443, + "num_tokens": 489828634.0, + "step": 12842 + }, + { + "epoch": 1.6337616079379214, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.078027725219727, + "learning_rate": 1e-06, + "loss": 0.4003, + "mean_token_accuracy": 0.873177707195282, + "num_tokens": 489866335.0, + "step": 12843 + }, + { + "epoch": 1.6338888182165119, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.102487564086914, + "learning_rate": 1e-06, + "loss": 0.3947, + "mean_token_accuracy": 0.8776599168777466, + "num_tokens": 489903059.0, + "step": 12844 + }, + { + "epoch": 1.6340160284951024, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.963722229003906, + "learning_rate": 1e-06, + "loss": 0.3871, + "mean_token_accuracy": 0.8765133619308472, + "num_tokens": 489940186.0, + "step": 12845 + }, + { + "epoch": 1.634143238773693, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.97170066833496, + "learning_rate": 1e-06, + "loss": 0.3717, + "mean_token_accuracy": 0.8809767961502075, + "num_tokens": 489984732.0, + "step": 12846 + }, + { + "epoch": 1.6342704490522835, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.944807052612305, + "learning_rate": 1e-06, + "loss": 0.402, + "mean_token_accuracy": 0.8706469535827637, + "num_tokens": 490016699.0, + "step": 12847 + }, + { + "epoch": 1.6343976593308738, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.111902236938477, + "learning_rate": 1e-06, + "loss": 0.4482, + "mean_token_accuracy": 0.861037015914917, + "num_tokens": 490051351.0, + "step": 12848 + }, + { + "epoch": 1.6345248696094643, + "ewc_loss": 0.03369140625, + "ewc_loss_parallel": 3.361701965332031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.965829849243164, + "learning_rate": 1e-06, + "loss": 0.3918, + "mean_token_accuracy": 0.8780704736709595, + "num_tokens": 490089668.0, + "step": 12849 + }, + { + "epoch": 1.6346520798880548, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.967994689941406, + "learning_rate": 1e-06, + "loss": 0.4141, + "mean_token_accuracy": 0.8699166774749756, + "num_tokens": 490129651.0, + "step": 12850 + }, + { + "epoch": 1.6347792901666454, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.006486892700195, + "learning_rate": 1e-06, + "loss": 0.4353, + "mean_token_accuracy": 0.862457811832428, + "num_tokens": 490168441.0, + "step": 12851 + }, + { + "epoch": 1.6349065004452359, + "ewc_loss": 0.03369140625, + "ewc_loss_parallel": 3.361701965332031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.026226043701172, + "learning_rate": 1e-06, + "loss": 0.3791, + "mean_token_accuracy": 0.87947678565979, + "num_tokens": 490204754.0, + "step": 12852 + }, + { + "epoch": 1.6350337107238264, + "ewc_loss": 0.03369140625, + "ewc_loss_parallel": 3.361701965332031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.987762451171875, + "learning_rate": 1e-06, + "loss": 0.3693, + "mean_token_accuracy": 0.8839965462684631, + "num_tokens": 490244178.0, + "step": 12853 + }, + { + "epoch": 1.635160921002417, + "ewc_loss": 0.03369140625, + "ewc_loss_parallel": 3.361701965332031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.19985008239746, + "learning_rate": 1e-06, + "loss": 0.4205, + "mean_token_accuracy": 0.8646561503410339, + "num_tokens": 490281161.0, + "step": 12854 + }, + { + "epoch": 1.6352881312810075, + "ewc_loss": 0.03369140625, + "ewc_loss_parallel": 3.361701965332031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.842695236206055, + "learning_rate": 1e-06, + "loss": 0.4121, + "mean_token_accuracy": 0.871199905872345, + "num_tokens": 490314892.0, + "step": 12855 + }, + { + "epoch": 1.635415341559598, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.372243881225586, + "learning_rate": 1e-06, + "loss": 0.445, + "mean_token_accuracy": 0.8585042953491211, + "num_tokens": 490352600.0, + "step": 12856 + }, + { + "epoch": 1.6355425518381885, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.988866806030273, + "learning_rate": 1e-06, + "loss": 0.3718, + "mean_token_accuracy": 0.8821772336959839, + "num_tokens": 490384487.0, + "step": 12857 + }, + { + "epoch": 1.635669762116779, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.078527450561523, + "learning_rate": 1e-06, + "loss": 0.4792, + "mean_token_accuracy": 0.8488348722457886, + "num_tokens": 490422199.0, + "step": 12858 + }, + { + "epoch": 1.6357969723953696, + "ewc_loss": 0.03369140625, + "ewc_loss_parallel": 3.361701965332031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.066057205200195, + "learning_rate": 1e-06, + "loss": 0.4403, + "mean_token_accuracy": 0.8613215088844299, + "num_tokens": 490458058.0, + "step": 12859 + }, + { + "epoch": 1.6359241826739601, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.985679626464844, + "learning_rate": 1e-06, + "loss": 0.3786, + "mean_token_accuracy": 0.8802779316902161, + "num_tokens": 490498447.0, + "step": 12860 + }, + { + "epoch": 1.6360513929525506, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.13146209716797, + "learning_rate": 1e-06, + "loss": 0.4574, + "mean_token_accuracy": 0.8574893474578857, + "num_tokens": 490538502.0, + "step": 12861 + }, + { + "epoch": 1.6361786032311412, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.916601181030273, + "learning_rate": 1e-06, + "loss": 0.4019, + "mean_token_accuracy": 0.8721028566360474, + "num_tokens": 490575752.0, + "step": 12862 + }, + { + "epoch": 1.6363058135097317, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.95637321472168, + "learning_rate": 1e-06, + "loss": 0.3939, + "mean_token_accuracy": 0.8697288036346436, + "num_tokens": 490613502.0, + "step": 12863 + }, + { + "epoch": 1.6364330237883222, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.04680824279785, + "learning_rate": 1e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.8642514944076538, + "num_tokens": 490653820.0, + "step": 12864 + }, + { + "epoch": 1.6365602340669128, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.944765090942383, + "learning_rate": 1e-06, + "loss": 0.4572, + "mean_token_accuracy": 0.8544118404388428, + "num_tokens": 490693332.0, + "step": 12865 + }, + { + "epoch": 1.636687444345503, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.08855438232422, + "learning_rate": 1e-06, + "loss": 0.4077, + "mean_token_accuracy": 0.868018627166748, + "num_tokens": 490733135.0, + "step": 12866 + }, + { + "epoch": 1.6368146546240936, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.986759185791016, + "learning_rate": 1e-06, + "loss": 0.4019, + "mean_token_accuracy": 0.8723795413970947, + "num_tokens": 490775505.0, + "step": 12867 + }, + { + "epoch": 1.6369418649026841, + "ewc_loss": 0.03369140625, + "ewc_loss_parallel": 3.361701965332031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.983186721801758, + "learning_rate": 1e-06, + "loss": 0.4095, + "mean_token_accuracy": 0.8685349225997925, + "num_tokens": 490817783.0, + "step": 12868 + }, + { + "epoch": 1.6370690751812746, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.989166259765625, + "learning_rate": 1e-06, + "loss": 0.4379, + "mean_token_accuracy": 0.8642159700393677, + "num_tokens": 490849464.0, + "step": 12869 + }, + { + "epoch": 1.6371962854598652, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.895553588867188, + "learning_rate": 1e-06, + "loss": 0.4315, + "mean_token_accuracy": 0.8624845743179321, + "num_tokens": 490891925.0, + "step": 12870 + }, + { + "epoch": 1.6373234957384557, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.819721221923828, + "learning_rate": 1e-06, + "loss": 0.4369, + "mean_token_accuracy": 0.862120509147644, + "num_tokens": 490926221.0, + "step": 12871 + }, + { + "epoch": 1.6374507060170462, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.245092391967773, + "learning_rate": 1e-06, + "loss": 0.4358, + "mean_token_accuracy": 0.8600360751152039, + "num_tokens": 490964141.0, + "step": 12872 + }, + { + "epoch": 1.6375779162956365, + "ewc_loss": 0.03369140625, + "ewc_loss_parallel": 3.361701965332031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.001806259155273, + "learning_rate": 1e-06, + "loss": 0.425, + "mean_token_accuracy": 0.865210771560669, + "num_tokens": 490997917.0, + "step": 12873 + }, + { + "epoch": 1.637705126574227, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.953779220581055, + "learning_rate": 1e-06, + "loss": 0.3656, + "mean_token_accuracy": 0.8831956386566162, + "num_tokens": 491029244.0, + "step": 12874 + }, + { + "epoch": 1.6378323368528176, + "ewc_loss": 0.03369140625, + "ewc_loss_parallel": 3.361701965332031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.00155258178711, + "learning_rate": 1e-06, + "loss": 0.4076, + "mean_token_accuracy": 0.8687843084335327, + "num_tokens": 491066591.0, + "step": 12875 + }, + { + "epoch": 1.6379595471314081, + "ewc_loss": 0.03369140625, + "ewc_loss_parallel": 3.361701965332031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.106794357299805, + "learning_rate": 1e-06, + "loss": 0.4156, + "mean_token_accuracy": 0.8672740459442139, + "num_tokens": 491106546.0, + "step": 12876 + }, + { + "epoch": 1.6380867574099987, + "ewc_loss": 0.03369140625, + "ewc_loss_parallel": 3.361701965332031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.007610321044922, + "learning_rate": 1e-06, + "loss": 0.4229, + "mean_token_accuracy": 0.8616570234298706, + "num_tokens": 491145512.0, + "step": 12877 + }, + { + "epoch": 1.6382139676885892, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.11480140686035, + "learning_rate": 1e-06, + "loss": 0.4131, + "mean_token_accuracy": 0.8669549226760864, + "num_tokens": 491185624.0, + "step": 12878 + }, + { + "epoch": 1.6383411779671797, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.005413055419922, + "learning_rate": 1e-06, + "loss": 0.4253, + "mean_token_accuracy": 0.8633061051368713, + "num_tokens": 491220982.0, + "step": 12879 + }, + { + "epoch": 1.6384683882457702, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.102468490600586, + "learning_rate": 1e-06, + "loss": 0.4117, + "mean_token_accuracy": 0.8707004189491272, + "num_tokens": 491262501.0, + "step": 12880 + }, + { + "epoch": 1.6385955985243608, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.01066780090332, + "learning_rate": 1e-06, + "loss": 0.3698, + "mean_token_accuracy": 0.8801986575126648, + "num_tokens": 491298078.0, + "step": 12881 + }, + { + "epoch": 1.6387228088029513, + "ewc_loss": 0.03369140625, + "ewc_loss_parallel": 3.361701965332031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.013168334960938, + "learning_rate": 1e-06, + "loss": 0.3903, + "mean_token_accuracy": 0.8750724196434021, + "num_tokens": 491336203.0, + "step": 12882 + }, + { + "epoch": 1.6388500190815418, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.206716537475586, + "learning_rate": 1e-06, + "loss": 0.4412, + "mean_token_accuracy": 0.8580036163330078, + "num_tokens": 491374723.0, + "step": 12883 + }, + { + "epoch": 1.6389772293601323, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.107088088989258, + "learning_rate": 1e-06, + "loss": 0.407, + "mean_token_accuracy": 0.8705286383628845, + "num_tokens": 491410546.0, + "step": 12884 + }, + { + "epoch": 1.6391044396387229, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.955774307250977, + "learning_rate": 1e-06, + "loss": 0.4323, + "mean_token_accuracy": 0.8643767833709717, + "num_tokens": 491450642.0, + "step": 12885 + }, + { + "epoch": 1.6392316499173134, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.12474822998047, + "learning_rate": 1e-06, + "loss": 0.4212, + "mean_token_accuracy": 0.8662731647491455, + "num_tokens": 491489893.0, + "step": 12886 + }, + { + "epoch": 1.639358860195904, + "ewc_loss": 0.03369140625, + "ewc_loss_parallel": 3.361701965332031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.081239700317383, + "learning_rate": 1e-06, + "loss": 0.4408, + "mean_token_accuracy": 0.8593264818191528, + "num_tokens": 491535870.0, + "step": 12887 + }, + { + "epoch": 1.6394860704744945, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.01531410217285, + "learning_rate": 1e-06, + "loss": 0.416, + "mean_token_accuracy": 0.8674735426902771, + "num_tokens": 491575988.0, + "step": 12888 + }, + { + "epoch": 1.639613280753085, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.99896812438965, + "learning_rate": 1e-06, + "loss": 0.4129, + "mean_token_accuracy": 0.8701977133750916, + "num_tokens": 491616822.0, + "step": 12889 + }, + { + "epoch": 1.6397404910316755, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.058696746826172, + "learning_rate": 1e-06, + "loss": 0.3533, + "mean_token_accuracy": 0.8877930641174316, + "num_tokens": 491654132.0, + "step": 12890 + }, + { + "epoch": 1.6398677013102658, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.141605377197266, + "learning_rate": 1e-06, + "loss": 0.4593, + "mean_token_accuracy": 0.8561295866966248, + "num_tokens": 491697215.0, + "step": 12891 + }, + { + "epoch": 1.6399949115888564, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.084245681762695, + "learning_rate": 1e-06, + "loss": 0.4833, + "mean_token_accuracy": 0.8477694988250732, + "num_tokens": 491731338.0, + "step": 12892 + }, + { + "epoch": 1.6401221218674469, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.220788955688477, + "learning_rate": 1e-06, + "loss": 0.4185, + "mean_token_accuracy": 0.8670349717140198, + "num_tokens": 491770808.0, + "step": 12893 + }, + { + "epoch": 1.6402493321460374, + "ewc_loss": 0.03369140625, + "ewc_loss_parallel": 3.361701965332031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.069887161254883, + "learning_rate": 1e-06, + "loss": 0.3799, + "mean_token_accuracy": 0.876294732093811, + "num_tokens": 491804167.0, + "step": 12894 + }, + { + "epoch": 1.640376542424628, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.921764373779297, + "learning_rate": 1e-06, + "loss": 0.4098, + "mean_token_accuracy": 0.8704275488853455, + "num_tokens": 491840094.0, + "step": 12895 + }, + { + "epoch": 1.6405037527032185, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.282176971435547, + "learning_rate": 1e-06, + "loss": 0.4562, + "mean_token_accuracy": 0.8551991581916809, + "num_tokens": 491881643.0, + "step": 12896 + }, + { + "epoch": 1.6406309629818088, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.005739212036133, + "learning_rate": 1e-06, + "loss": 0.452, + "mean_token_accuracy": 0.855709969997406, + "num_tokens": 491920951.0, + "step": 12897 + }, + { + "epoch": 1.6407581732603993, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.011695861816406, + "learning_rate": 1e-06, + "loss": 0.4218, + "mean_token_accuracy": 0.8655268549919128, + "num_tokens": 491954599.0, + "step": 12898 + }, + { + "epoch": 1.6408853835389898, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.08651351928711, + "learning_rate": 1e-06, + "loss": 0.4147, + "mean_token_accuracy": 0.8696355223655701, + "num_tokens": 491995375.0, + "step": 12899 + }, + { + "epoch": 1.6410125938175804, + "ewc_loss": 0.03369140625, + "ewc_loss_parallel": 3.361701965332031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.184526443481445, + "learning_rate": 1e-06, + "loss": 0.423, + "mean_token_accuracy": 0.8643143177032471, + "num_tokens": 492035297.0, + "step": 12900 + }, + { + "epoch": 1.6411398040961709, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.905542373657227, + "learning_rate": 1e-06, + "loss": 0.4114, + "mean_token_accuracy": 0.8694344758987427, + "num_tokens": 492071003.0, + "step": 12901 + }, + { + "epoch": 1.6412670143747614, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.283105850219727, + "learning_rate": 1e-06, + "loss": 0.4256, + "mean_token_accuracy": 0.8686118125915527, + "num_tokens": 492106046.0, + "step": 12902 + }, + { + "epoch": 1.641394224653352, + "ewc_loss": 0.03369140625, + "ewc_loss_parallel": 3.361701965332031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.005844116210938, + "learning_rate": 1e-06, + "loss": 0.3741, + "mean_token_accuracy": 0.8801889419555664, + "num_tokens": 492147252.0, + "step": 12903 + }, + { + "epoch": 1.6415214349319425, + "ewc_loss": 0.03369140625, + "ewc_loss_parallel": 3.361701965332031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.98373031616211, + "learning_rate": 1e-06, + "loss": 0.4413, + "mean_token_accuracy": 0.8603615760803223, + "num_tokens": 492188185.0, + "step": 12904 + }, + { + "epoch": 1.641648645210533, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.10741424560547, + "learning_rate": 1e-06, + "loss": 0.4038, + "mean_token_accuracy": 0.8736080527305603, + "num_tokens": 492219630.0, + "step": 12905 + }, + { + "epoch": 1.6417758554891235, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.152509689331055, + "learning_rate": 1e-06, + "loss": 0.4004, + "mean_token_accuracy": 0.8707855343818665, + "num_tokens": 492251955.0, + "step": 12906 + }, + { + "epoch": 1.641903065767714, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.87342071533203, + "learning_rate": 1e-06, + "loss": 0.4081, + "mean_token_accuracy": 0.8711380958557129, + "num_tokens": 492293484.0, + "step": 12907 + }, + { + "epoch": 1.6420302760463046, + "ewc_loss": 0.03369140625, + "ewc_loss_parallel": 3.361701965332031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.0939884185791, + "learning_rate": 1e-06, + "loss": 0.4478, + "mean_token_accuracy": 0.8543530702590942, + "num_tokens": 492331169.0, + "step": 12908 + }, + { + "epoch": 1.642157486324895, + "ewc_loss": 0.03369140625, + "ewc_loss_parallel": 3.361701965332031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.944150924682617, + "learning_rate": 1e-06, + "loss": 0.3755, + "mean_token_accuracy": 0.8764899373054504, + "num_tokens": 492366071.0, + "step": 12909 + }, + { + "epoch": 1.6422846966034856, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.065677642822266, + "learning_rate": 1e-06, + "loss": 0.3501, + "mean_token_accuracy": 0.8885864019393921, + "num_tokens": 492396820.0, + "step": 12910 + }, + { + "epoch": 1.6424119068820762, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.906797409057617, + "learning_rate": 1e-06, + "loss": 0.3988, + "mean_token_accuracy": 0.8759019374847412, + "num_tokens": 492435282.0, + "step": 12911 + }, + { + "epoch": 1.6425391171606667, + "ewc_loss": 0.03369140625, + "ewc_loss_parallel": 3.361701965332031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.938358306884766, + "learning_rate": 1e-06, + "loss": 0.3871, + "mean_token_accuracy": 0.8764307498931885, + "num_tokens": 492470195.0, + "step": 12912 + }, + { + "epoch": 1.6426663274392572, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.17658805847168, + "learning_rate": 1e-06, + "loss": 0.4345, + "mean_token_accuracy": 0.863410472869873, + "num_tokens": 492514265.0, + "step": 12913 + }, + { + "epoch": 1.6427935377178478, + "ewc_loss": 0.03369140625, + "ewc_loss_parallel": 3.361701965332031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.015098571777344, + "learning_rate": 1e-06, + "loss": 0.4037, + "mean_token_accuracy": 0.8712029457092285, + "num_tokens": 492550348.0, + "step": 12914 + }, + { + "epoch": 1.642920747996438, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.100900650024414, + "learning_rate": 1e-06, + "loss": 0.3968, + "mean_token_accuracy": 0.8717348575592041, + "num_tokens": 492590648.0, + "step": 12915 + }, + { + "epoch": 1.6430479582750286, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.15152931213379, + "learning_rate": 1e-06, + "loss": 0.4128, + "mean_token_accuracy": 0.8712221384048462, + "num_tokens": 492629523.0, + "step": 12916 + }, + { + "epoch": 1.6431751685536191, + "ewc_loss": 0.03369140625, + "ewc_loss_parallel": 3.361701965332031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.099803924560547, + "learning_rate": 1e-06, + "loss": 0.4318, + "mean_token_accuracy": 0.8646341562271118, + "num_tokens": 492668392.0, + "step": 12917 + }, + { + "epoch": 1.6433023788322096, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.211036682128906, + "learning_rate": 1e-06, + "loss": 0.4372, + "mean_token_accuracy": 0.8587816953659058, + "num_tokens": 492696073.0, + "step": 12918 + }, + { + "epoch": 1.6434295891108002, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.85862922668457, + "learning_rate": 1e-06, + "loss": 0.4049, + "mean_token_accuracy": 0.8712372779846191, + "num_tokens": 492728590.0, + "step": 12919 + }, + { + "epoch": 1.6435567993893907, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.14400291442871, + "learning_rate": 1e-06, + "loss": 0.4394, + "mean_token_accuracy": 0.8622380495071411, + "num_tokens": 492767290.0, + "step": 12920 + }, + { + "epoch": 1.6436840096679812, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.072179794311523, + "learning_rate": 1e-06, + "loss": 0.372, + "mean_token_accuracy": 0.8798607587814331, + "num_tokens": 492801019.0, + "step": 12921 + }, + { + "epoch": 1.6438112199465715, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.034503936767578, + "learning_rate": 1e-06, + "loss": 0.407, + "mean_token_accuracy": 0.8682866096496582, + "num_tokens": 492841220.0, + "step": 12922 + }, + { + "epoch": 1.643938430225162, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.071922302246094, + "learning_rate": 1e-06, + "loss": 0.4537, + "mean_token_accuracy": 0.858788251876831, + "num_tokens": 492885076.0, + "step": 12923 + }, + { + "epoch": 1.6440656405037526, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.01218032836914, + "learning_rate": 1e-06, + "loss": 0.4196, + "mean_token_accuracy": 0.8644648790359497, + "num_tokens": 492918554.0, + "step": 12924 + }, + { + "epoch": 1.6441928507823431, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.878856658935547, + "learning_rate": 1e-06, + "loss": 0.3632, + "mean_token_accuracy": 0.8844959735870361, + "num_tokens": 492961890.0, + "step": 12925 + }, + { + "epoch": 1.6443200610609336, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.090572357177734, + "learning_rate": 1e-06, + "loss": 0.3802, + "mean_token_accuracy": 0.8786565065383911, + "num_tokens": 492997768.0, + "step": 12926 + }, + { + "epoch": 1.6444472713395242, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.959659576416016, + "learning_rate": 1e-06, + "loss": 0.3409, + "mean_token_accuracy": 0.8943365216255188, + "num_tokens": 493037017.0, + "step": 12927 + }, + { + "epoch": 1.6445744816181147, + "ewc_loss": 0.03369140625, + "ewc_loss_parallel": 3.361701965332031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.145536422729492, + "learning_rate": 1e-06, + "loss": 0.394, + "mean_token_accuracy": 0.8787563443183899, + "num_tokens": 493074579.0, + "step": 12928 + }, + { + "epoch": 1.6447016918967052, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.032512664794922, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.8702487945556641, + "num_tokens": 493111249.0, + "step": 12929 + }, + { + "epoch": 1.6448289021752958, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.889686584472656, + "learning_rate": 1e-06, + "loss": 0.4646, + "mean_token_accuracy": 0.8540195226669312, + "num_tokens": 493150723.0, + "step": 12930 + }, + { + "epoch": 1.6449561124538863, + "ewc_loss": 0.03369140625, + "ewc_loss_parallel": 3.361701965332031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.979799270629883, + "learning_rate": 1e-06, + "loss": 0.4265, + "mean_token_accuracy": 0.8669676780700684, + "num_tokens": 493189530.0, + "step": 12931 + }, + { + "epoch": 1.6450833227324768, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.092744827270508, + "learning_rate": 1e-06, + "loss": 0.3897, + "mean_token_accuracy": 0.8765559196472168, + "num_tokens": 493231369.0, + "step": 12932 + }, + { + "epoch": 1.6452105330110673, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.138479232788086, + "learning_rate": 1e-06, + "loss": 0.4557, + "mean_token_accuracy": 0.8552658557891846, + "num_tokens": 493278430.0, + "step": 12933 + }, + { + "epoch": 1.6453377432896579, + "ewc_loss": 0.03369140625, + "ewc_loss_parallel": 3.361701965332031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.19117546081543, + "learning_rate": 1e-06, + "loss": 0.3954, + "mean_token_accuracy": 0.8742088675498962, + "num_tokens": 493312258.0, + "step": 12934 + }, + { + "epoch": 1.6454649535682484, + "ewc_loss": 0.03369140625, + "ewc_loss_parallel": 3.361701965332031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.035921096801758, + "learning_rate": 1e-06, + "loss": 0.4043, + "mean_token_accuracy": 0.8684099316596985, + "num_tokens": 493347725.0, + "step": 12935 + }, + { + "epoch": 1.645592163846839, + "ewc_loss": 0.03369140625, + "ewc_loss_parallel": 3.361701965332031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.102087020874023, + "learning_rate": 1e-06, + "loss": 0.4002, + "mean_token_accuracy": 0.8724196553230286, + "num_tokens": 493388119.0, + "step": 12936 + }, + { + "epoch": 1.6457193741254295, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.153043746948242, + "learning_rate": 1e-06, + "loss": 0.397, + "mean_token_accuracy": 0.8696163296699524, + "num_tokens": 493424607.0, + "step": 12937 + }, + { + "epoch": 1.64584658440402, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.086854934692383, + "learning_rate": 1e-06, + "loss": 0.4833, + "mean_token_accuracy": 0.8466683626174927, + "num_tokens": 493459546.0, + "step": 12938 + }, + { + "epoch": 1.6459737946826105, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.16440773010254, + "learning_rate": 1e-06, + "loss": 0.3979, + "mean_token_accuracy": 0.8729872703552246, + "num_tokens": 493494162.0, + "step": 12939 + }, + { + "epoch": 1.6461010049612008, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.89092445373535, + "learning_rate": 1e-06, + "loss": 0.4182, + "mean_token_accuracy": 0.866439700126648, + "num_tokens": 493537523.0, + "step": 12940 + }, + { + "epoch": 1.6462282152397913, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.099973678588867, + "learning_rate": 1e-06, + "loss": 0.4298, + "mean_token_accuracy": 0.8670134544372559, + "num_tokens": 493580065.0, + "step": 12941 + }, + { + "epoch": 1.6463554255183819, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.22297477722168, + "learning_rate": 1e-06, + "loss": 0.3924, + "mean_token_accuracy": 0.8759168386459351, + "num_tokens": 493626717.0, + "step": 12942 + }, + { + "epoch": 1.6464826357969724, + "ewc_loss": 0.033203125, + "ewc_loss_parallel": 3.314018249511719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.015779495239258, + "learning_rate": 1e-06, + "loss": 0.3922, + "mean_token_accuracy": 0.8751225471496582, + "num_tokens": 493666266.0, + "step": 12943 + }, + { + "epoch": 1.646609846075563, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.00301742553711, + "learning_rate": 1e-06, + "loss": 0.4412, + "mean_token_accuracy": 0.8585106134414673, + "num_tokens": 493703184.0, + "step": 12944 + }, + { + "epoch": 1.6467370563541535, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.032976150512695, + "learning_rate": 1e-06, + "loss": 0.3611, + "mean_token_accuracy": 0.8846999406814575, + "num_tokens": 493744061.0, + "step": 12945 + }, + { + "epoch": 1.6468642666327438, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.137874603271484, + "learning_rate": 1e-06, + "loss": 0.4122, + "mean_token_accuracy": 0.8694103956222534, + "num_tokens": 493783585.0, + "step": 12946 + }, + { + "epoch": 1.6469914769113343, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.18329429626465, + "learning_rate": 1e-06, + "loss": 0.3979, + "mean_token_accuracy": 0.8751053810119629, + "num_tokens": 493827781.0, + "step": 12947 + }, + { + "epoch": 1.6471186871899248, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.803680419921875, + "learning_rate": 1e-06, + "loss": 0.3981, + "mean_token_accuracy": 0.873400092124939, + "num_tokens": 493864418.0, + "step": 12948 + }, + { + "epoch": 1.6472458974685154, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.063045501708984, + "learning_rate": 1e-06, + "loss": 0.4205, + "mean_token_accuracy": 0.8694796562194824, + "num_tokens": 493895226.0, + "step": 12949 + }, + { + "epoch": 1.6473731077471059, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.201486587524414, + "learning_rate": 1e-06, + "loss": 0.4396, + "mean_token_accuracy": 0.8612009286880493, + "num_tokens": 493935210.0, + "step": 12950 + }, + { + "epoch": 1.6475003180256964, + "ewc_loss": 0.03369140625, + "ewc_loss_parallel": 3.361701965332031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.066707611083984, + "learning_rate": 1e-06, + "loss": 0.4081, + "mean_token_accuracy": 0.8682621717453003, + "num_tokens": 493966368.0, + "step": 12951 + }, + { + "epoch": 1.647627528304287, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.150218963623047, + "learning_rate": 1e-06, + "loss": 0.4313, + "mean_token_accuracy": 0.8624913692474365, + "num_tokens": 494004304.0, + "step": 12952 + }, + { + "epoch": 1.6477547385828775, + "ewc_loss": 0.03369140625, + "ewc_loss_parallel": 3.361701965332031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.052658081054688, + "learning_rate": 1e-06, + "loss": 0.3914, + "mean_token_accuracy": 0.8760607242584229, + "num_tokens": 494044758.0, + "step": 12953 + }, + { + "epoch": 1.647881948861468, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.110017776489258, + "learning_rate": 1e-06, + "loss": 0.4602, + "mean_token_accuracy": 0.8537551164627075, + "num_tokens": 494080624.0, + "step": 12954 + }, + { + "epoch": 1.6480091591400585, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.925058364868164, + "learning_rate": 1e-06, + "loss": 0.3945, + "mean_token_accuracy": 0.8747992515563965, + "num_tokens": 494116215.0, + "step": 12955 + }, + { + "epoch": 1.648136369418649, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.104877471923828, + "learning_rate": 1e-06, + "loss": 0.4198, + "mean_token_accuracy": 0.867762565612793, + "num_tokens": 494150317.0, + "step": 12956 + }, + { + "epoch": 1.6482635796972396, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.080284118652344, + "learning_rate": 1e-06, + "loss": 0.3634, + "mean_token_accuracy": 0.8824318647384644, + "num_tokens": 494187273.0, + "step": 12957 + }, + { + "epoch": 1.64839078997583, + "ewc_loss": 0.03369140625, + "ewc_loss_parallel": 3.361701965332031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.988483428955078, + "learning_rate": 1e-06, + "loss": 0.4293, + "mean_token_accuracy": 0.8653020858764648, + "num_tokens": 494226921.0, + "step": 12958 + }, + { + "epoch": 1.6485180002544206, + "ewc_loss": 0.03369140625, + "ewc_loss_parallel": 3.361701965332031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.021690368652344, + "learning_rate": 1e-06, + "loss": 0.3943, + "mean_token_accuracy": 0.8757179975509644, + "num_tokens": 494263141.0, + "step": 12959 + }, + { + "epoch": 1.6486452105330112, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.878332138061523, + "learning_rate": 1e-06, + "loss": 0.4183, + "mean_token_accuracy": 0.8647018671035767, + "num_tokens": 494307925.0, + "step": 12960 + }, + { + "epoch": 1.6487724208116017, + "ewc_loss": 0.03369140625, + "ewc_loss_parallel": 3.361701965332031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.071157455444336, + "learning_rate": 1e-06, + "loss": 0.4323, + "mean_token_accuracy": 0.8625774383544922, + "num_tokens": 494353428.0, + "step": 12961 + }, + { + "epoch": 1.6488996310901922, + "ewc_loss": 0.03369140625, + "ewc_loss_parallel": 3.361701965332031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.942113876342773, + "learning_rate": 1e-06, + "loss": 0.4537, + "mean_token_accuracy": 0.857124924659729, + "num_tokens": 494383559.0, + "step": 12962 + }, + { + "epoch": 1.6490268413687827, + "ewc_loss": 0.03369140625, + "ewc_loss_parallel": 3.361701965332031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.945871353149414, + "learning_rate": 1e-06, + "loss": 0.4062, + "mean_token_accuracy": 0.8721797466278076, + "num_tokens": 494419564.0, + "step": 12963 + }, + { + "epoch": 1.649154051647373, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.142913818359375, + "learning_rate": 1e-06, + "loss": 0.4119, + "mean_token_accuracy": 0.8720201253890991, + "num_tokens": 494466886.0, + "step": 12964 + }, + { + "epoch": 1.6492812619259636, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.944196701049805, + "learning_rate": 1e-06, + "loss": 0.4631, + "mean_token_accuracy": 0.8514584302902222, + "num_tokens": 494506672.0, + "step": 12965 + }, + { + "epoch": 1.649408472204554, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.201467514038086, + "learning_rate": 1e-06, + "loss": 0.4042, + "mean_token_accuracy": 0.8685760498046875, + "num_tokens": 494541881.0, + "step": 12966 + }, + { + "epoch": 1.6495356824831446, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.043405532836914, + "learning_rate": 1e-06, + "loss": 0.4105, + "mean_token_accuracy": 0.8735286593437195, + "num_tokens": 494583546.0, + "step": 12967 + }, + { + "epoch": 1.6496628927617352, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.012285232543945, + "learning_rate": 1e-06, + "loss": 0.4533, + "mean_token_accuracy": 0.8569965362548828, + "num_tokens": 494616625.0, + "step": 12968 + }, + { + "epoch": 1.6497901030403257, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.147619247436523, + "learning_rate": 1e-06, + "loss": 0.3717, + "mean_token_accuracy": 0.8820911049842834, + "num_tokens": 494654590.0, + "step": 12969 + }, + { + "epoch": 1.6499173133189162, + "ewc_loss": 0.03369140625, + "ewc_loss_parallel": 3.361701965332031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.8625545501709, + "learning_rate": 1e-06, + "loss": 0.3643, + "mean_token_accuracy": 0.8838103413581848, + "num_tokens": 494692471.0, + "step": 12970 + }, + { + "epoch": 1.6500445235975065, + "ewc_loss": 0.03369140625, + "ewc_loss_parallel": 3.361701965332031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.134490966796875, + "learning_rate": 1e-06, + "loss": 0.4314, + "mean_token_accuracy": 0.8611916303634644, + "num_tokens": 494730347.0, + "step": 12971 + }, + { + "epoch": 1.650171733876097, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.944292068481445, + "learning_rate": 1e-06, + "loss": 0.4467, + "mean_token_accuracy": 0.8579668998718262, + "num_tokens": 494763753.0, + "step": 12972 + }, + { + "epoch": 1.6502989441546876, + "ewc_loss": 0.03369140625, + "ewc_loss_parallel": 3.361701965332031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.18687629699707, + "learning_rate": 1e-06, + "loss": 0.4339, + "mean_token_accuracy": 0.8644711971282959, + "num_tokens": 494800416.0, + "step": 12973 + }, + { + "epoch": 1.6504261544332781, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.0097599029541, + "learning_rate": 1e-06, + "loss": 0.414, + "mean_token_accuracy": 0.8670494556427002, + "num_tokens": 494842733.0, + "step": 12974 + }, + { + "epoch": 1.6505533647118686, + "ewc_loss": 0.03369140625, + "ewc_loss_parallel": 3.361701965332031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.0228271484375, + "learning_rate": 1e-06, + "loss": 0.4354, + "mean_token_accuracy": 0.8614853024482727, + "num_tokens": 494878661.0, + "step": 12975 + }, + { + "epoch": 1.6506805749904592, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.054094314575195, + "learning_rate": 1e-06, + "loss": 0.4286, + "mean_token_accuracy": 0.8619082570075989, + "num_tokens": 494918142.0, + "step": 12976 + }, + { + "epoch": 1.6508077852690497, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.085391998291016, + "learning_rate": 1e-06, + "loss": 0.4235, + "mean_token_accuracy": 0.8682845830917358, + "num_tokens": 494956947.0, + "step": 12977 + }, + { + "epoch": 1.6509349955476402, + "ewc_loss": 0.03369140625, + "ewc_loss_parallel": 3.361701965332031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.207069396972656, + "learning_rate": 1e-06, + "loss": 0.4247, + "mean_token_accuracy": 0.8645455241203308, + "num_tokens": 494989850.0, + "step": 12978 + }, + { + "epoch": 1.6510622058262308, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.132368087768555, + "learning_rate": 1e-06, + "loss": 0.4474, + "mean_token_accuracy": 0.8608393669128418, + "num_tokens": 495026770.0, + "step": 12979 + }, + { + "epoch": 1.6511894161048213, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.981494903564453, + "learning_rate": 1e-06, + "loss": 0.3859, + "mean_token_accuracy": 0.8799399137496948, + "num_tokens": 495065009.0, + "step": 12980 + }, + { + "epoch": 1.6513166263834118, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.056034088134766, + "learning_rate": 1e-06, + "loss": 0.4326, + "mean_token_accuracy": 0.8698355555534363, + "num_tokens": 495096914.0, + "step": 12981 + }, + { + "epoch": 1.6514438366620023, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.11854362487793, + "learning_rate": 1e-06, + "loss": 0.3989, + "mean_token_accuracy": 0.8724904656410217, + "num_tokens": 495138348.0, + "step": 12982 + }, + { + "epoch": 1.6515710469405929, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.186115264892578, + "learning_rate": 1e-06, + "loss": 0.4293, + "mean_token_accuracy": 0.8624920845031738, + "num_tokens": 495177026.0, + "step": 12983 + }, + { + "epoch": 1.6516982572191834, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.962081909179688, + "learning_rate": 1e-06, + "loss": 0.4096, + "mean_token_accuracy": 0.8707795143127441, + "num_tokens": 495213850.0, + "step": 12984 + }, + { + "epoch": 1.651825467497774, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.0447998046875, + "learning_rate": 1e-06, + "loss": 0.3773, + "mean_token_accuracy": 0.8774386644363403, + "num_tokens": 495257213.0, + "step": 12985 + }, + { + "epoch": 1.6519526777763645, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.068918228149414, + "learning_rate": 1e-06, + "loss": 0.4351, + "mean_token_accuracy": 0.8594790697097778, + "num_tokens": 495300018.0, + "step": 12986 + }, + { + "epoch": 1.652079888054955, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.88606071472168, + "learning_rate": 1e-06, + "loss": 0.4169, + "mean_token_accuracy": 0.8677237629890442, + "num_tokens": 495336471.0, + "step": 12987 + }, + { + "epoch": 1.6522070983335455, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.134599685668945, + "learning_rate": 1e-06, + "loss": 0.4053, + "mean_token_accuracy": 0.8722184896469116, + "num_tokens": 495374680.0, + "step": 12988 + }, + { + "epoch": 1.6523343086121358, + "ewc_loss": 0.03369140625, + "ewc_loss_parallel": 3.361701965332031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.927356719970703, + "learning_rate": 1e-06, + "loss": 0.4001, + "mean_token_accuracy": 0.8717625141143799, + "num_tokens": 495414096.0, + "step": 12989 + }, + { + "epoch": 1.6524615188907263, + "ewc_loss": 0.03369140625, + "ewc_loss_parallel": 3.361701965332031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.896482467651367, + "learning_rate": 1e-06, + "loss": 0.4481, + "mean_token_accuracy": 0.856447696685791, + "num_tokens": 495452955.0, + "step": 12990 + }, + { + "epoch": 1.6525887291693169, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.097597122192383, + "learning_rate": 1e-06, + "loss": 0.4155, + "mean_token_accuracy": 0.8662331104278564, + "num_tokens": 495490091.0, + "step": 12991 + }, + { + "epoch": 1.6527159394479074, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.921281814575195, + "learning_rate": 1e-06, + "loss": 0.3871, + "mean_token_accuracy": 0.8770649433135986, + "num_tokens": 495524543.0, + "step": 12992 + }, + { + "epoch": 1.652843149726498, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.0205020904541, + "learning_rate": 1e-06, + "loss": 0.4393, + "mean_token_accuracy": 0.859907865524292, + "num_tokens": 495562068.0, + "step": 12993 + }, + { + "epoch": 1.6529703600050885, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.045421600341797, + "learning_rate": 1e-06, + "loss": 0.3833, + "mean_token_accuracy": 0.8769334554672241, + "num_tokens": 495598355.0, + "step": 12994 + }, + { + "epoch": 1.6530975702836788, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.09498405456543, + "learning_rate": 1e-06, + "loss": 0.451, + "mean_token_accuracy": 0.8551341891288757, + "num_tokens": 495634735.0, + "step": 12995 + }, + { + "epoch": 1.6532247805622693, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.988481521606445, + "learning_rate": 1e-06, + "loss": 0.4262, + "mean_token_accuracy": 0.8686323165893555, + "num_tokens": 495668804.0, + "step": 12996 + }, + { + "epoch": 1.6533519908408598, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.110490798950195, + "learning_rate": 1e-06, + "loss": 0.4015, + "mean_token_accuracy": 0.8730831146240234, + "num_tokens": 495707129.0, + "step": 12997 + }, + { + "epoch": 1.6534792011194503, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.95849609375, + "learning_rate": 1e-06, + "loss": 0.4684, + "mean_token_accuracy": 0.8478118181228638, + "num_tokens": 495746501.0, + "step": 12998 + }, + { + "epoch": 1.6536064113980409, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.33791160583496, + "learning_rate": 1e-06, + "loss": 0.3703, + "mean_token_accuracy": 0.884772777557373, + "num_tokens": 495789723.0, + "step": 12999 + }, + { + "epoch": 1.6537336216766314, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.236406326293945, + "learning_rate": 1e-06, + "loss": 0.4396, + "mean_token_accuracy": 0.8602616786956787, + "num_tokens": 495827073.0, + "step": 13000 + }, + { + "epoch": 1.653860831955222, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.032054901123047, + "learning_rate": 1e-06, + "loss": 0.4433, + "mean_token_accuracy": 0.8584318161010742, + "num_tokens": 495862362.0, + "step": 13001 + }, + { + "epoch": 1.6539880422338125, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.000347137451172, + "learning_rate": 1e-06, + "loss": 0.4164, + "mean_token_accuracy": 0.8710552453994751, + "num_tokens": 495905546.0, + "step": 13002 + }, + { + "epoch": 1.654115252512403, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.25724983215332, + "learning_rate": 1e-06, + "loss": 0.4112, + "mean_token_accuracy": 0.8670852184295654, + "num_tokens": 495942291.0, + "step": 13003 + }, + { + "epoch": 1.6542424627909935, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.97768783569336, + "learning_rate": 1e-06, + "loss": 0.4612, + "mean_token_accuracy": 0.8584965467453003, + "num_tokens": 495978631.0, + "step": 13004 + }, + { + "epoch": 1.654369673069584, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.151973724365234, + "learning_rate": 1e-06, + "loss": 0.4274, + "mean_token_accuracy": 0.863147497177124, + "num_tokens": 496016626.0, + "step": 13005 + }, + { + "epoch": 1.6544968833481746, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.111791610717773, + "learning_rate": 1e-06, + "loss": 0.4366, + "mean_token_accuracy": 0.8598545789718628, + "num_tokens": 496051949.0, + "step": 13006 + }, + { + "epoch": 1.654624093626765, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.121225357055664, + "learning_rate": 1e-06, + "loss": 0.3986, + "mean_token_accuracy": 0.8739076256752014, + "num_tokens": 496092693.0, + "step": 13007 + }, + { + "epoch": 1.6547513039053556, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.11876106262207, + "learning_rate": 1e-06, + "loss": 0.4303, + "mean_token_accuracy": 0.862459659576416, + "num_tokens": 496132853.0, + "step": 13008 + }, + { + "epoch": 1.6548785141839462, + "ewc_loss": 0.03369140625, + "ewc_loss_parallel": 3.361701965332031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.961519241333008, + "learning_rate": 1e-06, + "loss": 0.4622, + "mean_token_accuracy": 0.8538579344749451, + "num_tokens": 496168303.0, + "step": 13009 + }, + { + "epoch": 1.6550057244625367, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.1163330078125, + "learning_rate": 1e-06, + "loss": 0.4544, + "mean_token_accuracy": 0.8565888404846191, + "num_tokens": 496206132.0, + "step": 13010 + }, + { + "epoch": 1.6551329347411272, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.162105560302734, + "learning_rate": 1e-06, + "loss": 0.4391, + "mean_token_accuracy": 0.8618338108062744, + "num_tokens": 496247542.0, + "step": 13011 + }, + { + "epoch": 1.6552601450197177, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.204557418823242, + "learning_rate": 1e-06, + "loss": 0.3814, + "mean_token_accuracy": 0.8779290914535522, + "num_tokens": 496285698.0, + "step": 13012 + }, + { + "epoch": 1.655387355298308, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.124223709106445, + "learning_rate": 1e-06, + "loss": 0.4165, + "mean_token_accuracy": 0.8700944185256958, + "num_tokens": 496322216.0, + "step": 13013 + }, + { + "epoch": 1.6555145655768986, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.188899993896484, + "learning_rate": 1e-06, + "loss": 0.4284, + "mean_token_accuracy": 0.8662095665931702, + "num_tokens": 496357687.0, + "step": 13014 + }, + { + "epoch": 1.655641775855489, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.13261604309082, + "learning_rate": 1e-06, + "loss": 0.3767, + "mean_token_accuracy": 0.8801751136779785, + "num_tokens": 496400875.0, + "step": 13015 + }, + { + "epoch": 1.6557689861340796, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.12824058532715, + "learning_rate": 1e-06, + "loss": 0.406, + "mean_token_accuracy": 0.8712387084960938, + "num_tokens": 496432076.0, + "step": 13016 + }, + { + "epoch": 1.6558961964126702, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.277835845947266, + "learning_rate": 1e-06, + "loss": 0.4313, + "mean_token_accuracy": 0.865103006362915, + "num_tokens": 496466852.0, + "step": 13017 + }, + { + "epoch": 1.6560234066912607, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.010074615478516, + "learning_rate": 1e-06, + "loss": 0.4213, + "mean_token_accuracy": 0.8674606084823608, + "num_tokens": 496508890.0, + "step": 13018 + }, + { + "epoch": 1.6561506169698512, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.185527801513672, + "learning_rate": 1e-06, + "loss": 0.369, + "mean_token_accuracy": 0.8838537931442261, + "num_tokens": 496537964.0, + "step": 13019 + }, + { + "epoch": 1.6562778272484415, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.086063385009766, + "learning_rate": 1e-06, + "loss": 0.4792, + "mean_token_accuracy": 0.851852297782898, + "num_tokens": 496571263.0, + "step": 13020 + }, + { + "epoch": 1.656405037527032, + "ewc_loss": 0.03369140625, + "ewc_loss_parallel": 3.361701965332031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.141578674316406, + "learning_rate": 1e-06, + "loss": 0.388, + "mean_token_accuracy": 0.877328634262085, + "num_tokens": 496610058.0, + "step": 13021 + }, + { + "epoch": 1.6565322478056226, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.12108039855957, + "learning_rate": 1e-06, + "loss": 0.4332, + "mean_token_accuracy": 0.8621033430099487, + "num_tokens": 496640542.0, + "step": 13022 + }, + { + "epoch": 1.656659458084213, + "ewc_loss": 0.03369140625, + "ewc_loss_parallel": 3.361701965332031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.110855102539062, + "learning_rate": 1e-06, + "loss": 0.4197, + "mean_token_accuracy": 0.8673362731933594, + "num_tokens": 496678030.0, + "step": 13023 + }, + { + "epoch": 1.6567866683628036, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.030982971191406, + "learning_rate": 1e-06, + "loss": 0.4286, + "mean_token_accuracy": 0.8653227686882019, + "num_tokens": 496714057.0, + "step": 13024 + }, + { + "epoch": 1.6569138786413942, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.128704071044922, + "learning_rate": 1e-06, + "loss": 0.3692, + "mean_token_accuracy": 0.8840867877006531, + "num_tokens": 496744501.0, + "step": 13025 + }, + { + "epoch": 1.6570410889199847, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.170625686645508, + "learning_rate": 1e-06, + "loss": 0.4063, + "mean_token_accuracy": 0.8724757432937622, + "num_tokens": 496780233.0, + "step": 13026 + }, + { + "epoch": 1.6571682991985752, + "ewc_loss": 0.03369140625, + "ewc_loss_parallel": 3.361701965332031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.18811798095703, + "learning_rate": 1e-06, + "loss": 0.4088, + "mean_token_accuracy": 0.868309736251831, + "num_tokens": 496809645.0, + "step": 13027 + }, + { + "epoch": 1.6572955094771658, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.913021087646484, + "learning_rate": 1e-06, + "loss": 0.4167, + "mean_token_accuracy": 0.868476152420044, + "num_tokens": 496854871.0, + "step": 13028 + }, + { + "epoch": 1.6574227197557563, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.05144691467285, + "learning_rate": 1e-06, + "loss": 0.3626, + "mean_token_accuracy": 0.8842204809188843, + "num_tokens": 496891958.0, + "step": 13029 + }, + { + "epoch": 1.6575499300343468, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.100749969482422, + "learning_rate": 1e-06, + "loss": 0.443, + "mean_token_accuracy": 0.861020028591156, + "num_tokens": 496928550.0, + "step": 13030 + }, + { + "epoch": 1.6576771403129373, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.15387535095215, + "learning_rate": 1e-06, + "loss": 0.4123, + "mean_token_accuracy": 0.8669642210006714, + "num_tokens": 496972009.0, + "step": 13031 + }, + { + "epoch": 1.6578043505915279, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.18918228149414, + "learning_rate": 1e-06, + "loss": 0.4149, + "mean_token_accuracy": 0.8670517206192017, + "num_tokens": 497013605.0, + "step": 13032 + }, + { + "epoch": 1.6579315608701184, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.095714569091797, + "learning_rate": 1e-06, + "loss": 0.412, + "mean_token_accuracy": 0.8698199987411499, + "num_tokens": 497048712.0, + "step": 13033 + }, + { + "epoch": 1.658058771148709, + "ewc_loss": 0.03369140625, + "ewc_loss_parallel": 3.361701965332031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.23537254333496, + "learning_rate": 1e-06, + "loss": 0.3829, + "mean_token_accuracy": 0.8821923136711121, + "num_tokens": 497089161.0, + "step": 13034 + }, + { + "epoch": 1.6581859814272994, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.954214096069336, + "learning_rate": 1e-06, + "loss": 0.5006, + "mean_token_accuracy": 0.8412421941757202, + "num_tokens": 497127253.0, + "step": 13035 + }, + { + "epoch": 1.65831319170589, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.122228622436523, + "learning_rate": 1e-06, + "loss": 0.4169, + "mean_token_accuracy": 0.8644552230834961, + "num_tokens": 497167637.0, + "step": 13036 + }, + { + "epoch": 1.6584404019844805, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.227479934692383, + "learning_rate": 1e-06, + "loss": 0.4161, + "mean_token_accuracy": 0.868594229221344, + "num_tokens": 497205663.0, + "step": 13037 + }, + { + "epoch": 1.6585676122630708, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.01057243347168, + "learning_rate": 1e-06, + "loss": 0.3602, + "mean_token_accuracy": 0.8853563666343689, + "num_tokens": 497242146.0, + "step": 13038 + }, + { + "epoch": 1.6586948225416613, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.27328872680664, + "learning_rate": 1e-06, + "loss": 0.3998, + "mean_token_accuracy": 0.8738008737564087, + "num_tokens": 497279339.0, + "step": 13039 + }, + { + "epoch": 1.6588220328202519, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.13826560974121, + "learning_rate": 1e-06, + "loss": 0.4402, + "mean_token_accuracy": 0.8600025773048401, + "num_tokens": 497318530.0, + "step": 13040 + }, + { + "epoch": 1.6589492430988424, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.236215591430664, + "learning_rate": 1e-06, + "loss": 0.4492, + "mean_token_accuracy": 0.8634238243103027, + "num_tokens": 497352943.0, + "step": 13041 + }, + { + "epoch": 1.659076453377433, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.118553161621094, + "learning_rate": 1e-06, + "loss": 0.4292, + "mean_token_accuracy": 0.8629633188247681, + "num_tokens": 497387729.0, + "step": 13042 + }, + { + "epoch": 1.6592036636560235, + "ewc_loss": 0.03369140625, + "ewc_loss_parallel": 3.361701965332031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.070825576782227, + "learning_rate": 1e-06, + "loss": 0.4082, + "mean_token_accuracy": 0.8697071075439453, + "num_tokens": 497427875.0, + "step": 13043 + }, + { + "epoch": 1.6593308739346138, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.07570457458496, + "learning_rate": 1e-06, + "loss": 0.4416, + "mean_token_accuracy": 0.8606870770454407, + "num_tokens": 497469076.0, + "step": 13044 + }, + { + "epoch": 1.6594580842132043, + "ewc_loss": 0.03369140625, + "ewc_loss_parallel": 3.361701965332031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.010356903076172, + "learning_rate": 1e-06, + "loss": 0.4086, + "mean_token_accuracy": 0.869991660118103, + "num_tokens": 497509556.0, + "step": 13045 + }, + { + "epoch": 1.6595852944917948, + "ewc_loss": 0.03369140625, + "ewc_loss_parallel": 3.361701965332031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.059650421142578, + "learning_rate": 1e-06, + "loss": 0.432, + "mean_token_accuracy": 0.8610689043998718, + "num_tokens": 497548151.0, + "step": 13046 + }, + { + "epoch": 1.6597125047703853, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.091306686401367, + "learning_rate": 1e-06, + "loss": 0.502, + "mean_token_accuracy": 0.8430487513542175, + "num_tokens": 497589088.0, + "step": 13047 + }, + { + "epoch": 1.6598397150489759, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.145328521728516, + "learning_rate": 1e-06, + "loss": 0.3946, + "mean_token_accuracy": 0.878070592880249, + "num_tokens": 497632095.0, + "step": 13048 + }, + { + "epoch": 1.6599669253275664, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.98311424255371, + "learning_rate": 1e-06, + "loss": 0.3806, + "mean_token_accuracy": 0.8798400163650513, + "num_tokens": 497665148.0, + "step": 13049 + }, + { + "epoch": 1.660094135606157, + "ewc_loss": 0.03369140625, + "ewc_loss_parallel": 3.361701965332031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.075645446777344, + "learning_rate": 1e-06, + "loss": 0.4344, + "mean_token_accuracy": 0.8628252744674683, + "num_tokens": 497700134.0, + "step": 13050 + }, + { + "epoch": 1.6602213458847475, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.130273818969727, + "learning_rate": 1e-06, + "loss": 0.3994, + "mean_token_accuracy": 0.8734506368637085, + "num_tokens": 497743101.0, + "step": 13051 + }, + { + "epoch": 1.660348556163338, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.1572208404541, + "learning_rate": 1e-06, + "loss": 0.4419, + "mean_token_accuracy": 0.8617392778396606, + "num_tokens": 497775112.0, + "step": 13052 + }, + { + "epoch": 1.6604757664419285, + "ewc_loss": 0.03369140625, + "ewc_loss_parallel": 3.361701965332031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.067726135253906, + "learning_rate": 1e-06, + "loss": 0.3747, + "mean_token_accuracy": 0.879570484161377, + "num_tokens": 497813320.0, + "step": 13053 + }, + { + "epoch": 1.660602976720519, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.2874698638916, + "learning_rate": 1e-06, + "loss": 0.4416, + "mean_token_accuracy": 0.8639389276504517, + "num_tokens": 497852780.0, + "step": 13054 + }, + { + "epoch": 1.6607301869991096, + "ewc_loss": 0.03369140625, + "ewc_loss_parallel": 3.361701965332031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.91731834411621, + "learning_rate": 1e-06, + "loss": 0.3999, + "mean_token_accuracy": 0.8715686202049255, + "num_tokens": 497886313.0, + "step": 13055 + }, + { + "epoch": 1.6608573972777, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.062679290771484, + "learning_rate": 1e-06, + "loss": 0.4272, + "mean_token_accuracy": 0.8657656908035278, + "num_tokens": 497924108.0, + "step": 13056 + }, + { + "epoch": 1.6609846075562906, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.02791976928711, + "learning_rate": 1e-06, + "loss": 0.4452, + "mean_token_accuracy": 0.8576828241348267, + "num_tokens": 497964656.0, + "step": 13057 + }, + { + "epoch": 1.6611118178348812, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.109737396240234, + "learning_rate": 1e-06, + "loss": 0.4523, + "mean_token_accuracy": 0.8548973798751831, + "num_tokens": 498005436.0, + "step": 13058 + }, + { + "epoch": 1.6612390281134717, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.19226837158203, + "learning_rate": 1e-06, + "loss": 0.4203, + "mean_token_accuracy": 0.8641881942749023, + "num_tokens": 498042237.0, + "step": 13059 + }, + { + "epoch": 1.6613662383920622, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.11725616455078, + "learning_rate": 1e-06, + "loss": 0.4236, + "mean_token_accuracy": 0.8669962882995605, + "num_tokens": 498083008.0, + "step": 13060 + }, + { + "epoch": 1.6614934486706527, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.186960220336914, + "learning_rate": 1e-06, + "loss": 0.4914, + "mean_token_accuracy": 0.8447876572608948, + "num_tokens": 498120550.0, + "step": 13061 + }, + { + "epoch": 1.661620658949243, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.957595825195312, + "learning_rate": 1e-06, + "loss": 0.4074, + "mean_token_accuracy": 0.8655645847320557, + "num_tokens": 498153033.0, + "step": 13062 + }, + { + "epoch": 1.6617478692278336, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.98737144470215, + "learning_rate": 1e-06, + "loss": 0.4205, + "mean_token_accuracy": 0.870420515537262, + "num_tokens": 498190491.0, + "step": 13063 + }, + { + "epoch": 1.661875079506424, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.22743797302246, + "learning_rate": 1e-06, + "loss": 0.4379, + "mean_token_accuracy": 0.8612627983093262, + "num_tokens": 498235889.0, + "step": 13064 + }, + { + "epoch": 1.6620022897850146, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.148244857788086, + "learning_rate": 1e-06, + "loss": 0.4186, + "mean_token_accuracy": 0.8646308183670044, + "num_tokens": 498277191.0, + "step": 13065 + }, + { + "epoch": 1.6621295000636052, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.0527286529541, + "learning_rate": 1e-06, + "loss": 0.4469, + "mean_token_accuracy": 0.8576313257217407, + "num_tokens": 498318443.0, + "step": 13066 + }, + { + "epoch": 1.6622567103421957, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.277841567993164, + "learning_rate": 1e-06, + "loss": 0.4516, + "mean_token_accuracy": 0.8559209108352661, + "num_tokens": 498355955.0, + "step": 13067 + }, + { + "epoch": 1.662383920620786, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.988605499267578, + "learning_rate": 1e-06, + "loss": 0.405, + "mean_token_accuracy": 0.8690634965896606, + "num_tokens": 498388588.0, + "step": 13068 + }, + { + "epoch": 1.6625111308993765, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.285676956176758, + "learning_rate": 1e-06, + "loss": 0.4286, + "mean_token_accuracy": 0.8663829565048218, + "num_tokens": 498430661.0, + "step": 13069 + }, + { + "epoch": 1.662638341177967, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.154541015625, + "learning_rate": 1e-06, + "loss": 0.4489, + "mean_token_accuracy": 0.8556791543960571, + "num_tokens": 498470428.0, + "step": 13070 + }, + { + "epoch": 1.6627655514565576, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.989187240600586, + "learning_rate": 1e-06, + "loss": 0.4152, + "mean_token_accuracy": 0.8676340579986572, + "num_tokens": 498512031.0, + "step": 13071 + }, + { + "epoch": 1.662892761735148, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.155763626098633, + "learning_rate": 1e-06, + "loss": 0.354, + "mean_token_accuracy": 0.8899036049842834, + "num_tokens": 498548344.0, + "step": 13072 + }, + { + "epoch": 1.6630199720137386, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.90941619873047, + "learning_rate": 1e-06, + "loss": 0.3779, + "mean_token_accuracy": 0.881158709526062, + "num_tokens": 498590115.0, + "step": 13073 + }, + { + "epoch": 1.6631471822923292, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.067304611206055, + "learning_rate": 1e-06, + "loss": 0.4212, + "mean_token_accuracy": 0.8686752319335938, + "num_tokens": 498634034.0, + "step": 13074 + }, + { + "epoch": 1.6632743925709197, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.147537231445312, + "learning_rate": 1e-06, + "loss": 0.4399, + "mean_token_accuracy": 0.8604649305343628, + "num_tokens": 498675086.0, + "step": 13075 + }, + { + "epoch": 1.6634016028495102, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.98744773864746, + "learning_rate": 1e-06, + "loss": 0.3921, + "mean_token_accuracy": 0.8737097978591919, + "num_tokens": 498711374.0, + "step": 13076 + }, + { + "epoch": 1.6635288131281007, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.131023406982422, + "learning_rate": 1e-06, + "loss": 0.4012, + "mean_token_accuracy": 0.8728863596916199, + "num_tokens": 498753050.0, + "step": 13077 + }, + { + "epoch": 1.6636560234066913, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.867502212524414, + "learning_rate": 1e-06, + "loss": 0.4109, + "mean_token_accuracy": 0.8694136738777161, + "num_tokens": 498790406.0, + "step": 13078 + }, + { + "epoch": 1.6637832336852818, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.14692497253418, + "learning_rate": 1e-06, + "loss": 0.4189, + "mean_token_accuracy": 0.8673190474510193, + "num_tokens": 498827712.0, + "step": 13079 + }, + { + "epoch": 1.6639104439638723, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.093521118164062, + "learning_rate": 1e-06, + "loss": 0.3993, + "mean_token_accuracy": 0.8723106384277344, + "num_tokens": 498862283.0, + "step": 13080 + }, + { + "epoch": 1.6640376542424629, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.921478271484375, + "learning_rate": 1e-06, + "loss": 0.4084, + "mean_token_accuracy": 0.8692763447761536, + "num_tokens": 498902686.0, + "step": 13081 + }, + { + "epoch": 1.6641648645210534, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.282756805419922, + "learning_rate": 1e-06, + "loss": 0.4627, + "mean_token_accuracy": 0.8575414419174194, + "num_tokens": 498942785.0, + "step": 13082 + }, + { + "epoch": 1.664292074799644, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.8931941986084, + "learning_rate": 1e-06, + "loss": 0.4032, + "mean_token_accuracy": 0.8728569746017456, + "num_tokens": 498982761.0, + "step": 13083 + }, + { + "epoch": 1.6644192850782344, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.198040008544922, + "learning_rate": 1e-06, + "loss": 0.4756, + "mean_token_accuracy": 0.8506543636322021, + "num_tokens": 499025762.0, + "step": 13084 + }, + { + "epoch": 1.664546495356825, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.06330108642578, + "learning_rate": 1e-06, + "loss": 0.4453, + "mean_token_accuracy": 0.8595826625823975, + "num_tokens": 499063394.0, + "step": 13085 + }, + { + "epoch": 1.6646737056354155, + "ewc_loss": 0.03369140625, + "ewc_loss_parallel": 3.361701965332031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.114816665649414, + "learning_rate": 1e-06, + "loss": 0.3868, + "mean_token_accuracy": 0.878050684928894, + "num_tokens": 499105824.0, + "step": 13086 + }, + { + "epoch": 1.6648009159140058, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.08418083190918, + "learning_rate": 1e-06, + "loss": 0.385, + "mean_token_accuracy": 0.8768125772476196, + "num_tokens": 499140935.0, + "step": 13087 + }, + { + "epoch": 1.6649281261925963, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.907712936401367, + "learning_rate": 1e-06, + "loss": 0.4182, + "mean_token_accuracy": 0.8669885993003845, + "num_tokens": 499179492.0, + "step": 13088 + }, + { + "epoch": 1.6650553364711869, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.0595645904541, + "learning_rate": 1e-06, + "loss": 0.3875, + "mean_token_accuracy": 0.8785533905029297, + "num_tokens": 499214716.0, + "step": 13089 + }, + { + "epoch": 1.6651825467497774, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.997154235839844, + "learning_rate": 1e-06, + "loss": 0.3968, + "mean_token_accuracy": 0.8749195337295532, + "num_tokens": 499250337.0, + "step": 13090 + }, + { + "epoch": 1.665309757028368, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.06813621520996, + "learning_rate": 1e-06, + "loss": 0.3997, + "mean_token_accuracy": 0.8733229041099548, + "num_tokens": 499283675.0, + "step": 13091 + }, + { + "epoch": 1.6654369673069584, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.34835433959961, + "learning_rate": 1e-06, + "loss": 0.4035, + "mean_token_accuracy": 0.8730844855308533, + "num_tokens": 499319153.0, + "step": 13092 + }, + { + "epoch": 1.6655641775855488, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.90123176574707, + "learning_rate": 1e-06, + "loss": 0.4196, + "mean_token_accuracy": 0.8660352230072021, + "num_tokens": 499356675.0, + "step": 13093 + }, + { + "epoch": 1.6656913878641393, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.132600784301758, + "learning_rate": 1e-06, + "loss": 0.3665, + "mean_token_accuracy": 0.882664144039154, + "num_tokens": 499392129.0, + "step": 13094 + }, + { + "epoch": 1.6658185981427298, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.869285583496094, + "learning_rate": 1e-06, + "loss": 0.377, + "mean_token_accuracy": 0.8792648315429688, + "num_tokens": 499424958.0, + "step": 13095 + }, + { + "epoch": 1.6659458084213203, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.00493812561035, + "learning_rate": 1e-06, + "loss": 0.3851, + "mean_token_accuracy": 0.8773422837257385, + "num_tokens": 499467769.0, + "step": 13096 + }, + { + "epoch": 1.6660730186999109, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.016321182250977, + "learning_rate": 1e-06, + "loss": 0.4559, + "mean_token_accuracy": 0.858942985534668, + "num_tokens": 499513828.0, + "step": 13097 + }, + { + "epoch": 1.6662002289785014, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.037763595581055, + "learning_rate": 1e-06, + "loss": 0.3968, + "mean_token_accuracy": 0.8743590712547302, + "num_tokens": 499548219.0, + "step": 13098 + }, + { + "epoch": 1.666327439257092, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.13216209411621, + "learning_rate": 1e-06, + "loss": 0.466, + "mean_token_accuracy": 0.8516220450401306, + "num_tokens": 499587158.0, + "step": 13099 + }, + { + "epoch": 1.6664546495356825, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.007848739624023, + "learning_rate": 1e-06, + "loss": 0.414, + "mean_token_accuracy": 0.8709925413131714, + "num_tokens": 499625235.0, + "step": 13100 + }, + { + "epoch": 1.666581859814273, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.195274353027344, + "learning_rate": 1e-06, + "loss": 0.4669, + "mean_token_accuracy": 0.8492677807807922, + "num_tokens": 499662842.0, + "step": 13101 + }, + { + "epoch": 1.6667090700928635, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.821720123291016, + "learning_rate": 1e-06, + "loss": 0.4871, + "mean_token_accuracy": 0.8452406525611877, + "num_tokens": 499703464.0, + "step": 13102 + }, + { + "epoch": 1.666836280371454, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.909482955932617, + "learning_rate": 1e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.8677437901496887, + "num_tokens": 499739160.0, + "step": 13103 + }, + { + "epoch": 1.6669634906500446, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.982006072998047, + "learning_rate": 1e-06, + "loss": 0.4491, + "mean_token_accuracy": 0.8544694185256958, + "num_tokens": 499775024.0, + "step": 13104 + }, + { + "epoch": 1.667090700928635, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.001461029052734, + "learning_rate": 1e-06, + "loss": 0.4209, + "mean_token_accuracy": 0.8716431856155396, + "num_tokens": 499816479.0, + "step": 13105 + }, + { + "epoch": 1.6672179112072256, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.081302642822266, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.869701623916626, + "num_tokens": 499848790.0, + "step": 13106 + }, + { + "epoch": 1.6673451214858162, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.025278091430664, + "learning_rate": 1e-06, + "loss": 0.3596, + "mean_token_accuracy": 0.8873937129974365, + "num_tokens": 499890955.0, + "step": 13107 + }, + { + "epoch": 1.6674723317644067, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.006563186645508, + "learning_rate": 1e-06, + "loss": 0.3876, + "mean_token_accuracy": 0.8776886463165283, + "num_tokens": 499924813.0, + "step": 13108 + }, + { + "epoch": 1.6675995420429972, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.957361221313477, + "learning_rate": 1e-06, + "loss": 0.4248, + "mean_token_accuracy": 0.8652937412261963, + "num_tokens": 499960032.0, + "step": 13109 + }, + { + "epoch": 1.6677267523215877, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.965412139892578, + "learning_rate": 1e-06, + "loss": 0.3794, + "mean_token_accuracy": 0.8822823762893677, + "num_tokens": 499995868.0, + "step": 13110 + }, + { + "epoch": 1.667853962600178, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.91683006286621, + "learning_rate": 1e-06, + "loss": 0.4037, + "mean_token_accuracy": 0.8728601336479187, + "num_tokens": 500031662.0, + "step": 13111 + }, + { + "epoch": 1.6679811728787686, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.995738983154297, + "learning_rate": 1e-06, + "loss": 0.4447, + "mean_token_accuracy": 0.8589960336685181, + "num_tokens": 500070189.0, + "step": 13112 + }, + { + "epoch": 1.668108383157359, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.064157485961914, + "learning_rate": 1e-06, + "loss": 0.4186, + "mean_token_accuracy": 0.8659523129463196, + "num_tokens": 500109999.0, + "step": 13113 + }, + { + "epoch": 1.6682355934359496, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.998645782470703, + "learning_rate": 1e-06, + "loss": 0.4613, + "mean_token_accuracy": 0.8519988656044006, + "num_tokens": 500151473.0, + "step": 13114 + }, + { + "epoch": 1.6683628037145402, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.0213623046875, + "learning_rate": 1e-06, + "loss": 0.4116, + "mean_token_accuracy": 0.8662096858024597, + "num_tokens": 500187436.0, + "step": 13115 + }, + { + "epoch": 1.6684900139931307, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.111509323120117, + "learning_rate": 1e-06, + "loss": 0.4156, + "mean_token_accuracy": 0.8686910271644592, + "num_tokens": 500222433.0, + "step": 13116 + }, + { + "epoch": 1.668617224271721, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.15452766418457, + "learning_rate": 1e-06, + "loss": 0.4063, + "mean_token_accuracy": 0.8683130741119385, + "num_tokens": 500257390.0, + "step": 13117 + }, + { + "epoch": 1.6687444345503115, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.090150833129883, + "learning_rate": 1e-06, + "loss": 0.4143, + "mean_token_accuracy": 0.8676799535751343, + "num_tokens": 500301260.0, + "step": 13118 + }, + { + "epoch": 1.668871644828902, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.104093551635742, + "learning_rate": 1e-06, + "loss": 0.3993, + "mean_token_accuracy": 0.873430073261261, + "num_tokens": 500343453.0, + "step": 13119 + }, + { + "epoch": 1.6689988551074926, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.234548568725586, + "learning_rate": 1e-06, + "loss": 0.3915, + "mean_token_accuracy": 0.8743858337402344, + "num_tokens": 500375133.0, + "step": 13120 + }, + { + "epoch": 1.669126065386083, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.058948516845703, + "learning_rate": 1e-06, + "loss": 0.4389, + "mean_token_accuracy": 0.8639680743217468, + "num_tokens": 500408540.0, + "step": 13121 + }, + { + "epoch": 1.6692532756646736, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.24921989440918, + "learning_rate": 1e-06, + "loss": 0.3808, + "mean_token_accuracy": 0.8816080689430237, + "num_tokens": 500446243.0, + "step": 13122 + }, + { + "epoch": 1.6693804859432642, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.80171012878418, + "learning_rate": 1e-06, + "loss": 0.5029, + "mean_token_accuracy": 0.8431617617607117, + "num_tokens": 500484313.0, + "step": 13123 + }, + { + "epoch": 1.6695076962218547, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.295358657836914, + "learning_rate": 1e-06, + "loss": 0.3862, + "mean_token_accuracy": 0.880422055721283, + "num_tokens": 500525566.0, + "step": 13124 + }, + { + "epoch": 1.6696349065004452, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.968461990356445, + "learning_rate": 1e-06, + "loss": 0.3881, + "mean_token_accuracy": 0.8731428384780884, + "num_tokens": 500566274.0, + "step": 13125 + }, + { + "epoch": 1.6697621167790357, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.070045471191406, + "learning_rate": 1e-06, + "loss": 0.4415, + "mean_token_accuracy": 0.8617358207702637, + "num_tokens": 500602769.0, + "step": 13126 + }, + { + "epoch": 1.6698893270576263, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.158674240112305, + "learning_rate": 1e-06, + "loss": 0.4746, + "mean_token_accuracy": 0.8521893620491028, + "num_tokens": 500642895.0, + "step": 13127 + }, + { + "epoch": 1.6700165373362168, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.075641632080078, + "learning_rate": 1e-06, + "loss": 0.4163, + "mean_token_accuracy": 0.8672540783882141, + "num_tokens": 500681461.0, + "step": 13128 + }, + { + "epoch": 1.6701437476148073, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.75173568725586, + "learning_rate": 1e-06, + "loss": 0.4603, + "mean_token_accuracy": 0.8513975143432617, + "num_tokens": 500723395.0, + "step": 13129 + }, + { + "epoch": 1.6702709578933979, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.11383819580078, + "learning_rate": 1e-06, + "loss": 0.3919, + "mean_token_accuracy": 0.8697713613510132, + "num_tokens": 500760571.0, + "step": 13130 + }, + { + "epoch": 1.6703981681719884, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.05307388305664, + "learning_rate": 1e-06, + "loss": 0.4471, + "mean_token_accuracy": 0.8591536283493042, + "num_tokens": 500797858.0, + "step": 13131 + }, + { + "epoch": 1.670525378450579, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.415552139282227, + "learning_rate": 1e-06, + "loss": 0.3897, + "mean_token_accuracy": 0.8744562268257141, + "num_tokens": 500840199.0, + "step": 13132 + }, + { + "epoch": 1.6706525887291694, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.005014419555664, + "learning_rate": 1e-06, + "loss": 0.443, + "mean_token_accuracy": 0.86191725730896, + "num_tokens": 500883290.0, + "step": 13133 + }, + { + "epoch": 1.67077979900776, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.13799476623535, + "learning_rate": 1e-06, + "loss": 0.3937, + "mean_token_accuracy": 0.8720303773880005, + "num_tokens": 500923072.0, + "step": 13134 + }, + { + "epoch": 1.6709070092863505, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.229562759399414, + "learning_rate": 1e-06, + "loss": 0.4, + "mean_token_accuracy": 0.8722789883613586, + "num_tokens": 500958726.0, + "step": 13135 + }, + { + "epoch": 1.6710342195649408, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.12712287902832, + "learning_rate": 1e-06, + "loss": 0.4334, + "mean_token_accuracy": 0.865420401096344, + "num_tokens": 500996746.0, + "step": 13136 + }, + { + "epoch": 1.6711614298435313, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.61536407470703, + "learning_rate": 1e-06, + "loss": 0.438, + "mean_token_accuracy": 0.8611498475074768, + "num_tokens": 501034135.0, + "step": 13137 + }, + { + "epoch": 1.6712886401221219, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.04243278503418, + "learning_rate": 1e-06, + "loss": 0.4134, + "mean_token_accuracy": 0.8704965114593506, + "num_tokens": 501069810.0, + "step": 13138 + }, + { + "epoch": 1.6714158504007124, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.110332489013672, + "learning_rate": 1e-06, + "loss": 0.4332, + "mean_token_accuracy": 0.8624794483184814, + "num_tokens": 501110597.0, + "step": 13139 + }, + { + "epoch": 1.671543060679303, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.037364959716797, + "learning_rate": 1e-06, + "loss": 0.4535, + "mean_token_accuracy": 0.8578989505767822, + "num_tokens": 501145927.0, + "step": 13140 + }, + { + "epoch": 1.6716702709578934, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.9234561920166, + "learning_rate": 1e-06, + "loss": 0.477, + "mean_token_accuracy": 0.8496406078338623, + "num_tokens": 501188473.0, + "step": 13141 + }, + { + "epoch": 1.6717974812364838, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.186662673950195, + "learning_rate": 1e-06, + "loss": 0.388, + "mean_token_accuracy": 0.880108118057251, + "num_tokens": 501226143.0, + "step": 13142 + }, + { + "epoch": 1.6719246915150743, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.003877639770508, + "learning_rate": 1e-06, + "loss": 0.4425, + "mean_token_accuracy": 0.8591820001602173, + "num_tokens": 501266980.0, + "step": 13143 + }, + { + "epoch": 1.6720519017936648, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.133440017700195, + "learning_rate": 1e-06, + "loss": 0.422, + "mean_token_accuracy": 0.864614725112915, + "num_tokens": 501305414.0, + "step": 13144 + }, + { + "epoch": 1.6721791120722553, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.208757400512695, + "learning_rate": 1e-06, + "loss": 0.4569, + "mean_token_accuracy": 0.8606290817260742, + "num_tokens": 501343009.0, + "step": 13145 + }, + { + "epoch": 1.6723063223508459, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.18437957763672, + "learning_rate": 1e-06, + "loss": 0.4001, + "mean_token_accuracy": 0.8744446635246277, + "num_tokens": 501383571.0, + "step": 13146 + }, + { + "epoch": 1.6724335326294364, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.22098159790039, + "learning_rate": 1e-06, + "loss": 0.4098, + "mean_token_accuracy": 0.8689581155776978, + "num_tokens": 501425444.0, + "step": 13147 + }, + { + "epoch": 1.672560742908027, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.28969955444336, + "learning_rate": 1e-06, + "loss": 0.3857, + "mean_token_accuracy": 0.8810349702835083, + "num_tokens": 501466536.0, + "step": 13148 + }, + { + "epoch": 1.6726879531866174, + "ewc_loss": 0.03369140625, + "ewc_loss_parallel": 3.361701965332031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.237491607666016, + "learning_rate": 1e-06, + "loss": 0.4071, + "mean_token_accuracy": 0.8710423707962036, + "num_tokens": 501507346.0, + "step": 13149 + }, + { + "epoch": 1.672815163465208, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.105661392211914, + "learning_rate": 1e-06, + "loss": 0.3954, + "mean_token_accuracy": 0.8748976588249207, + "num_tokens": 501545515.0, + "step": 13150 + }, + { + "epoch": 1.6729423737437985, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.13018798828125, + "learning_rate": 1e-06, + "loss": 0.4389, + "mean_token_accuracy": 0.8624945878982544, + "num_tokens": 501588793.0, + "step": 13151 + }, + { + "epoch": 1.673069584022389, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.30575942993164, + "learning_rate": 1e-06, + "loss": 0.4168, + "mean_token_accuracy": 0.8656378984451294, + "num_tokens": 501624384.0, + "step": 13152 + }, + { + "epoch": 1.6731967943009796, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.29361343383789, + "learning_rate": 1e-06, + "loss": 0.4279, + "mean_token_accuracy": 0.8677229881286621, + "num_tokens": 501663150.0, + "step": 13153 + }, + { + "epoch": 1.67332400457957, + "ewc_loss": 0.03369140625, + "ewc_loss_parallel": 3.361701965332031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.972814559936523, + "learning_rate": 1e-06, + "loss": 0.4466, + "mean_token_accuracy": 0.8616255521774292, + "num_tokens": 501706177.0, + "step": 13154 + }, + { + "epoch": 1.6734512148581606, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.353239059448242, + "learning_rate": 1e-06, + "loss": 0.3785, + "mean_token_accuracy": 0.8799112439155579, + "num_tokens": 501745267.0, + "step": 13155 + }, + { + "epoch": 1.6735784251367511, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.01805305480957, + "learning_rate": 1e-06, + "loss": 0.4344, + "mean_token_accuracy": 0.8640343546867371, + "num_tokens": 501783591.0, + "step": 13156 + }, + { + "epoch": 1.6737056354153417, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.431528091430664, + "learning_rate": 1e-06, + "loss": 0.4422, + "mean_token_accuracy": 0.8587076663970947, + "num_tokens": 501825190.0, + "step": 13157 + }, + { + "epoch": 1.6738328456939322, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.084115982055664, + "learning_rate": 1e-06, + "loss": 0.4171, + "mean_token_accuracy": 0.8684989213943481, + "num_tokens": 501855813.0, + "step": 13158 + }, + { + "epoch": 1.6739600559725227, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.843320846557617, + "learning_rate": 1e-06, + "loss": 0.4275, + "mean_token_accuracy": 0.8644765615463257, + "num_tokens": 501890106.0, + "step": 13159 + }, + { + "epoch": 1.674087266251113, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.387771606445312, + "learning_rate": 1e-06, + "loss": 0.4628, + "mean_token_accuracy": 0.8547871708869934, + "num_tokens": 501935437.0, + "step": 13160 + }, + { + "epoch": 1.6742144765297036, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.267118453979492, + "learning_rate": 1e-06, + "loss": 0.4548, + "mean_token_accuracy": 0.8555282950401306, + "num_tokens": 501980134.0, + "step": 13161 + }, + { + "epoch": 1.674341686808294, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.08909034729004, + "learning_rate": 1e-06, + "loss": 0.4346, + "mean_token_accuracy": 0.8617231845855713, + "num_tokens": 502015977.0, + "step": 13162 + }, + { + "epoch": 1.6744688970868846, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.198326110839844, + "learning_rate": 1e-06, + "loss": 0.408, + "mean_token_accuracy": 0.8703875541687012, + "num_tokens": 502061717.0, + "step": 13163 + }, + { + "epoch": 1.6745961073654752, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.19286346435547, + "learning_rate": 1e-06, + "loss": 0.432, + "mean_token_accuracy": 0.8656212091445923, + "num_tokens": 502101594.0, + "step": 13164 + }, + { + "epoch": 1.6747233176440657, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.08268928527832, + "learning_rate": 1e-06, + "loss": 0.414, + "mean_token_accuracy": 0.8674571514129639, + "num_tokens": 502136521.0, + "step": 13165 + }, + { + "epoch": 1.674850527922656, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.673465728759766, + "learning_rate": 1e-06, + "loss": 0.4786, + "mean_token_accuracy": 0.8483917713165283, + "num_tokens": 502174821.0, + "step": 13166 + }, + { + "epoch": 1.6749777382012465, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.35387420654297, + "learning_rate": 1e-06, + "loss": 0.4348, + "mean_token_accuracy": 0.8626536726951599, + "num_tokens": 502210615.0, + "step": 13167 + }, + { + "epoch": 1.675104948479837, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.02898597717285, + "learning_rate": 1e-06, + "loss": 0.4088, + "mean_token_accuracy": 0.8672829866409302, + "num_tokens": 502248220.0, + "step": 13168 + }, + { + "epoch": 1.6752321587584276, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.591753005981445, + "learning_rate": 1e-06, + "loss": 0.4235, + "mean_token_accuracy": 0.8660547733306885, + "num_tokens": 502290686.0, + "step": 13169 + }, + { + "epoch": 1.675359369037018, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.41316795349121, + "learning_rate": 1e-06, + "loss": 0.4582, + "mean_token_accuracy": 0.8561970591545105, + "num_tokens": 502331789.0, + "step": 13170 + }, + { + "epoch": 1.6754865793156086, + "ewc_loss": 0.03369140625, + "ewc_loss_parallel": 3.361701965332031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.233823776245117, + "learning_rate": 1e-06, + "loss": 0.4055, + "mean_token_accuracy": 0.87420254945755, + "num_tokens": 502371490.0, + "step": 13171 + }, + { + "epoch": 1.6756137895941992, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.19597625732422, + "learning_rate": 1e-06, + "loss": 0.4261, + "mean_token_accuracy": 0.8670612573623657, + "num_tokens": 502415606.0, + "step": 13172 + }, + { + "epoch": 1.6757409998727897, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.922941207885742, + "learning_rate": 1e-06, + "loss": 0.4246, + "mean_token_accuracy": 0.8661929368972778, + "num_tokens": 502456203.0, + "step": 13173 + }, + { + "epoch": 1.6758682101513802, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.477230072021484, + "learning_rate": 1e-06, + "loss": 0.3659, + "mean_token_accuracy": 0.8836017847061157, + "num_tokens": 502491508.0, + "step": 13174 + }, + { + "epoch": 1.6759954204299707, + "ewc_loss": 0.03369140625, + "ewc_loss_parallel": 3.361701965332031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.335554122924805, + "learning_rate": 1e-06, + "loss": 0.4633, + "mean_token_accuracy": 0.856383204460144, + "num_tokens": 502533408.0, + "step": 13175 + }, + { + "epoch": 1.6761226307085613, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.07114028930664, + "learning_rate": 1e-06, + "loss": 0.4352, + "mean_token_accuracy": 0.8611582517623901, + "num_tokens": 502569824.0, + "step": 13176 + }, + { + "epoch": 1.6762498409871518, + "ewc_loss": 0.03369140625, + "ewc_loss_parallel": 3.361701965332031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.353620529174805, + "learning_rate": 1e-06, + "loss": 0.3878, + "mean_token_accuracy": 0.8754916191101074, + "num_tokens": 502604776.0, + "step": 13177 + }, + { + "epoch": 1.6763770512657423, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.150211334228516, + "learning_rate": 1e-06, + "loss": 0.4711, + "mean_token_accuracy": 0.8505529165267944, + "num_tokens": 502645732.0, + "step": 13178 + }, + { + "epoch": 1.6765042615443329, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.077882766723633, + "learning_rate": 1e-06, + "loss": 0.4111, + "mean_token_accuracy": 0.8705272078514099, + "num_tokens": 502686044.0, + "step": 13179 + }, + { + "epoch": 1.6766314718229234, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.30207061767578, + "learning_rate": 1e-06, + "loss": 0.4612, + "mean_token_accuracy": 0.857974648475647, + "num_tokens": 502728288.0, + "step": 13180 + }, + { + "epoch": 1.676758682101514, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.045011520385742, + "learning_rate": 1e-06, + "loss": 0.4495, + "mean_token_accuracy": 0.8596315383911133, + "num_tokens": 502760772.0, + "step": 13181 + }, + { + "epoch": 1.6768858923801044, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.87984848022461, + "learning_rate": 1e-06, + "loss": 0.4039, + "mean_token_accuracy": 0.8668277263641357, + "num_tokens": 502791585.0, + "step": 13182 + }, + { + "epoch": 1.677013102658695, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.747661590576172, + "learning_rate": 1e-06, + "loss": 0.3673, + "mean_token_accuracy": 0.8843951225280762, + "num_tokens": 502826339.0, + "step": 13183 + }, + { + "epoch": 1.6771403129372855, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.31175994873047, + "learning_rate": 1e-06, + "loss": 0.3666, + "mean_token_accuracy": 0.8839707374572754, + "num_tokens": 502868716.0, + "step": 13184 + }, + { + "epoch": 1.6772675232158758, + "ewc_loss": 0.03369140625, + "ewc_loss_parallel": 3.361701965332031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.068593978881836, + "learning_rate": 1e-06, + "loss": 0.4203, + "mean_token_accuracy": 0.8669157028198242, + "num_tokens": 502907992.0, + "step": 13185 + }, + { + "epoch": 1.6773947334944663, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.337989807128906, + "learning_rate": 1e-06, + "loss": 0.4368, + "mean_token_accuracy": 0.8620167970657349, + "num_tokens": 502947197.0, + "step": 13186 + }, + { + "epoch": 1.6775219437730569, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.09604263305664, + "learning_rate": 1e-06, + "loss": 0.4039, + "mean_token_accuracy": 0.875808596611023, + "num_tokens": 502985890.0, + "step": 13187 + }, + { + "epoch": 1.6776491540516474, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.97357749938965, + "learning_rate": 1e-06, + "loss": 0.4391, + "mean_token_accuracy": 0.8621447086334229, + "num_tokens": 503033476.0, + "step": 13188 + }, + { + "epoch": 1.677776364330238, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.228145599365234, + "learning_rate": 1e-06, + "loss": 0.3928, + "mean_token_accuracy": 0.8753061294555664, + "num_tokens": 503068002.0, + "step": 13189 + }, + { + "epoch": 1.6779035746088284, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.292354583740234, + "learning_rate": 1e-06, + "loss": 0.4571, + "mean_token_accuracy": 0.8595931529998779, + "num_tokens": 503110059.0, + "step": 13190 + }, + { + "epoch": 1.6780307848874187, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.154216766357422, + "learning_rate": 1e-06, + "loss": 0.4366, + "mean_token_accuracy": 0.8612244725227356, + "num_tokens": 503146617.0, + "step": 13191 + }, + { + "epoch": 1.6781579951660093, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.091724395751953, + "learning_rate": 1e-06, + "loss": 0.4241, + "mean_token_accuracy": 0.8641663789749146, + "num_tokens": 503186436.0, + "step": 13192 + }, + { + "epoch": 1.6782852054445998, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.33342933654785, + "learning_rate": 1e-06, + "loss": 0.4067, + "mean_token_accuracy": 0.8733259439468384, + "num_tokens": 503227983.0, + "step": 13193 + }, + { + "epoch": 1.6784124157231903, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.212051391601562, + "learning_rate": 1e-06, + "loss": 0.4628, + "mean_token_accuracy": 0.853929877281189, + "num_tokens": 503269301.0, + "step": 13194 + }, + { + "epoch": 1.6785396260017809, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.34733772277832, + "learning_rate": 1e-06, + "loss": 0.4799, + "mean_token_accuracy": 0.8497430086135864, + "num_tokens": 503308919.0, + "step": 13195 + }, + { + "epoch": 1.6786668362803714, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.121084213256836, + "learning_rate": 1e-06, + "loss": 0.3871, + "mean_token_accuracy": 0.8770567774772644, + "num_tokens": 503345030.0, + "step": 13196 + }, + { + "epoch": 1.678794046558962, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.173707962036133, + "learning_rate": 1e-06, + "loss": 0.4451, + "mean_token_accuracy": 0.859241247177124, + "num_tokens": 503379142.0, + "step": 13197 + }, + { + "epoch": 1.6789212568375524, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.191396713256836, + "learning_rate": 1e-06, + "loss": 0.4236, + "mean_token_accuracy": 0.8681161403656006, + "num_tokens": 503411810.0, + "step": 13198 + }, + { + "epoch": 1.679048467116143, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.07914924621582, + "learning_rate": 1e-06, + "loss": 0.4174, + "mean_token_accuracy": 0.8642368316650391, + "num_tokens": 503450266.0, + "step": 13199 + }, + { + "epoch": 1.6791756773947335, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.250213623046875, + "learning_rate": 1e-06, + "loss": 0.3934, + "mean_token_accuracy": 0.8750661015510559, + "num_tokens": 503491015.0, + "step": 13200 + }, + { + "epoch": 1.679302887673324, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.127431869506836, + "learning_rate": 1e-06, + "loss": 0.3962, + "mean_token_accuracy": 0.8749439716339111, + "num_tokens": 503531964.0, + "step": 13201 + }, + { + "epoch": 1.6794300979519146, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.072309494018555, + "learning_rate": 1e-06, + "loss": 0.4016, + "mean_token_accuracy": 0.8715132474899292, + "num_tokens": 503576069.0, + "step": 13202 + }, + { + "epoch": 1.679557308230505, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.294050216674805, + "learning_rate": 1e-06, + "loss": 0.4152, + "mean_token_accuracy": 0.8686455488204956, + "num_tokens": 503607895.0, + "step": 13203 + }, + { + "epoch": 1.6796845185090956, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.12685775756836, + "learning_rate": 1e-06, + "loss": 0.4397, + "mean_token_accuracy": 0.8625025749206543, + "num_tokens": 503636891.0, + "step": 13204 + }, + { + "epoch": 1.6798117287876861, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.37824058532715, + "learning_rate": 1e-06, + "loss": 0.3768, + "mean_token_accuracy": 0.8815984725952148, + "num_tokens": 503682037.0, + "step": 13205 + }, + { + "epoch": 1.6799389390662767, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.89853286743164, + "learning_rate": 1e-06, + "loss": 0.4052, + "mean_token_accuracy": 0.8695135116577148, + "num_tokens": 503724008.0, + "step": 13206 + }, + { + "epoch": 1.6800661493448672, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.365020751953125, + "learning_rate": 1e-06, + "loss": 0.3997, + "mean_token_accuracy": 0.8749485611915588, + "num_tokens": 503759008.0, + "step": 13207 + }, + { + "epoch": 1.6801933596234577, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.05917739868164, + "learning_rate": 1e-06, + "loss": 0.4294, + "mean_token_accuracy": 0.8634395003318787, + "num_tokens": 503796599.0, + "step": 13208 + }, + { + "epoch": 1.680320569902048, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.172725677490234, + "learning_rate": 1e-06, + "loss": 0.4223, + "mean_token_accuracy": 0.8649532794952393, + "num_tokens": 503835539.0, + "step": 13209 + }, + { + "epoch": 1.6804477801806386, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.998750686645508, + "learning_rate": 1e-06, + "loss": 0.4158, + "mean_token_accuracy": 0.8640780448913574, + "num_tokens": 503875933.0, + "step": 13210 + }, + { + "epoch": 1.680574990459229, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.109485626220703, + "learning_rate": 1e-06, + "loss": 0.4259, + "mean_token_accuracy": 0.8645355701446533, + "num_tokens": 503913740.0, + "step": 13211 + }, + { + "epoch": 1.6807022007378196, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.408964157104492, + "learning_rate": 1e-06, + "loss": 0.403, + "mean_token_accuracy": 0.8706141710281372, + "num_tokens": 503955105.0, + "step": 13212 + }, + { + "epoch": 1.6808294110164101, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.012535095214844, + "learning_rate": 1e-06, + "loss": 0.4173, + "mean_token_accuracy": 0.8665857911109924, + "num_tokens": 503987641.0, + "step": 13213 + }, + { + "epoch": 1.6809566212950007, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.305282592773438, + "learning_rate": 1e-06, + "loss": 0.4479, + "mean_token_accuracy": 0.8582653999328613, + "num_tokens": 504028699.0, + "step": 13214 + }, + { + "epoch": 1.681083831573591, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.240741729736328, + "learning_rate": 1e-06, + "loss": 0.434, + "mean_token_accuracy": 0.8611912727355957, + "num_tokens": 504065597.0, + "step": 13215 + }, + { + "epoch": 1.6812110418521815, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.31863021850586, + "learning_rate": 1e-06, + "loss": 0.4755, + "mean_token_accuracy": 0.8535028100013733, + "num_tokens": 504105968.0, + "step": 13216 + }, + { + "epoch": 1.681338252130772, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.072189331054688, + "learning_rate": 1e-06, + "loss": 0.4015, + "mean_token_accuracy": 0.8692404627799988, + "num_tokens": 504143743.0, + "step": 13217 + }, + { + "epoch": 1.6814654624093626, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.270296096801758, + "learning_rate": 1e-06, + "loss": 0.4119, + "mean_token_accuracy": 0.8692588806152344, + "num_tokens": 504177023.0, + "step": 13218 + }, + { + "epoch": 1.681592672687953, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.353845596313477, + "learning_rate": 1e-06, + "loss": 0.4356, + "mean_token_accuracy": 0.8646253347396851, + "num_tokens": 504210492.0, + "step": 13219 + }, + { + "epoch": 1.6817198829665436, + "ewc_loss": 0.03369140625, + "ewc_loss_parallel": 3.361701965332031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.95940589904785, + "learning_rate": 1e-06, + "loss": 0.4157, + "mean_token_accuracy": 0.8675283193588257, + "num_tokens": 504248322.0, + "step": 13220 + }, + { + "epoch": 1.6818470932451342, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.406579971313477, + "learning_rate": 1e-06, + "loss": 0.4562, + "mean_token_accuracy": 0.8537859916687012, + "num_tokens": 504285915.0, + "step": 13221 + }, + { + "epoch": 1.6819743035237247, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.036588668823242, + "learning_rate": 1e-06, + "loss": 0.4273, + "mean_token_accuracy": 0.8655574321746826, + "num_tokens": 504325578.0, + "step": 13222 + }, + { + "epoch": 1.6821015138023152, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.301462173461914, + "learning_rate": 1e-06, + "loss": 0.3457, + "mean_token_accuracy": 0.8867599964141846, + "num_tokens": 504362876.0, + "step": 13223 + }, + { + "epoch": 1.6822287240809057, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.334264755249023, + "learning_rate": 1e-06, + "loss": 0.4311, + "mean_token_accuracy": 0.8658121824264526, + "num_tokens": 504402303.0, + "step": 13224 + }, + { + "epoch": 1.6823559343594963, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.173274993896484, + "learning_rate": 1e-06, + "loss": 0.4108, + "mean_token_accuracy": 0.8685548305511475, + "num_tokens": 504440039.0, + "step": 13225 + }, + { + "epoch": 1.6824831446380868, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.13946533203125, + "learning_rate": 1e-06, + "loss": 0.4469, + "mean_token_accuracy": 0.8583639860153198, + "num_tokens": 504485102.0, + "step": 13226 + }, + { + "epoch": 1.6826103549166773, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.029979705810547, + "learning_rate": 1e-06, + "loss": 0.4128, + "mean_token_accuracy": 0.8666751980781555, + "num_tokens": 504522762.0, + "step": 13227 + }, + { + "epoch": 1.6827375651952678, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.14716148376465, + "learning_rate": 1e-06, + "loss": 0.3985, + "mean_token_accuracy": 0.8716744184494019, + "num_tokens": 504562818.0, + "step": 13228 + }, + { + "epoch": 1.6828647754738584, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.196041107177734, + "learning_rate": 1e-06, + "loss": 0.45, + "mean_token_accuracy": 0.8577594757080078, + "num_tokens": 504602914.0, + "step": 13229 + }, + { + "epoch": 1.682991985752449, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.972341537475586, + "learning_rate": 1e-06, + "loss": 0.4013, + "mean_token_accuracy": 0.8746001124382019, + "num_tokens": 504641376.0, + "step": 13230 + }, + { + "epoch": 1.6831191960310394, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.138370513916016, + "learning_rate": 1e-06, + "loss": 0.3974, + "mean_token_accuracy": 0.873189389705658, + "num_tokens": 504675031.0, + "step": 13231 + }, + { + "epoch": 1.68324640630963, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.117359161376953, + "learning_rate": 1e-06, + "loss": 0.4028, + "mean_token_accuracy": 0.8743153810501099, + "num_tokens": 504709624.0, + "step": 13232 + }, + { + "epoch": 1.6833736165882205, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.273265838623047, + "learning_rate": 1e-06, + "loss": 0.3974, + "mean_token_accuracy": 0.870895266532898, + "num_tokens": 504749310.0, + "step": 13233 + }, + { + "epoch": 1.6835008268668108, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.98722267150879, + "learning_rate": 1e-06, + "loss": 0.4266, + "mean_token_accuracy": 0.8681426048278809, + "num_tokens": 504791700.0, + "step": 13234 + }, + { + "epoch": 1.6836280371454013, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.21462059020996, + "learning_rate": 1e-06, + "loss": 0.447, + "mean_token_accuracy": 0.8568605184555054, + "num_tokens": 504833246.0, + "step": 13235 + }, + { + "epoch": 1.6837552474239919, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.083236694335938, + "learning_rate": 1e-06, + "loss": 0.4108, + "mean_token_accuracy": 0.8723330497741699, + "num_tokens": 504872901.0, + "step": 13236 + }, + { + "epoch": 1.6838824577025824, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.311946868896484, + "learning_rate": 1e-06, + "loss": 0.4318, + "mean_token_accuracy": 0.8626520037651062, + "num_tokens": 504906138.0, + "step": 13237 + }, + { + "epoch": 1.684009667981173, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.134735107421875, + "learning_rate": 1e-06, + "loss": 0.4917, + "mean_token_accuracy": 0.8473721146583557, + "num_tokens": 504947147.0, + "step": 13238 + }, + { + "epoch": 1.6841368782597634, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.21076774597168, + "learning_rate": 1e-06, + "loss": 0.4194, + "mean_token_accuracy": 0.8689067363739014, + "num_tokens": 504985332.0, + "step": 13239 + }, + { + "epoch": 1.6842640885383537, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.081052780151367, + "learning_rate": 1e-06, + "loss": 0.4422, + "mean_token_accuracy": 0.859228253364563, + "num_tokens": 505027995.0, + "step": 13240 + }, + { + "epoch": 1.6843912988169443, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.14618492126465, + "learning_rate": 1e-06, + "loss": 0.405, + "mean_token_accuracy": 0.8699129819869995, + "num_tokens": 505065311.0, + "step": 13241 + }, + { + "epoch": 1.6845185090955348, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.428997039794922, + "learning_rate": 1e-06, + "loss": 0.425, + "mean_token_accuracy": 0.8624613881111145, + "num_tokens": 505105810.0, + "step": 13242 + }, + { + "epoch": 1.6846457193741253, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.86623191833496, + "learning_rate": 1e-06, + "loss": 0.4064, + "mean_token_accuracy": 0.8737838268280029, + "num_tokens": 505141003.0, + "step": 13243 + }, + { + "epoch": 1.6847729296527159, + "ewc_loss": 0.03369140625, + "ewc_loss_parallel": 3.361701965332031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.16640281677246, + "learning_rate": 1e-06, + "loss": 0.4473, + "mean_token_accuracy": 0.856464147567749, + "num_tokens": 505188025.0, + "step": 13244 + }, + { + "epoch": 1.6849001399313064, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.96839141845703, + "learning_rate": 1e-06, + "loss": 0.374, + "mean_token_accuracy": 0.8812349438667297, + "num_tokens": 505226101.0, + "step": 13245 + }, + { + "epoch": 1.685027350209897, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.249706268310547, + "learning_rate": 1e-06, + "loss": 0.446, + "mean_token_accuracy": 0.8576338291168213, + "num_tokens": 505264880.0, + "step": 13246 + }, + { + "epoch": 1.6851545604884874, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.989704132080078, + "learning_rate": 1e-06, + "loss": 0.4306, + "mean_token_accuracy": 0.8614352941513062, + "num_tokens": 505306208.0, + "step": 13247 + }, + { + "epoch": 1.685281770767078, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.254741668701172, + "learning_rate": 1e-06, + "loss": 0.3525, + "mean_token_accuracy": 0.8878456354141235, + "num_tokens": 505346024.0, + "step": 13248 + }, + { + "epoch": 1.6854089810456685, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.230913162231445, + "learning_rate": 1e-06, + "loss": 0.3481, + "mean_token_accuracy": 0.8894244432449341, + "num_tokens": 505383644.0, + "step": 13249 + }, + { + "epoch": 1.685536191324259, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.099628448486328, + "learning_rate": 1e-06, + "loss": 0.3993, + "mean_token_accuracy": 0.8708084225654602, + "num_tokens": 505420365.0, + "step": 13250 + }, + { + "epoch": 1.6856634016028496, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.26150131225586, + "learning_rate": 1e-06, + "loss": 0.417, + "mean_token_accuracy": 0.8687175512313843, + "num_tokens": 505458211.0, + "step": 13251 + }, + { + "epoch": 1.68579061188144, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.001150131225586, + "learning_rate": 1e-06, + "loss": 0.4229, + "mean_token_accuracy": 0.8620849847793579, + "num_tokens": 505501649.0, + "step": 13252 + }, + { + "epoch": 1.6859178221600306, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.248611450195312, + "learning_rate": 1e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.8696861267089844, + "num_tokens": 505538763.0, + "step": 13253 + }, + { + "epoch": 1.6860450324386211, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.053794860839844, + "learning_rate": 1e-06, + "loss": 0.3512, + "mean_token_accuracy": 0.8897597789764404, + "num_tokens": 505574562.0, + "step": 13254 + }, + { + "epoch": 1.6861722427172117, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.989946365356445, + "learning_rate": 1e-06, + "loss": 0.3741, + "mean_token_accuracy": 0.88139408826828, + "num_tokens": 505609718.0, + "step": 13255 + }, + { + "epoch": 1.6862994529958022, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.217477798461914, + "learning_rate": 1e-06, + "loss": 0.3897, + "mean_token_accuracy": 0.8807233572006226, + "num_tokens": 505644337.0, + "step": 13256 + }, + { + "epoch": 1.6864266632743927, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.102563858032227, + "learning_rate": 1e-06, + "loss": 0.3853, + "mean_token_accuracy": 0.8782070875167847, + "num_tokens": 505681929.0, + "step": 13257 + }, + { + "epoch": 1.686553873552983, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.134077072143555, + "learning_rate": 1e-06, + "loss": 0.4875, + "mean_token_accuracy": 0.8448226451873779, + "num_tokens": 505719373.0, + "step": 13258 + }, + { + "epoch": 1.6866810838315736, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.206344604492188, + "learning_rate": 1e-06, + "loss": 0.4548, + "mean_token_accuracy": 0.8534491062164307, + "num_tokens": 505757767.0, + "step": 13259 + }, + { + "epoch": 1.686808294110164, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.48162841796875, + "learning_rate": 1e-06, + "loss": 0.5444, + "mean_token_accuracy": 0.837494969367981, + "num_tokens": 505794777.0, + "step": 13260 + }, + { + "epoch": 1.6869355043887546, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.08379554748535, + "learning_rate": 1e-06, + "loss": 0.3844, + "mean_token_accuracy": 0.8738840818405151, + "num_tokens": 505832672.0, + "step": 13261 + }, + { + "epoch": 1.6870627146673451, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.237842559814453, + "learning_rate": 1e-06, + "loss": 0.4572, + "mean_token_accuracy": 0.8558424711227417, + "num_tokens": 505873142.0, + "step": 13262 + }, + { + "epoch": 1.6871899249459357, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.992843627929688, + "learning_rate": 1e-06, + "loss": 0.4071, + "mean_token_accuracy": 0.8707224726676941, + "num_tokens": 505911994.0, + "step": 13263 + }, + { + "epoch": 1.687317135224526, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.07306480407715, + "learning_rate": 1e-06, + "loss": 0.4062, + "mean_token_accuracy": 0.8713968992233276, + "num_tokens": 505948874.0, + "step": 13264 + }, + { + "epoch": 1.6874443455031165, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.975791931152344, + "learning_rate": 1e-06, + "loss": 0.4009, + "mean_token_accuracy": 0.8734217882156372, + "num_tokens": 505989679.0, + "step": 13265 + }, + { + "epoch": 1.687571555781707, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.458215713500977, + "learning_rate": 1e-06, + "loss": 0.3952, + "mean_token_accuracy": 0.8755344748497009, + "num_tokens": 506033209.0, + "step": 13266 + }, + { + "epoch": 1.6876987660602976, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.09136390686035, + "learning_rate": 1e-06, + "loss": 0.3906, + "mean_token_accuracy": 0.8771034479141235, + "num_tokens": 506065379.0, + "step": 13267 + }, + { + "epoch": 1.687825976338888, + "ewc_loss": 0.03369140625, + "ewc_loss_parallel": 3.361701965332031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.100027084350586, + "learning_rate": 1e-06, + "loss": 0.4433, + "mean_token_accuracy": 0.8594173192977905, + "num_tokens": 506099484.0, + "step": 13268 + }, + { + "epoch": 1.6879531866174786, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.17037582397461, + "learning_rate": 1e-06, + "loss": 0.4581, + "mean_token_accuracy": 0.8521334528923035, + "num_tokens": 506136559.0, + "step": 13269 + }, + { + "epoch": 1.6880803968960691, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.06995391845703, + "learning_rate": 1e-06, + "loss": 0.4234, + "mean_token_accuracy": 0.8665807247161865, + "num_tokens": 506172997.0, + "step": 13270 + }, + { + "epoch": 1.6882076071746597, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.135887145996094, + "learning_rate": 1e-06, + "loss": 0.3749, + "mean_token_accuracy": 0.8819119930267334, + "num_tokens": 506209858.0, + "step": 13271 + }, + { + "epoch": 1.6883348174532502, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.128459930419922, + "learning_rate": 1e-06, + "loss": 0.372, + "mean_token_accuracy": 0.8794776201248169, + "num_tokens": 506242260.0, + "step": 13272 + }, + { + "epoch": 1.6884620277318407, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.161529541015625, + "learning_rate": 1e-06, + "loss": 0.4557, + "mean_token_accuracy": 0.8547277450561523, + "num_tokens": 506276856.0, + "step": 13273 + }, + { + "epoch": 1.6885892380104313, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.228906631469727, + "learning_rate": 1e-06, + "loss": 0.3541, + "mean_token_accuracy": 0.886731743812561, + "num_tokens": 506311469.0, + "step": 13274 + }, + { + "epoch": 1.6887164482890218, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.29096031188965, + "learning_rate": 1e-06, + "loss": 0.383, + "mean_token_accuracy": 0.8799453377723694, + "num_tokens": 506348240.0, + "step": 13275 + }, + { + "epoch": 1.6888436585676123, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.168813705444336, + "learning_rate": 1e-06, + "loss": 0.4107, + "mean_token_accuracy": 0.8693698644638062, + "num_tokens": 506387203.0, + "step": 13276 + }, + { + "epoch": 1.6889708688462028, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.99655532836914, + "learning_rate": 1e-06, + "loss": 0.4936, + "mean_token_accuracy": 0.8461651802062988, + "num_tokens": 506421869.0, + "step": 13277 + }, + { + "epoch": 1.6890980791247934, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.20020866394043, + "learning_rate": 1e-06, + "loss": 0.4192, + "mean_token_accuracy": 0.8681337237358093, + "num_tokens": 506458873.0, + "step": 13278 + }, + { + "epoch": 1.689225289403384, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.158472061157227, + "learning_rate": 1e-06, + "loss": 0.4566, + "mean_token_accuracy": 0.854728102684021, + "num_tokens": 506495947.0, + "step": 13279 + }, + { + "epoch": 1.6893524996819744, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.027828216552734, + "learning_rate": 1e-06, + "loss": 0.4743, + "mean_token_accuracy": 0.8496577739715576, + "num_tokens": 506533098.0, + "step": 13280 + }, + { + "epoch": 1.689479709960565, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.96649932861328, + "learning_rate": 1e-06, + "loss": 0.3818, + "mean_token_accuracy": 0.8788136839866638, + "num_tokens": 506571346.0, + "step": 13281 + }, + { + "epoch": 1.6896069202391555, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.504405975341797, + "learning_rate": 1e-06, + "loss": 0.4094, + "mean_token_accuracy": 0.8689451217651367, + "num_tokens": 506607912.0, + "step": 13282 + }, + { + "epoch": 1.6897341305177458, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.00303840637207, + "learning_rate": 1e-06, + "loss": 0.4418, + "mean_token_accuracy": 0.8573562502861023, + "num_tokens": 506654258.0, + "step": 13283 + }, + { + "epoch": 1.6898613407963363, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.172977447509766, + "learning_rate": 1e-06, + "loss": 0.3993, + "mean_token_accuracy": 0.8744066953659058, + "num_tokens": 506693436.0, + "step": 13284 + }, + { + "epoch": 1.6899885510749268, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.025819778442383, + "learning_rate": 1e-06, + "loss": 0.4091, + "mean_token_accuracy": 0.8701287508010864, + "num_tokens": 506737383.0, + "step": 13285 + }, + { + "epoch": 1.6901157613535174, + "ewc_loss": 0.03369140625, + "ewc_loss_parallel": 3.361701965332031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.146509170532227, + "learning_rate": 1e-06, + "loss": 0.3787, + "mean_token_accuracy": 0.8786242008209229, + "num_tokens": 506772282.0, + "step": 13286 + }, + { + "epoch": 1.690242971632108, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.2370548248291, + "learning_rate": 1e-06, + "loss": 0.3787, + "mean_token_accuracy": 0.8788940906524658, + "num_tokens": 506807208.0, + "step": 13287 + }, + { + "epoch": 1.6903701819106984, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.07273292541504, + "learning_rate": 1e-06, + "loss": 0.3848, + "mean_token_accuracy": 0.8739734292030334, + "num_tokens": 506847952.0, + "step": 13288 + }, + { + "epoch": 1.6904973921892887, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.26079559326172, + "learning_rate": 1e-06, + "loss": 0.4053, + "mean_token_accuracy": 0.8710984587669373, + "num_tokens": 506888265.0, + "step": 13289 + }, + { + "epoch": 1.6906246024678793, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.842065811157227, + "learning_rate": 1e-06, + "loss": 0.4142, + "mean_token_accuracy": 0.8714027404785156, + "num_tokens": 506926096.0, + "step": 13290 + }, + { + "epoch": 1.6907518127464698, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.31451416015625, + "learning_rate": 1e-06, + "loss": 0.4272, + "mean_token_accuracy": 0.8615586757659912, + "num_tokens": 506962252.0, + "step": 13291 + }, + { + "epoch": 1.6908790230250603, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.23169708251953, + "learning_rate": 1e-06, + "loss": 0.3977, + "mean_token_accuracy": 0.8737198114395142, + "num_tokens": 507004709.0, + "step": 13292 + }, + { + "epoch": 1.6910062333036509, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.15013313293457, + "learning_rate": 1e-06, + "loss": 0.3731, + "mean_token_accuracy": 0.8841546177864075, + "num_tokens": 507041041.0, + "step": 13293 + }, + { + "epoch": 1.6911334435822414, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.030447006225586, + "learning_rate": 1e-06, + "loss": 0.4346, + "mean_token_accuracy": 0.8619787096977234, + "num_tokens": 507080251.0, + "step": 13294 + }, + { + "epoch": 1.691260653860832, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.236825942993164, + "learning_rate": 1e-06, + "loss": 0.4262, + "mean_token_accuracy": 0.866727352142334, + "num_tokens": 507120813.0, + "step": 13295 + }, + { + "epoch": 1.6913878641394224, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.040864944458008, + "learning_rate": 1e-06, + "loss": 0.4663, + "mean_token_accuracy": 0.8553699851036072, + "num_tokens": 507160284.0, + "step": 13296 + }, + { + "epoch": 1.691515074418013, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.2702693939209, + "learning_rate": 1e-06, + "loss": 0.43, + "mean_token_accuracy": 0.8612044453620911, + "num_tokens": 507205069.0, + "step": 13297 + }, + { + "epoch": 1.6916422846966035, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.1091251373291, + "learning_rate": 1e-06, + "loss": 0.3831, + "mean_token_accuracy": 0.8764813542366028, + "num_tokens": 507244689.0, + "step": 13298 + }, + { + "epoch": 1.691769494975194, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.123674392700195, + "learning_rate": 1e-06, + "loss": 0.4093, + "mean_token_accuracy": 0.8700381517410278, + "num_tokens": 507283917.0, + "step": 13299 + }, + { + "epoch": 1.6918967052537845, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.99616241455078, + "learning_rate": 1e-06, + "loss": 0.4074, + "mean_token_accuracy": 0.8704933524131775, + "num_tokens": 507327937.0, + "step": 13300 + }, + { + "epoch": 1.692023915532375, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.266944885253906, + "learning_rate": 1e-06, + "loss": 0.3759, + "mean_token_accuracy": 0.8825560212135315, + "num_tokens": 507369521.0, + "step": 13301 + }, + { + "epoch": 1.6921511258109656, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.056640625, + "learning_rate": 1e-06, + "loss": 0.3812, + "mean_token_accuracy": 0.8784685134887695, + "num_tokens": 507408157.0, + "step": 13302 + }, + { + "epoch": 1.6922783360895561, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.150665283203125, + "learning_rate": 1e-06, + "loss": 0.3853, + "mean_token_accuracy": 0.8737954497337341, + "num_tokens": 507448003.0, + "step": 13303 + }, + { + "epoch": 1.6924055463681467, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.177141189575195, + "learning_rate": 1e-06, + "loss": 0.3878, + "mean_token_accuracy": 0.8756327629089355, + "num_tokens": 507489970.0, + "step": 13304 + }, + { + "epoch": 1.6925327566467372, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.969818115234375, + "learning_rate": 1e-06, + "loss": 0.4029, + "mean_token_accuracy": 0.8704476356506348, + "num_tokens": 507524536.0, + "step": 13305 + }, + { + "epoch": 1.6926599669253277, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.067638397216797, + "learning_rate": 1e-06, + "loss": 0.3395, + "mean_token_accuracy": 0.8885507583618164, + "num_tokens": 507561575.0, + "step": 13306 + }, + { + "epoch": 1.692787177203918, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.155590057373047, + "learning_rate": 1e-06, + "loss": 0.416, + "mean_token_accuracy": 0.8710987567901611, + "num_tokens": 507597212.0, + "step": 13307 + }, + { + "epoch": 1.6929143874825086, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.16187858581543, + "learning_rate": 1e-06, + "loss": 0.4226, + "mean_token_accuracy": 0.8703868389129639, + "num_tokens": 507634251.0, + "step": 13308 + }, + { + "epoch": 1.693041597761099, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.313810348510742, + "learning_rate": 1e-06, + "loss": 0.3898, + "mean_token_accuracy": 0.8743206262588501, + "num_tokens": 507671345.0, + "step": 13309 + }, + { + "epoch": 1.6931688080396896, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.350486755371094, + "learning_rate": 1e-06, + "loss": 0.4567, + "mean_token_accuracy": 0.8576928973197937, + "num_tokens": 507709797.0, + "step": 13310 + }, + { + "epoch": 1.6932960183182801, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.037403106689453, + "learning_rate": 1e-06, + "loss": 0.4077, + "mean_token_accuracy": 0.8677335977554321, + "num_tokens": 507747047.0, + "step": 13311 + }, + { + "epoch": 1.6934232285968707, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.07731819152832, + "learning_rate": 1e-06, + "loss": 0.4102, + "mean_token_accuracy": 0.8695994019508362, + "num_tokens": 507782187.0, + "step": 13312 + }, + { + "epoch": 1.693550438875461, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.114274978637695, + "learning_rate": 1e-06, + "loss": 0.4615, + "mean_token_accuracy": 0.855350136756897, + "num_tokens": 507826714.0, + "step": 13313 + }, + { + "epoch": 1.6936776491540515, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.17847442626953, + "learning_rate": 1e-06, + "loss": 0.4409, + "mean_token_accuracy": 0.8608746528625488, + "num_tokens": 507869423.0, + "step": 13314 + }, + { + "epoch": 1.693804859432642, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.128429412841797, + "learning_rate": 1e-06, + "loss": 0.3923, + "mean_token_accuracy": 0.8780060410499573, + "num_tokens": 507904711.0, + "step": 13315 + }, + { + "epoch": 1.6939320697112326, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.06547737121582, + "learning_rate": 1e-06, + "loss": 0.4285, + "mean_token_accuracy": 0.8637272119522095, + "num_tokens": 507947760.0, + "step": 13316 + }, + { + "epoch": 1.694059279989823, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.220687866210938, + "learning_rate": 1e-06, + "loss": 0.4209, + "mean_token_accuracy": 0.8655565977096558, + "num_tokens": 507984249.0, + "step": 13317 + }, + { + "epoch": 1.6941864902684136, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.086074829101562, + "learning_rate": 1e-06, + "loss": 0.4209, + "mean_token_accuracy": 0.8653417229652405, + "num_tokens": 508017206.0, + "step": 13318 + }, + { + "epoch": 1.6943137005470041, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.19213104248047, + "learning_rate": 1e-06, + "loss": 0.4257, + "mean_token_accuracy": 0.8636090159416199, + "num_tokens": 508056357.0, + "step": 13319 + }, + { + "epoch": 1.6944409108255947, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.0347957611084, + "learning_rate": 1e-06, + "loss": 0.4375, + "mean_token_accuracy": 0.8594492673873901, + "num_tokens": 508094870.0, + "step": 13320 + }, + { + "epoch": 1.6945681211041852, + "ewc_loss": 0.03369140625, + "ewc_loss_parallel": 3.361701965332031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.055517196655273, + "learning_rate": 1e-06, + "loss": 0.4177, + "mean_token_accuracy": 0.8680003881454468, + "num_tokens": 508120323.0, + "step": 13321 + }, + { + "epoch": 1.6946953313827757, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.162445068359375, + "learning_rate": 1e-06, + "loss": 0.443, + "mean_token_accuracy": 0.8600218892097473, + "num_tokens": 508160018.0, + "step": 13322 + }, + { + "epoch": 1.6948225416613663, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.953264236450195, + "learning_rate": 1e-06, + "loss": 0.385, + "mean_token_accuracy": 0.8851103782653809, + "num_tokens": 508200947.0, + "step": 13323 + }, + { + "epoch": 1.6949497519399568, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.35369110107422, + "learning_rate": 1e-06, + "loss": 0.4564, + "mean_token_accuracy": 0.8552170991897583, + "num_tokens": 508246457.0, + "step": 13324 + }, + { + "epoch": 1.6950769622185473, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.085983276367188, + "learning_rate": 1e-06, + "loss": 0.4376, + "mean_token_accuracy": 0.8636566400527954, + "num_tokens": 508281951.0, + "step": 13325 + }, + { + "epoch": 1.6952041724971378, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.26106071472168, + "learning_rate": 1e-06, + "loss": 0.4813, + "mean_token_accuracy": 0.8500058650970459, + "num_tokens": 508322571.0, + "step": 13326 + }, + { + "epoch": 1.6953313827757284, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.17226219177246, + "learning_rate": 1e-06, + "loss": 0.4005, + "mean_token_accuracy": 0.8749973177909851, + "num_tokens": 508357347.0, + "step": 13327 + }, + { + "epoch": 1.695458593054319, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.042190551757812, + "learning_rate": 1e-06, + "loss": 0.4033, + "mean_token_accuracy": 0.8704322576522827, + "num_tokens": 508401813.0, + "step": 13328 + }, + { + "epoch": 1.6955858033329094, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.108070373535156, + "learning_rate": 1e-06, + "loss": 0.4434, + "mean_token_accuracy": 0.8567636013031006, + "num_tokens": 508440457.0, + "step": 13329 + }, + { + "epoch": 1.6957130136115, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.148723602294922, + "learning_rate": 1e-06, + "loss": 0.4397, + "mean_token_accuracy": 0.8592402338981628, + "num_tokens": 508472272.0, + "step": 13330 + }, + { + "epoch": 1.6958402238900905, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.13973045349121, + "learning_rate": 1e-06, + "loss": 0.3794, + "mean_token_accuracy": 0.8806024789810181, + "num_tokens": 508509352.0, + "step": 13331 + }, + { + "epoch": 1.6959674341686808, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.334678649902344, + "learning_rate": 1e-06, + "loss": 0.4058, + "mean_token_accuracy": 0.8774404525756836, + "num_tokens": 508552515.0, + "step": 13332 + }, + { + "epoch": 1.6960946444472713, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.120664596557617, + "learning_rate": 1e-06, + "loss": 0.3763, + "mean_token_accuracy": 0.8810800313949585, + "num_tokens": 508591399.0, + "step": 13333 + }, + { + "epoch": 1.6962218547258618, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.21723747253418, + "learning_rate": 1e-06, + "loss": 0.4028, + "mean_token_accuracy": 0.8727255463600159, + "num_tokens": 508630427.0, + "step": 13334 + }, + { + "epoch": 1.6963490650044524, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.13477325439453, + "learning_rate": 1e-06, + "loss": 0.392, + "mean_token_accuracy": 0.8753728866577148, + "num_tokens": 508667469.0, + "step": 13335 + }, + { + "epoch": 1.696476275283043, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.076311111450195, + "learning_rate": 1e-06, + "loss": 0.4163, + "mean_token_accuracy": 0.8718534708023071, + "num_tokens": 508706308.0, + "step": 13336 + }, + { + "epoch": 1.6966034855616334, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.184398651123047, + "learning_rate": 1e-06, + "loss": 0.4619, + "mean_token_accuracy": 0.8563915491104126, + "num_tokens": 508738968.0, + "step": 13337 + }, + { + "epoch": 1.6967306958402237, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.1420955657959, + "learning_rate": 1e-06, + "loss": 0.3926, + "mean_token_accuracy": 0.8740249872207642, + "num_tokens": 508769879.0, + "step": 13338 + }, + { + "epoch": 1.6968579061188143, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.29531478881836, + "learning_rate": 1e-06, + "loss": 0.4597, + "mean_token_accuracy": 0.8535725474357605, + "num_tokens": 508812277.0, + "step": 13339 + }, + { + "epoch": 1.6969851163974048, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.021102905273438, + "learning_rate": 1e-06, + "loss": 0.4434, + "mean_token_accuracy": 0.8579720258712769, + "num_tokens": 508849293.0, + "step": 13340 + }, + { + "epoch": 1.6971123266759953, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.18039894104004, + "learning_rate": 1e-06, + "loss": 0.4066, + "mean_token_accuracy": 0.869888424873352, + "num_tokens": 508885798.0, + "step": 13341 + }, + { + "epoch": 1.6972395369545858, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.19888687133789, + "learning_rate": 1e-06, + "loss": 0.4162, + "mean_token_accuracy": 0.8663256168365479, + "num_tokens": 508920491.0, + "step": 13342 + }, + { + "epoch": 1.6973667472331764, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.025890350341797, + "learning_rate": 1e-06, + "loss": 0.4106, + "mean_token_accuracy": 0.8687220811843872, + "num_tokens": 508956161.0, + "step": 13343 + }, + { + "epoch": 1.697493957511767, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.270645141601562, + "learning_rate": 1e-06, + "loss": 0.4357, + "mean_token_accuracy": 0.8644297122955322, + "num_tokens": 508995696.0, + "step": 13344 + }, + { + "epoch": 1.6976211677903574, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.009185791015625, + "learning_rate": 1e-06, + "loss": 0.4105, + "mean_token_accuracy": 0.8734485507011414, + "num_tokens": 509029505.0, + "step": 13345 + }, + { + "epoch": 1.697748378068948, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.054914474487305, + "learning_rate": 1e-06, + "loss": 0.3987, + "mean_token_accuracy": 0.8760847449302673, + "num_tokens": 509062815.0, + "step": 13346 + }, + { + "epoch": 1.6978755883475385, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.143205642700195, + "learning_rate": 1e-06, + "loss": 0.4056, + "mean_token_accuracy": 0.8740039467811584, + "num_tokens": 509103376.0, + "step": 13347 + }, + { + "epoch": 1.698002798626129, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.042530059814453, + "learning_rate": 1e-06, + "loss": 0.3855, + "mean_token_accuracy": 0.8784075379371643, + "num_tokens": 509142797.0, + "step": 13348 + }, + { + "epoch": 1.6981300089047195, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.145530700683594, + "learning_rate": 1e-06, + "loss": 0.4428, + "mean_token_accuracy": 0.8572734594345093, + "num_tokens": 509182724.0, + "step": 13349 + }, + { + "epoch": 1.69825721918331, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.09563636779785, + "learning_rate": 1e-06, + "loss": 0.3887, + "mean_token_accuracy": 0.8772287964820862, + "num_tokens": 509217856.0, + "step": 13350 + }, + { + "epoch": 1.6983844294619006, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.231977462768555, + "learning_rate": 1e-06, + "loss": 0.4084, + "mean_token_accuracy": 0.8691179156303406, + "num_tokens": 509251581.0, + "step": 13351 + }, + { + "epoch": 1.6985116397404911, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.276119232177734, + "learning_rate": 1e-06, + "loss": 0.3626, + "mean_token_accuracy": 0.8846327066421509, + "num_tokens": 509286687.0, + "step": 13352 + }, + { + "epoch": 1.6986388500190817, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.26618003845215, + "learning_rate": 1e-06, + "loss": 0.4275, + "mean_token_accuracy": 0.8631037473678589, + "num_tokens": 509323372.0, + "step": 13353 + }, + { + "epoch": 1.6987660602976722, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.07925033569336, + "learning_rate": 1e-06, + "loss": 0.4314, + "mean_token_accuracy": 0.860419750213623, + "num_tokens": 509362472.0, + "step": 13354 + }, + { + "epoch": 1.6988932705762627, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.279571533203125, + "learning_rate": 1e-06, + "loss": 0.4083, + "mean_token_accuracy": 0.8690299391746521, + "num_tokens": 509397676.0, + "step": 13355 + }, + { + "epoch": 1.699020480854853, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.242204666137695, + "learning_rate": 1e-06, + "loss": 0.4017, + "mean_token_accuracy": 0.8726717233657837, + "num_tokens": 509441100.0, + "step": 13356 + }, + { + "epoch": 1.6991476911334435, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.123531341552734, + "learning_rate": 1e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.8643999099731445, + "num_tokens": 509485430.0, + "step": 13357 + }, + { + "epoch": 1.699274901412034, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.418994903564453, + "learning_rate": 1e-06, + "loss": 0.5153, + "mean_token_accuracy": 0.8383577466011047, + "num_tokens": 509520102.0, + "step": 13358 + }, + { + "epoch": 1.6994021116906246, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.978818893432617, + "learning_rate": 1e-06, + "loss": 0.4038, + "mean_token_accuracy": 0.8701331615447998, + "num_tokens": 509556525.0, + "step": 13359 + }, + { + "epoch": 1.6995293219692151, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.1449031829834, + "learning_rate": 1e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.8616408109664917, + "num_tokens": 509596899.0, + "step": 13360 + }, + { + "epoch": 1.6996565322478057, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.172279357910156, + "learning_rate": 1e-06, + "loss": 0.4036, + "mean_token_accuracy": 0.8685925006866455, + "num_tokens": 509632125.0, + "step": 13361 + }, + { + "epoch": 1.699783742526396, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.16622543334961, + "learning_rate": 1e-06, + "loss": 0.4146, + "mean_token_accuracy": 0.8648182153701782, + "num_tokens": 509668131.0, + "step": 13362 + }, + { + "epoch": 1.6999109528049865, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.129701614379883, + "learning_rate": 1e-06, + "loss": 0.3737, + "mean_token_accuracy": 0.880801796913147, + "num_tokens": 509704078.0, + "step": 13363 + }, + { + "epoch": 1.700038163083577, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.055580139160156, + "learning_rate": 1e-06, + "loss": 0.3751, + "mean_token_accuracy": 0.8813499212265015, + "num_tokens": 509739464.0, + "step": 13364 + }, + { + "epoch": 1.7001653733621676, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.11546516418457, + "learning_rate": 1e-06, + "loss": 0.4146, + "mean_token_accuracy": 0.8671050667762756, + "num_tokens": 509777970.0, + "step": 13365 + }, + { + "epoch": 1.700292583640758, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.410789489746094, + "learning_rate": 1e-06, + "loss": 0.4614, + "mean_token_accuracy": 0.8542911410331726, + "num_tokens": 509817010.0, + "step": 13366 + }, + { + "epoch": 1.7004197939193486, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.0745792388916, + "learning_rate": 1e-06, + "loss": 0.3942, + "mean_token_accuracy": 0.8743665218353271, + "num_tokens": 509854192.0, + "step": 13367 + }, + { + "epoch": 1.7005470041979391, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.058401107788086, + "learning_rate": 1e-06, + "loss": 0.377, + "mean_token_accuracy": 0.8821103572845459, + "num_tokens": 509888633.0, + "step": 13368 + }, + { + "epoch": 1.7006742144765297, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.203174591064453, + "learning_rate": 1e-06, + "loss": 0.4036, + "mean_token_accuracy": 0.8700183629989624, + "num_tokens": 509928474.0, + "step": 13369 + }, + { + "epoch": 1.7008014247551202, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.149105072021484, + "learning_rate": 1e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.860764741897583, + "num_tokens": 509968875.0, + "step": 13370 + }, + { + "epoch": 1.7009286350337107, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.96730613708496, + "learning_rate": 1e-06, + "loss": 0.4103, + "mean_token_accuracy": 0.8691420555114746, + "num_tokens": 510006021.0, + "step": 13371 + }, + { + "epoch": 1.7010558453123013, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.280725479125977, + "learning_rate": 1e-06, + "loss": 0.4066, + "mean_token_accuracy": 0.8736859560012817, + "num_tokens": 510042111.0, + "step": 13372 + }, + { + "epoch": 1.7011830555908918, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.30714225769043, + "learning_rate": 1e-06, + "loss": 0.3907, + "mean_token_accuracy": 0.8751205205917358, + "num_tokens": 510074894.0, + "step": 13373 + }, + { + "epoch": 1.7013102658694823, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.154682159423828, + "learning_rate": 1e-06, + "loss": 0.4499, + "mean_token_accuracy": 0.8597730398178101, + "num_tokens": 510111516.0, + "step": 13374 + }, + { + "epoch": 1.7014374761480728, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.189008712768555, + "learning_rate": 1e-06, + "loss": 0.4085, + "mean_token_accuracy": 0.8716703653335571, + "num_tokens": 510151549.0, + "step": 13375 + }, + { + "epoch": 1.7015646864266634, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.112375259399414, + "learning_rate": 1e-06, + "loss": 0.4138, + "mean_token_accuracy": 0.8686469197273254, + "num_tokens": 510189215.0, + "step": 13376 + }, + { + "epoch": 1.701691896705254, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.217937469482422, + "learning_rate": 1e-06, + "loss": 0.3544, + "mean_token_accuracy": 0.8888351917266846, + "num_tokens": 510231189.0, + "step": 13377 + }, + { + "epoch": 1.7018191069838444, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.152698516845703, + "learning_rate": 1e-06, + "loss": 0.4002, + "mean_token_accuracy": 0.8726129531860352, + "num_tokens": 510273956.0, + "step": 13378 + }, + { + "epoch": 1.701946317262435, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.153743743896484, + "learning_rate": 1e-06, + "loss": 0.3638, + "mean_token_accuracy": 0.8847681283950806, + "num_tokens": 510315095.0, + "step": 13379 + }, + { + "epoch": 1.7020735275410255, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.168676376342773, + "learning_rate": 1e-06, + "loss": 0.4868, + "mean_token_accuracy": 0.8480687141418457, + "num_tokens": 510350753.0, + "step": 13380 + }, + { + "epoch": 1.7022007378196158, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.345705032348633, + "learning_rate": 1e-06, + "loss": 0.3953, + "mean_token_accuracy": 0.8740941286087036, + "num_tokens": 510389257.0, + "step": 13381 + }, + { + "epoch": 1.7023279480982063, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.146116256713867, + "learning_rate": 1e-06, + "loss": 0.4082, + "mean_token_accuracy": 0.869870662689209, + "num_tokens": 510428879.0, + "step": 13382 + }, + { + "epoch": 1.7024551583767968, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.241933822631836, + "learning_rate": 1e-06, + "loss": 0.4106, + "mean_token_accuracy": 0.8721570372581482, + "num_tokens": 510468409.0, + "step": 13383 + }, + { + "epoch": 1.7025823686553874, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.274770736694336, + "learning_rate": 1e-06, + "loss": 0.3673, + "mean_token_accuracy": 0.8843733072280884, + "num_tokens": 510503988.0, + "step": 13384 + }, + { + "epoch": 1.702709578933978, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.295007705688477, + "learning_rate": 1e-06, + "loss": 0.4191, + "mean_token_accuracy": 0.8674178719520569, + "num_tokens": 510539009.0, + "step": 13385 + }, + { + "epoch": 1.7028367892125684, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.341747283935547, + "learning_rate": 1e-06, + "loss": 0.3678, + "mean_token_accuracy": 0.8834291696548462, + "num_tokens": 510576177.0, + "step": 13386 + }, + { + "epoch": 1.7029639994911587, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.36711311340332, + "learning_rate": 1e-06, + "loss": 0.4105, + "mean_token_accuracy": 0.8685989379882812, + "num_tokens": 510622194.0, + "step": 13387 + }, + { + "epoch": 1.7030912097697493, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.050743103027344, + "learning_rate": 1e-06, + "loss": 0.362, + "mean_token_accuracy": 0.8856516480445862, + "num_tokens": 510663564.0, + "step": 13388 + }, + { + "epoch": 1.7032184200483398, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.240713119506836, + "learning_rate": 1e-06, + "loss": 0.3827, + "mean_token_accuracy": 0.8798167705535889, + "num_tokens": 510702153.0, + "step": 13389 + }, + { + "epoch": 1.7033456303269303, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.055421829223633, + "learning_rate": 1e-06, + "loss": 0.4279, + "mean_token_accuracy": 0.8648277521133423, + "num_tokens": 510742812.0, + "step": 13390 + }, + { + "epoch": 1.7034728406055208, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.203201293945312, + "learning_rate": 1e-06, + "loss": 0.4235, + "mean_token_accuracy": 0.863403856754303, + "num_tokens": 510783363.0, + "step": 13391 + }, + { + "epoch": 1.7036000508841114, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.059837341308594, + "learning_rate": 1e-06, + "loss": 0.4538, + "mean_token_accuracy": 0.8546234965324402, + "num_tokens": 510818362.0, + "step": 13392 + }, + { + "epoch": 1.703727261162702, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.131637573242188, + "learning_rate": 1e-06, + "loss": 0.3718, + "mean_token_accuracy": 0.881218671798706, + "num_tokens": 510854732.0, + "step": 13393 + }, + { + "epoch": 1.7038544714412924, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.061037063598633, + "learning_rate": 1e-06, + "loss": 0.4308, + "mean_token_accuracy": 0.8628074526786804, + "num_tokens": 510895107.0, + "step": 13394 + }, + { + "epoch": 1.703981681719883, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.171152114868164, + "learning_rate": 1e-06, + "loss": 0.4259, + "mean_token_accuracy": 0.8657196760177612, + "num_tokens": 510930737.0, + "step": 13395 + }, + { + "epoch": 1.7041088919984735, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.073366165161133, + "learning_rate": 1e-06, + "loss": 0.3955, + "mean_token_accuracy": 0.8765941262245178, + "num_tokens": 510964646.0, + "step": 13396 + }, + { + "epoch": 1.704236102277064, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.273874282836914, + "learning_rate": 1e-06, + "loss": 0.3959, + "mean_token_accuracy": 0.8725613355636597, + "num_tokens": 511000583.0, + "step": 13397 + }, + { + "epoch": 1.7043633125556545, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.090415954589844, + "learning_rate": 1e-06, + "loss": 0.4041, + "mean_token_accuracy": 0.8715463280677795, + "num_tokens": 511033729.0, + "step": 13398 + }, + { + "epoch": 1.704490522834245, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.088014602661133, + "learning_rate": 1e-06, + "loss": 0.3973, + "mean_token_accuracy": 0.8741095662117004, + "num_tokens": 511074099.0, + "step": 13399 + }, + { + "epoch": 1.7046177331128356, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.313196182250977, + "learning_rate": 1e-06, + "loss": 0.454, + "mean_token_accuracy": 0.8560899496078491, + "num_tokens": 511112473.0, + "step": 13400 + }, + { + "epoch": 1.7047449433914261, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.120773315429688, + "learning_rate": 1e-06, + "loss": 0.406, + "mean_token_accuracy": 0.8694217205047607, + "num_tokens": 511149839.0, + "step": 13401 + }, + { + "epoch": 1.7048721536700167, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.118471145629883, + "learning_rate": 1e-06, + "loss": 0.4547, + "mean_token_accuracy": 0.8585337996482849, + "num_tokens": 511183054.0, + "step": 13402 + }, + { + "epoch": 1.7049993639486072, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.289060592651367, + "learning_rate": 1e-06, + "loss": 0.42, + "mean_token_accuracy": 0.8729910850524902, + "num_tokens": 511218498.0, + "step": 13403 + }, + { + "epoch": 1.7051265742271977, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.36678123474121, + "learning_rate": 1e-06, + "loss": 0.4227, + "mean_token_accuracy": 0.8655656576156616, + "num_tokens": 511259776.0, + "step": 13404 + }, + { + "epoch": 1.705253784505788, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.05852699279785, + "learning_rate": 1e-06, + "loss": 0.3968, + "mean_token_accuracy": 0.8766494989395142, + "num_tokens": 511297331.0, + "step": 13405 + }, + { + "epoch": 1.7053809947843785, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.146705627441406, + "learning_rate": 1e-06, + "loss": 0.3732, + "mean_token_accuracy": 0.8804975748062134, + "num_tokens": 511331055.0, + "step": 13406 + }, + { + "epoch": 1.705508205062969, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.24863624572754, + "learning_rate": 1e-06, + "loss": 0.3604, + "mean_token_accuracy": 0.8863874673843384, + "num_tokens": 511368651.0, + "step": 13407 + }, + { + "epoch": 1.7056354153415596, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.113967895507812, + "learning_rate": 1e-06, + "loss": 0.3905, + "mean_token_accuracy": 0.8736637830734253, + "num_tokens": 511402163.0, + "step": 13408 + }, + { + "epoch": 1.7057626256201501, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.080867767333984, + "learning_rate": 1e-06, + "loss": 0.4347, + "mean_token_accuracy": 0.8621257543563843, + "num_tokens": 511443769.0, + "step": 13409 + }, + { + "epoch": 1.7058898358987407, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.343399047851562, + "learning_rate": 1e-06, + "loss": 0.4145, + "mean_token_accuracy": 0.8672824501991272, + "num_tokens": 511481038.0, + "step": 13410 + }, + { + "epoch": 1.706017046177331, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.12481689453125, + "learning_rate": 1e-06, + "loss": 0.4093, + "mean_token_accuracy": 0.8713641166687012, + "num_tokens": 511522940.0, + "step": 13411 + }, + { + "epoch": 1.7061442564559215, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.119455337524414, + "learning_rate": 1e-06, + "loss": 0.4243, + "mean_token_accuracy": 0.8665693998336792, + "num_tokens": 511562630.0, + "step": 13412 + }, + { + "epoch": 1.706271466734512, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.447614669799805, + "learning_rate": 1e-06, + "loss": 0.4484, + "mean_token_accuracy": 0.8544763326644897, + "num_tokens": 511598986.0, + "step": 13413 + }, + { + "epoch": 1.7063986770131025, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.312667846679688, + "learning_rate": 1e-06, + "loss": 0.4083, + "mean_token_accuracy": 0.8718471527099609, + "num_tokens": 511633582.0, + "step": 13414 + }, + { + "epoch": 1.706525887291693, + "ewc_loss": 0.03369140625, + "ewc_loss_parallel": 3.361701965332031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.218582153320312, + "learning_rate": 1e-06, + "loss": 0.411, + "mean_token_accuracy": 0.8696191310882568, + "num_tokens": 511673296.0, + "step": 13415 + }, + { + "epoch": 1.7066530975702836, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.197315216064453, + "learning_rate": 1e-06, + "loss": 0.4348, + "mean_token_accuracy": 0.8616001605987549, + "num_tokens": 511707890.0, + "step": 13416 + }, + { + "epoch": 1.7067803078488741, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.45630645751953, + "learning_rate": 1e-06, + "loss": 0.3899, + "mean_token_accuracy": 0.87486332654953, + "num_tokens": 511741597.0, + "step": 13417 + }, + { + "epoch": 1.7069075181274647, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.483715057373047, + "learning_rate": 1e-06, + "loss": 0.397, + "mean_token_accuracy": 0.8735318183898926, + "num_tokens": 511776816.0, + "step": 13418 + }, + { + "epoch": 1.7070347284060552, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.36582374572754, + "learning_rate": 1e-06, + "loss": 0.4408, + "mean_token_accuracy": 0.862735390663147, + "num_tokens": 511814866.0, + "step": 13419 + }, + { + "epoch": 1.7071619386846457, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.086698532104492, + "learning_rate": 1e-06, + "loss": 0.4036, + "mean_token_accuracy": 0.870855450630188, + "num_tokens": 511852458.0, + "step": 13420 + }, + { + "epoch": 1.7072891489632362, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.421537399291992, + "learning_rate": 1e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.8642889261245728, + "num_tokens": 511891201.0, + "step": 13421 + }, + { + "epoch": 1.7074163592418268, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.181232452392578, + "learning_rate": 1e-06, + "loss": 0.4236, + "mean_token_accuracy": 0.8692038059234619, + "num_tokens": 511930862.0, + "step": 13422 + }, + { + "epoch": 1.7075435695204173, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.553499221801758, + "learning_rate": 1e-06, + "loss": 0.3836, + "mean_token_accuracy": 0.876559853553772, + "num_tokens": 511967695.0, + "step": 13423 + }, + { + "epoch": 1.7076707797990078, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.968992233276367, + "learning_rate": 1e-06, + "loss": 0.3739, + "mean_token_accuracy": 0.8830550909042358, + "num_tokens": 512005976.0, + "step": 13424 + }, + { + "epoch": 1.7077979900775984, + "ewc_loss": 0.033447265625, + "ewc_loss_parallel": 3.337860107421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.082368850708008, + "learning_rate": 1e-06, + "loss": 0.4025, + "mean_token_accuracy": 0.8702009320259094, + "num_tokens": 512041575.0, + "step": 13425 + }, + { + "epoch": 1.7079252003561889, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.248167037963867, + "learning_rate": 1e-06, + "loss": 0.4268, + "mean_token_accuracy": 0.8660058975219727, + "num_tokens": 512076171.0, + "step": 13426 + }, + { + "epoch": 1.7080524106347794, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.194578170776367, + "learning_rate": 1e-06, + "loss": 0.4159, + "mean_token_accuracy": 0.8648542165756226, + "num_tokens": 512112082.0, + "step": 13427 + }, + { + "epoch": 1.70817962091337, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.21442413330078, + "learning_rate": 1e-06, + "loss": 0.4559, + "mean_token_accuracy": 0.8560953140258789, + "num_tokens": 512149126.0, + "step": 13428 + }, + { + "epoch": 1.7083068311919605, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.15314483642578, + "learning_rate": 1e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.8708525896072388, + "num_tokens": 512182368.0, + "step": 13429 + }, + { + "epoch": 1.7084340414705508, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.140491485595703, + "learning_rate": 1e-06, + "loss": 0.4717, + "mean_token_accuracy": 0.8494908809661865, + "num_tokens": 512218224.0, + "step": 13430 + }, + { + "epoch": 1.7085612517491413, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.14344596862793, + "learning_rate": 1e-06, + "loss": 0.4185, + "mean_token_accuracy": 0.8687341809272766, + "num_tokens": 512255523.0, + "step": 13431 + }, + { + "epoch": 1.7086884620277318, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.223888397216797, + "learning_rate": 1e-06, + "loss": 0.3839, + "mean_token_accuracy": 0.8786550760269165, + "num_tokens": 512290514.0, + "step": 13432 + }, + { + "epoch": 1.7088156723063224, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.193130493164062, + "learning_rate": 1e-06, + "loss": 0.3919, + "mean_token_accuracy": 0.8787837624549866, + "num_tokens": 512328468.0, + "step": 13433 + }, + { + "epoch": 1.708942882584913, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.18633270263672, + "learning_rate": 1e-06, + "loss": 0.4052, + "mean_token_accuracy": 0.8691180348396301, + "num_tokens": 512359786.0, + "step": 13434 + }, + { + "epoch": 1.7090700928635034, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.11119270324707, + "learning_rate": 1e-06, + "loss": 0.4956, + "mean_token_accuracy": 0.8472499847412109, + "num_tokens": 512395679.0, + "step": 13435 + }, + { + "epoch": 1.7091973031420937, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.219133377075195, + "learning_rate": 1e-06, + "loss": 0.4156, + "mean_token_accuracy": 0.8675620555877686, + "num_tokens": 512433491.0, + "step": 13436 + }, + { + "epoch": 1.7093245134206843, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.36142921447754, + "learning_rate": 1e-06, + "loss": 0.4452, + "mean_token_accuracy": 0.8634420037269592, + "num_tokens": 512470215.0, + "step": 13437 + }, + { + "epoch": 1.7094517236992748, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.981582641601562, + "learning_rate": 1e-06, + "loss": 0.4327, + "mean_token_accuracy": 0.8638588786125183, + "num_tokens": 512507055.0, + "step": 13438 + }, + { + "epoch": 1.7095789339778653, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.1376953125, + "learning_rate": 1e-06, + "loss": 0.4585, + "mean_token_accuracy": 0.8506487011909485, + "num_tokens": 512547877.0, + "step": 13439 + }, + { + "epoch": 1.7097061442564558, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.103059768676758, + "learning_rate": 1e-06, + "loss": 0.4587, + "mean_token_accuracy": 0.8546654582023621, + "num_tokens": 512586558.0, + "step": 13440 + }, + { + "epoch": 1.7098333545350464, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.06548309326172, + "learning_rate": 1e-06, + "loss": 0.4316, + "mean_token_accuracy": 0.8657912015914917, + "num_tokens": 512631181.0, + "step": 13441 + }, + { + "epoch": 1.709960564813637, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.318601608276367, + "learning_rate": 1e-06, + "loss": 0.3777, + "mean_token_accuracy": 0.8794265389442444, + "num_tokens": 512667925.0, + "step": 13442 + }, + { + "epoch": 1.7100877750922274, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.16689682006836, + "learning_rate": 1e-06, + "loss": 0.4674, + "mean_token_accuracy": 0.8512578010559082, + "num_tokens": 512703136.0, + "step": 13443 + }, + { + "epoch": 1.710214985370818, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.945669174194336, + "learning_rate": 1e-06, + "loss": 0.4072, + "mean_token_accuracy": 0.871485710144043, + "num_tokens": 512739001.0, + "step": 13444 + }, + { + "epoch": 1.7103421956494085, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.17790412902832, + "learning_rate": 1e-06, + "loss": 0.402, + "mean_token_accuracy": 0.8722611665725708, + "num_tokens": 512780517.0, + "step": 13445 + }, + { + "epoch": 1.710469405927999, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.108970642089844, + "learning_rate": 1e-06, + "loss": 0.3936, + "mean_token_accuracy": 0.8750966787338257, + "num_tokens": 512815964.0, + "step": 13446 + }, + { + "epoch": 1.7105966162065895, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.002408981323242, + "learning_rate": 1e-06, + "loss": 0.4028, + "mean_token_accuracy": 0.8719291090965271, + "num_tokens": 512855451.0, + "step": 13447 + }, + { + "epoch": 1.71072382648518, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.196975708007812, + "learning_rate": 1e-06, + "loss": 0.4141, + "mean_token_accuracy": 0.8720346689224243, + "num_tokens": 512895809.0, + "step": 13448 + }, + { + "epoch": 1.7108510367637706, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.00418472290039, + "learning_rate": 1e-06, + "loss": 0.397, + "mean_token_accuracy": 0.8738020658493042, + "num_tokens": 512936424.0, + "step": 13449 + }, + { + "epoch": 1.7109782470423611, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.306413650512695, + "learning_rate": 1e-06, + "loss": 0.3671, + "mean_token_accuracy": 0.8829853534698486, + "num_tokens": 512976486.0, + "step": 13450 + }, + { + "epoch": 1.7111054573209517, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.05467414855957, + "learning_rate": 1e-06, + "loss": 0.4532, + "mean_token_accuracy": 0.8604120016098022, + "num_tokens": 513014131.0, + "step": 13451 + }, + { + "epoch": 1.7112326675995422, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.836219787597656, + "learning_rate": 1e-06, + "loss": 0.4995, + "mean_token_accuracy": 0.8410065770149231, + "num_tokens": 513054706.0, + "step": 13452 + }, + { + "epoch": 1.7113598778781327, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.34998893737793, + "learning_rate": 1e-06, + "loss": 0.3984, + "mean_token_accuracy": 0.8733177185058594, + "num_tokens": 513094089.0, + "step": 13453 + }, + { + "epoch": 1.711487088156723, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.14691925048828, + "learning_rate": 1e-06, + "loss": 0.4247, + "mean_token_accuracy": 0.8666760921478271, + "num_tokens": 513136306.0, + "step": 13454 + }, + { + "epoch": 1.7116142984353135, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.27174186706543, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.8651928305625916, + "num_tokens": 513174376.0, + "step": 13455 + }, + { + "epoch": 1.711741508713904, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.153156280517578, + "learning_rate": 1e-06, + "loss": 0.4046, + "mean_token_accuracy": 0.8723421096801758, + "num_tokens": 513211834.0, + "step": 13456 + }, + { + "epoch": 1.7118687189924946, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.095640182495117, + "learning_rate": 1e-06, + "loss": 0.4482, + "mean_token_accuracy": 0.8604484796524048, + "num_tokens": 513241765.0, + "step": 13457 + }, + { + "epoch": 1.7119959292710851, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.195831298828125, + "learning_rate": 1e-06, + "loss": 0.4545, + "mean_token_accuracy": 0.8563951253890991, + "num_tokens": 513283510.0, + "step": 13458 + }, + { + "epoch": 1.7121231395496757, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.297256469726562, + "learning_rate": 1e-06, + "loss": 0.4279, + "mean_token_accuracy": 0.865531325340271, + "num_tokens": 513320746.0, + "step": 13459 + }, + { + "epoch": 1.712250349828266, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.208133697509766, + "learning_rate": 1e-06, + "loss": 0.4131, + "mean_token_accuracy": 0.865937352180481, + "num_tokens": 513358036.0, + "step": 13460 + }, + { + "epoch": 1.7123775601068565, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.429141998291016, + "learning_rate": 1e-06, + "loss": 0.3962, + "mean_token_accuracy": 0.873878002166748, + "num_tokens": 513395156.0, + "step": 13461 + }, + { + "epoch": 1.712504770385447, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.099281311035156, + "learning_rate": 1e-06, + "loss": 0.433, + "mean_token_accuracy": 0.8624633550643921, + "num_tokens": 513438631.0, + "step": 13462 + }, + { + "epoch": 1.7126319806640375, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.254352569580078, + "learning_rate": 1e-06, + "loss": 0.4301, + "mean_token_accuracy": 0.8594851493835449, + "num_tokens": 513471225.0, + "step": 13463 + }, + { + "epoch": 1.712759190942628, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.475584030151367, + "learning_rate": 1e-06, + "loss": 0.4475, + "mean_token_accuracy": 0.8600831031799316, + "num_tokens": 513513824.0, + "step": 13464 + }, + { + "epoch": 1.7128864012212186, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.01078224182129, + "learning_rate": 1e-06, + "loss": 0.4363, + "mean_token_accuracy": 0.8621191382408142, + "num_tokens": 513556263.0, + "step": 13465 + }, + { + "epoch": 1.7130136114998091, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.233566284179688, + "learning_rate": 1e-06, + "loss": 0.3925, + "mean_token_accuracy": 0.8716276288032532, + "num_tokens": 513595093.0, + "step": 13466 + }, + { + "epoch": 1.7131408217783997, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.422534942626953, + "learning_rate": 1e-06, + "loss": 0.4284, + "mean_token_accuracy": 0.8606123328208923, + "num_tokens": 513633761.0, + "step": 13467 + }, + { + "epoch": 1.7132680320569902, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.221567153930664, + "learning_rate": 1e-06, + "loss": 0.4272, + "mean_token_accuracy": 0.8653479814529419, + "num_tokens": 513671404.0, + "step": 13468 + }, + { + "epoch": 1.7133952423355807, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.180519104003906, + "learning_rate": 1e-06, + "loss": 0.4142, + "mean_token_accuracy": 0.8675118684768677, + "num_tokens": 513713752.0, + "step": 13469 + }, + { + "epoch": 1.7135224526141712, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.295475006103516, + "learning_rate": 1e-06, + "loss": 0.4307, + "mean_token_accuracy": 0.8639230728149414, + "num_tokens": 513755261.0, + "step": 13470 + }, + { + "epoch": 1.7136496628927618, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.427576065063477, + "learning_rate": 1e-06, + "loss": 0.4038, + "mean_token_accuracy": 0.873841404914856, + "num_tokens": 513799042.0, + "step": 13471 + }, + { + "epoch": 1.7137768731713523, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.171537399291992, + "learning_rate": 1e-06, + "loss": 0.4486, + "mean_token_accuracy": 0.8579123020172119, + "num_tokens": 513838148.0, + "step": 13472 + }, + { + "epoch": 1.7139040834499428, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.242961883544922, + "learning_rate": 1e-06, + "loss": 0.3976, + "mean_token_accuracy": 0.8747372031211853, + "num_tokens": 513878960.0, + "step": 13473 + }, + { + "epoch": 1.7140312937285334, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.253681182861328, + "learning_rate": 1e-06, + "loss": 0.4142, + "mean_token_accuracy": 0.8694095611572266, + "num_tokens": 513914083.0, + "step": 13474 + }, + { + "epoch": 1.7141585040071239, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.484601974487305, + "learning_rate": 1e-06, + "loss": 0.4468, + "mean_token_accuracy": 0.8570911288261414, + "num_tokens": 513951591.0, + "step": 13475 + }, + { + "epoch": 1.7142857142857144, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.213542938232422, + "learning_rate": 1e-06, + "loss": 0.4585, + "mean_token_accuracy": 0.8554458618164062, + "num_tokens": 513986127.0, + "step": 13476 + }, + { + "epoch": 1.714412924564305, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.118789672851562, + "learning_rate": 1e-06, + "loss": 0.4404, + "mean_token_accuracy": 0.8630000352859497, + "num_tokens": 514028248.0, + "step": 13477 + }, + { + "epoch": 1.7145401348428955, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.741201400756836, + "learning_rate": 1e-06, + "loss": 0.4119, + "mean_token_accuracy": 0.871625542640686, + "num_tokens": 514062757.0, + "step": 13478 + }, + { + "epoch": 1.7146673451214858, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.747467041015625, + "learning_rate": 1e-06, + "loss": 0.4603, + "mean_token_accuracy": 0.85502028465271, + "num_tokens": 514102635.0, + "step": 13479 + }, + { + "epoch": 1.7147945554000763, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.026823043823242, + "learning_rate": 1e-06, + "loss": 0.4102, + "mean_token_accuracy": 0.8666967749595642, + "num_tokens": 514140883.0, + "step": 13480 + }, + { + "epoch": 1.7149217656786668, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.443714141845703, + "learning_rate": 1e-06, + "loss": 0.3811, + "mean_token_accuracy": 0.8763773441314697, + "num_tokens": 514176536.0, + "step": 13481 + }, + { + "epoch": 1.7150489759572574, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.381610870361328, + "learning_rate": 1e-06, + "loss": 0.4354, + "mean_token_accuracy": 0.8621655702590942, + "num_tokens": 514209969.0, + "step": 13482 + }, + { + "epoch": 1.7151761862358479, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.065982818603516, + "learning_rate": 1e-06, + "loss": 0.409, + "mean_token_accuracy": 0.8675264120101929, + "num_tokens": 514249679.0, + "step": 13483 + }, + { + "epoch": 1.7153033965144384, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.676998138427734, + "learning_rate": 1e-06, + "loss": 0.4395, + "mean_token_accuracy": 0.8621704578399658, + "num_tokens": 514292719.0, + "step": 13484 + }, + { + "epoch": 1.7154306067930287, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.49823570251465, + "learning_rate": 1e-06, + "loss": 0.432, + "mean_token_accuracy": 0.8622636795043945, + "num_tokens": 514323396.0, + "step": 13485 + }, + { + "epoch": 1.7155578170716193, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.03706169128418, + "learning_rate": 1e-06, + "loss": 0.3951, + "mean_token_accuracy": 0.8732213973999023, + "num_tokens": 514360951.0, + "step": 13486 + }, + { + "epoch": 1.7156850273502098, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.0851993560791, + "learning_rate": 1e-06, + "loss": 0.4651, + "mean_token_accuracy": 0.8518741726875305, + "num_tokens": 514403879.0, + "step": 13487 + }, + { + "epoch": 1.7158122376288003, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.50061798095703, + "learning_rate": 1e-06, + "loss": 0.3912, + "mean_token_accuracy": 0.8754633665084839, + "num_tokens": 514446628.0, + "step": 13488 + }, + { + "epoch": 1.7159394479073908, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.075777053833008, + "learning_rate": 1e-06, + "loss": 0.4162, + "mean_token_accuracy": 0.8690545558929443, + "num_tokens": 514486132.0, + "step": 13489 + }, + { + "epoch": 1.7160666581859814, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.075946807861328, + "learning_rate": 1e-06, + "loss": 0.4455, + "mean_token_accuracy": 0.8572879433631897, + "num_tokens": 514526384.0, + "step": 13490 + }, + { + "epoch": 1.716193868464572, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.752607345581055, + "learning_rate": 1e-06, + "loss": 0.4267, + "mean_token_accuracy": 0.8657969236373901, + "num_tokens": 514565828.0, + "step": 13491 + }, + { + "epoch": 1.7163210787431624, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.424575805664062, + "learning_rate": 1e-06, + "loss": 0.4274, + "mean_token_accuracy": 0.8631337285041809, + "num_tokens": 514605260.0, + "step": 13492 + }, + { + "epoch": 1.716448289021753, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.285423278808594, + "learning_rate": 1e-06, + "loss": 0.4118, + "mean_token_accuracy": 0.8690517544746399, + "num_tokens": 514643433.0, + "step": 13493 + }, + { + "epoch": 1.7165754993003435, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.113420486450195, + "learning_rate": 1e-06, + "loss": 0.4079, + "mean_token_accuracy": 0.8699890375137329, + "num_tokens": 514683264.0, + "step": 13494 + }, + { + "epoch": 1.716702709578934, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.41441535949707, + "learning_rate": 1e-06, + "loss": 0.3858, + "mean_token_accuracy": 0.8750951290130615, + "num_tokens": 514719775.0, + "step": 13495 + }, + { + "epoch": 1.7168299198575245, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.98981475830078, + "learning_rate": 1e-06, + "loss": 0.4438, + "mean_token_accuracy": 0.8611642122268677, + "num_tokens": 514766287.0, + "step": 13496 + }, + { + "epoch": 1.716957130136115, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.055185317993164, + "learning_rate": 1e-06, + "loss": 0.3831, + "mean_token_accuracy": 0.8773250579833984, + "num_tokens": 514806725.0, + "step": 13497 + }, + { + "epoch": 1.7170843404147056, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.388141632080078, + "learning_rate": 1e-06, + "loss": 0.3967, + "mean_token_accuracy": 0.8708691596984863, + "num_tokens": 514844008.0, + "step": 13498 + }, + { + "epoch": 1.7172115506932961, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.58596420288086, + "learning_rate": 1e-06, + "loss": 0.4525, + "mean_token_accuracy": 0.8562348484992981, + "num_tokens": 514880356.0, + "step": 13499 + }, + { + "epoch": 1.7173387609718866, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.228527069091797, + "learning_rate": 1e-06, + "loss": 0.3797, + "mean_token_accuracy": 0.8793790340423584, + "num_tokens": 514915309.0, + "step": 13500 + }, + { + "epoch": 1.7174659712504772, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.15392303466797, + "learning_rate": 1e-06, + "loss": 0.4328, + "mean_token_accuracy": 0.8611840605735779, + "num_tokens": 514956496.0, + "step": 13501 + }, + { + "epoch": 1.7175931815290677, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.539819717407227, + "learning_rate": 1e-06, + "loss": 0.4158, + "mean_token_accuracy": 0.8677024841308594, + "num_tokens": 514992247.0, + "step": 13502 + }, + { + "epoch": 1.717720391807658, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.329349517822266, + "learning_rate": 1e-06, + "loss": 0.4385, + "mean_token_accuracy": 0.8592850565910339, + "num_tokens": 515035443.0, + "step": 13503 + }, + { + "epoch": 1.7178476020862485, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.355072021484375, + "learning_rate": 1e-06, + "loss": 0.3946, + "mean_token_accuracy": 0.8717234134674072, + "num_tokens": 515072279.0, + "step": 13504 + }, + { + "epoch": 1.717974812364839, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.145099639892578, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.8672467470169067, + "num_tokens": 515112547.0, + "step": 13505 + }, + { + "epoch": 1.7181020226434296, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.08942985534668, + "learning_rate": 1e-06, + "loss": 0.4599, + "mean_token_accuracy": 0.8563166856765747, + "num_tokens": 515146814.0, + "step": 13506 + }, + { + "epoch": 1.7182292329220201, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.075864791870117, + "learning_rate": 1e-06, + "loss": 0.4031, + "mean_token_accuracy": 0.869282603263855, + "num_tokens": 515184393.0, + "step": 13507 + }, + { + "epoch": 1.7183564432006107, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.40740966796875, + "learning_rate": 1e-06, + "loss": 0.4425, + "mean_token_accuracy": 0.8606035113334656, + "num_tokens": 515225312.0, + "step": 13508 + }, + { + "epoch": 1.718483653479201, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.259292602539062, + "learning_rate": 1e-06, + "loss": 0.4902, + "mean_token_accuracy": 0.845561146736145, + "num_tokens": 515261330.0, + "step": 13509 + }, + { + "epoch": 1.7186108637577915, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.058359146118164, + "learning_rate": 1e-06, + "loss": 0.4046, + "mean_token_accuracy": 0.8733983635902405, + "num_tokens": 515299325.0, + "step": 13510 + }, + { + "epoch": 1.718738074036382, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.80252456665039, + "learning_rate": 1e-06, + "loss": 0.4078, + "mean_token_accuracy": 0.8684470653533936, + "num_tokens": 515335213.0, + "step": 13511 + }, + { + "epoch": 1.7188652843149725, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.844257354736328, + "learning_rate": 1e-06, + "loss": 0.3769, + "mean_token_accuracy": 0.8791251182556152, + "num_tokens": 515365699.0, + "step": 13512 + }, + { + "epoch": 1.718992494593563, + "ewc_loss": 0.033935546875, + "ewc_loss_parallel": 3.3855438232421875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.47207260131836, + "learning_rate": 1e-06, + "loss": 0.3855, + "mean_token_accuracy": 0.8766915202140808, + "num_tokens": 515396025.0, + "step": 13513 + }, + { + "epoch": 1.7191197048721536, + "ewc_loss": 0.034912109375, + "ewc_loss_parallel": 3.4809112548828125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.241777420043945, + "learning_rate": 1e-06, + "loss": 0.4169, + "mean_token_accuracy": 0.8676881790161133, + "num_tokens": 515429724.0, + "step": 13514 + }, + { + "epoch": 1.7192469151507441, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.095394134521484, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8637467622756958, + "num_tokens": 515465894.0, + "step": 13515 + }, + { + "epoch": 1.7193741254293347, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.33098602294922, + "learning_rate": 1e-06, + "loss": 0.3592, + "mean_token_accuracy": 0.8863139152526855, + "num_tokens": 515498894.0, + "step": 13516 + }, + { + "epoch": 1.7195013357079252, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.11764144897461, + "learning_rate": 1e-06, + "loss": 0.4201, + "mean_token_accuracy": 0.8670642971992493, + "num_tokens": 515538783.0, + "step": 13517 + }, + { + "epoch": 1.7196285459865157, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.221689224243164, + "learning_rate": 1e-06, + "loss": 0.4163, + "mean_token_accuracy": 0.8665926456451416, + "num_tokens": 515573107.0, + "step": 13518 + }, + { + "epoch": 1.7197557562651062, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.320537567138672, + "learning_rate": 1e-06, + "loss": 0.407, + "mean_token_accuracy": 0.8692758083343506, + "num_tokens": 515614901.0, + "step": 13519 + }, + { + "epoch": 1.7198829665436968, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.322412490844727, + "learning_rate": 1e-06, + "loss": 0.4134, + "mean_token_accuracy": 0.8703639507293701, + "num_tokens": 515654197.0, + "step": 13520 + }, + { + "epoch": 1.7200101768222873, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.17115592956543, + "learning_rate": 1e-06, + "loss": 0.3603, + "mean_token_accuracy": 0.8859105110168457, + "num_tokens": 515697003.0, + "step": 13521 + }, + { + "epoch": 1.7201373871008778, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.186599731445312, + "learning_rate": 1e-06, + "loss": 0.4382, + "mean_token_accuracy": 0.8642799854278564, + "num_tokens": 515737801.0, + "step": 13522 + }, + { + "epoch": 1.7202645973794684, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.162416458129883, + "learning_rate": 1e-06, + "loss": 0.4415, + "mean_token_accuracy": 0.8602322340011597, + "num_tokens": 515774711.0, + "step": 13523 + }, + { + "epoch": 1.7203918076580589, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.383708953857422, + "learning_rate": 1e-06, + "loss": 0.4387, + "mean_token_accuracy": 0.8598473072052002, + "num_tokens": 515811728.0, + "step": 13524 + }, + { + "epoch": 1.7205190179366494, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.104841232299805, + "learning_rate": 1e-06, + "loss": 0.3826, + "mean_token_accuracy": 0.874529242515564, + "num_tokens": 515844425.0, + "step": 13525 + }, + { + "epoch": 1.72064622821524, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.134601593017578, + "learning_rate": 1e-06, + "loss": 0.4155, + "mean_token_accuracy": 0.8701791763305664, + "num_tokens": 515882439.0, + "step": 13526 + }, + { + "epoch": 1.7207734384938305, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.00696563720703, + "learning_rate": 1e-06, + "loss": 0.4944, + "mean_token_accuracy": 0.8460670113563538, + "num_tokens": 515922168.0, + "step": 13527 + }, + { + "epoch": 1.7209006487724208, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.04412269592285, + "learning_rate": 1e-06, + "loss": 0.3835, + "mean_token_accuracy": 0.8748549222946167, + "num_tokens": 515961840.0, + "step": 13528 + }, + { + "epoch": 1.7210278590510113, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.398242950439453, + "learning_rate": 1e-06, + "loss": 0.397, + "mean_token_accuracy": 0.8680126667022705, + "num_tokens": 515997586.0, + "step": 13529 + }, + { + "epoch": 1.7211550693296018, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.167980194091797, + "learning_rate": 1e-06, + "loss": 0.4045, + "mean_token_accuracy": 0.8737539052963257, + "num_tokens": 516036660.0, + "step": 13530 + }, + { + "epoch": 1.7212822796081924, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.204423904418945, + "learning_rate": 1e-06, + "loss": 0.4444, + "mean_token_accuracy": 0.8609119653701782, + "num_tokens": 516070846.0, + "step": 13531 + }, + { + "epoch": 1.7214094898867829, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.193880081176758, + "learning_rate": 1e-06, + "loss": 0.3598, + "mean_token_accuracy": 0.8890062570571899, + "num_tokens": 516107027.0, + "step": 13532 + }, + { + "epoch": 1.7215367001653734, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.126991271972656, + "learning_rate": 1e-06, + "loss": 0.4269, + "mean_token_accuracy": 0.8685104846954346, + "num_tokens": 516143971.0, + "step": 13533 + }, + { + "epoch": 1.7216639104439637, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.441804885864258, + "learning_rate": 1e-06, + "loss": 0.413, + "mean_token_accuracy": 0.8704217672348022, + "num_tokens": 516180617.0, + "step": 13534 + }, + { + "epoch": 1.7217911207225542, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.205970764160156, + "learning_rate": 1e-06, + "loss": 0.4143, + "mean_token_accuracy": 0.8685469627380371, + "num_tokens": 516220371.0, + "step": 13535 + }, + { + "epoch": 1.7219183310011448, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.26265525817871, + "learning_rate": 1e-06, + "loss": 0.415, + "mean_token_accuracy": 0.8712131977081299, + "num_tokens": 516258642.0, + "step": 13536 + }, + { + "epoch": 1.7220455412797353, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.318674087524414, + "learning_rate": 1e-06, + "loss": 0.4337, + "mean_token_accuracy": 0.8613479137420654, + "num_tokens": 516298971.0, + "step": 13537 + }, + { + "epoch": 1.7221727515583258, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.130760192871094, + "learning_rate": 1e-06, + "loss": 0.3774, + "mean_token_accuracy": 0.8797649145126343, + "num_tokens": 516334457.0, + "step": 13538 + }, + { + "epoch": 1.7222999618369164, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.083372116088867, + "learning_rate": 1e-06, + "loss": 0.3541, + "mean_token_accuracy": 0.8932782411575317, + "num_tokens": 516374585.0, + "step": 13539 + }, + { + "epoch": 1.7224271721155069, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.1685791015625, + "learning_rate": 1e-06, + "loss": 0.4647, + "mean_token_accuracy": 0.8532984852790833, + "num_tokens": 516414180.0, + "step": 13540 + }, + { + "epoch": 1.7225543823940974, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.179039001464844, + "learning_rate": 1e-06, + "loss": 0.3894, + "mean_token_accuracy": 0.8776692152023315, + "num_tokens": 516450189.0, + "step": 13541 + }, + { + "epoch": 1.722681592672688, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.42915153503418, + "learning_rate": 1e-06, + "loss": 0.4208, + "mean_token_accuracy": 0.8665595054626465, + "num_tokens": 516487549.0, + "step": 13542 + }, + { + "epoch": 1.7228088029512785, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.219633102416992, + "learning_rate": 1e-06, + "loss": 0.4596, + "mean_token_accuracy": 0.8559205532073975, + "num_tokens": 516526649.0, + "step": 13543 + }, + { + "epoch": 1.722936013229869, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.234373092651367, + "learning_rate": 1e-06, + "loss": 0.4064, + "mean_token_accuracy": 0.8724060654640198, + "num_tokens": 516563829.0, + "step": 13544 + }, + { + "epoch": 1.7230632235084595, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.12428855895996, + "learning_rate": 1e-06, + "loss": 0.4632, + "mean_token_accuracy": 0.8522538542747498, + "num_tokens": 516605179.0, + "step": 13545 + }, + { + "epoch": 1.72319043378705, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.24759292602539, + "learning_rate": 1e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.8662851452827454, + "num_tokens": 516641576.0, + "step": 13546 + }, + { + "epoch": 1.7233176440656406, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.22888946533203, + "learning_rate": 1e-06, + "loss": 0.4182, + "mean_token_accuracy": 0.8658182621002197, + "num_tokens": 516678687.0, + "step": 13547 + }, + { + "epoch": 1.7234448543442311, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.196855545043945, + "learning_rate": 1e-06, + "loss": 0.4281, + "mean_token_accuracy": 0.8639494776725769, + "num_tokens": 516711484.0, + "step": 13548 + }, + { + "epoch": 1.7235720646228216, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.289478302001953, + "learning_rate": 1e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.8644691109657288, + "num_tokens": 516750598.0, + "step": 13549 + }, + { + "epoch": 1.7236992749014122, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.247234344482422, + "learning_rate": 1e-06, + "loss": 0.3959, + "mean_token_accuracy": 0.8743733167648315, + "num_tokens": 516793438.0, + "step": 13550 + }, + { + "epoch": 1.7238264851800027, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.2099552154541, + "learning_rate": 1e-06, + "loss": 0.3713, + "mean_token_accuracy": 0.8827793598175049, + "num_tokens": 516828692.0, + "step": 13551 + }, + { + "epoch": 1.723953695458593, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.39163589477539, + "learning_rate": 1e-06, + "loss": 0.4208, + "mean_token_accuracy": 0.869360625743866, + "num_tokens": 516867592.0, + "step": 13552 + }, + { + "epoch": 1.7240809057371835, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.347246170043945, + "learning_rate": 1e-06, + "loss": 0.4512, + "mean_token_accuracy": 0.8563164472579956, + "num_tokens": 516907668.0, + "step": 13553 + }, + { + "epoch": 1.724208116015774, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.170148849487305, + "learning_rate": 1e-06, + "loss": 0.4266, + "mean_token_accuracy": 0.8666292428970337, + "num_tokens": 516944628.0, + "step": 13554 + }, + { + "epoch": 1.7243353262943646, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.30535125732422, + "learning_rate": 1e-06, + "loss": 0.4206, + "mean_token_accuracy": 0.8633781671524048, + "num_tokens": 516989027.0, + "step": 13555 + }, + { + "epoch": 1.7244625365729551, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.366653442382812, + "learning_rate": 1e-06, + "loss": 0.4333, + "mean_token_accuracy": 0.8641613125801086, + "num_tokens": 517024464.0, + "step": 13556 + }, + { + "epoch": 1.7245897468515456, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.370807647705078, + "learning_rate": 1e-06, + "loss": 0.4024, + "mean_token_accuracy": 0.8715357780456543, + "num_tokens": 517067714.0, + "step": 13557 + }, + { + "epoch": 1.724716957130136, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.312410354614258, + "learning_rate": 1e-06, + "loss": 0.4068, + "mean_token_accuracy": 0.871144711971283, + "num_tokens": 517107564.0, + "step": 13558 + }, + { + "epoch": 1.7248441674087265, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.39789581298828, + "learning_rate": 1e-06, + "loss": 0.3995, + "mean_token_accuracy": 0.8734918832778931, + "num_tokens": 517148240.0, + "step": 13559 + }, + { + "epoch": 1.724971377687317, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.28717613220215, + "learning_rate": 1e-06, + "loss": 0.4386, + "mean_token_accuracy": 0.8613821864128113, + "num_tokens": 517187841.0, + "step": 13560 + }, + { + "epoch": 1.7250985879659075, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.11410903930664, + "learning_rate": 1e-06, + "loss": 0.4636, + "mean_token_accuracy": 0.8523191213607788, + "num_tokens": 517225732.0, + "step": 13561 + }, + { + "epoch": 1.725225798244498, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.233413696289062, + "learning_rate": 1e-06, + "loss": 0.4766, + "mean_token_accuracy": 0.8513832092285156, + "num_tokens": 517260423.0, + "step": 13562 + }, + { + "epoch": 1.7253530085230886, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.286832809448242, + "learning_rate": 1e-06, + "loss": 0.4556, + "mean_token_accuracy": 0.8550742864608765, + "num_tokens": 517300862.0, + "step": 13563 + }, + { + "epoch": 1.7254802188016791, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.232858657836914, + "learning_rate": 1e-06, + "loss": 0.4108, + "mean_token_accuracy": 0.8723645806312561, + "num_tokens": 517341010.0, + "step": 13564 + }, + { + "epoch": 1.7256074290802697, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.1430721282959, + "learning_rate": 1e-06, + "loss": 0.47, + "mean_token_accuracy": 0.8518086671829224, + "num_tokens": 517376853.0, + "step": 13565 + }, + { + "epoch": 1.7257346393588602, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.259498596191406, + "learning_rate": 1e-06, + "loss": 0.4381, + "mean_token_accuracy": 0.862646222114563, + "num_tokens": 517421854.0, + "step": 13566 + }, + { + "epoch": 1.7258618496374507, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.222549438476562, + "learning_rate": 1e-06, + "loss": 0.4366, + "mean_token_accuracy": 0.8622400164604187, + "num_tokens": 517461711.0, + "step": 13567 + }, + { + "epoch": 1.7259890599160412, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.506589889526367, + "learning_rate": 1e-06, + "loss": 0.3975, + "mean_token_accuracy": 0.8744768500328064, + "num_tokens": 517494755.0, + "step": 13568 + }, + { + "epoch": 1.7261162701946318, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.210378646850586, + "learning_rate": 1e-06, + "loss": 0.4534, + "mean_token_accuracy": 0.8585739135742188, + "num_tokens": 517539061.0, + "step": 13569 + }, + { + "epoch": 1.7262434804732223, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.379392623901367, + "learning_rate": 1e-06, + "loss": 0.4266, + "mean_token_accuracy": 0.8633514046669006, + "num_tokens": 517579768.0, + "step": 13570 + }, + { + "epoch": 1.7263706907518128, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.043107986450195, + "learning_rate": 1e-06, + "loss": 0.3791, + "mean_token_accuracy": 0.8789623379707336, + "num_tokens": 517612006.0, + "step": 13571 + }, + { + "epoch": 1.7264979010304033, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.473142623901367, + "learning_rate": 1e-06, + "loss": 0.4126, + "mean_token_accuracy": 0.8676918745040894, + "num_tokens": 517647547.0, + "step": 13572 + }, + { + "epoch": 1.7266251113089939, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.305574417114258, + "learning_rate": 1e-06, + "loss": 0.4534, + "mean_token_accuracy": 0.857504665851593, + "num_tokens": 517689067.0, + "step": 13573 + }, + { + "epoch": 1.7267523215875844, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.426219940185547, + "learning_rate": 1e-06, + "loss": 0.4008, + "mean_token_accuracy": 0.8730265498161316, + "num_tokens": 517729967.0, + "step": 13574 + }, + { + "epoch": 1.726879531866175, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.173391342163086, + "learning_rate": 1e-06, + "loss": 0.4293, + "mean_token_accuracy": 0.867250382900238, + "num_tokens": 517770427.0, + "step": 13575 + }, + { + "epoch": 1.7270067421447655, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.204015731811523, + "learning_rate": 1e-06, + "loss": 0.4324, + "mean_token_accuracy": 0.865517258644104, + "num_tokens": 517804202.0, + "step": 13576 + }, + { + "epoch": 1.7271339524233558, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.2786865234375, + "learning_rate": 1e-06, + "loss": 0.4399, + "mean_token_accuracy": 0.8590784668922424, + "num_tokens": 517834825.0, + "step": 13577 + }, + { + "epoch": 1.7272611627019463, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.258623123168945, + "learning_rate": 1e-06, + "loss": 0.4526, + "mean_token_accuracy": 0.8595446348190308, + "num_tokens": 517874600.0, + "step": 13578 + }, + { + "epoch": 1.7273883729805368, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.276350021362305, + "learning_rate": 1e-06, + "loss": 0.4556, + "mean_token_accuracy": 0.8586120009422302, + "num_tokens": 517913488.0, + "step": 13579 + }, + { + "epoch": 1.7275155832591274, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.29961395263672, + "learning_rate": 1e-06, + "loss": 0.3822, + "mean_token_accuracy": 0.8819403052330017, + "num_tokens": 517951094.0, + "step": 13580 + }, + { + "epoch": 1.7276427935377179, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.173444747924805, + "learning_rate": 1e-06, + "loss": 0.4442, + "mean_token_accuracy": 0.8572227954864502, + "num_tokens": 517993811.0, + "step": 13581 + }, + { + "epoch": 1.7277700038163084, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.266889572143555, + "learning_rate": 1e-06, + "loss": 0.4074, + "mean_token_accuracy": 0.8704578280448914, + "num_tokens": 518032860.0, + "step": 13582 + }, + { + "epoch": 1.7278972140948987, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.35162353515625, + "learning_rate": 1e-06, + "loss": 0.4299, + "mean_token_accuracy": 0.8632405996322632, + "num_tokens": 518070256.0, + "step": 13583 + }, + { + "epoch": 1.7280244243734892, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.25275993347168, + "learning_rate": 1e-06, + "loss": 0.4095, + "mean_token_accuracy": 0.8729710578918457, + "num_tokens": 518102828.0, + "step": 13584 + }, + { + "epoch": 1.7281516346520798, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.360631942749023, + "learning_rate": 1e-06, + "loss": 0.3775, + "mean_token_accuracy": 0.8807870149612427, + "num_tokens": 518146859.0, + "step": 13585 + }, + { + "epoch": 1.7282788449306703, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.187795639038086, + "learning_rate": 1e-06, + "loss": 0.3946, + "mean_token_accuracy": 0.8746540546417236, + "num_tokens": 518186173.0, + "step": 13586 + }, + { + "epoch": 1.7284060552092608, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.202184677124023, + "learning_rate": 1e-06, + "loss": 0.467, + "mean_token_accuracy": 0.851607620716095, + "num_tokens": 518221903.0, + "step": 13587 + }, + { + "epoch": 1.7285332654878514, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.09967613220215, + "learning_rate": 1e-06, + "loss": 0.4379, + "mean_token_accuracy": 0.8589306473731995, + "num_tokens": 518258048.0, + "step": 13588 + }, + { + "epoch": 1.7286604757664419, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.415563583374023, + "learning_rate": 1e-06, + "loss": 0.4463, + "mean_token_accuracy": 0.8590879440307617, + "num_tokens": 518299127.0, + "step": 13589 + }, + { + "epoch": 1.7287876860450324, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.325237274169922, + "learning_rate": 1e-06, + "loss": 0.3826, + "mean_token_accuracy": 0.8779784440994263, + "num_tokens": 518336520.0, + "step": 13590 + }, + { + "epoch": 1.728914896323623, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.30982208251953, + "learning_rate": 1e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.862921416759491, + "num_tokens": 518368484.0, + "step": 13591 + }, + { + "epoch": 1.7290421066022135, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.203855514526367, + "learning_rate": 1e-06, + "loss": 0.4289, + "mean_token_accuracy": 0.8650920391082764, + "num_tokens": 518408439.0, + "step": 13592 + }, + { + "epoch": 1.729169316880804, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.042579650878906, + "learning_rate": 1e-06, + "loss": 0.4783, + "mean_token_accuracy": 0.8472668528556824, + "num_tokens": 518447414.0, + "step": 13593 + }, + { + "epoch": 1.7292965271593945, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.410058975219727, + "learning_rate": 1e-06, + "loss": 0.433, + "mean_token_accuracy": 0.8630229234695435, + "num_tokens": 518486455.0, + "step": 13594 + }, + { + "epoch": 1.729423737437985, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.441850662231445, + "learning_rate": 1e-06, + "loss": 0.3729, + "mean_token_accuracy": 0.8839335441589355, + "num_tokens": 518527023.0, + "step": 13595 + }, + { + "epoch": 1.7295509477165756, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.323747634887695, + "learning_rate": 1e-06, + "loss": 0.45, + "mean_token_accuracy": 0.8605537414550781, + "num_tokens": 518563285.0, + "step": 13596 + }, + { + "epoch": 1.729678157995166, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.240081787109375, + "learning_rate": 1e-06, + "loss": 0.4195, + "mean_token_accuracy": 0.8681886196136475, + "num_tokens": 518607610.0, + "step": 13597 + }, + { + "epoch": 1.7298053682737566, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.140701293945312, + "learning_rate": 1e-06, + "loss": 0.3928, + "mean_token_accuracy": 0.8756057024002075, + "num_tokens": 518649434.0, + "step": 13598 + }, + { + "epoch": 1.7299325785523472, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.3742618560791, + "learning_rate": 1e-06, + "loss": 0.4006, + "mean_token_accuracy": 0.8743112087249756, + "num_tokens": 518684508.0, + "step": 13599 + }, + { + "epoch": 1.7300597888309377, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.132047653198242, + "learning_rate": 1e-06, + "loss": 0.3969, + "mean_token_accuracy": 0.8732532262802124, + "num_tokens": 518717883.0, + "step": 13600 + }, + { + "epoch": 1.730186999109528, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.398828506469727, + "learning_rate": 1e-06, + "loss": 0.4011, + "mean_token_accuracy": 0.8745022416114807, + "num_tokens": 518756158.0, + "step": 13601 + }, + { + "epoch": 1.7303142093881185, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.091197967529297, + "learning_rate": 1e-06, + "loss": 0.4256, + "mean_token_accuracy": 0.8675006031990051, + "num_tokens": 518793003.0, + "step": 13602 + }, + { + "epoch": 1.730441419666709, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.12761116027832, + "learning_rate": 1e-06, + "loss": 0.4411, + "mean_token_accuracy": 0.8611828088760376, + "num_tokens": 518838485.0, + "step": 13603 + }, + { + "epoch": 1.7305686299452996, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.330331802368164, + "learning_rate": 1e-06, + "loss": 0.4369, + "mean_token_accuracy": 0.8615218997001648, + "num_tokens": 518885936.0, + "step": 13604 + }, + { + "epoch": 1.7306958402238901, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.2017765045166, + "learning_rate": 1e-06, + "loss": 0.4609, + "mean_token_accuracy": 0.8547214269638062, + "num_tokens": 518925684.0, + "step": 13605 + }, + { + "epoch": 1.7308230505024806, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.094694137573242, + "learning_rate": 1e-06, + "loss": 0.4059, + "mean_token_accuracy": 0.8699973821640015, + "num_tokens": 518966438.0, + "step": 13606 + }, + { + "epoch": 1.730950260781071, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.28522491455078, + "learning_rate": 1e-06, + "loss": 0.4117, + "mean_token_accuracy": 0.8697187900543213, + "num_tokens": 519013576.0, + "step": 13607 + }, + { + "epoch": 1.7310774710596615, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.934803009033203, + "learning_rate": 1e-06, + "loss": 0.4263, + "mean_token_accuracy": 0.8645561933517456, + "num_tokens": 519045342.0, + "step": 13608 + }, + { + "epoch": 1.731204681338252, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.247652053833008, + "learning_rate": 1e-06, + "loss": 0.4645, + "mean_token_accuracy": 0.8532388210296631, + "num_tokens": 519088566.0, + "step": 13609 + }, + { + "epoch": 1.7313318916168425, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.075578689575195, + "learning_rate": 1e-06, + "loss": 0.4128, + "mean_token_accuracy": 0.8707761764526367, + "num_tokens": 519126516.0, + "step": 13610 + }, + { + "epoch": 1.731459101895433, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.27748680114746, + "learning_rate": 1e-06, + "loss": 0.3924, + "mean_token_accuracy": 0.8758314251899719, + "num_tokens": 519165431.0, + "step": 13611 + }, + { + "epoch": 1.7315863121740236, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.021570205688477, + "learning_rate": 1e-06, + "loss": 0.3807, + "mean_token_accuracy": 0.8827993869781494, + "num_tokens": 519200991.0, + "step": 13612 + }, + { + "epoch": 1.7317135224526141, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.10291862487793, + "learning_rate": 1e-06, + "loss": 0.3971, + "mean_token_accuracy": 0.8738489151000977, + "num_tokens": 519245988.0, + "step": 13613 + }, + { + "epoch": 1.7318407327312046, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.35898780822754, + "learning_rate": 1e-06, + "loss": 0.4057, + "mean_token_accuracy": 0.8721218705177307, + "num_tokens": 519282595.0, + "step": 13614 + }, + { + "epoch": 1.7319679430097952, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.43535041809082, + "learning_rate": 1e-06, + "loss": 0.4334, + "mean_token_accuracy": 0.8620550036430359, + "num_tokens": 519319061.0, + "step": 13615 + }, + { + "epoch": 1.7320951532883857, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.243011474609375, + "learning_rate": 1e-06, + "loss": 0.4274, + "mean_token_accuracy": 0.8653696775436401, + "num_tokens": 519358617.0, + "step": 13616 + }, + { + "epoch": 1.7322223635669762, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.214447021484375, + "learning_rate": 1e-06, + "loss": 0.4175, + "mean_token_accuracy": 0.8663703203201294, + "num_tokens": 519392461.0, + "step": 13617 + }, + { + "epoch": 1.7323495738455668, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.34805679321289, + "learning_rate": 1e-06, + "loss": 0.3931, + "mean_token_accuracy": 0.8752689361572266, + "num_tokens": 519432893.0, + "step": 13618 + }, + { + "epoch": 1.7324767841241573, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.338972091674805, + "learning_rate": 1e-06, + "loss": 0.4183, + "mean_token_accuracy": 0.867743730545044, + "num_tokens": 519473445.0, + "step": 13619 + }, + { + "epoch": 1.7326039944027478, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.487228393554688, + "learning_rate": 1e-06, + "loss": 0.4201, + "mean_token_accuracy": 0.8672330379486084, + "num_tokens": 519518883.0, + "step": 13620 + }, + { + "epoch": 1.7327312046813383, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.03548240661621, + "learning_rate": 1e-06, + "loss": 0.403, + "mean_token_accuracy": 0.8718475103378296, + "num_tokens": 519550123.0, + "step": 13621 + }, + { + "epoch": 1.7328584149599289, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.280900955200195, + "learning_rate": 1e-06, + "loss": 0.462, + "mean_token_accuracy": 0.8586849570274353, + "num_tokens": 519592538.0, + "step": 13622 + }, + { + "epoch": 1.7329856252385194, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.27808380126953, + "learning_rate": 1e-06, + "loss": 0.4154, + "mean_token_accuracy": 0.8682262301445007, + "num_tokens": 519633466.0, + "step": 13623 + }, + { + "epoch": 1.73311283551711, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.273414611816406, + "learning_rate": 1e-06, + "loss": 0.4204, + "mean_token_accuracy": 0.8640339374542236, + "num_tokens": 519672569.0, + "step": 13624 + }, + { + "epoch": 1.7332400457957005, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.465917587280273, + "learning_rate": 1e-06, + "loss": 0.3991, + "mean_token_accuracy": 0.8736958503723145, + "num_tokens": 519707383.0, + "step": 13625 + }, + { + "epoch": 1.7333672560742908, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.148656845092773, + "learning_rate": 1e-06, + "loss": 0.4784, + "mean_token_accuracy": 0.8489187955856323, + "num_tokens": 519748187.0, + "step": 13626 + }, + { + "epoch": 1.7334944663528813, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.33052635192871, + "learning_rate": 1e-06, + "loss": 0.428, + "mean_token_accuracy": 0.8640508651733398, + "num_tokens": 519790215.0, + "step": 13627 + }, + { + "epoch": 1.7336216766314718, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.40427017211914, + "learning_rate": 1e-06, + "loss": 0.4575, + "mean_token_accuracy": 0.8592019081115723, + "num_tokens": 519827278.0, + "step": 13628 + }, + { + "epoch": 1.7337488869100623, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.14311981201172, + "learning_rate": 1e-06, + "loss": 0.4033, + "mean_token_accuracy": 0.872367799282074, + "num_tokens": 519860216.0, + "step": 13629 + }, + { + "epoch": 1.7338760971886529, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.44214630126953, + "learning_rate": 1e-06, + "loss": 0.4237, + "mean_token_accuracy": 0.8677514791488647, + "num_tokens": 519894442.0, + "step": 13630 + }, + { + "epoch": 1.7340033074672434, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.220367431640625, + "learning_rate": 1e-06, + "loss": 0.4105, + "mean_token_accuracy": 0.8673074245452881, + "num_tokens": 519932524.0, + "step": 13631 + }, + { + "epoch": 1.7341305177458337, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.441316604614258, + "learning_rate": 1e-06, + "loss": 0.424, + "mean_token_accuracy": 0.8665685653686523, + "num_tokens": 519974366.0, + "step": 13632 + }, + { + "epoch": 1.7342577280244242, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.222984313964844, + "learning_rate": 1e-06, + "loss": 0.3869, + "mean_token_accuracy": 0.8771107792854309, + "num_tokens": 520013983.0, + "step": 13633 + }, + { + "epoch": 1.7343849383030148, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.56296157836914, + "learning_rate": 1e-06, + "loss": 0.4058, + "mean_token_accuracy": 0.8703049421310425, + "num_tokens": 520047671.0, + "step": 13634 + }, + { + "epoch": 1.7345121485816053, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.266950607299805, + "learning_rate": 1e-06, + "loss": 0.4205, + "mean_token_accuracy": 0.8643106818199158, + "num_tokens": 520086574.0, + "step": 13635 + }, + { + "epoch": 1.7346393588601958, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.31577491760254, + "learning_rate": 1e-06, + "loss": 0.4693, + "mean_token_accuracy": 0.8529378175735474, + "num_tokens": 520130613.0, + "step": 13636 + }, + { + "epoch": 1.7347665691387864, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.156848907470703, + "learning_rate": 1e-06, + "loss": 0.3558, + "mean_token_accuracy": 0.8853453397750854, + "num_tokens": 520163763.0, + "step": 13637 + }, + { + "epoch": 1.7348937794173769, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.29293441772461, + "learning_rate": 1e-06, + "loss": 0.4213, + "mean_token_accuracy": 0.8649515509605408, + "num_tokens": 520198663.0, + "step": 13638 + }, + { + "epoch": 1.7350209896959674, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.305753707885742, + "learning_rate": 1e-06, + "loss": 0.4114, + "mean_token_accuracy": 0.8704400062561035, + "num_tokens": 520231737.0, + "step": 13639 + }, + { + "epoch": 1.735148199974558, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.1612491607666, + "learning_rate": 1e-06, + "loss": 0.4041, + "mean_token_accuracy": 0.872680127620697, + "num_tokens": 520275664.0, + "step": 13640 + }, + { + "epoch": 1.7352754102531485, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.4139404296875, + "learning_rate": 1e-06, + "loss": 0.4429, + "mean_token_accuracy": 0.8563311100006104, + "num_tokens": 520316100.0, + "step": 13641 + }, + { + "epoch": 1.735402620531739, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.237205505371094, + "learning_rate": 1e-06, + "loss": 0.4242, + "mean_token_accuracy": 0.8645668625831604, + "num_tokens": 520348167.0, + "step": 13642 + }, + { + "epoch": 1.7355298308103295, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.274675369262695, + "learning_rate": 1e-06, + "loss": 0.4621, + "mean_token_accuracy": 0.8537427186965942, + "num_tokens": 520385234.0, + "step": 13643 + }, + { + "epoch": 1.73565704108892, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.33047103881836, + "learning_rate": 1e-06, + "loss": 0.4673, + "mean_token_accuracy": 0.8504826426506042, + "num_tokens": 520423051.0, + "step": 13644 + }, + { + "epoch": 1.7357842513675106, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.401714324951172, + "learning_rate": 1e-06, + "loss": 0.4435, + "mean_token_accuracy": 0.8583309650421143, + "num_tokens": 520467202.0, + "step": 13645 + }, + { + "epoch": 1.735911461646101, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.11758041381836, + "learning_rate": 1e-06, + "loss": 0.423, + "mean_token_accuracy": 0.8652594685554504, + "num_tokens": 520501434.0, + "step": 13646 + }, + { + "epoch": 1.7360386719246916, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.39104461669922, + "learning_rate": 1e-06, + "loss": 0.4197, + "mean_token_accuracy": 0.864233136177063, + "num_tokens": 520542757.0, + "step": 13647 + }, + { + "epoch": 1.7361658822032822, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.029516220092773, + "learning_rate": 1e-06, + "loss": 0.4358, + "mean_token_accuracy": 0.857975959777832, + "num_tokens": 520585216.0, + "step": 13648 + }, + { + "epoch": 1.7362930924818727, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.269943237304688, + "learning_rate": 1e-06, + "loss": 0.4034, + "mean_token_accuracy": 0.8686540722846985, + "num_tokens": 520620066.0, + "step": 13649 + }, + { + "epoch": 1.736420302760463, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.317476272583008, + "learning_rate": 1e-06, + "loss": 0.4, + "mean_token_accuracy": 0.8741713762283325, + "num_tokens": 520660917.0, + "step": 13650 + }, + { + "epoch": 1.7365475130390535, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.357616424560547, + "learning_rate": 1e-06, + "loss": 0.443, + "mean_token_accuracy": 0.859423041343689, + "num_tokens": 520704255.0, + "step": 13651 + }, + { + "epoch": 1.736674723317644, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.299541473388672, + "learning_rate": 1e-06, + "loss": 0.4699, + "mean_token_accuracy": 0.8536890745162964, + "num_tokens": 520743036.0, + "step": 13652 + }, + { + "epoch": 1.7368019335962346, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.32204246520996, + "learning_rate": 1e-06, + "loss": 0.4297, + "mean_token_accuracy": 0.8605219125747681, + "num_tokens": 520778325.0, + "step": 13653 + }, + { + "epoch": 1.736929143874825, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.250123977661133, + "learning_rate": 1e-06, + "loss": 0.376, + "mean_token_accuracy": 0.8794717788696289, + "num_tokens": 520815226.0, + "step": 13654 + }, + { + "epoch": 1.7370563541534156, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.389007568359375, + "learning_rate": 1e-06, + "loss": 0.4902, + "mean_token_accuracy": 0.8481402397155762, + "num_tokens": 520856606.0, + "step": 13655 + }, + { + "epoch": 1.737183564432006, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.593372344970703, + "learning_rate": 1e-06, + "loss": 0.4497, + "mean_token_accuracy": 0.8583523035049438, + "num_tokens": 520893493.0, + "step": 13656 + }, + { + "epoch": 1.7373107747105965, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.301149368286133, + "learning_rate": 1e-06, + "loss": 0.3971, + "mean_token_accuracy": 0.8755965232849121, + "num_tokens": 520930391.0, + "step": 13657 + }, + { + "epoch": 1.737437984989187, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.337827682495117, + "learning_rate": 1e-06, + "loss": 0.4216, + "mean_token_accuracy": 0.8664259910583496, + "num_tokens": 520970512.0, + "step": 13658 + }, + { + "epoch": 1.7375651952677775, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.404821395874023, + "learning_rate": 1e-06, + "loss": 0.4339, + "mean_token_accuracy": 0.861042857170105, + "num_tokens": 521011787.0, + "step": 13659 + }, + { + "epoch": 1.737692405546368, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.12709617614746, + "learning_rate": 1e-06, + "loss": 0.423, + "mean_token_accuracy": 0.8673657178878784, + "num_tokens": 521053494.0, + "step": 13660 + }, + { + "epoch": 1.7378196158249586, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.23228645324707, + "learning_rate": 1e-06, + "loss": 0.4637, + "mean_token_accuracy": 0.8556617498397827, + "num_tokens": 521089291.0, + "step": 13661 + }, + { + "epoch": 1.7379468261035491, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.773975372314453, + "learning_rate": 1e-06, + "loss": 0.3848, + "mean_token_accuracy": 0.8788967728614807, + "num_tokens": 521128910.0, + "step": 13662 + }, + { + "epoch": 1.7380740363821396, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.275197982788086, + "learning_rate": 1e-06, + "loss": 0.4331, + "mean_token_accuracy": 0.863741934299469, + "num_tokens": 521168175.0, + "step": 13663 + }, + { + "epoch": 1.7382012466607302, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.23855209350586, + "learning_rate": 1e-06, + "loss": 0.4406, + "mean_token_accuracy": 0.863227128982544, + "num_tokens": 521208020.0, + "step": 13664 + }, + { + "epoch": 1.7383284569393207, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.415056228637695, + "learning_rate": 1e-06, + "loss": 0.3847, + "mean_token_accuracy": 0.8789229393005371, + "num_tokens": 521249828.0, + "step": 13665 + }, + { + "epoch": 1.7384556672179112, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.346717834472656, + "learning_rate": 1e-06, + "loss": 0.4002, + "mean_token_accuracy": 0.8712869882583618, + "num_tokens": 521289940.0, + "step": 13666 + }, + { + "epoch": 1.7385828774965018, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.274627685546875, + "learning_rate": 1e-06, + "loss": 0.3633, + "mean_token_accuracy": 0.8846399784088135, + "num_tokens": 521327426.0, + "step": 13667 + }, + { + "epoch": 1.7387100877750923, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.425701141357422, + "learning_rate": 1e-06, + "loss": 0.4519, + "mean_token_accuracy": 0.8569362163543701, + "num_tokens": 521372112.0, + "step": 13668 + }, + { + "epoch": 1.7388372980536828, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.51613998413086, + "learning_rate": 1e-06, + "loss": 0.4253, + "mean_token_accuracy": 0.8661407232284546, + "num_tokens": 521410842.0, + "step": 13669 + }, + { + "epoch": 1.7389645083322733, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.060836791992188, + "learning_rate": 1e-06, + "loss": 0.3838, + "mean_token_accuracy": 0.8768676519393921, + "num_tokens": 521446854.0, + "step": 13670 + }, + { + "epoch": 1.7390917186108639, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.509687423706055, + "learning_rate": 1e-06, + "loss": 0.4618, + "mean_token_accuracy": 0.8575087785720825, + "num_tokens": 521486284.0, + "step": 13671 + }, + { + "epoch": 1.7392189288894544, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.39667320251465, + "learning_rate": 1e-06, + "loss": 0.4578, + "mean_token_accuracy": 0.8539982438087463, + "num_tokens": 521522716.0, + "step": 13672 + }, + { + "epoch": 1.739346139168045, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.23616600036621, + "learning_rate": 1e-06, + "loss": 0.4495, + "mean_token_accuracy": 0.8599480390548706, + "num_tokens": 521551549.0, + "step": 13673 + }, + { + "epoch": 1.7394733494466355, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.4423885345459, + "learning_rate": 1e-06, + "loss": 0.4024, + "mean_token_accuracy": 0.8703590035438538, + "num_tokens": 521589552.0, + "step": 13674 + }, + { + "epoch": 1.7396005597252258, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.523557662963867, + "learning_rate": 1e-06, + "loss": 0.3826, + "mean_token_accuracy": 0.8782628774642944, + "num_tokens": 521627906.0, + "step": 13675 + }, + { + "epoch": 1.7397277700038163, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.354534149169922, + "learning_rate": 1e-06, + "loss": 0.4758, + "mean_token_accuracy": 0.8507066965103149, + "num_tokens": 521666615.0, + "step": 13676 + }, + { + "epoch": 1.7398549802824068, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.224384307861328, + "learning_rate": 1e-06, + "loss": 0.4354, + "mean_token_accuracy": 0.8622100353240967, + "num_tokens": 521707742.0, + "step": 13677 + }, + { + "epoch": 1.7399821905609973, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.46710205078125, + "learning_rate": 1e-06, + "loss": 0.4455, + "mean_token_accuracy": 0.8570892214775085, + "num_tokens": 521746848.0, + "step": 13678 + }, + { + "epoch": 1.7401094008395879, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.59337043762207, + "learning_rate": 1e-06, + "loss": 0.4157, + "mean_token_accuracy": 0.8690497875213623, + "num_tokens": 521782917.0, + "step": 13679 + }, + { + "epoch": 1.7402366111181784, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.415224075317383, + "learning_rate": 1e-06, + "loss": 0.4268, + "mean_token_accuracy": 0.8642444014549255, + "num_tokens": 521825609.0, + "step": 13680 + }, + { + "epoch": 1.7403638213967687, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.36457061767578, + "learning_rate": 1e-06, + "loss": 0.3752, + "mean_token_accuracy": 0.8798696994781494, + "num_tokens": 521862277.0, + "step": 13681 + }, + { + "epoch": 1.7404910316753592, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.4804630279541, + "learning_rate": 1e-06, + "loss": 0.357, + "mean_token_accuracy": 0.884800136089325, + "num_tokens": 521894951.0, + "step": 13682 + }, + { + "epoch": 1.7406182419539498, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.18341064453125, + "learning_rate": 1e-06, + "loss": 0.4316, + "mean_token_accuracy": 0.8674152493476868, + "num_tokens": 521932158.0, + "step": 13683 + }, + { + "epoch": 1.7407454522325403, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.385461807250977, + "learning_rate": 1e-06, + "loss": 0.3965, + "mean_token_accuracy": 0.8727797865867615, + "num_tokens": 521971050.0, + "step": 13684 + }, + { + "epoch": 1.7408726625111308, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.967836380004883, + "learning_rate": 1e-06, + "loss": 0.3846, + "mean_token_accuracy": 0.8776872158050537, + "num_tokens": 522007617.0, + "step": 13685 + }, + { + "epoch": 1.7409998727897213, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.34550666809082, + "learning_rate": 1e-06, + "loss": 0.4534, + "mean_token_accuracy": 0.8547272682189941, + "num_tokens": 522053484.0, + "step": 13686 + }, + { + "epoch": 1.7411270830683119, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.294414520263672, + "learning_rate": 1e-06, + "loss": 0.4313, + "mean_token_accuracy": 0.8629534840583801, + "num_tokens": 522088047.0, + "step": 13687 + }, + { + "epoch": 1.7412542933469024, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.321626663208008, + "learning_rate": 1e-06, + "loss": 0.4262, + "mean_token_accuracy": 0.8649330139160156, + "num_tokens": 522129865.0, + "step": 13688 + }, + { + "epoch": 1.741381503625493, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.502954483032227, + "learning_rate": 1e-06, + "loss": 0.4312, + "mean_token_accuracy": 0.8624978065490723, + "num_tokens": 522170154.0, + "step": 13689 + }, + { + "epoch": 1.7415087139040835, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.22466278076172, + "learning_rate": 1e-06, + "loss": 0.3512, + "mean_token_accuracy": 0.8906944990158081, + "num_tokens": 522204586.0, + "step": 13690 + }, + { + "epoch": 1.741635924182674, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.36191749572754, + "learning_rate": 1e-06, + "loss": 0.4097, + "mean_token_accuracy": 0.8713515996932983, + "num_tokens": 522241942.0, + "step": 13691 + }, + { + "epoch": 1.7417631344612645, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.274188995361328, + "learning_rate": 1e-06, + "loss": 0.4247, + "mean_token_accuracy": 0.8631259202957153, + "num_tokens": 522275800.0, + "step": 13692 + }, + { + "epoch": 1.741890344739855, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.13237190246582, + "learning_rate": 1e-06, + "loss": 0.4333, + "mean_token_accuracy": 0.8607770204544067, + "num_tokens": 522317024.0, + "step": 13693 + }, + { + "epoch": 1.7420175550184456, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.426952362060547, + "learning_rate": 1e-06, + "loss": 0.3953, + "mean_token_accuracy": 0.8772223591804504, + "num_tokens": 522358510.0, + "step": 13694 + }, + { + "epoch": 1.742144765297036, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.275461196899414, + "learning_rate": 1e-06, + "loss": 0.4496, + "mean_token_accuracy": 0.8603881001472473, + "num_tokens": 522395960.0, + "step": 13695 + }, + { + "epoch": 1.7422719755756266, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.366703033447266, + "learning_rate": 1e-06, + "loss": 0.3779, + "mean_token_accuracy": 0.8756346702575684, + "num_tokens": 522428807.0, + "step": 13696 + }, + { + "epoch": 1.7423991858542172, + "ewc_loss": 0.034912109375, + "ewc_loss_parallel": 3.4809112548828125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.38601303100586, + "learning_rate": 1e-06, + "loss": 0.4146, + "mean_token_accuracy": 0.8682982921600342, + "num_tokens": 522464472.0, + "step": 13697 + }, + { + "epoch": 1.7425263961328077, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.39094352722168, + "learning_rate": 1e-06, + "loss": 0.3785, + "mean_token_accuracy": 0.880811870098114, + "num_tokens": 522509522.0, + "step": 13698 + }, + { + "epoch": 1.742653606411398, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.463560104370117, + "learning_rate": 1e-06, + "loss": 0.4609, + "mean_token_accuracy": 0.8518806099891663, + "num_tokens": 522545581.0, + "step": 13699 + }, + { + "epoch": 1.7427808166899885, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.25278091430664, + "learning_rate": 1e-06, + "loss": 0.4211, + "mean_token_accuracy": 0.870680034160614, + "num_tokens": 522582072.0, + "step": 13700 + }, + { + "epoch": 1.742908026968579, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.28565788269043, + "learning_rate": 1e-06, + "loss": 0.421, + "mean_token_accuracy": 0.870982825756073, + "num_tokens": 522618183.0, + "step": 13701 + }, + { + "epoch": 1.7430352372471696, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.359085083007812, + "learning_rate": 1e-06, + "loss": 0.4577, + "mean_token_accuracy": 0.8563525676727295, + "num_tokens": 522660578.0, + "step": 13702 + }, + { + "epoch": 1.74316244752576, + "ewc_loss": 0.034912109375, + "ewc_loss_parallel": 3.4809112548828125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.410900115966797, + "learning_rate": 1e-06, + "loss": 0.3876, + "mean_token_accuracy": 0.8792562484741211, + "num_tokens": 522701934.0, + "step": 13703 + }, + { + "epoch": 1.7432896578043506, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.175466537475586, + "learning_rate": 1e-06, + "loss": 0.3652, + "mean_token_accuracy": 0.8847705125808716, + "num_tokens": 522737404.0, + "step": 13704 + }, + { + "epoch": 1.743416868082941, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.24653434753418, + "learning_rate": 1e-06, + "loss": 0.4268, + "mean_token_accuracy": 0.8640003204345703, + "num_tokens": 522774188.0, + "step": 13705 + }, + { + "epoch": 1.7435440783615315, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.301355361938477, + "learning_rate": 1e-06, + "loss": 0.3995, + "mean_token_accuracy": 0.8751081228256226, + "num_tokens": 522810467.0, + "step": 13706 + }, + { + "epoch": 1.743671288640122, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.26078224182129, + "learning_rate": 1e-06, + "loss": 0.4627, + "mean_token_accuracy": 0.8570474982261658, + "num_tokens": 522845246.0, + "step": 13707 + }, + { + "epoch": 1.7437984989187125, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.262195587158203, + "learning_rate": 1e-06, + "loss": 0.3965, + "mean_token_accuracy": 0.8769493103027344, + "num_tokens": 522877150.0, + "step": 13708 + }, + { + "epoch": 1.743925709197303, + "ewc_loss": 0.034912109375, + "ewc_loss_parallel": 3.4809112548828125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.357681274414062, + "learning_rate": 1e-06, + "loss": 0.3711, + "mean_token_accuracy": 0.8815433382987976, + "num_tokens": 522913161.0, + "step": 13709 + }, + { + "epoch": 1.7440529194758936, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.139291763305664, + "learning_rate": 1e-06, + "loss": 0.4785, + "mean_token_accuracy": 0.8540842533111572, + "num_tokens": 522953464.0, + "step": 13710 + }, + { + "epoch": 1.744180129754484, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.312076568603516, + "learning_rate": 1e-06, + "loss": 0.4063, + "mean_token_accuracy": 0.8720880150794983, + "num_tokens": 522996874.0, + "step": 13711 + }, + { + "epoch": 1.7443073400330746, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.264278411865234, + "learning_rate": 1e-06, + "loss": 0.4119, + "mean_token_accuracy": 0.8710607886314392, + "num_tokens": 523031349.0, + "step": 13712 + }, + { + "epoch": 1.7444345503116652, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.17159652709961, + "learning_rate": 1e-06, + "loss": 0.447, + "mean_token_accuracy": 0.8596948981285095, + "num_tokens": 523068910.0, + "step": 13713 + }, + { + "epoch": 1.7445617605902557, + "ewc_loss": 0.034912109375, + "ewc_loss_parallel": 3.4809112548828125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.322254180908203, + "learning_rate": 1e-06, + "loss": 0.3799, + "mean_token_accuracy": 0.8808575868606567, + "num_tokens": 523106179.0, + "step": 13714 + }, + { + "epoch": 1.7446889708688462, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.10137939453125, + "learning_rate": 1e-06, + "loss": 0.4105, + "mean_token_accuracy": 0.869498610496521, + "num_tokens": 523140827.0, + "step": 13715 + }, + { + "epoch": 1.7448161811474368, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.637840270996094, + "learning_rate": 1e-06, + "loss": 0.3719, + "mean_token_accuracy": 0.8807669878005981, + "num_tokens": 523182880.0, + "step": 13716 + }, + { + "epoch": 1.7449433914260273, + "ewc_loss": 0.034912109375, + "ewc_loss_parallel": 3.4809112548828125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.31108856201172, + "learning_rate": 1e-06, + "loss": 0.4011, + "mean_token_accuracy": 0.8703572154045105, + "num_tokens": 523213059.0, + "step": 13717 + }, + { + "epoch": 1.7450706017046178, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.37058448791504, + "learning_rate": 1e-06, + "loss": 0.424, + "mean_token_accuracy": 0.8660330772399902, + "num_tokens": 523254398.0, + "step": 13718 + }, + { + "epoch": 1.7451978119832083, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.43174934387207, + "learning_rate": 1e-06, + "loss": 0.3782, + "mean_token_accuracy": 0.8798894882202148, + "num_tokens": 523293325.0, + "step": 13719 + }, + { + "epoch": 1.7453250222617989, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.4223690032959, + "learning_rate": 1e-06, + "loss": 0.4578, + "mean_token_accuracy": 0.857150673866272, + "num_tokens": 523334402.0, + "step": 13720 + }, + { + "epoch": 1.7454522325403894, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.196605682373047, + "learning_rate": 1e-06, + "loss": 0.4109, + "mean_token_accuracy": 0.8720450401306152, + "num_tokens": 523376238.0, + "step": 13721 + }, + { + "epoch": 1.74557944281898, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.306177139282227, + "learning_rate": 1e-06, + "loss": 0.3492, + "mean_token_accuracy": 0.8898014426231384, + "num_tokens": 523417074.0, + "step": 13722 + }, + { + "epoch": 1.7457066530975704, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.42011070251465, + "learning_rate": 1e-06, + "loss": 0.3966, + "mean_token_accuracy": 0.8757120966911316, + "num_tokens": 523457984.0, + "step": 13723 + }, + { + "epoch": 1.7458338633761608, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.282564163208008, + "learning_rate": 1e-06, + "loss": 0.4381, + "mean_token_accuracy": 0.8623998165130615, + "num_tokens": 523490071.0, + "step": 13724 + }, + { + "epoch": 1.7459610736547513, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.445993423461914, + "learning_rate": 1e-06, + "loss": 0.4295, + "mean_token_accuracy": 0.861301600933075, + "num_tokens": 523530105.0, + "step": 13725 + }, + { + "epoch": 1.7460882839333418, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.40317153930664, + "learning_rate": 1e-06, + "loss": 0.3727, + "mean_token_accuracy": 0.8819106817245483, + "num_tokens": 523576698.0, + "step": 13726 + }, + { + "epoch": 1.7462154942119323, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.050981521606445, + "learning_rate": 1e-06, + "loss": 0.4155, + "mean_token_accuracy": 0.8679912090301514, + "num_tokens": 523614387.0, + "step": 13727 + }, + { + "epoch": 1.7463427044905229, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.629318237304688, + "learning_rate": 1e-06, + "loss": 0.4366, + "mean_token_accuracy": 0.8630311489105225, + "num_tokens": 523651149.0, + "step": 13728 + }, + { + "epoch": 1.7464699147691134, + "ewc_loss": 0.034912109375, + "ewc_loss_parallel": 3.4809112548828125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.69561767578125, + "learning_rate": 1e-06, + "loss": 0.3743, + "mean_token_accuracy": 0.8801259398460388, + "num_tokens": 523688448.0, + "step": 13729 + }, + { + "epoch": 1.7465971250477037, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.940631866455078, + "learning_rate": 1e-06, + "loss": 0.4015, + "mean_token_accuracy": 0.8724855184555054, + "num_tokens": 523725805.0, + "step": 13730 + }, + { + "epoch": 1.7467243353262942, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.36380958557129, + "learning_rate": 1e-06, + "loss": 0.414, + "mean_token_accuracy": 0.8691258430480957, + "num_tokens": 523763564.0, + "step": 13731 + }, + { + "epoch": 1.7468515456048848, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.48853302001953, + "learning_rate": 1e-06, + "loss": 0.4227, + "mean_token_accuracy": 0.8629648089408875, + "num_tokens": 523804422.0, + "step": 13732 + }, + { + "epoch": 1.7469787558834753, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.120914459228516, + "learning_rate": 1e-06, + "loss": 0.3863, + "mean_token_accuracy": 0.8774459958076477, + "num_tokens": 523845889.0, + "step": 13733 + }, + { + "epoch": 1.7471059661620658, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.28989601135254, + "learning_rate": 1e-06, + "loss": 0.3931, + "mean_token_accuracy": 0.8790881633758545, + "num_tokens": 523883623.0, + "step": 13734 + }, + { + "epoch": 1.7472331764406563, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.404239654541016, + "learning_rate": 1e-06, + "loss": 0.3891, + "mean_token_accuracy": 0.8752411603927612, + "num_tokens": 523917273.0, + "step": 13735 + }, + { + "epoch": 1.7473603867192469, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.259124755859375, + "learning_rate": 1e-06, + "loss": 0.4173, + "mean_token_accuracy": 0.86521315574646, + "num_tokens": 523956149.0, + "step": 13736 + }, + { + "epoch": 1.7474875969978374, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.131175994873047, + "learning_rate": 1e-06, + "loss": 0.4301, + "mean_token_accuracy": 0.8657002449035645, + "num_tokens": 523997229.0, + "step": 13737 + }, + { + "epoch": 1.747614807276428, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.458494186401367, + "learning_rate": 1e-06, + "loss": 0.4046, + "mean_token_accuracy": 0.8711515069007874, + "num_tokens": 524032921.0, + "step": 13738 + }, + { + "epoch": 1.7477420175550185, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.337234497070312, + "learning_rate": 1e-06, + "loss": 0.4222, + "mean_token_accuracy": 0.8706822991371155, + "num_tokens": 524065016.0, + "step": 13739 + }, + { + "epoch": 1.747869227833609, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.358558654785156, + "learning_rate": 1e-06, + "loss": 0.4457, + "mean_token_accuracy": 0.8569937944412231, + "num_tokens": 524107001.0, + "step": 13740 + }, + { + "epoch": 1.7479964381121995, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.2457275390625, + "learning_rate": 1e-06, + "loss": 0.4306, + "mean_token_accuracy": 0.8620563745498657, + "num_tokens": 524149283.0, + "step": 13741 + }, + { + "epoch": 1.74812364839079, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.342975616455078, + "learning_rate": 1e-06, + "loss": 0.4207, + "mean_token_accuracy": 0.8684406280517578, + "num_tokens": 524187240.0, + "step": 13742 + }, + { + "epoch": 1.7482508586693806, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.379894256591797, + "learning_rate": 1e-06, + "loss": 0.3841, + "mean_token_accuracy": 0.876781702041626, + "num_tokens": 524216793.0, + "step": 13743 + }, + { + "epoch": 1.748378068947971, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.38194465637207, + "learning_rate": 1e-06, + "loss": 0.3962, + "mean_token_accuracy": 0.8747742772102356, + "num_tokens": 524257629.0, + "step": 13744 + }, + { + "epoch": 1.7485052792265616, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.37115478515625, + "learning_rate": 1e-06, + "loss": 0.3755, + "mean_token_accuracy": 0.878996729850769, + "num_tokens": 524291090.0, + "step": 13745 + }, + { + "epoch": 1.7486324895051522, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.309329986572266, + "learning_rate": 1e-06, + "loss": 0.3676, + "mean_token_accuracy": 0.8809720873832703, + "num_tokens": 524324882.0, + "step": 13746 + }, + { + "epoch": 1.7487596997837427, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.3211669921875, + "learning_rate": 1e-06, + "loss": 0.4626, + "mean_token_accuracy": 0.8532818555831909, + "num_tokens": 524359422.0, + "step": 13747 + }, + { + "epoch": 1.748886910062333, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.272018432617188, + "learning_rate": 1e-06, + "loss": 0.3936, + "mean_token_accuracy": 0.8749097585678101, + "num_tokens": 524398383.0, + "step": 13748 + }, + { + "epoch": 1.7490141203409235, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.307662963867188, + "learning_rate": 1e-06, + "loss": 0.436, + "mean_token_accuracy": 0.8652305603027344, + "num_tokens": 524437984.0, + "step": 13749 + }, + { + "epoch": 1.749141330619514, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.476743698120117, + "learning_rate": 1e-06, + "loss": 0.3842, + "mean_token_accuracy": 0.8748535513877869, + "num_tokens": 524471324.0, + "step": 13750 + }, + { + "epoch": 1.7492685408981046, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.140214920043945, + "learning_rate": 1e-06, + "loss": 0.4384, + "mean_token_accuracy": 0.859668493270874, + "num_tokens": 524511471.0, + "step": 13751 + }, + { + "epoch": 1.749395751176695, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.29179573059082, + "learning_rate": 1e-06, + "loss": 0.3936, + "mean_token_accuracy": 0.8723270893096924, + "num_tokens": 524541743.0, + "step": 13752 + }, + { + "epoch": 1.7495229614552856, + "ewc_loss": 0.034912109375, + "ewc_loss_parallel": 3.4809112548828125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.414308547973633, + "learning_rate": 1e-06, + "loss": 0.4249, + "mean_token_accuracy": 0.8686707615852356, + "num_tokens": 524581470.0, + "step": 13753 + }, + { + "epoch": 1.749650171733876, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.355295181274414, + "learning_rate": 1e-06, + "loss": 0.4019, + "mean_token_accuracy": 0.8712363839149475, + "num_tokens": 524612722.0, + "step": 13754 + }, + { + "epoch": 1.7497773820124665, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.563507080078125, + "learning_rate": 1e-06, + "loss": 0.3771, + "mean_token_accuracy": 0.8774929046630859, + "num_tokens": 524647640.0, + "step": 13755 + }, + { + "epoch": 1.749904592291057, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.404720306396484, + "learning_rate": 1e-06, + "loss": 0.4365, + "mean_token_accuracy": 0.8616786003112793, + "num_tokens": 524682366.0, + "step": 13756 + }, + { + "epoch": 1.7500318025696475, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.37580680847168, + "learning_rate": 1e-06, + "loss": 0.4247, + "mean_token_accuracy": 0.8681997060775757, + "num_tokens": 524724242.0, + "step": 13757 + }, + { + "epoch": 1.750159012848238, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.386354446411133, + "learning_rate": 1e-06, + "loss": 0.4493, + "mean_token_accuracy": 0.8556419610977173, + "num_tokens": 524757373.0, + "step": 13758 + }, + { + "epoch": 1.7502862231268286, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.367612838745117, + "learning_rate": 1e-06, + "loss": 0.4657, + "mean_token_accuracy": 0.8576749563217163, + "num_tokens": 524797704.0, + "step": 13759 + }, + { + "epoch": 1.750413433405419, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.32684326171875, + "learning_rate": 1e-06, + "loss": 0.4126, + "mean_token_accuracy": 0.8715479969978333, + "num_tokens": 524836710.0, + "step": 13760 + }, + { + "epoch": 1.7505406436840096, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.37449073791504, + "learning_rate": 1e-06, + "loss": 0.4104, + "mean_token_accuracy": 0.8666386604309082, + "num_tokens": 524870224.0, + "step": 13761 + }, + { + "epoch": 1.7506678539626002, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.290813446044922, + "learning_rate": 1e-06, + "loss": 0.4128, + "mean_token_accuracy": 0.8711337447166443, + "num_tokens": 524905345.0, + "step": 13762 + }, + { + "epoch": 1.7507950642411907, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.17974090576172, + "learning_rate": 1e-06, + "loss": 0.4699, + "mean_token_accuracy": 0.8525870442390442, + "num_tokens": 524943893.0, + "step": 13763 + }, + { + "epoch": 1.7509222745197812, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.449371337890625, + "learning_rate": 1e-06, + "loss": 0.4126, + "mean_token_accuracy": 0.8669338226318359, + "num_tokens": 524982924.0, + "step": 13764 + }, + { + "epoch": 1.7510494847983717, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.252639770507812, + "learning_rate": 1e-06, + "loss": 0.4221, + "mean_token_accuracy": 0.8671878576278687, + "num_tokens": 525016749.0, + "step": 13765 + }, + { + "epoch": 1.7511766950769623, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.43293571472168, + "learning_rate": 1e-06, + "loss": 0.4058, + "mean_token_accuracy": 0.8701452016830444, + "num_tokens": 525049178.0, + "step": 13766 + }, + { + "epoch": 1.7513039053555528, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.273653030395508, + "learning_rate": 1e-06, + "loss": 0.4438, + "mean_token_accuracy": 0.8557044267654419, + "num_tokens": 525083456.0, + "step": 13767 + }, + { + "epoch": 1.7514311156341433, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.239852905273438, + "learning_rate": 1e-06, + "loss": 0.4095, + "mean_token_accuracy": 0.8699120283126831, + "num_tokens": 525120159.0, + "step": 13768 + }, + { + "epoch": 1.7515583259127339, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.525774002075195, + "learning_rate": 1e-06, + "loss": 0.3975, + "mean_token_accuracy": 0.8741629719734192, + "num_tokens": 525159174.0, + "step": 13769 + }, + { + "epoch": 1.7516855361913244, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.033254623413086, + "learning_rate": 1e-06, + "loss": 0.4188, + "mean_token_accuracy": 0.8650116920471191, + "num_tokens": 525198373.0, + "step": 13770 + }, + { + "epoch": 1.751812746469915, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.212644577026367, + "learning_rate": 1e-06, + "loss": 0.4, + "mean_token_accuracy": 0.8742324709892273, + "num_tokens": 525240618.0, + "step": 13771 + }, + { + "epoch": 1.7519399567485054, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.317028045654297, + "learning_rate": 1e-06, + "loss": 0.3678, + "mean_token_accuracy": 0.8828709125518799, + "num_tokens": 525274899.0, + "step": 13772 + }, + { + "epoch": 1.7520671670270958, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.12488555908203, + "learning_rate": 1e-06, + "loss": 0.4922, + "mean_token_accuracy": 0.8491334915161133, + "num_tokens": 525310144.0, + "step": 13773 + }, + { + "epoch": 1.7521943773056863, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.135276794433594, + "learning_rate": 1e-06, + "loss": 0.3996, + "mean_token_accuracy": 0.8782414197921753, + "num_tokens": 525348565.0, + "step": 13774 + }, + { + "epoch": 1.7523215875842768, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.46054458618164, + "learning_rate": 1e-06, + "loss": 0.403, + "mean_token_accuracy": 0.8741675019264221, + "num_tokens": 525383709.0, + "step": 13775 + }, + { + "epoch": 1.7524487978628673, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.187891006469727, + "learning_rate": 1e-06, + "loss": 0.4348, + "mean_token_accuracy": 0.8629660606384277, + "num_tokens": 525423983.0, + "step": 13776 + }, + { + "epoch": 1.7525760081414579, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.20628547668457, + "learning_rate": 1e-06, + "loss": 0.398, + "mean_token_accuracy": 0.8754297494888306, + "num_tokens": 525457357.0, + "step": 13777 + }, + { + "epoch": 1.7527032184200484, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.553281784057617, + "learning_rate": 1e-06, + "loss": 0.4166, + "mean_token_accuracy": 0.866941511631012, + "num_tokens": 525497021.0, + "step": 13778 + }, + { + "epoch": 1.7528304286986387, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.40273094177246, + "learning_rate": 1e-06, + "loss": 0.3656, + "mean_token_accuracy": 0.8850235939025879, + "num_tokens": 525533592.0, + "step": 13779 + }, + { + "epoch": 1.7529576389772292, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.087007522583008, + "learning_rate": 1e-06, + "loss": 0.4483, + "mean_token_accuracy": 0.8604419231414795, + "num_tokens": 525572747.0, + "step": 13780 + }, + { + "epoch": 1.7530848492558198, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.67868423461914, + "learning_rate": 1e-06, + "loss": 0.4013, + "mean_token_accuracy": 0.874515175819397, + "num_tokens": 525613578.0, + "step": 13781 + }, + { + "epoch": 1.7532120595344103, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.160287857055664, + "learning_rate": 1e-06, + "loss": 0.3972, + "mean_token_accuracy": 0.8750061988830566, + "num_tokens": 525655047.0, + "step": 13782 + }, + { + "epoch": 1.7533392698130008, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.551511764526367, + "learning_rate": 1e-06, + "loss": 0.4186, + "mean_token_accuracy": 0.8649998903274536, + "num_tokens": 525689561.0, + "step": 13783 + }, + { + "epoch": 1.7534664800915913, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.367441177368164, + "learning_rate": 1e-06, + "loss": 0.4218, + "mean_token_accuracy": 0.8640309572219849, + "num_tokens": 525726946.0, + "step": 13784 + }, + { + "epoch": 1.7535936903701819, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.279279708862305, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.8618795871734619, + "num_tokens": 525762089.0, + "step": 13785 + }, + { + "epoch": 1.7537209006487724, + "ewc_loss": 0.034912109375, + "ewc_loss_parallel": 3.4809112548828125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.50870704650879, + "learning_rate": 1e-06, + "loss": 0.4027, + "mean_token_accuracy": 0.8738141655921936, + "num_tokens": 525796081.0, + "step": 13786 + }, + { + "epoch": 1.753848110927363, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.214523315429688, + "learning_rate": 1e-06, + "loss": 0.4491, + "mean_token_accuracy": 0.857977569103241, + "num_tokens": 525829945.0, + "step": 13787 + }, + { + "epoch": 1.7539753212059535, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.064254760742188, + "learning_rate": 1e-06, + "loss": 0.3881, + "mean_token_accuracy": 0.8768110871315002, + "num_tokens": 525871289.0, + "step": 13788 + }, + { + "epoch": 1.754102531484544, + "ewc_loss": 0.034912109375, + "ewc_loss_parallel": 3.4809112548828125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.507335662841797, + "learning_rate": 1e-06, + "loss": 0.4096, + "mean_token_accuracy": 0.869920551776886, + "num_tokens": 525908455.0, + "step": 13789 + }, + { + "epoch": 1.7542297417631345, + "ewc_loss": 0.034912109375, + "ewc_loss_parallel": 3.4809112548828125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.313030242919922, + "learning_rate": 1e-06, + "loss": 0.3476, + "mean_token_accuracy": 0.8900403380393982, + "num_tokens": 525943734.0, + "step": 13790 + }, + { + "epoch": 1.754356952041725, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.262887954711914, + "learning_rate": 1e-06, + "loss": 0.3453, + "mean_token_accuracy": 0.891139566898346, + "num_tokens": 525988070.0, + "step": 13791 + }, + { + "epoch": 1.7544841623203156, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.292757034301758, + "learning_rate": 1e-06, + "loss": 0.3733, + "mean_token_accuracy": 0.8835152387619019, + "num_tokens": 526027940.0, + "step": 13792 + }, + { + "epoch": 1.754611372598906, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.38709259033203, + "learning_rate": 1e-06, + "loss": 0.3936, + "mean_token_accuracy": 0.8792469501495361, + "num_tokens": 526061357.0, + "step": 13793 + }, + { + "epoch": 1.7547385828774966, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.13932991027832, + "learning_rate": 1e-06, + "loss": 0.4003, + "mean_token_accuracy": 0.8713724613189697, + "num_tokens": 526102157.0, + "step": 13794 + }, + { + "epoch": 1.7548657931560872, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.221315383911133, + "learning_rate": 1e-06, + "loss": 0.4424, + "mean_token_accuracy": 0.8603541851043701, + "num_tokens": 526137241.0, + "step": 13795 + }, + { + "epoch": 1.7549930034346777, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.169063568115234, + "learning_rate": 1e-06, + "loss": 0.4393, + "mean_token_accuracy": 0.8599157929420471, + "num_tokens": 526169079.0, + "step": 13796 + }, + { + "epoch": 1.755120213713268, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.148283004760742, + "learning_rate": 1e-06, + "loss": 0.3985, + "mean_token_accuracy": 0.8733842968940735, + "num_tokens": 526212212.0, + "step": 13797 + }, + { + "epoch": 1.7552474239918585, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.382219314575195, + "learning_rate": 1e-06, + "loss": 0.4001, + "mean_token_accuracy": 0.8772603273391724, + "num_tokens": 526250380.0, + "step": 13798 + }, + { + "epoch": 1.755374634270449, + "ewc_loss": 0.034912109375, + "ewc_loss_parallel": 3.4809112548828125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.215293884277344, + "learning_rate": 1e-06, + "loss": 0.3819, + "mean_token_accuracy": 0.876089334487915, + "num_tokens": 526292525.0, + "step": 13799 + }, + { + "epoch": 1.7555018445490396, + "ewc_loss": 0.034912109375, + "ewc_loss_parallel": 3.4809112548828125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.373043060302734, + "learning_rate": 1e-06, + "loss": 0.407, + "mean_token_accuracy": 0.8696216344833374, + "num_tokens": 526330427.0, + "step": 13800 + }, + { + "epoch": 1.75562905482763, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.266807556152344, + "learning_rate": 1e-06, + "loss": 0.4019, + "mean_token_accuracy": 0.8717447519302368, + "num_tokens": 526362954.0, + "step": 13801 + }, + { + "epoch": 1.7557562651062206, + "ewc_loss": 0.034912109375, + "ewc_loss_parallel": 3.4809112548828125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.52854347229004, + "learning_rate": 1e-06, + "loss": 0.4399, + "mean_token_accuracy": 0.8636633157730103, + "num_tokens": 526394310.0, + "step": 13802 + }, + { + "epoch": 1.755883475384811, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.304323196411133, + "learning_rate": 1e-06, + "loss": 0.4716, + "mean_token_accuracy": 0.850020170211792, + "num_tokens": 526438507.0, + "step": 13803 + }, + { + "epoch": 1.7560106856634015, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.409969329833984, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8667011857032776, + "num_tokens": 526475124.0, + "step": 13804 + }, + { + "epoch": 1.756137895941992, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.299135208129883, + "learning_rate": 1e-06, + "loss": 0.3881, + "mean_token_accuracy": 0.8775149583816528, + "num_tokens": 526514936.0, + "step": 13805 + }, + { + "epoch": 1.7562651062205825, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.319726943969727, + "learning_rate": 1e-06, + "loss": 0.4334, + "mean_token_accuracy": 0.864172101020813, + "num_tokens": 526548649.0, + "step": 13806 + }, + { + "epoch": 1.756392316499173, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.411523818969727, + "learning_rate": 1e-06, + "loss": 0.467, + "mean_token_accuracy": 0.8533266186714172, + "num_tokens": 526589028.0, + "step": 13807 + }, + { + "epoch": 1.7565195267777636, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.196428298950195, + "learning_rate": 1e-06, + "loss": 0.4101, + "mean_token_accuracy": 0.8694295287132263, + "num_tokens": 526626327.0, + "step": 13808 + }, + { + "epoch": 1.756646737056354, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.298065185546875, + "learning_rate": 1e-06, + "loss": 0.4243, + "mean_token_accuracy": 0.8653805255889893, + "num_tokens": 526672264.0, + "step": 13809 + }, + { + "epoch": 1.7567739473349446, + "ewc_loss": 0.034912109375, + "ewc_loss_parallel": 3.4809112548828125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.323097229003906, + "learning_rate": 1e-06, + "loss": 0.4383, + "mean_token_accuracy": 0.8629021644592285, + "num_tokens": 526709979.0, + "step": 13810 + }, + { + "epoch": 1.7569011576135352, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.2357177734375, + "learning_rate": 1e-06, + "loss": 0.3684, + "mean_token_accuracy": 0.8811603784561157, + "num_tokens": 526745002.0, + "step": 13811 + }, + { + "epoch": 1.7570283678921257, + "ewc_loss": 0.034912109375, + "ewc_loss_parallel": 3.4809112548828125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.541105270385742, + "learning_rate": 1e-06, + "loss": 0.3702, + "mean_token_accuracy": 0.8842810988426208, + "num_tokens": 526785684.0, + "step": 13812 + }, + { + "epoch": 1.7571555781707162, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.214099884033203, + "learning_rate": 1e-06, + "loss": 0.3557, + "mean_token_accuracy": 0.8865285515785217, + "num_tokens": 526820246.0, + "step": 13813 + }, + { + "epoch": 1.7572827884493067, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.215667724609375, + "learning_rate": 1e-06, + "loss": 0.3975, + "mean_token_accuracy": 0.873898983001709, + "num_tokens": 526854815.0, + "step": 13814 + }, + { + "epoch": 1.7574099987278973, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.470041275024414, + "learning_rate": 1e-06, + "loss": 0.3979, + "mean_token_accuracy": 0.8751477003097534, + "num_tokens": 526896336.0, + "step": 13815 + }, + { + "epoch": 1.7575372090064878, + "ewc_loss": 0.034912109375, + "ewc_loss_parallel": 3.4809112548828125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.246746063232422, + "learning_rate": 1e-06, + "loss": 0.3805, + "mean_token_accuracy": 0.8813414573669434, + "num_tokens": 526939586.0, + "step": 13816 + }, + { + "epoch": 1.7576644192850783, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.126741409301758, + "learning_rate": 1e-06, + "loss": 0.4004, + "mean_token_accuracy": 0.8738536834716797, + "num_tokens": 526975027.0, + "step": 13817 + }, + { + "epoch": 1.7577916295636689, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.361774444580078, + "learning_rate": 1e-06, + "loss": 0.4125, + "mean_token_accuracy": 0.8711978197097778, + "num_tokens": 527017820.0, + "step": 13818 + }, + { + "epoch": 1.7579188398422594, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.311607360839844, + "learning_rate": 1e-06, + "loss": 0.386, + "mean_token_accuracy": 0.8804536461830139, + "num_tokens": 527051647.0, + "step": 13819 + }, + { + "epoch": 1.75804605012085, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.309328079223633, + "learning_rate": 1e-06, + "loss": 0.3886, + "mean_token_accuracy": 0.8750121593475342, + "num_tokens": 527089590.0, + "step": 13820 + }, + { + "epoch": 1.7581732603994404, + "ewc_loss": 0.034912109375, + "ewc_loss_parallel": 3.4809112548828125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.40985870361328, + "learning_rate": 1e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.866377592086792, + "num_tokens": 527126696.0, + "step": 13821 + }, + { + "epoch": 1.7583004706780307, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.48917007446289, + "learning_rate": 1e-06, + "loss": 0.4308, + "mean_token_accuracy": 0.8613481521606445, + "num_tokens": 527164823.0, + "step": 13822 + }, + { + "epoch": 1.7584276809566213, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.342477798461914, + "learning_rate": 1e-06, + "loss": 0.4701, + "mean_token_accuracy": 0.8521651029586792, + "num_tokens": 527208032.0, + "step": 13823 + }, + { + "epoch": 1.7585548912352118, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.460874557495117, + "learning_rate": 1e-06, + "loss": 0.4077, + "mean_token_accuracy": 0.8704013824462891, + "num_tokens": 527242773.0, + "step": 13824 + }, + { + "epoch": 1.7586821015138023, + "ewc_loss": 0.034912109375, + "ewc_loss_parallel": 3.4809112548828125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.32607078552246, + "learning_rate": 1e-06, + "loss": 0.3969, + "mean_token_accuracy": 0.8725273609161377, + "num_tokens": 527277968.0, + "step": 13825 + }, + { + "epoch": 1.7588093117923929, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.589162826538086, + "learning_rate": 1e-06, + "loss": 0.3695, + "mean_token_accuracy": 0.8830060958862305, + "num_tokens": 527311077.0, + "step": 13826 + }, + { + "epoch": 1.7589365220709834, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.280908584594727, + "learning_rate": 1e-06, + "loss": 0.3778, + "mean_token_accuracy": 0.8748868703842163, + "num_tokens": 527340543.0, + "step": 13827 + }, + { + "epoch": 1.7590637323495737, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.364151000976562, + "learning_rate": 1e-06, + "loss": 0.4203, + "mean_token_accuracy": 0.8671162724494934, + "num_tokens": 527379596.0, + "step": 13828 + }, + { + "epoch": 1.7591909426281642, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.37265396118164, + "learning_rate": 1e-06, + "loss": 0.4161, + "mean_token_accuracy": 0.8688246607780457, + "num_tokens": 527417873.0, + "step": 13829 + }, + { + "epoch": 1.7593181529067548, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.41414451599121, + "learning_rate": 1e-06, + "loss": 0.4027, + "mean_token_accuracy": 0.871727705001831, + "num_tokens": 527458607.0, + "step": 13830 + }, + { + "epoch": 1.7594453631853453, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.37055206298828, + "learning_rate": 1e-06, + "loss": 0.4312, + "mean_token_accuracy": 0.861963152885437, + "num_tokens": 527493299.0, + "step": 13831 + }, + { + "epoch": 1.7595725734639358, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.47500228881836, + "learning_rate": 1e-06, + "loss": 0.4489, + "mean_token_accuracy": 0.8565982580184937, + "num_tokens": 527536065.0, + "step": 13832 + }, + { + "epoch": 1.7596997837425263, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.18242645263672, + "learning_rate": 1e-06, + "loss": 0.4609, + "mean_token_accuracy": 0.8539767265319824, + "num_tokens": 527576477.0, + "step": 13833 + }, + { + "epoch": 1.7598269940211169, + "ewc_loss": 0.034912109375, + "ewc_loss_parallel": 3.4809112548828125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.754531860351562, + "learning_rate": 1e-06, + "loss": 0.414, + "mean_token_accuracy": 0.8671107292175293, + "num_tokens": 527611777.0, + "step": 13834 + }, + { + "epoch": 1.7599542042997074, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.251361846923828, + "learning_rate": 1e-06, + "loss": 0.3541, + "mean_token_accuracy": 0.8871588110923767, + "num_tokens": 527650236.0, + "step": 13835 + }, + { + "epoch": 1.760081414578298, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.627050399780273, + "learning_rate": 1e-06, + "loss": 0.3917, + "mean_token_accuracy": 0.8754868507385254, + "num_tokens": 527685486.0, + "step": 13836 + }, + { + "epoch": 1.7602086248568884, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.245573043823242, + "learning_rate": 1e-06, + "loss": 0.4461, + "mean_token_accuracy": 0.8566524982452393, + "num_tokens": 527725386.0, + "step": 13837 + }, + { + "epoch": 1.760335835135479, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.08078956604004, + "learning_rate": 1e-06, + "loss": 0.4496, + "mean_token_accuracy": 0.8562375903129578, + "num_tokens": 527756213.0, + "step": 13838 + }, + { + "epoch": 1.7604630454140695, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.7733211517334, + "learning_rate": 1e-06, + "loss": 0.3977, + "mean_token_accuracy": 0.8735964298248291, + "num_tokens": 527793884.0, + "step": 13839 + }, + { + "epoch": 1.76059025569266, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.251605987548828, + "learning_rate": 1e-06, + "loss": 0.4273, + "mean_token_accuracy": 0.8669137954711914, + "num_tokens": 527826428.0, + "step": 13840 + }, + { + "epoch": 1.7607174659712506, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.480911254882812, + "learning_rate": 1e-06, + "loss": 0.4388, + "mean_token_accuracy": 0.860966145992279, + "num_tokens": 527866691.0, + "step": 13841 + }, + { + "epoch": 1.760844676249841, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.157987594604492, + "learning_rate": 1e-06, + "loss": 0.3868, + "mean_token_accuracy": 0.877633810043335, + "num_tokens": 527909482.0, + "step": 13842 + }, + { + "epoch": 1.7609718865284316, + "ewc_loss": 0.034912109375, + "ewc_loss_parallel": 3.4809112548828125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.41388702392578, + "learning_rate": 1e-06, + "loss": 0.4575, + "mean_token_accuracy": 0.8582871556282043, + "num_tokens": 527947818.0, + "step": 13843 + }, + { + "epoch": 1.7610990968070221, + "ewc_loss": 0.034912109375, + "ewc_loss_parallel": 3.4809112548828125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.15778350830078, + "learning_rate": 1e-06, + "loss": 0.3976, + "mean_token_accuracy": 0.8729284405708313, + "num_tokens": 527986760.0, + "step": 13844 + }, + { + "epoch": 1.7612263070856127, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.36477279663086, + "learning_rate": 1e-06, + "loss": 0.4268, + "mean_token_accuracy": 0.8628841042518616, + "num_tokens": 528024742.0, + "step": 13845 + }, + { + "epoch": 1.761353517364203, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.365890502929688, + "learning_rate": 1e-06, + "loss": 0.4598, + "mean_token_accuracy": 0.8555766344070435, + "num_tokens": 528060781.0, + "step": 13846 + }, + { + "epoch": 1.7614807276427935, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.222122192382812, + "learning_rate": 1e-06, + "loss": 0.4132, + "mean_token_accuracy": 0.8710572719573975, + "num_tokens": 528099051.0, + "step": 13847 + }, + { + "epoch": 1.761607937921384, + "ewc_loss": 0.034912109375, + "ewc_loss_parallel": 3.4809112548828125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.330183029174805, + "learning_rate": 1e-06, + "loss": 0.4365, + "mean_token_accuracy": 0.8625713586807251, + "num_tokens": 528138325.0, + "step": 13848 + }, + { + "epoch": 1.7617351481999746, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.196491241455078, + "learning_rate": 1e-06, + "loss": 0.4311, + "mean_token_accuracy": 0.8610934615135193, + "num_tokens": 528177997.0, + "step": 13849 + }, + { + "epoch": 1.761862358478565, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.446434020996094, + "learning_rate": 1e-06, + "loss": 0.4237, + "mean_token_accuracy": 0.8646899461746216, + "num_tokens": 528216360.0, + "step": 13850 + }, + { + "epoch": 1.7619895687571556, + "ewc_loss": 0.034912109375, + "ewc_loss_parallel": 3.4809112548828125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.152645111083984, + "learning_rate": 1e-06, + "loss": 0.3609, + "mean_token_accuracy": 0.8845036029815674, + "num_tokens": 528251100.0, + "step": 13851 + }, + { + "epoch": 1.762116779035746, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.191883087158203, + "learning_rate": 1e-06, + "loss": 0.3903, + "mean_token_accuracy": 0.8735454082489014, + "num_tokens": 528286298.0, + "step": 13852 + }, + { + "epoch": 1.7622439893143365, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.306806564331055, + "learning_rate": 1e-06, + "loss": 0.3615, + "mean_token_accuracy": 0.8854703903198242, + "num_tokens": 528322970.0, + "step": 13853 + }, + { + "epoch": 1.762371199592927, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.27208137512207, + "learning_rate": 1e-06, + "loss": 0.3619, + "mean_token_accuracy": 0.8852924704551697, + "num_tokens": 528355191.0, + "step": 13854 + }, + { + "epoch": 1.7624984098715175, + "ewc_loss": 0.034912109375, + "ewc_loss_parallel": 3.4809112548828125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.415983200073242, + "learning_rate": 1e-06, + "loss": 0.4159, + "mean_token_accuracy": 0.8687317371368408, + "num_tokens": 528388975.0, + "step": 13855 + }, + { + "epoch": 1.762625620150108, + "ewc_loss": 0.034912109375, + "ewc_loss_parallel": 3.4809112548828125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.291669845581055, + "learning_rate": 1e-06, + "loss": 0.4041, + "mean_token_accuracy": 0.8744401931762695, + "num_tokens": 528432139.0, + "step": 13856 + }, + { + "epoch": 1.7627528304286986, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.437891006469727, + "learning_rate": 1e-06, + "loss": 0.3837, + "mean_token_accuracy": 0.8789817094802856, + "num_tokens": 528468315.0, + "step": 13857 + }, + { + "epoch": 1.762880040707289, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.273921966552734, + "learning_rate": 1e-06, + "loss": 0.4504, + "mean_token_accuracy": 0.8582078218460083, + "num_tokens": 528510368.0, + "step": 13858 + }, + { + "epoch": 1.7630072509858796, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.460546493530273, + "learning_rate": 1e-06, + "loss": 0.3991, + "mean_token_accuracy": 0.8723057508468628, + "num_tokens": 528545637.0, + "step": 13859 + }, + { + "epoch": 1.7631344612644702, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.31143569946289, + "learning_rate": 1e-06, + "loss": 0.402, + "mean_token_accuracy": 0.8741474747657776, + "num_tokens": 528586267.0, + "step": 13860 + }, + { + "epoch": 1.7632616715430607, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.470985412597656, + "learning_rate": 1e-06, + "loss": 0.4091, + "mean_token_accuracy": 0.8736864328384399, + "num_tokens": 528629640.0, + "step": 13861 + }, + { + "epoch": 1.7633888818216512, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.283672332763672, + "learning_rate": 1e-06, + "loss": 0.4563, + "mean_token_accuracy": 0.8556161522865295, + "num_tokens": 528665373.0, + "step": 13862 + }, + { + "epoch": 1.7635160921002417, + "ewc_loss": 0.034912109375, + "ewc_loss_parallel": 3.4809112548828125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.44485855102539, + "learning_rate": 1e-06, + "loss": 0.3711, + "mean_token_accuracy": 0.8840914964675903, + "num_tokens": 528703426.0, + "step": 13863 + }, + { + "epoch": 1.7636433023788323, + "ewc_loss": 0.034912109375, + "ewc_loss_parallel": 3.4809112548828125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.47283363342285, + "learning_rate": 1e-06, + "loss": 0.4796, + "mean_token_accuracy": 0.8516135811805725, + "num_tokens": 528744886.0, + "step": 13864 + }, + { + "epoch": 1.7637705126574228, + "ewc_loss": 0.034912109375, + "ewc_loss_parallel": 3.4809112548828125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.317825317382812, + "learning_rate": 1e-06, + "loss": 0.3703, + "mean_token_accuracy": 0.8822213411331177, + "num_tokens": 528779773.0, + "step": 13865 + }, + { + "epoch": 1.7638977229360133, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.44266700744629, + "learning_rate": 1e-06, + "loss": 0.4028, + "mean_token_accuracy": 0.8726286292076111, + "num_tokens": 528821878.0, + "step": 13866 + }, + { + "epoch": 1.7640249332146039, + "ewc_loss": 0.034912109375, + "ewc_loss_parallel": 3.4809112548828125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.428686141967773, + "learning_rate": 1e-06, + "loss": 0.4098, + "mean_token_accuracy": 0.8712375164031982, + "num_tokens": 528858056.0, + "step": 13867 + }, + { + "epoch": 1.7641521434931944, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.335514068603516, + "learning_rate": 1e-06, + "loss": 0.4129, + "mean_token_accuracy": 0.8669877052307129, + "num_tokens": 528903116.0, + "step": 13868 + }, + { + "epoch": 1.764279353771785, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.67820930480957, + "learning_rate": 1e-06, + "loss": 0.4294, + "mean_token_accuracy": 0.864081859588623, + "num_tokens": 528941561.0, + "step": 13869 + }, + { + "epoch": 1.7644065640503754, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.357019424438477, + "learning_rate": 1e-06, + "loss": 0.3586, + "mean_token_accuracy": 0.8864225745201111, + "num_tokens": 528976215.0, + "step": 13870 + }, + { + "epoch": 1.7645337743289657, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.319799423217773, + "learning_rate": 1e-06, + "loss": 0.3845, + "mean_token_accuracy": 0.8761653900146484, + "num_tokens": 529019280.0, + "step": 13871 + }, + { + "epoch": 1.7646609846075563, + "ewc_loss": 0.034912109375, + "ewc_loss_parallel": 3.4809112548828125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.2928409576416, + "learning_rate": 1e-06, + "loss": 0.4017, + "mean_token_accuracy": 0.8725566864013672, + "num_tokens": 529060733.0, + "step": 13872 + }, + { + "epoch": 1.7647881948861468, + "ewc_loss": 0.034912109375, + "ewc_loss_parallel": 3.4809112548828125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.306488037109375, + "learning_rate": 1e-06, + "loss": 0.4137, + "mean_token_accuracy": 0.8722086548805237, + "num_tokens": 529094948.0, + "step": 13873 + }, + { + "epoch": 1.7649154051647373, + "ewc_loss": 0.034912109375, + "ewc_loss_parallel": 3.4809112548828125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.584829330444336, + "learning_rate": 1e-06, + "loss": 0.3748, + "mean_token_accuracy": 0.8803585767745972, + "num_tokens": 529131374.0, + "step": 13874 + }, + { + "epoch": 1.7650426154433279, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.393747329711914, + "learning_rate": 1e-06, + "loss": 0.4178, + "mean_token_accuracy": 0.8665593862533569, + "num_tokens": 529169133.0, + "step": 13875 + }, + { + "epoch": 1.7651698257219184, + "ewc_loss": 0.034912109375, + "ewc_loss_parallel": 3.4809112548828125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.236774444580078, + "learning_rate": 1e-06, + "loss": 0.4285, + "mean_token_accuracy": 0.8679920434951782, + "num_tokens": 529206467.0, + "step": 13876 + }, + { + "epoch": 1.7652970360005087, + "ewc_loss": 0.034912109375, + "ewc_loss_parallel": 3.4809112548828125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.441505432128906, + "learning_rate": 1e-06, + "loss": 0.4393, + "mean_token_accuracy": 0.8605082035064697, + "num_tokens": 529244891.0, + "step": 13877 + }, + { + "epoch": 1.7654242462790992, + "ewc_loss": 0.034912109375, + "ewc_loss_parallel": 3.4809112548828125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.3258113861084, + "learning_rate": 1e-06, + "loss": 0.4065, + "mean_token_accuracy": 0.8722511529922485, + "num_tokens": 529277384.0, + "step": 13878 + }, + { + "epoch": 1.7655514565576897, + "ewc_loss": 0.034912109375, + "ewc_loss_parallel": 3.4809112548828125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.53972816467285, + "learning_rate": 1e-06, + "loss": 0.4021, + "mean_token_accuracy": 0.8730547428131104, + "num_tokens": 529322530.0, + "step": 13879 + }, + { + "epoch": 1.7656786668362803, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.597028732299805, + "learning_rate": 1e-06, + "loss": 0.4255, + "mean_token_accuracy": 0.864771842956543, + "num_tokens": 529356345.0, + "step": 13880 + }, + { + "epoch": 1.7658058771148708, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.489299774169922, + "learning_rate": 1e-06, + "loss": 0.4072, + "mean_token_accuracy": 0.8720019459724426, + "num_tokens": 529391877.0, + "step": 13881 + }, + { + "epoch": 1.7659330873934613, + "ewc_loss": 0.034912109375, + "ewc_loss_parallel": 3.4809112548828125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.396541595458984, + "learning_rate": 1e-06, + "loss": 0.3882, + "mean_token_accuracy": 0.8767483234405518, + "num_tokens": 529425460.0, + "step": 13882 + }, + { + "epoch": 1.7660602976720519, + "ewc_loss": 0.034912109375, + "ewc_loss_parallel": 3.4809112548828125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.548315048217773, + "learning_rate": 1e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.8601703643798828, + "num_tokens": 529459660.0, + "step": 13883 + }, + { + "epoch": 1.7661875079506424, + "ewc_loss": 0.034912109375, + "ewc_loss_parallel": 3.4809112548828125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.6554012298584, + "learning_rate": 1e-06, + "loss": 0.4348, + "mean_token_accuracy": 0.8628661632537842, + "num_tokens": 529501839.0, + "step": 13884 + }, + { + "epoch": 1.766314718229233, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.303964614868164, + "learning_rate": 1e-06, + "loss": 0.42, + "mean_token_accuracy": 0.8671128749847412, + "num_tokens": 529540828.0, + "step": 13885 + }, + { + "epoch": 1.7664419285078234, + "ewc_loss": 0.034912109375, + "ewc_loss_parallel": 3.4809112548828125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.155427932739258, + "learning_rate": 1e-06, + "loss": 0.4125, + "mean_token_accuracy": 0.8695986270904541, + "num_tokens": 529581578.0, + "step": 13886 + }, + { + "epoch": 1.766569138786414, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.220928192138672, + "learning_rate": 1e-06, + "loss": 0.4132, + "mean_token_accuracy": 0.8691755533218384, + "num_tokens": 529624363.0, + "step": 13887 + }, + { + "epoch": 1.7666963490650045, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.34993553161621, + "learning_rate": 1e-06, + "loss": 0.3963, + "mean_token_accuracy": 0.8766605854034424, + "num_tokens": 529665286.0, + "step": 13888 + }, + { + "epoch": 1.766823559343595, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.41718292236328, + "learning_rate": 1e-06, + "loss": 0.4307, + "mean_token_accuracy": 0.865768551826477, + "num_tokens": 529704150.0, + "step": 13889 + }, + { + "epoch": 1.7669507696221856, + "ewc_loss": 0.034912109375, + "ewc_loss_parallel": 3.4809112548828125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.300748825073242, + "learning_rate": 1e-06, + "loss": 0.4002, + "mean_token_accuracy": 0.8721586465835571, + "num_tokens": 529742110.0, + "step": 13890 + }, + { + "epoch": 1.767077979900776, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.362777709960938, + "learning_rate": 1e-06, + "loss": 0.4102, + "mean_token_accuracy": 0.8687730431556702, + "num_tokens": 529776158.0, + "step": 13891 + }, + { + "epoch": 1.7672051901793666, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.724658966064453, + "learning_rate": 1e-06, + "loss": 0.4752, + "mean_token_accuracy": 0.8477354049682617, + "num_tokens": 529814002.0, + "step": 13892 + }, + { + "epoch": 1.7673324004579571, + "ewc_loss": 0.034912109375, + "ewc_loss_parallel": 3.4809112548828125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.357585906982422, + "learning_rate": 1e-06, + "loss": 0.4009, + "mean_token_accuracy": 0.873154878616333, + "num_tokens": 529846696.0, + "step": 13893 + }, + { + "epoch": 1.7674596107365477, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.361162185668945, + "learning_rate": 1e-06, + "loss": 0.4145, + "mean_token_accuracy": 0.8673961758613586, + "num_tokens": 529881557.0, + "step": 13894 + }, + { + "epoch": 1.767586821015138, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.296873092651367, + "learning_rate": 1e-06, + "loss": 0.4599, + "mean_token_accuracy": 0.8578149080276489, + "num_tokens": 529924337.0, + "step": 13895 + }, + { + "epoch": 1.7677140312937285, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.43213653564453, + "learning_rate": 1e-06, + "loss": 0.4207, + "mean_token_accuracy": 0.8688496947288513, + "num_tokens": 529967195.0, + "step": 13896 + }, + { + "epoch": 1.767841241572319, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.28312110900879, + "learning_rate": 1e-06, + "loss": 0.4674, + "mean_token_accuracy": 0.8515069484710693, + "num_tokens": 530003454.0, + "step": 13897 + }, + { + "epoch": 1.7679684518509096, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.596525192260742, + "learning_rate": 1e-06, + "loss": 0.4901, + "mean_token_accuracy": 0.8438749313354492, + "num_tokens": 530042577.0, + "step": 13898 + }, + { + "epoch": 1.7680956621295, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.275760650634766, + "learning_rate": 1e-06, + "loss": 0.4618, + "mean_token_accuracy": 0.8547632694244385, + "num_tokens": 530084903.0, + "step": 13899 + }, + { + "epoch": 1.7682228724080906, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.41205596923828, + "learning_rate": 1e-06, + "loss": 0.4184, + "mean_token_accuracy": 0.865534782409668, + "num_tokens": 530117598.0, + "step": 13900 + }, + { + "epoch": 1.768350082686681, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.59937858581543, + "learning_rate": 1e-06, + "loss": 0.3984, + "mean_token_accuracy": 0.8724607825279236, + "num_tokens": 530151891.0, + "step": 13901 + }, + { + "epoch": 1.7684772929652715, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.1889705657959, + "learning_rate": 1e-06, + "loss": 0.4375, + "mean_token_accuracy": 0.8608461618423462, + "num_tokens": 530191263.0, + "step": 13902 + }, + { + "epoch": 1.768604503243862, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.30446434020996, + "learning_rate": 1e-06, + "loss": 0.429, + "mean_token_accuracy": 0.8641416430473328, + "num_tokens": 530231156.0, + "step": 13903 + }, + { + "epoch": 1.7687317135224525, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.518407821655273, + "learning_rate": 1e-06, + "loss": 0.3715, + "mean_token_accuracy": 0.8822804689407349, + "num_tokens": 530270089.0, + "step": 13904 + }, + { + "epoch": 1.768858923801043, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.538938522338867, + "learning_rate": 1e-06, + "loss": 0.4021, + "mean_token_accuracy": 0.8720096945762634, + "num_tokens": 530309808.0, + "step": 13905 + }, + { + "epoch": 1.7689861340796336, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.18892478942871, + "learning_rate": 1e-06, + "loss": 0.4516, + "mean_token_accuracy": 0.8576595783233643, + "num_tokens": 530357723.0, + "step": 13906 + }, + { + "epoch": 1.769113344358224, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.426166534423828, + "learning_rate": 1e-06, + "loss": 0.4591, + "mean_token_accuracy": 0.8559905886650085, + "num_tokens": 530389586.0, + "step": 13907 + }, + { + "epoch": 1.7692405546368146, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.425817489624023, + "learning_rate": 1e-06, + "loss": 0.4465, + "mean_token_accuracy": 0.8586538434028625, + "num_tokens": 530430356.0, + "step": 13908 + }, + { + "epoch": 1.7693677649154052, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.319377899169922, + "learning_rate": 1e-06, + "loss": 0.4109, + "mean_token_accuracy": 0.873489260673523, + "num_tokens": 530472859.0, + "step": 13909 + }, + { + "epoch": 1.7694949751939957, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.545278549194336, + "learning_rate": 1e-06, + "loss": 0.4523, + "mean_token_accuracy": 0.854836642742157, + "num_tokens": 530515422.0, + "step": 13910 + }, + { + "epoch": 1.7696221854725862, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.43755340576172, + "learning_rate": 1e-06, + "loss": 0.4165, + "mean_token_accuracy": 0.8668898344039917, + "num_tokens": 530553842.0, + "step": 13911 + }, + { + "epoch": 1.7697493957511767, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.455406188964844, + "learning_rate": 1e-06, + "loss": 0.4174, + "mean_token_accuracy": 0.8678881525993347, + "num_tokens": 530592593.0, + "step": 13912 + }, + { + "epoch": 1.7698766060297673, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.437946319580078, + "learning_rate": 1e-06, + "loss": 0.4352, + "mean_token_accuracy": 0.8620738983154297, + "num_tokens": 530632309.0, + "step": 13913 + }, + { + "epoch": 1.7700038163083578, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.330101013183594, + "learning_rate": 1e-06, + "loss": 0.4275, + "mean_token_accuracy": 0.8633712530136108, + "num_tokens": 530667415.0, + "step": 13914 + }, + { + "epoch": 1.7701310265869483, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.413570404052734, + "learning_rate": 1e-06, + "loss": 0.425, + "mean_token_accuracy": 0.8652875423431396, + "num_tokens": 530705736.0, + "step": 13915 + }, + { + "epoch": 1.7702582368655388, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.45753288269043, + "learning_rate": 1e-06, + "loss": 0.3908, + "mean_token_accuracy": 0.8769180178642273, + "num_tokens": 530744931.0, + "step": 13916 + }, + { + "epoch": 1.7703854471441294, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.671207427978516, + "learning_rate": 1e-06, + "loss": 0.37, + "mean_token_accuracy": 0.8804272413253784, + "num_tokens": 530784700.0, + "step": 13917 + }, + { + "epoch": 1.77051265742272, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.28285026550293, + "learning_rate": 1e-06, + "loss": 0.3633, + "mean_token_accuracy": 0.8845648765563965, + "num_tokens": 530821097.0, + "step": 13918 + }, + { + "epoch": 1.7706398677013102, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.349166870117188, + "learning_rate": 1e-06, + "loss": 0.4351, + "mean_token_accuracy": 0.8621196746826172, + "num_tokens": 530860444.0, + "step": 13919 + }, + { + "epoch": 1.7707670779799007, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.30315399169922, + "learning_rate": 1e-06, + "loss": 0.4252, + "mean_token_accuracy": 0.8660248517990112, + "num_tokens": 530907304.0, + "step": 13920 + }, + { + "epoch": 1.7708942882584913, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.42216682434082, + "learning_rate": 1e-06, + "loss": 0.435, + "mean_token_accuracy": 0.8605715036392212, + "num_tokens": 530945692.0, + "step": 13921 + }, + { + "epoch": 1.7710214985370818, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.656675338745117, + "learning_rate": 1e-06, + "loss": 0.3777, + "mean_token_accuracy": 0.8778953552246094, + "num_tokens": 530986695.0, + "step": 13922 + }, + { + "epoch": 1.7711487088156723, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.31169891357422, + "learning_rate": 1e-06, + "loss": 0.3715, + "mean_token_accuracy": 0.8794797658920288, + "num_tokens": 531022656.0, + "step": 13923 + }, + { + "epoch": 1.7712759190942629, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.400379180908203, + "learning_rate": 1e-06, + "loss": 0.387, + "mean_token_accuracy": 0.877458930015564, + "num_tokens": 531056896.0, + "step": 13924 + }, + { + "epoch": 1.7714031293728534, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.63706398010254, + "learning_rate": 1e-06, + "loss": 0.4997, + "mean_token_accuracy": 0.8418083786964417, + "num_tokens": 531097286.0, + "step": 13925 + }, + { + "epoch": 1.7715303396514437, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.3757381439209, + "learning_rate": 1e-06, + "loss": 0.3918, + "mean_token_accuracy": 0.8760664463043213, + "num_tokens": 531136430.0, + "step": 13926 + }, + { + "epoch": 1.7716575499300342, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.6114501953125, + "learning_rate": 1e-06, + "loss": 0.4511, + "mean_token_accuracy": 0.8612833619117737, + "num_tokens": 531176698.0, + "step": 13927 + }, + { + "epoch": 1.7717847602086247, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.480140686035156, + "learning_rate": 1e-06, + "loss": 0.4166, + "mean_token_accuracy": 0.8672874569892883, + "num_tokens": 531214343.0, + "step": 13928 + }, + { + "epoch": 1.7719119704872153, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.442140579223633, + "learning_rate": 1e-06, + "loss": 0.3813, + "mean_token_accuracy": 0.8802309036254883, + "num_tokens": 531252325.0, + "step": 13929 + }, + { + "epoch": 1.7720391807658058, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.51922035217285, + "learning_rate": 1e-06, + "loss": 0.4557, + "mean_token_accuracy": 0.8567827939987183, + "num_tokens": 531290378.0, + "step": 13930 + }, + { + "epoch": 1.7721663910443963, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.494224548339844, + "learning_rate": 1e-06, + "loss": 0.4466, + "mean_token_accuracy": 0.8592647314071655, + "num_tokens": 531330406.0, + "step": 13931 + }, + { + "epoch": 1.7722936013229869, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.4620361328125, + "learning_rate": 1e-06, + "loss": 0.4131, + "mean_token_accuracy": 0.8670423030853271, + "num_tokens": 531367425.0, + "step": 13932 + }, + { + "epoch": 1.7724208116015774, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.459026336669922, + "learning_rate": 1e-06, + "loss": 0.3568, + "mean_token_accuracy": 0.889072060585022, + "num_tokens": 531406465.0, + "step": 13933 + }, + { + "epoch": 1.772548021880168, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.648094177246094, + "learning_rate": 1e-06, + "loss": 0.4407, + "mean_token_accuracy": 0.8579638004302979, + "num_tokens": 531445356.0, + "step": 13934 + }, + { + "epoch": 1.7726752321587584, + "ewc_loss": 0.034912109375, + "ewc_loss_parallel": 3.4809112548828125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.43872833251953, + "learning_rate": 1e-06, + "loss": 0.415, + "mean_token_accuracy": 0.8655049800872803, + "num_tokens": 531484737.0, + "step": 13935 + }, + { + "epoch": 1.772802442437349, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.63702392578125, + "learning_rate": 1e-06, + "loss": 0.4278, + "mean_token_accuracy": 0.8643669486045837, + "num_tokens": 531523453.0, + "step": 13936 + }, + { + "epoch": 1.7729296527159395, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.535566329956055, + "learning_rate": 1e-06, + "loss": 0.3891, + "mean_token_accuracy": 0.8746676445007324, + "num_tokens": 531559627.0, + "step": 13937 + }, + { + "epoch": 1.77305686299453, + "ewc_loss": 0.0341796875, + "ewc_loss_parallel": 3.409385681152344e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.439144134521484, + "learning_rate": 1e-06, + "loss": 0.3855, + "mean_token_accuracy": 0.8734637498855591, + "num_tokens": 531597677.0, + "step": 13938 + }, + { + "epoch": 1.7731840732731206, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.33645248413086, + "learning_rate": 1e-06, + "loss": 0.399, + "mean_token_accuracy": 0.8772066831588745, + "num_tokens": 531642753.0, + "step": 13939 + }, + { + "epoch": 1.773311283551711, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.271059036254883, + "learning_rate": 1e-06, + "loss": 0.4438, + "mean_token_accuracy": 0.860878586769104, + "num_tokens": 531688249.0, + "step": 13940 + }, + { + "epoch": 1.7734384938303016, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.356447219848633, + "learning_rate": 1e-06, + "loss": 0.3749, + "mean_token_accuracy": 0.8820929527282715, + "num_tokens": 531727994.0, + "step": 13941 + }, + { + "epoch": 1.7735657041088921, + "ewc_loss": 0.034912109375, + "ewc_loss_parallel": 3.4809112548828125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.603347778320312, + "learning_rate": 1e-06, + "loss": 0.4074, + "mean_token_accuracy": 0.8714594841003418, + "num_tokens": 531764428.0, + "step": 13942 + }, + { + "epoch": 1.7736929143874827, + "ewc_loss": 0.034912109375, + "ewc_loss_parallel": 3.4809112548828125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.447362899780273, + "learning_rate": 1e-06, + "loss": 0.4321, + "mean_token_accuracy": 0.8630340099334717, + "num_tokens": 531806604.0, + "step": 13943 + }, + { + "epoch": 1.773820124666073, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.310989379882812, + "learning_rate": 1e-06, + "loss": 0.3564, + "mean_token_accuracy": 0.886623740196228, + "num_tokens": 531845504.0, + "step": 13944 + }, + { + "epoch": 1.7739473349446635, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.309616088867188, + "learning_rate": 1e-06, + "loss": 0.3871, + "mean_token_accuracy": 0.8783944249153137, + "num_tokens": 531880962.0, + "step": 13945 + }, + { + "epoch": 1.774074545223254, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.38330078125, + "learning_rate": 1e-06, + "loss": 0.4183, + "mean_token_accuracy": 0.8669769763946533, + "num_tokens": 531918182.0, + "step": 13946 + }, + { + "epoch": 1.7742017555018446, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.031818389892578, + "learning_rate": 1e-06, + "loss": 0.4137, + "mean_token_accuracy": 0.8690425157546997, + "num_tokens": 531958340.0, + "step": 13947 + }, + { + "epoch": 1.774328965780435, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.534666061401367, + "learning_rate": 1e-06, + "loss": 0.4519, + "mean_token_accuracy": 0.8568543791770935, + "num_tokens": 531996951.0, + "step": 13948 + }, + { + "epoch": 1.7744561760590256, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.648624420166016, + "learning_rate": 1e-06, + "loss": 0.3958, + "mean_token_accuracy": 0.8722628355026245, + "num_tokens": 532036963.0, + "step": 13949 + }, + { + "epoch": 1.774583386337616, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.357027053833008, + "learning_rate": 1e-06, + "loss": 0.4327, + "mean_token_accuracy": 0.8625461459159851, + "num_tokens": 532070937.0, + "step": 13950 + }, + { + "epoch": 1.7747105966162064, + "ewc_loss": 0.034912109375, + "ewc_loss_parallel": 3.4809112548828125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.53368377685547, + "learning_rate": 1e-06, + "loss": 0.4684, + "mean_token_accuracy": 0.85552978515625, + "num_tokens": 532107866.0, + "step": 13951 + }, + { + "epoch": 1.774837806894797, + "ewc_loss": 0.034912109375, + "ewc_loss_parallel": 3.4809112548828125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.60190773010254, + "learning_rate": 1e-06, + "loss": 0.4417, + "mean_token_accuracy": 0.8611540794372559, + "num_tokens": 532151485.0, + "step": 13952 + }, + { + "epoch": 1.7749650171733875, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.381439208984375, + "learning_rate": 1e-06, + "loss": 0.3949, + "mean_token_accuracy": 0.8743512630462646, + "num_tokens": 532191209.0, + "step": 13953 + }, + { + "epoch": 1.775092227451978, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.808155059814453, + "learning_rate": 1e-06, + "loss": 0.4206, + "mean_token_accuracy": 0.8634134531021118, + "num_tokens": 532232219.0, + "step": 13954 + }, + { + "epoch": 1.7752194377305686, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.249073028564453, + "learning_rate": 1e-06, + "loss": 0.4489, + "mean_token_accuracy": 0.857049822807312, + "num_tokens": 532265330.0, + "step": 13955 + }, + { + "epoch": 1.775346648009159, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.557289123535156, + "learning_rate": 1e-06, + "loss": 0.4227, + "mean_token_accuracy": 0.8674799203872681, + "num_tokens": 532295964.0, + "step": 13956 + }, + { + "epoch": 1.7754738582877496, + "ewc_loss": 0.034912109375, + "ewc_loss_parallel": 3.4809112548828125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.69220733642578, + "learning_rate": 1e-06, + "loss": 0.4161, + "mean_token_accuracy": 0.8724642992019653, + "num_tokens": 532333804.0, + "step": 13957 + }, + { + "epoch": 1.7756010685663401, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.15152931213379, + "learning_rate": 1e-06, + "loss": 0.3915, + "mean_token_accuracy": 0.8725876808166504, + "num_tokens": 532373088.0, + "step": 13958 + }, + { + "epoch": 1.7757282788449307, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.50473403930664, + "learning_rate": 1e-06, + "loss": 0.4218, + "mean_token_accuracy": 0.866094708442688, + "num_tokens": 532409473.0, + "step": 13959 + }, + { + "epoch": 1.7758554891235212, + "ewc_loss": 0.034912109375, + "ewc_loss_parallel": 3.4809112548828125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.682533264160156, + "learning_rate": 1e-06, + "loss": 0.3756, + "mean_token_accuracy": 0.8795529007911682, + "num_tokens": 532447883.0, + "step": 13960 + }, + { + "epoch": 1.7759826994021117, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.150049209594727, + "learning_rate": 1e-06, + "loss": 0.3985, + "mean_token_accuracy": 0.8730058670043945, + "num_tokens": 532485783.0, + "step": 13961 + }, + { + "epoch": 1.7761099096807023, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.480205535888672, + "learning_rate": 1e-06, + "loss": 0.3929, + "mean_token_accuracy": 0.8747261762619019, + "num_tokens": 532511082.0, + "step": 13962 + }, + { + "epoch": 1.7762371199592928, + "ewc_loss": 0.034912109375, + "ewc_loss_parallel": 3.4809112548828125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.34652328491211, + "learning_rate": 1e-06, + "loss": 0.3696, + "mean_token_accuracy": 0.8843799233436584, + "num_tokens": 532555516.0, + "step": 13963 + }, + { + "epoch": 1.7763643302378833, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.262767791748047, + "learning_rate": 1e-06, + "loss": 0.4603, + "mean_token_accuracy": 0.8528719544410706, + "num_tokens": 532599200.0, + "step": 13964 + }, + { + "epoch": 1.7764915405164738, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.241436004638672, + "learning_rate": 1e-06, + "loss": 0.3926, + "mean_token_accuracy": 0.8795289993286133, + "num_tokens": 532636335.0, + "step": 13965 + }, + { + "epoch": 1.7766187507950644, + "ewc_loss": 0.034912109375, + "ewc_loss_parallel": 3.4809112548828125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.6356201171875, + "learning_rate": 1e-06, + "loss": 0.3786, + "mean_token_accuracy": 0.8806664943695068, + "num_tokens": 532678337.0, + "step": 13966 + }, + { + "epoch": 1.776745961073655, + "ewc_loss": 0.034912109375, + "ewc_loss_parallel": 3.4809112548828125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.392637252807617, + "learning_rate": 1e-06, + "loss": 0.4223, + "mean_token_accuracy": 0.8646248579025269, + "num_tokens": 532715051.0, + "step": 13967 + }, + { + "epoch": 1.7768731713522452, + "ewc_loss": 0.034912109375, + "ewc_loss_parallel": 3.4809112548828125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.252370834350586, + "learning_rate": 1e-06, + "loss": 0.3645, + "mean_token_accuracy": 0.8792202472686768, + "num_tokens": 532750836.0, + "step": 13968 + }, + { + "epoch": 1.7770003816308357, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.37644386291504, + "learning_rate": 1e-06, + "loss": 0.3945, + "mean_token_accuracy": 0.8745302557945251, + "num_tokens": 532790331.0, + "step": 13969 + }, + { + "epoch": 1.7771275919094263, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.24667739868164, + "learning_rate": 1e-06, + "loss": 0.4283, + "mean_token_accuracy": 0.8646401166915894, + "num_tokens": 532830169.0, + "step": 13970 + }, + { + "epoch": 1.7772548021880168, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.354917526245117, + "learning_rate": 1e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.8622983694076538, + "num_tokens": 532865957.0, + "step": 13971 + }, + { + "epoch": 1.7773820124666073, + "ewc_loss": 0.034912109375, + "ewc_loss_parallel": 3.4809112548828125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.55322265625, + "learning_rate": 1e-06, + "loss": 0.4299, + "mean_token_accuracy": 0.8652086853981018, + "num_tokens": 532906112.0, + "step": 13972 + }, + { + "epoch": 1.7775092227451978, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.38789176940918, + "learning_rate": 1e-06, + "loss": 0.4049, + "mean_token_accuracy": 0.8749878406524658, + "num_tokens": 532939676.0, + "step": 13973 + }, + { + "epoch": 1.7776364330237884, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.4973087310791, + "learning_rate": 1e-06, + "loss": 0.4686, + "mean_token_accuracy": 0.8512948751449585, + "num_tokens": 532977668.0, + "step": 13974 + }, + { + "epoch": 1.7777636433023787, + "ewc_loss": 0.034912109375, + "ewc_loss_parallel": 3.4809112548828125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.668283462524414, + "learning_rate": 1e-06, + "loss": 0.4203, + "mean_token_accuracy": 0.8665741682052612, + "num_tokens": 533013862.0, + "step": 13975 + }, + { + "epoch": 1.7778908535809692, + "ewc_loss": 0.034912109375, + "ewc_loss_parallel": 3.4809112548828125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.20895004272461, + "learning_rate": 1e-06, + "loss": 0.4533, + "mean_token_accuracy": 0.8569921255111694, + "num_tokens": 533051222.0, + "step": 13976 + }, + { + "epoch": 1.7780180638595597, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.506288528442383, + "learning_rate": 1e-06, + "loss": 0.4226, + "mean_token_accuracy": 0.8662074208259583, + "num_tokens": 533090182.0, + "step": 13977 + }, + { + "epoch": 1.7781452741381503, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.45454216003418, + "learning_rate": 1e-06, + "loss": 0.3774, + "mean_token_accuracy": 0.8813634514808655, + "num_tokens": 533129600.0, + "step": 13978 + }, + { + "epoch": 1.7782724844167408, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.231882095336914, + "learning_rate": 1e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.8685208559036255, + "num_tokens": 533172351.0, + "step": 13979 + }, + { + "epoch": 1.7783996946953313, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.656156539916992, + "learning_rate": 1e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.8655839562416077, + "num_tokens": 533211583.0, + "step": 13980 + }, + { + "epoch": 1.7785269049739219, + "ewc_loss": 0.034912109375, + "ewc_loss_parallel": 3.4809112548828125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.533966064453125, + "learning_rate": 1e-06, + "loss": 0.3962, + "mean_token_accuracy": 0.8749409914016724, + "num_tokens": 533249960.0, + "step": 13981 + }, + { + "epoch": 1.7786541152525124, + "ewc_loss": 0.034912109375, + "ewc_loss_parallel": 3.4809112548828125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.391605377197266, + "learning_rate": 1e-06, + "loss": 0.3802, + "mean_token_accuracy": 0.8758187294006348, + "num_tokens": 533287328.0, + "step": 13982 + }, + { + "epoch": 1.778781325531103, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.428646087646484, + "learning_rate": 1e-06, + "loss": 0.4079, + "mean_token_accuracy": 0.8692992925643921, + "num_tokens": 533327772.0, + "step": 13983 + }, + { + "epoch": 1.7789085358096934, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.709699630737305, + "learning_rate": 1e-06, + "loss": 0.4337, + "mean_token_accuracy": 0.8624366521835327, + "num_tokens": 533372004.0, + "step": 13984 + }, + { + "epoch": 1.779035746088284, + "ewc_loss": 0.034912109375, + "ewc_loss_parallel": 3.4809112548828125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.66413116455078, + "learning_rate": 1e-06, + "loss": 0.4261, + "mean_token_accuracy": 0.8664041757583618, + "num_tokens": 533409502.0, + "step": 13985 + }, + { + "epoch": 1.7791629563668745, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.248659133911133, + "learning_rate": 1e-06, + "loss": 0.4468, + "mean_token_accuracy": 0.8564525246620178, + "num_tokens": 533451160.0, + "step": 13986 + }, + { + "epoch": 1.779290166645465, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.290794372558594, + "learning_rate": 1e-06, + "loss": 0.3884, + "mean_token_accuracy": 0.8762860894203186, + "num_tokens": 533480413.0, + "step": 13987 + }, + { + "epoch": 1.7794173769240555, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.42482566833496, + "learning_rate": 1e-06, + "loss": 0.4597, + "mean_token_accuracy": 0.8568689227104187, + "num_tokens": 533518826.0, + "step": 13988 + }, + { + "epoch": 1.779544587202646, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.318225860595703, + "learning_rate": 1e-06, + "loss": 0.4518, + "mean_token_accuracy": 0.8578015565872192, + "num_tokens": 533563760.0, + "step": 13989 + }, + { + "epoch": 1.7796717974812366, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.561874389648438, + "learning_rate": 1e-06, + "loss": 0.4131, + "mean_token_accuracy": 0.8686593770980835, + "num_tokens": 533601499.0, + "step": 13990 + }, + { + "epoch": 1.7797990077598271, + "ewc_loss": 0.034912109375, + "ewc_loss_parallel": 3.4809112548828125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.438777923583984, + "learning_rate": 1e-06, + "loss": 0.4028, + "mean_token_accuracy": 0.8728671073913574, + "num_tokens": 533640757.0, + "step": 13991 + }, + { + "epoch": 1.7799262180384177, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.27611541748047, + "learning_rate": 1e-06, + "loss": 0.4051, + "mean_token_accuracy": 0.8700742721557617, + "num_tokens": 533678047.0, + "step": 13992 + }, + { + "epoch": 1.780053428317008, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.522489547729492, + "learning_rate": 1e-06, + "loss": 0.4103, + "mean_token_accuracy": 0.8689351677894592, + "num_tokens": 533720903.0, + "step": 13993 + }, + { + "epoch": 1.7801806385955985, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.537240982055664, + "learning_rate": 1e-06, + "loss": 0.4172, + "mean_token_accuracy": 0.870538592338562, + "num_tokens": 533759720.0, + "step": 13994 + }, + { + "epoch": 1.780307848874189, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.307676315307617, + "learning_rate": 1e-06, + "loss": 0.4473, + "mean_token_accuracy": 0.8568957448005676, + "num_tokens": 533796122.0, + "step": 13995 + }, + { + "epoch": 1.7804350591527796, + "ewc_loss": 0.034912109375, + "ewc_loss_parallel": 3.4809112548828125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.712512969970703, + "learning_rate": 1e-06, + "loss": 0.393, + "mean_token_accuracy": 0.8749427795410156, + "num_tokens": 533836326.0, + "step": 13996 + }, + { + "epoch": 1.78056226943137, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.75582504272461, + "learning_rate": 1e-06, + "loss": 0.3673, + "mean_token_accuracy": 0.8798316717147827, + "num_tokens": 533867874.0, + "step": 13997 + }, + { + "epoch": 1.7806894797099606, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.359037399291992, + "learning_rate": 1e-06, + "loss": 0.3746, + "mean_token_accuracy": 0.8815067410469055, + "num_tokens": 533902877.0, + "step": 13998 + }, + { + "epoch": 1.780816689988551, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.843358993530273, + "learning_rate": 1e-06, + "loss": 0.4009, + "mean_token_accuracy": 0.8723406195640564, + "num_tokens": 533937440.0, + "step": 13999 + }, + { + "epoch": 1.7809439002671414, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.4062442779541, + "learning_rate": 1e-06, + "loss": 0.3857, + "mean_token_accuracy": 0.877973198890686, + "num_tokens": 533972308.0, + "step": 14000 + }, + { + "epoch": 1.781071110545732, + "ewc_loss": 0.034423828125, + "ewc_loss_parallel": 3.4332275390625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.40936851501465, + "learning_rate": 1e-06, + "loss": 0.4642, + "mean_token_accuracy": 0.8561975955963135, + "num_tokens": 534011913.0, + "step": 14001 + }, + { + "epoch": 1.7811983208243225, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.248342514038086, + "learning_rate": 1e-06, + "loss": 0.3804, + "mean_token_accuracy": 0.8800193071365356, + "num_tokens": 534057235.0, + "step": 14002 + }, + { + "epoch": 1.781325531102913, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.47573471069336, + "learning_rate": 1e-06, + "loss": 0.4045, + "mean_token_accuracy": 0.86915123462677, + "num_tokens": 534092850.0, + "step": 14003 + }, + { + "epoch": 1.7814527413815036, + "ewc_loss": 0.034912109375, + "ewc_loss_parallel": 3.4809112548828125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.740209579467773, + "learning_rate": 1e-06, + "loss": 0.4033, + "mean_token_accuracy": 0.873690128326416, + "num_tokens": 534135310.0, + "step": 14004 + }, + { + "epoch": 1.781579951660094, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.09552001953125, + "learning_rate": 1e-06, + "loss": 0.4198, + "mean_token_accuracy": 0.8667659759521484, + "num_tokens": 534176008.0, + "step": 14005 + }, + { + "epoch": 1.7817071619386846, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.380701065063477, + "learning_rate": 1e-06, + "loss": 0.4087, + "mean_token_accuracy": 0.8693287372589111, + "num_tokens": 534208217.0, + "step": 14006 + }, + { + "epoch": 1.7818343722172751, + "ewc_loss": 0.034912109375, + "ewc_loss_parallel": 3.4809112548828125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.463579177856445, + "learning_rate": 1e-06, + "loss": 0.4057, + "mean_token_accuracy": 0.873264491558075, + "num_tokens": 534247564.0, + "step": 14007 + }, + { + "epoch": 1.7819615824958657, + "ewc_loss": 0.034912109375, + "ewc_loss_parallel": 3.4809112548828125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.361618041992188, + "learning_rate": 1e-06, + "loss": 0.468, + "mean_token_accuracy": 0.851136326789856, + "num_tokens": 534281144.0, + "step": 14008 + }, + { + "epoch": 1.7820887927744562, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.4759464263916, + "learning_rate": 1e-06, + "loss": 0.4476, + "mean_token_accuracy": 0.8586559295654297, + "num_tokens": 534317893.0, + "step": 14009 + }, + { + "epoch": 1.7822160030530467, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.404563903808594, + "learning_rate": 1e-06, + "loss": 0.4191, + "mean_token_accuracy": 0.8669284582138062, + "num_tokens": 534356116.0, + "step": 14010 + }, + { + "epoch": 1.7823432133316373, + "ewc_loss": 0.034912109375, + "ewc_loss_parallel": 3.4809112548828125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.383026123046875, + "learning_rate": 1e-06, + "loss": 0.4855, + "mean_token_accuracy": 0.8436676859855652, + "num_tokens": 534390499.0, + "step": 14011 + }, + { + "epoch": 1.7824704236102278, + "ewc_loss": 0.034912109375, + "ewc_loss_parallel": 3.4809112548828125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.437889099121094, + "learning_rate": 1e-06, + "loss": 0.4369, + "mean_token_accuracy": 0.864172637462616, + "num_tokens": 534424242.0, + "step": 14012 + }, + { + "epoch": 1.7825976338888183, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.104042053222656, + "learning_rate": 1e-06, + "loss": 0.4404, + "mean_token_accuracy": 0.8601707220077515, + "num_tokens": 534467465.0, + "step": 14013 + }, + { + "epoch": 1.7827248441674088, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.51296043395996, + "learning_rate": 1e-06, + "loss": 0.4535, + "mean_token_accuracy": 0.8630715012550354, + "num_tokens": 534502959.0, + "step": 14014 + }, + { + "epoch": 1.7828520544459994, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.42156219482422, + "learning_rate": 1e-06, + "loss": 0.4203, + "mean_token_accuracy": 0.8699839115142822, + "num_tokens": 534549888.0, + "step": 14015 + }, + { + "epoch": 1.78297926472459, + "ewc_loss": 0.034912109375, + "ewc_loss_parallel": 3.4809112548828125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.379133224487305, + "learning_rate": 1e-06, + "loss": 0.4125, + "mean_token_accuracy": 0.8704103827476501, + "num_tokens": 534591686.0, + "step": 14016 + }, + { + "epoch": 1.7831064750031802, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.459882736206055, + "learning_rate": 1e-06, + "loss": 0.3643, + "mean_token_accuracy": 0.8852077126502991, + "num_tokens": 534628584.0, + "step": 14017 + }, + { + "epoch": 1.7832336852817707, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.413190841674805, + "learning_rate": 1e-06, + "loss": 0.4511, + "mean_token_accuracy": 0.8586060404777527, + "num_tokens": 534665684.0, + "step": 14018 + }, + { + "epoch": 1.7833608955603613, + "ewc_loss": 0.034912109375, + "ewc_loss_parallel": 3.4809112548828125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.213951110839844, + "learning_rate": 1e-06, + "loss": 0.4125, + "mean_token_accuracy": 0.8710832595825195, + "num_tokens": 534701604.0, + "step": 14019 + }, + { + "epoch": 1.7834881058389518, + "ewc_loss": 0.034912109375, + "ewc_loss_parallel": 3.4809112548828125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.34383201599121, + "learning_rate": 1e-06, + "loss": 0.4634, + "mean_token_accuracy": 0.853537380695343, + "num_tokens": 534741765.0, + "step": 14020 + }, + { + "epoch": 1.7836153161175423, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.40827178955078, + "learning_rate": 1e-06, + "loss": 0.3773, + "mean_token_accuracy": 0.8776724338531494, + "num_tokens": 534780331.0, + "step": 14021 + }, + { + "epoch": 1.7837425263961328, + "ewc_loss": 0.034912109375, + "ewc_loss_parallel": 3.4809112548828125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.283977508544922, + "learning_rate": 1e-06, + "loss": 0.4686, + "mean_token_accuracy": 0.8567903637886047, + "num_tokens": 534816254.0, + "step": 14022 + }, + { + "epoch": 1.7838697366747234, + "ewc_loss": 0.034912109375, + "ewc_loss_parallel": 3.4809112548828125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.336368560791016, + "learning_rate": 1e-06, + "loss": 0.4245, + "mean_token_accuracy": 0.8678668737411499, + "num_tokens": 534851367.0, + "step": 14023 + }, + { + "epoch": 1.7839969469533137, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.43746566772461, + "learning_rate": 1e-06, + "loss": 0.415, + "mean_token_accuracy": 0.8652912974357605, + "num_tokens": 534889077.0, + "step": 14024 + }, + { + "epoch": 1.7841241572319042, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.220975875854492, + "learning_rate": 1e-06, + "loss": 0.4513, + "mean_token_accuracy": 0.8574317097663879, + "num_tokens": 534924249.0, + "step": 14025 + }, + { + "epoch": 1.7842513675104947, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.3447265625, + "learning_rate": 1e-06, + "loss": 0.4641, + "mean_token_accuracy": 0.8530118465423584, + "num_tokens": 534966370.0, + "step": 14026 + }, + { + "epoch": 1.7843785777890853, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.223283767700195, + "learning_rate": 1e-06, + "loss": 0.3952, + "mean_token_accuracy": 0.8750827312469482, + "num_tokens": 535005869.0, + "step": 14027 + }, + { + "epoch": 1.7845057880676758, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.300880432128906, + "learning_rate": 1e-06, + "loss": 0.4363, + "mean_token_accuracy": 0.8648093938827515, + "num_tokens": 535042057.0, + "step": 14028 + }, + { + "epoch": 1.7846329983462663, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.350242614746094, + "learning_rate": 1e-06, + "loss": 0.4293, + "mean_token_accuracy": 0.8632129430770874, + "num_tokens": 535074410.0, + "step": 14029 + }, + { + "epoch": 1.7847602086248568, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.271211624145508, + "learning_rate": 1e-06, + "loss": 0.4408, + "mean_token_accuracy": 0.8595020771026611, + "num_tokens": 535120356.0, + "step": 14030 + }, + { + "epoch": 1.7848874189034474, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.210803985595703, + "learning_rate": 1e-06, + "loss": 0.4486, + "mean_token_accuracy": 0.8594954013824463, + "num_tokens": 535156939.0, + "step": 14031 + }, + { + "epoch": 1.785014629182038, + "ewc_loss": 0.034912109375, + "ewc_loss_parallel": 3.4809112548828125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.375288009643555, + "learning_rate": 1e-06, + "loss": 0.4106, + "mean_token_accuracy": 0.8705583214759827, + "num_tokens": 535195553.0, + "step": 14032 + }, + { + "epoch": 1.7851418394606284, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.355327606201172, + "learning_rate": 1e-06, + "loss": 0.4101, + "mean_token_accuracy": 0.8726641535758972, + "num_tokens": 535235692.0, + "step": 14033 + }, + { + "epoch": 1.785269049739219, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.40421485900879, + "learning_rate": 1e-06, + "loss": 0.4194, + "mean_token_accuracy": 0.8670993447303772, + "num_tokens": 535269265.0, + "step": 14034 + }, + { + "epoch": 1.7853962600178095, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.39011001586914, + "learning_rate": 1e-06, + "loss": 0.3601, + "mean_token_accuracy": 0.8835692405700684, + "num_tokens": 535303825.0, + "step": 14035 + }, + { + "epoch": 1.7855234702964, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.296527862548828, + "learning_rate": 1e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.8623644113540649, + "num_tokens": 535338802.0, + "step": 14036 + }, + { + "epoch": 1.7856506805749905, + "ewc_loss": 0.034912109375, + "ewc_loss_parallel": 3.4809112548828125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.35574722290039, + "learning_rate": 1e-06, + "loss": 0.3902, + "mean_token_accuracy": 0.8755276203155518, + "num_tokens": 535376592.0, + "step": 14037 + }, + { + "epoch": 1.785777890853581, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.269433975219727, + "learning_rate": 1e-06, + "loss": 0.4311, + "mean_token_accuracy": 0.8645062446594238, + "num_tokens": 535416528.0, + "step": 14038 + }, + { + "epoch": 1.7859051011321716, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.181747436523438, + "learning_rate": 1e-06, + "loss": 0.4281, + "mean_token_accuracy": 0.8640831112861633, + "num_tokens": 535450892.0, + "step": 14039 + }, + { + "epoch": 1.7860323114107621, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.106653213500977, + "learning_rate": 1e-06, + "loss": 0.4064, + "mean_token_accuracy": 0.870343804359436, + "num_tokens": 535488198.0, + "step": 14040 + }, + { + "epoch": 1.7861595216893527, + "ewc_loss": 0.034912109375, + "ewc_loss_parallel": 3.4809112548828125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.46527099609375, + "learning_rate": 1e-06, + "loss": 0.4134, + "mean_token_accuracy": 0.8670504689216614, + "num_tokens": 535526020.0, + "step": 14041 + }, + { + "epoch": 1.786286731967943, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.289329528808594, + "learning_rate": 1e-06, + "loss": 0.3972, + "mean_token_accuracy": 0.8732534646987915, + "num_tokens": 535560400.0, + "step": 14042 + }, + { + "epoch": 1.7864139422465335, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.369953155517578, + "learning_rate": 1e-06, + "loss": 0.4303, + "mean_token_accuracy": 0.86187744140625, + "num_tokens": 535594699.0, + "step": 14043 + }, + { + "epoch": 1.786541152525124, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.248048782348633, + "learning_rate": 1e-06, + "loss": 0.393, + "mean_token_accuracy": 0.8738440275192261, + "num_tokens": 535627019.0, + "step": 14044 + }, + { + "epoch": 1.7866683628037145, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.15380096435547, + "learning_rate": 1e-06, + "loss": 0.392, + "mean_token_accuracy": 0.8778098821640015, + "num_tokens": 535667479.0, + "step": 14045 + }, + { + "epoch": 1.786795573082305, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.48430061340332, + "learning_rate": 1e-06, + "loss": 0.4351, + "mean_token_accuracy": 0.8627074956893921, + "num_tokens": 535705211.0, + "step": 14046 + }, + { + "epoch": 1.7869227833608956, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.147340774536133, + "learning_rate": 1e-06, + "loss": 0.4702, + "mean_token_accuracy": 0.8484708666801453, + "num_tokens": 535744907.0, + "step": 14047 + }, + { + "epoch": 1.787049993639486, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.476594924926758, + "learning_rate": 1e-06, + "loss": 0.4193, + "mean_token_accuracy": 0.8670926690101624, + "num_tokens": 535782526.0, + "step": 14048 + }, + { + "epoch": 1.7871772039180764, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.356239318847656, + "learning_rate": 1e-06, + "loss": 0.3727, + "mean_token_accuracy": 0.8840150833129883, + "num_tokens": 535824153.0, + "step": 14049 + }, + { + "epoch": 1.787304414196667, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.163713455200195, + "learning_rate": 1e-06, + "loss": 0.3828, + "mean_token_accuracy": 0.8795951008796692, + "num_tokens": 535859019.0, + "step": 14050 + }, + { + "epoch": 1.7874316244752575, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.44414710998535, + "learning_rate": 1e-06, + "loss": 0.3794, + "mean_token_accuracy": 0.8800293803215027, + "num_tokens": 535896866.0, + "step": 14051 + }, + { + "epoch": 1.787558834753848, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.34484100341797, + "learning_rate": 1e-06, + "loss": 0.4094, + "mean_token_accuracy": 0.8696032762527466, + "num_tokens": 535937128.0, + "step": 14052 + }, + { + "epoch": 1.7876860450324386, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.306774139404297, + "learning_rate": 1e-06, + "loss": 0.3685, + "mean_token_accuracy": 0.883667528629303, + "num_tokens": 535972727.0, + "step": 14053 + }, + { + "epoch": 1.787813255311029, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.469247817993164, + "learning_rate": 1e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.8737220764160156, + "num_tokens": 536011491.0, + "step": 14054 + }, + { + "epoch": 1.7879404655896196, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.341520309448242, + "learning_rate": 1e-06, + "loss": 0.4076, + "mean_token_accuracy": 0.8722937107086182, + "num_tokens": 536048074.0, + "step": 14055 + }, + { + "epoch": 1.7880676758682101, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.408748626708984, + "learning_rate": 1e-06, + "loss": 0.3997, + "mean_token_accuracy": 0.8756529092788696, + "num_tokens": 536089554.0, + "step": 14056 + }, + { + "epoch": 1.7881948861468007, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.340667724609375, + "learning_rate": 1e-06, + "loss": 0.3865, + "mean_token_accuracy": 0.8773229122161865, + "num_tokens": 536127672.0, + "step": 14057 + }, + { + "epoch": 1.7883220964253912, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.352813720703125, + "learning_rate": 1e-06, + "loss": 0.4074, + "mean_token_accuracy": 0.8695323467254639, + "num_tokens": 536157517.0, + "step": 14058 + }, + { + "epoch": 1.7884493067039817, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.250911712646484, + "learning_rate": 1e-06, + "loss": 0.3787, + "mean_token_accuracy": 0.8809469938278198, + "num_tokens": 536192855.0, + "step": 14059 + }, + { + "epoch": 1.7885765169825723, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.3435001373291, + "learning_rate": 1e-06, + "loss": 0.3991, + "mean_token_accuracy": 0.8706095218658447, + "num_tokens": 536231727.0, + "step": 14060 + }, + { + "epoch": 1.7887037272611628, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.352458953857422, + "learning_rate": 1e-06, + "loss": 0.3824, + "mean_token_accuracy": 0.8816725611686707, + "num_tokens": 536270738.0, + "step": 14061 + }, + { + "epoch": 1.7888309375397533, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.394142150878906, + "learning_rate": 1e-06, + "loss": 0.364, + "mean_token_accuracy": 0.8871421813964844, + "num_tokens": 536313006.0, + "step": 14062 + }, + { + "epoch": 1.7889581478183438, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.357824325561523, + "learning_rate": 1e-06, + "loss": 0.422, + "mean_token_accuracy": 0.8624536395072937, + "num_tokens": 536350697.0, + "step": 14063 + }, + { + "epoch": 1.7890853580969344, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.47838592529297, + "learning_rate": 1e-06, + "loss": 0.388, + "mean_token_accuracy": 0.8775570392608643, + "num_tokens": 536394252.0, + "step": 14064 + }, + { + "epoch": 1.789212568375525, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.320354461669922, + "learning_rate": 1e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.8673855662345886, + "num_tokens": 536426591.0, + "step": 14065 + }, + { + "epoch": 1.7893397786541152, + "ewc_loss": 0.034912109375, + "ewc_loss_parallel": 3.4809112548828125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.584440231323242, + "learning_rate": 1e-06, + "loss": 0.4514, + "mean_token_accuracy": 0.8569807410240173, + "num_tokens": 536468567.0, + "step": 14066 + }, + { + "epoch": 1.7894669889327057, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.528100967407227, + "learning_rate": 1e-06, + "loss": 0.4392, + "mean_token_accuracy": 0.8600132465362549, + "num_tokens": 536506352.0, + "step": 14067 + }, + { + "epoch": 1.7895941992112963, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.50248908996582, + "learning_rate": 1e-06, + "loss": 0.4378, + "mean_token_accuracy": 0.8628043532371521, + "num_tokens": 536538122.0, + "step": 14068 + }, + { + "epoch": 1.7897214094898868, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.711877822875977, + "learning_rate": 1e-06, + "loss": 0.4528, + "mean_token_accuracy": 0.8592953681945801, + "num_tokens": 536574617.0, + "step": 14069 + }, + { + "epoch": 1.7898486197684773, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.216779708862305, + "learning_rate": 1e-06, + "loss": 0.3776, + "mean_token_accuracy": 0.8805382251739502, + "num_tokens": 536611895.0, + "step": 14070 + }, + { + "epoch": 1.7899758300470678, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.36768913269043, + "learning_rate": 1e-06, + "loss": 0.4497, + "mean_token_accuracy": 0.8617922067642212, + "num_tokens": 536651336.0, + "step": 14071 + }, + { + "epoch": 1.7901030403256584, + "ewc_loss": 0.034912109375, + "ewc_loss_parallel": 3.4809112548828125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.708667755126953, + "learning_rate": 1e-06, + "loss": 0.4258, + "mean_token_accuracy": 0.8651536107063293, + "num_tokens": 536692566.0, + "step": 14072 + }, + { + "epoch": 1.7902302506042487, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.682374954223633, + "learning_rate": 1e-06, + "loss": 0.3977, + "mean_token_accuracy": 0.8746247291564941, + "num_tokens": 536730855.0, + "step": 14073 + }, + { + "epoch": 1.7903574608828392, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.351022720336914, + "learning_rate": 1e-06, + "loss": 0.4259, + "mean_token_accuracy": 0.8672341108322144, + "num_tokens": 536766478.0, + "step": 14074 + }, + { + "epoch": 1.7904846711614297, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.493816375732422, + "learning_rate": 1e-06, + "loss": 0.4309, + "mean_token_accuracy": 0.8643465638160706, + "num_tokens": 536799900.0, + "step": 14075 + }, + { + "epoch": 1.7906118814400203, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.269372940063477, + "learning_rate": 1e-06, + "loss": 0.3962, + "mean_token_accuracy": 0.8759841918945312, + "num_tokens": 536843597.0, + "step": 14076 + }, + { + "epoch": 1.7907390917186108, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.200796127319336, + "learning_rate": 1e-06, + "loss": 0.3777, + "mean_token_accuracy": 0.8798837065696716, + "num_tokens": 536879189.0, + "step": 14077 + }, + { + "epoch": 1.7908663019972013, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.319385528564453, + "learning_rate": 1e-06, + "loss": 0.3952, + "mean_token_accuracy": 0.8730910420417786, + "num_tokens": 536918099.0, + "step": 14078 + }, + { + "epoch": 1.7909935122757918, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.4060001373291, + "learning_rate": 1e-06, + "loss": 0.4551, + "mean_token_accuracy": 0.8598722815513611, + "num_tokens": 536954410.0, + "step": 14079 + }, + { + "epoch": 1.7911207225543824, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.254261016845703, + "learning_rate": 1e-06, + "loss": 0.3467, + "mean_token_accuracy": 0.8907282948493958, + "num_tokens": 536986688.0, + "step": 14080 + }, + { + "epoch": 1.791247932832973, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.306102752685547, + "learning_rate": 1e-06, + "loss": 0.4124, + "mean_token_accuracy": 0.869917631149292, + "num_tokens": 537024181.0, + "step": 14081 + }, + { + "epoch": 1.7913751431115634, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.33119010925293, + "learning_rate": 1e-06, + "loss": 0.408, + "mean_token_accuracy": 0.8729425668716431, + "num_tokens": 537064174.0, + "step": 14082 + }, + { + "epoch": 1.791502353390154, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.196191787719727, + "learning_rate": 1e-06, + "loss": 0.4354, + "mean_token_accuracy": 0.8628948926925659, + "num_tokens": 537100279.0, + "step": 14083 + }, + { + "epoch": 1.7916295636687445, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.25832748413086, + "learning_rate": 1e-06, + "loss": 0.4786, + "mean_token_accuracy": 0.844354510307312, + "num_tokens": 537135339.0, + "step": 14084 + }, + { + "epoch": 1.791756773947335, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.283626556396484, + "learning_rate": 1e-06, + "loss": 0.4231, + "mean_token_accuracy": 0.8638814091682434, + "num_tokens": 537169608.0, + "step": 14085 + }, + { + "epoch": 1.7918839842259255, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.40192222595215, + "learning_rate": 1e-06, + "loss": 0.3877, + "mean_token_accuracy": 0.8767673969268799, + "num_tokens": 537205560.0, + "step": 14086 + }, + { + "epoch": 1.792011194504516, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.426895141601562, + "learning_rate": 1e-06, + "loss": 0.4517, + "mean_token_accuracy": 0.8578293323516846, + "num_tokens": 537238697.0, + "step": 14087 + }, + { + "epoch": 1.7921384047831066, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.18038558959961, + "learning_rate": 1e-06, + "loss": 0.4227, + "mean_token_accuracy": 0.8644245862960815, + "num_tokens": 537282365.0, + "step": 14088 + }, + { + "epoch": 1.7922656150616971, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.461233139038086, + "learning_rate": 1e-06, + "loss": 0.3995, + "mean_token_accuracy": 0.8776916265487671, + "num_tokens": 537320805.0, + "step": 14089 + }, + { + "epoch": 1.7923928253402877, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.33787727355957, + "learning_rate": 1e-06, + "loss": 0.4141, + "mean_token_accuracy": 0.8719948530197144, + "num_tokens": 537354607.0, + "step": 14090 + }, + { + "epoch": 1.792520035618878, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.307477951049805, + "learning_rate": 1e-06, + "loss": 0.3799, + "mean_token_accuracy": 0.8787364959716797, + "num_tokens": 537386672.0, + "step": 14091 + }, + { + "epoch": 1.7926472458974685, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.346145629882812, + "learning_rate": 1e-06, + "loss": 0.4016, + "mean_token_accuracy": 0.8716073036193848, + "num_tokens": 537430882.0, + "step": 14092 + }, + { + "epoch": 1.792774456176059, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.575563430786133, + "learning_rate": 1e-06, + "loss": 0.401, + "mean_token_accuracy": 0.8721450567245483, + "num_tokens": 537465114.0, + "step": 14093 + }, + { + "epoch": 1.7929016664546495, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.386457443237305, + "learning_rate": 1e-06, + "loss": 0.4198, + "mean_token_accuracy": 0.8638957738876343, + "num_tokens": 537502820.0, + "step": 14094 + }, + { + "epoch": 1.79302887673324, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.273513793945312, + "learning_rate": 1e-06, + "loss": 0.3864, + "mean_token_accuracy": 0.8780760765075684, + "num_tokens": 537537422.0, + "step": 14095 + }, + { + "epoch": 1.7931560870118306, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.62249183654785, + "learning_rate": 1e-06, + "loss": 0.4005, + "mean_token_accuracy": 0.8732902407646179, + "num_tokens": 537566828.0, + "step": 14096 + }, + { + "epoch": 1.793283297290421, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.304018020629883, + "learning_rate": 1e-06, + "loss": 0.4303, + "mean_token_accuracy": 0.8671688437461853, + "num_tokens": 537606102.0, + "step": 14097 + }, + { + "epoch": 1.7934105075690114, + "ewc_loss": 0.034912109375, + "ewc_loss_parallel": 3.4809112548828125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.468175888061523, + "learning_rate": 1e-06, + "loss": 0.3886, + "mean_token_accuracy": 0.8762864470481873, + "num_tokens": 537638527.0, + "step": 14098 + }, + { + "epoch": 1.793537717847602, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.4293270111084, + "learning_rate": 1e-06, + "loss": 0.411, + "mean_token_accuracy": 0.8687818050384521, + "num_tokens": 537683375.0, + "step": 14099 + }, + { + "epoch": 1.7936649281261925, + "ewc_loss": 0.034912109375, + "ewc_loss_parallel": 3.4809112548828125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.336410522460938, + "learning_rate": 1e-06, + "loss": 0.4288, + "mean_token_accuracy": 0.8644486665725708, + "num_tokens": 537731354.0, + "step": 14100 + }, + { + "epoch": 1.793792138404783, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.463441848754883, + "learning_rate": 1e-06, + "loss": 0.4208, + "mean_token_accuracy": 0.8646830320358276, + "num_tokens": 537770055.0, + "step": 14101 + }, + { + "epoch": 1.7939193486833735, + "ewc_loss": 0.034912109375, + "ewc_loss_parallel": 3.4809112548828125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.67241096496582, + "learning_rate": 1e-06, + "loss": 0.4783, + "mean_token_accuracy": 0.8491122722625732, + "num_tokens": 537807977.0, + "step": 14102 + }, + { + "epoch": 1.794046558961964, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.406288146972656, + "learning_rate": 1e-06, + "loss": 0.431, + "mean_token_accuracy": 0.8640522956848145, + "num_tokens": 537847809.0, + "step": 14103 + }, + { + "epoch": 1.7941737692405546, + "ewc_loss": 0.034912109375, + "ewc_loss_parallel": 3.4809112548828125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.3386173248291, + "learning_rate": 1e-06, + "loss": 0.4752, + "mean_token_accuracy": 0.8502029180526733, + "num_tokens": 537885506.0, + "step": 14104 + }, + { + "epoch": 1.7943009795191451, + "ewc_loss": 0.034912109375, + "ewc_loss_parallel": 3.4809112548828125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.480854034423828, + "learning_rate": 1e-06, + "loss": 0.4315, + "mean_token_accuracy": 0.8643879890441895, + "num_tokens": 537918936.0, + "step": 14105 + }, + { + "epoch": 1.7944281897977357, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.576658248901367, + "learning_rate": 1e-06, + "loss": 0.3973, + "mean_token_accuracy": 0.8735644817352295, + "num_tokens": 537954205.0, + "step": 14106 + }, + { + "epoch": 1.7945554000763262, + "ewc_loss": 0.034912109375, + "ewc_loss_parallel": 3.4809112548828125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.35416030883789, + "learning_rate": 1e-06, + "loss": 0.4026, + "mean_token_accuracy": 0.8712936043739319, + "num_tokens": 537989190.0, + "step": 14107 + }, + { + "epoch": 1.7946826103549167, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.786409378051758, + "learning_rate": 1e-06, + "loss": 0.4427, + "mean_token_accuracy": 0.8595452308654785, + "num_tokens": 538028852.0, + "step": 14108 + }, + { + "epoch": 1.7948098206335072, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.735536575317383, + "learning_rate": 1e-06, + "loss": 0.393, + "mean_token_accuracy": 0.8747478723526001, + "num_tokens": 538061245.0, + "step": 14109 + }, + { + "epoch": 1.7949370309120978, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.25152587890625, + "learning_rate": 1e-06, + "loss": 0.3791, + "mean_token_accuracy": 0.8800805807113647, + "num_tokens": 538103238.0, + "step": 14110 + }, + { + "epoch": 1.7950642411906883, + "ewc_loss": 0.034912109375, + "ewc_loss_parallel": 3.4809112548828125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.20233726501465, + "learning_rate": 1e-06, + "loss": 0.4581, + "mean_token_accuracy": 0.8543527722358704, + "num_tokens": 538142692.0, + "step": 14111 + }, + { + "epoch": 1.7951914514692788, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.50322914123535, + "learning_rate": 1e-06, + "loss": 0.4318, + "mean_token_accuracy": 0.8666422367095947, + "num_tokens": 538183155.0, + "step": 14112 + }, + { + "epoch": 1.7953186617478694, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.39148712158203, + "learning_rate": 1e-06, + "loss": 0.4259, + "mean_token_accuracy": 0.8673850893974304, + "num_tokens": 538222063.0, + "step": 14113 + }, + { + "epoch": 1.7954458720264599, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.49052619934082, + "learning_rate": 1e-06, + "loss": 0.398, + "mean_token_accuracy": 0.8713477849960327, + "num_tokens": 538259847.0, + "step": 14114 + }, + { + "epoch": 1.7955730823050502, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.504905700683594, + "learning_rate": 1e-06, + "loss": 0.3925, + "mean_token_accuracy": 0.8759752511978149, + "num_tokens": 538298557.0, + "step": 14115 + }, + { + "epoch": 1.7957002925836407, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.423290252685547, + "learning_rate": 1e-06, + "loss": 0.4232, + "mean_token_accuracy": 0.8667388558387756, + "num_tokens": 538337926.0, + "step": 14116 + }, + { + "epoch": 1.7958275028622313, + "ewc_loss": 0.034912109375, + "ewc_loss_parallel": 3.4809112548828125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.56507110595703, + "learning_rate": 1e-06, + "loss": 0.3951, + "mean_token_accuracy": 0.872810423374176, + "num_tokens": 538382755.0, + "step": 14117 + }, + { + "epoch": 1.7959547131408218, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.675512313842773, + "learning_rate": 1e-06, + "loss": 0.4713, + "mean_token_accuracy": 0.8509408831596375, + "num_tokens": 538417639.0, + "step": 14118 + }, + { + "epoch": 1.7960819234194123, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.431859970092773, + "learning_rate": 1e-06, + "loss": 0.4339, + "mean_token_accuracy": 0.8629567623138428, + "num_tokens": 538458444.0, + "step": 14119 + }, + { + "epoch": 1.7962091336980028, + "ewc_loss": 0.034912109375, + "ewc_loss_parallel": 3.4809112548828125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.5438232421875, + "learning_rate": 1e-06, + "loss": 0.4059, + "mean_token_accuracy": 0.869721531867981, + "num_tokens": 538496580.0, + "step": 14120 + }, + { + "epoch": 1.7963363439765934, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.648435592651367, + "learning_rate": 1e-06, + "loss": 0.3782, + "mean_token_accuracy": 0.877821147441864, + "num_tokens": 538538474.0, + "step": 14121 + }, + { + "epoch": 1.7964635542551837, + "ewc_loss": 0.034912109375, + "ewc_loss_parallel": 3.4809112548828125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.549190521240234, + "learning_rate": 1e-06, + "loss": 0.4054, + "mean_token_accuracy": 0.8713064193725586, + "num_tokens": 538576828.0, + "step": 14122 + }, + { + "epoch": 1.7965907645337742, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.43589210510254, + "learning_rate": 1e-06, + "loss": 0.3988, + "mean_token_accuracy": 0.8775957822799683, + "num_tokens": 538611630.0, + "step": 14123 + }, + { + "epoch": 1.7967179748123647, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.752704620361328, + "learning_rate": 1e-06, + "loss": 0.5023, + "mean_token_accuracy": 0.8455300331115723, + "num_tokens": 538659537.0, + "step": 14124 + }, + { + "epoch": 1.7968451850909553, + "ewc_loss": 0.034912109375, + "ewc_loss_parallel": 3.4809112548828125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.462690353393555, + "learning_rate": 1e-06, + "loss": 0.3784, + "mean_token_accuracy": 0.8762599229812622, + "num_tokens": 538700216.0, + "step": 14125 + }, + { + "epoch": 1.7969723953695458, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.427289962768555, + "learning_rate": 1e-06, + "loss": 0.4341, + "mean_token_accuracy": 0.866155207157135, + "num_tokens": 538737719.0, + "step": 14126 + }, + { + "epoch": 1.7970996056481363, + "ewc_loss": 0.034912109375, + "ewc_loss_parallel": 3.4809112548828125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.595388412475586, + "learning_rate": 1e-06, + "loss": 0.3951, + "mean_token_accuracy": 0.8769513368606567, + "num_tokens": 538774723.0, + "step": 14127 + }, + { + "epoch": 1.7972268159267268, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.550453186035156, + "learning_rate": 1e-06, + "loss": 0.3757, + "mean_token_accuracy": 0.8784404993057251, + "num_tokens": 538815288.0, + "step": 14128 + }, + { + "epoch": 1.7973540262053174, + "ewc_loss": 0.034912109375, + "ewc_loss_parallel": 3.4809112548828125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.300193786621094, + "learning_rate": 1e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.868036687374115, + "num_tokens": 538858239.0, + "step": 14129 + }, + { + "epoch": 1.797481236483908, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.415613174438477, + "learning_rate": 1e-06, + "loss": 0.4055, + "mean_token_accuracy": 0.8717940449714661, + "num_tokens": 538896947.0, + "step": 14130 + }, + { + "epoch": 1.7976084467624984, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.817649841308594, + "learning_rate": 1e-06, + "loss": 0.4226, + "mean_token_accuracy": 0.8667607307434082, + "num_tokens": 538936511.0, + "step": 14131 + }, + { + "epoch": 1.797735657041089, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.507902145385742, + "learning_rate": 1e-06, + "loss": 0.4218, + "mean_token_accuracy": 0.8635648488998413, + "num_tokens": 538969198.0, + "step": 14132 + }, + { + "epoch": 1.7978628673196795, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.40899658203125, + "learning_rate": 1e-06, + "loss": 0.3802, + "mean_token_accuracy": 0.8796499967575073, + "num_tokens": 539002258.0, + "step": 14133 + }, + { + "epoch": 1.79799007759827, + "ewc_loss": 0.034912109375, + "ewc_loss_parallel": 3.4809112548828125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.48995018005371, + "learning_rate": 1e-06, + "loss": 0.4518, + "mean_token_accuracy": 0.8556460738182068, + "num_tokens": 539039993.0, + "step": 14134 + }, + { + "epoch": 1.7981172878768605, + "ewc_loss": 0.034912109375, + "ewc_loss_parallel": 3.4809112548828125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.4027042388916, + "learning_rate": 1e-06, + "loss": 0.4272, + "mean_token_accuracy": 0.8649355173110962, + "num_tokens": 539080713.0, + "step": 14135 + }, + { + "epoch": 1.798244498155451, + "ewc_loss": 0.034912109375, + "ewc_loss_parallel": 3.4809112548828125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.64595603942871, + "learning_rate": 1e-06, + "loss": 0.3941, + "mean_token_accuracy": 0.878343939781189, + "num_tokens": 539121115.0, + "step": 14136 + }, + { + "epoch": 1.7983717084340416, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.321414947509766, + "learning_rate": 1e-06, + "loss": 0.3626, + "mean_token_accuracy": 0.8872486352920532, + "num_tokens": 539159889.0, + "step": 14137 + }, + { + "epoch": 1.7984989187126321, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.49275779724121, + "learning_rate": 1e-06, + "loss": 0.4145, + "mean_token_accuracy": 0.8681166768074036, + "num_tokens": 539201508.0, + "step": 14138 + }, + { + "epoch": 1.7986261289912227, + "ewc_loss": 0.034912109375, + "ewc_loss_parallel": 3.4809112548828125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.3635196685791, + "learning_rate": 1e-06, + "loss": 0.472, + "mean_token_accuracy": 0.8510557413101196, + "num_tokens": 539238640.0, + "step": 14139 + }, + { + "epoch": 1.798753339269813, + "ewc_loss": 0.034912109375, + "ewc_loss_parallel": 3.4809112548828125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.248828887939453, + "learning_rate": 1e-06, + "loss": 0.4218, + "mean_token_accuracy": 0.8646531701087952, + "num_tokens": 539274797.0, + "step": 14140 + }, + { + "epoch": 1.7988805495484035, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.300329208374023, + "learning_rate": 1e-06, + "loss": 0.4474, + "mean_token_accuracy": 0.8580355048179626, + "num_tokens": 539308410.0, + "step": 14141 + }, + { + "epoch": 1.799007759826994, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.340715408325195, + "learning_rate": 1e-06, + "loss": 0.4396, + "mean_token_accuracy": 0.8606128692626953, + "num_tokens": 539345578.0, + "step": 14142 + }, + { + "epoch": 1.7991349701055845, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.399293899536133, + "learning_rate": 1e-06, + "loss": 0.4339, + "mean_token_accuracy": 0.8534817695617676, + "num_tokens": 539378946.0, + "step": 14143 + }, + { + "epoch": 1.799262180384175, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.367950439453125, + "learning_rate": 1e-06, + "loss": 0.3879, + "mean_token_accuracy": 0.8801391124725342, + "num_tokens": 539417663.0, + "step": 14144 + }, + { + "epoch": 1.7993893906627656, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.286836624145508, + "learning_rate": 1e-06, + "loss": 0.3657, + "mean_token_accuracy": 0.8837498426437378, + "num_tokens": 539459597.0, + "step": 14145 + }, + { + "epoch": 1.799516600941356, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.44363784790039, + "learning_rate": 1e-06, + "loss": 0.4123, + "mean_token_accuracy": 0.8716527223587036, + "num_tokens": 539499579.0, + "step": 14146 + }, + { + "epoch": 1.7996438112199464, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.4095401763916, + "learning_rate": 1e-06, + "loss": 0.3935, + "mean_token_accuracy": 0.877711296081543, + "num_tokens": 539538505.0, + "step": 14147 + }, + { + "epoch": 1.799771021498537, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.366867065429688, + "learning_rate": 1e-06, + "loss": 0.435, + "mean_token_accuracy": 0.8622239828109741, + "num_tokens": 539574824.0, + "step": 14148 + }, + { + "epoch": 1.7998982317771275, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.364118576049805, + "learning_rate": 1e-06, + "loss": 0.4623, + "mean_token_accuracy": 0.8556154370307922, + "num_tokens": 539614181.0, + "step": 14149 + }, + { + "epoch": 1.800025442055718, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.40957260131836, + "learning_rate": 1e-06, + "loss": 0.4541, + "mean_token_accuracy": 0.8573383092880249, + "num_tokens": 539656713.0, + "step": 14150 + }, + { + "epoch": 1.8001526523343085, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.307846069335938, + "learning_rate": 1e-06, + "loss": 0.4234, + "mean_token_accuracy": 0.8674864768981934, + "num_tokens": 539695436.0, + "step": 14151 + }, + { + "epoch": 1.800279862612899, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.536909103393555, + "learning_rate": 1e-06, + "loss": 0.4088, + "mean_token_accuracy": 0.8704198002815247, + "num_tokens": 539736549.0, + "step": 14152 + }, + { + "epoch": 1.8004070728914896, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.31623649597168, + "learning_rate": 1e-06, + "loss": 0.3992, + "mean_token_accuracy": 0.8742458820343018, + "num_tokens": 539776326.0, + "step": 14153 + }, + { + "epoch": 1.8005342831700801, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.402238845825195, + "learning_rate": 1e-06, + "loss": 0.3954, + "mean_token_accuracy": 0.8760197162628174, + "num_tokens": 539819665.0, + "step": 14154 + }, + { + "epoch": 1.8006614934486707, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.556737899780273, + "learning_rate": 1e-06, + "loss": 0.425, + "mean_token_accuracy": 0.8668291568756104, + "num_tokens": 539858175.0, + "step": 14155 + }, + { + "epoch": 1.8007887037272612, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.49025535583496, + "learning_rate": 1e-06, + "loss": 0.4061, + "mean_token_accuracy": 0.8675606846809387, + "num_tokens": 539895152.0, + "step": 14156 + }, + { + "epoch": 1.8009159140058517, + "ewc_loss": 0.034912109375, + "ewc_loss_parallel": 3.4809112548828125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.511627197265625, + "learning_rate": 1e-06, + "loss": 0.3997, + "mean_token_accuracy": 0.8743815422058105, + "num_tokens": 539941033.0, + "step": 14157 + }, + { + "epoch": 1.8010431242844422, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.15865135192871, + "learning_rate": 1e-06, + "loss": 0.4112, + "mean_token_accuracy": 0.8703298568725586, + "num_tokens": 539977959.0, + "step": 14158 + }, + { + "epoch": 1.8011703345630328, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.473230361938477, + "learning_rate": 1e-06, + "loss": 0.424, + "mean_token_accuracy": 0.8676600456237793, + "num_tokens": 540014857.0, + "step": 14159 + }, + { + "epoch": 1.8012975448416233, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.29599952697754, + "learning_rate": 1e-06, + "loss": 0.4152, + "mean_token_accuracy": 0.8669169545173645, + "num_tokens": 540055775.0, + "step": 14160 + }, + { + "epoch": 1.8014247551202138, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.28532600402832, + "learning_rate": 1e-06, + "loss": 0.3787, + "mean_token_accuracy": 0.8779369592666626, + "num_tokens": 540090287.0, + "step": 14161 + }, + { + "epoch": 1.8015519653988044, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.53207015991211, + "learning_rate": 1e-06, + "loss": 0.3879, + "mean_token_accuracy": 0.8777390718460083, + "num_tokens": 540129139.0, + "step": 14162 + }, + { + "epoch": 1.8016791756773949, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.568260192871094, + "learning_rate": 1e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.8675967454910278, + "num_tokens": 540165187.0, + "step": 14163 + }, + { + "epoch": 1.8018063859559852, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.37899398803711, + "learning_rate": 1e-06, + "loss": 0.3743, + "mean_token_accuracy": 0.8808501958847046, + "num_tokens": 540201113.0, + "step": 14164 + }, + { + "epoch": 1.8019335962345757, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.34721565246582, + "learning_rate": 1e-06, + "loss": 0.4169, + "mean_token_accuracy": 0.8708696365356445, + "num_tokens": 540237448.0, + "step": 14165 + }, + { + "epoch": 1.8020608065131662, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.40789794921875, + "learning_rate": 1e-06, + "loss": 0.4381, + "mean_token_accuracy": 0.8632779717445374, + "num_tokens": 540279703.0, + "step": 14166 + }, + { + "epoch": 1.8021880167917568, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.508647918701172, + "learning_rate": 1e-06, + "loss": 0.4614, + "mean_token_accuracy": 0.8512792587280273, + "num_tokens": 540313071.0, + "step": 14167 + }, + { + "epoch": 1.8023152270703473, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.523681640625, + "learning_rate": 1e-06, + "loss": 0.4026, + "mean_token_accuracy": 0.872381865978241, + "num_tokens": 540353940.0, + "step": 14168 + }, + { + "epoch": 1.8024424373489378, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.32549476623535, + "learning_rate": 1e-06, + "loss": 0.4306, + "mean_token_accuracy": 0.866073727607727, + "num_tokens": 540391241.0, + "step": 14169 + }, + { + "epoch": 1.8025696476275284, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.34688949584961, + "learning_rate": 1e-06, + "loss": 0.3766, + "mean_token_accuracy": 0.8794170022010803, + "num_tokens": 540420205.0, + "step": 14170 + }, + { + "epoch": 1.8026968579061187, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.49649429321289, + "learning_rate": 1e-06, + "loss": 0.4014, + "mean_token_accuracy": 0.8733044862747192, + "num_tokens": 540452121.0, + "step": 14171 + }, + { + "epoch": 1.8028240681847092, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.336292266845703, + "learning_rate": 1e-06, + "loss": 0.4172, + "mean_token_accuracy": 0.8692512512207031, + "num_tokens": 540487415.0, + "step": 14172 + }, + { + "epoch": 1.8029512784632997, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.342697143554688, + "learning_rate": 1e-06, + "loss": 0.4501, + "mean_token_accuracy": 0.8580235242843628, + "num_tokens": 540522293.0, + "step": 14173 + }, + { + "epoch": 1.8030784887418903, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.30733871459961, + "learning_rate": 1e-06, + "loss": 0.3836, + "mean_token_accuracy": 0.8801236152648926, + "num_tokens": 540560345.0, + "step": 14174 + }, + { + "epoch": 1.8032056990204808, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.478240966796875, + "learning_rate": 1e-06, + "loss": 0.3969, + "mean_token_accuracy": 0.8771185874938965, + "num_tokens": 540598299.0, + "step": 14175 + }, + { + "epoch": 1.8033329092990713, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.35698890686035, + "learning_rate": 1e-06, + "loss": 0.3767, + "mean_token_accuracy": 0.8802769780158997, + "num_tokens": 540634480.0, + "step": 14176 + }, + { + "epoch": 1.8034601195776618, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.427223205566406, + "learning_rate": 1e-06, + "loss": 0.4442, + "mean_token_accuracy": 0.860703706741333, + "num_tokens": 540679473.0, + "step": 14177 + }, + { + "epoch": 1.8035873298562524, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.30759620666504, + "learning_rate": 1e-06, + "loss": 0.4204, + "mean_token_accuracy": 0.8629043102264404, + "num_tokens": 540722259.0, + "step": 14178 + }, + { + "epoch": 1.803714540134843, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.500757217407227, + "learning_rate": 1e-06, + "loss": 0.402, + "mean_token_accuracy": 0.8717696070671082, + "num_tokens": 540758058.0, + "step": 14179 + }, + { + "epoch": 1.8038417504134334, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.439104080200195, + "learning_rate": 1e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.8642481565475464, + "num_tokens": 540798535.0, + "step": 14180 + }, + { + "epoch": 1.803968960692024, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.298477172851562, + "learning_rate": 1e-06, + "loss": 0.4153, + "mean_token_accuracy": 0.8696454763412476, + "num_tokens": 540839423.0, + "step": 14181 + }, + { + "epoch": 1.8040961709706145, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.362136840820312, + "learning_rate": 1e-06, + "loss": 0.4001, + "mean_token_accuracy": 0.8751202821731567, + "num_tokens": 540887512.0, + "step": 14182 + }, + { + "epoch": 1.804223381249205, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.482683181762695, + "learning_rate": 1e-06, + "loss": 0.3999, + "mean_token_accuracy": 0.8711235523223877, + "num_tokens": 540922292.0, + "step": 14183 + }, + { + "epoch": 1.8043505915277955, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.441152572631836, + "learning_rate": 1e-06, + "loss": 0.3916, + "mean_token_accuracy": 0.8752650022506714, + "num_tokens": 540964105.0, + "step": 14184 + }, + { + "epoch": 1.804477801806386, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.445524215698242, + "learning_rate": 1e-06, + "loss": 0.4196, + "mean_token_accuracy": 0.8678865432739258, + "num_tokens": 541001793.0, + "step": 14185 + }, + { + "epoch": 1.8046050120849766, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.31550407409668, + "learning_rate": 1e-06, + "loss": 0.3946, + "mean_token_accuracy": 0.8756012320518494, + "num_tokens": 541041104.0, + "step": 14186 + }, + { + "epoch": 1.8047322223635671, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.32833480834961, + "learning_rate": 1e-06, + "loss": 0.439, + "mean_token_accuracy": 0.8628066182136536, + "num_tokens": 541079643.0, + "step": 14187 + }, + { + "epoch": 1.8048594326421576, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.429088592529297, + "learning_rate": 1e-06, + "loss": 0.3999, + "mean_token_accuracy": 0.8738673329353333, + "num_tokens": 541118554.0, + "step": 14188 + }, + { + "epoch": 1.804986642920748, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.489315032958984, + "learning_rate": 1e-06, + "loss": 0.453, + "mean_token_accuracy": 0.8580797910690308, + "num_tokens": 541156893.0, + "step": 14189 + }, + { + "epoch": 1.8051138531993385, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.2782039642334, + "learning_rate": 1e-06, + "loss": 0.3595, + "mean_token_accuracy": 0.8848156332969666, + "num_tokens": 541197283.0, + "step": 14190 + }, + { + "epoch": 1.805241063477929, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.331825256347656, + "learning_rate": 1e-06, + "loss": 0.3699, + "mean_token_accuracy": 0.8822054862976074, + "num_tokens": 541232343.0, + "step": 14191 + }, + { + "epoch": 1.8053682737565195, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.355806350708008, + "learning_rate": 1e-06, + "loss": 0.4503, + "mean_token_accuracy": 0.8615196943283081, + "num_tokens": 541270274.0, + "step": 14192 + }, + { + "epoch": 1.80549548403511, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.477327346801758, + "learning_rate": 1e-06, + "loss": 0.4316, + "mean_token_accuracy": 0.8644958734512329, + "num_tokens": 541306018.0, + "step": 14193 + }, + { + "epoch": 1.8056226943137006, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.43222999572754, + "learning_rate": 1e-06, + "loss": 0.409, + "mean_token_accuracy": 0.8704217672348022, + "num_tokens": 541344974.0, + "step": 14194 + }, + { + "epoch": 1.805749904592291, + "ewc_loss": 0.034912109375, + "ewc_loss_parallel": 3.4809112548828125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.401700973510742, + "learning_rate": 1e-06, + "loss": 0.3766, + "mean_token_accuracy": 0.8786015510559082, + "num_tokens": 541378590.0, + "step": 14195 + }, + { + "epoch": 1.8058771148708814, + "ewc_loss": 0.034912109375, + "ewc_loss_parallel": 3.4809112548828125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.33975601196289, + "learning_rate": 1e-06, + "loss": 0.4142, + "mean_token_accuracy": 0.8720143437385559, + "num_tokens": 541415945.0, + "step": 14196 + }, + { + "epoch": 1.806004325149472, + "ewc_loss": 0.034912109375, + "ewc_loss_parallel": 3.4809112548828125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.378084182739258, + "learning_rate": 1e-06, + "loss": 0.4289, + "mean_token_accuracy": 0.8658949732780457, + "num_tokens": 541459577.0, + "step": 14197 + }, + { + "epoch": 1.8061315354280625, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.393722534179688, + "learning_rate": 1e-06, + "loss": 0.41, + "mean_token_accuracy": 0.8705737590789795, + "num_tokens": 541501041.0, + "step": 14198 + }, + { + "epoch": 1.806258745706653, + "ewc_loss": 0.034912109375, + "ewc_loss_parallel": 3.4809112548828125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.274662017822266, + "learning_rate": 1e-06, + "loss": 0.4397, + "mean_token_accuracy": 0.8620468378067017, + "num_tokens": 541538587.0, + "step": 14199 + }, + { + "epoch": 1.8063859559852435, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.301427841186523, + "learning_rate": 1e-06, + "loss": 0.4652, + "mean_token_accuracy": 0.857157289981842, + "num_tokens": 541582225.0, + "step": 14200 + }, + { + "epoch": 1.806513166263834, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.484689712524414, + "learning_rate": 1e-06, + "loss": 0.3811, + "mean_token_accuracy": 0.8795161247253418, + "num_tokens": 541616956.0, + "step": 14201 + }, + { + "epoch": 1.8066403765424246, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.280303955078125, + "learning_rate": 1e-06, + "loss": 0.4559, + "mean_token_accuracy": 0.8578012585639954, + "num_tokens": 541650814.0, + "step": 14202 + }, + { + "epoch": 1.8067675868210151, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.636688232421875, + "learning_rate": 1e-06, + "loss": 0.3902, + "mean_token_accuracy": 0.8788255453109741, + "num_tokens": 541692167.0, + "step": 14203 + }, + { + "epoch": 1.8068947970996057, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.279903411865234, + "learning_rate": 1e-06, + "loss": 0.4485, + "mean_token_accuracy": 0.8601198792457581, + "num_tokens": 541732427.0, + "step": 14204 + }, + { + "epoch": 1.8070220073781962, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.570852279663086, + "learning_rate": 1e-06, + "loss": 0.4196, + "mean_token_accuracy": 0.8694831728935242, + "num_tokens": 541773256.0, + "step": 14205 + }, + { + "epoch": 1.8071492176567867, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.397226333618164, + "learning_rate": 1e-06, + "loss": 0.3954, + "mean_token_accuracy": 0.8752808570861816, + "num_tokens": 541815008.0, + "step": 14206 + }, + { + "epoch": 1.8072764279353772, + "ewc_loss": 0.034912109375, + "ewc_loss_parallel": 3.4809112548828125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.376848220825195, + "learning_rate": 1e-06, + "loss": 0.4176, + "mean_token_accuracy": 0.8725908994674683, + "num_tokens": 541853462.0, + "step": 14207 + }, + { + "epoch": 1.8074036382139678, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.468671798706055, + "learning_rate": 1e-06, + "loss": 0.4346, + "mean_token_accuracy": 0.8640652298927307, + "num_tokens": 541890487.0, + "step": 14208 + }, + { + "epoch": 1.8075308484925583, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.327295303344727, + "learning_rate": 1e-06, + "loss": 0.3974, + "mean_token_accuracy": 0.8735415935516357, + "num_tokens": 541929857.0, + "step": 14209 + }, + { + "epoch": 1.8076580587711488, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.44361114501953, + "learning_rate": 1e-06, + "loss": 0.4373, + "mean_token_accuracy": 0.8630109429359436, + "num_tokens": 541969923.0, + "step": 14210 + }, + { + "epoch": 1.8077852690497394, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.318828582763672, + "learning_rate": 1e-06, + "loss": 0.3822, + "mean_token_accuracy": 0.8773278594017029, + "num_tokens": 542008802.0, + "step": 14211 + }, + { + "epoch": 1.8079124793283299, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.3303279876709, + "learning_rate": 1e-06, + "loss": 0.3762, + "mean_token_accuracy": 0.8827969431877136, + "num_tokens": 542049276.0, + "step": 14212 + }, + { + "epoch": 1.8080396896069202, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.362045288085938, + "learning_rate": 1e-06, + "loss": 0.4216, + "mean_token_accuracy": 0.8630684018135071, + "num_tokens": 542086921.0, + "step": 14213 + }, + { + "epoch": 1.8081668998855107, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.499807357788086, + "learning_rate": 1e-06, + "loss": 0.4189, + "mean_token_accuracy": 0.8695908784866333, + "num_tokens": 542124291.0, + "step": 14214 + }, + { + "epoch": 1.8082941101641012, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.267078399658203, + "learning_rate": 1e-06, + "loss": 0.4421, + "mean_token_accuracy": 0.8586537837982178, + "num_tokens": 542167600.0, + "step": 14215 + }, + { + "epoch": 1.8084213204426918, + "ewc_loss": 0.034912109375, + "ewc_loss_parallel": 3.4809112548828125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.339683532714844, + "learning_rate": 1e-06, + "loss": 0.434, + "mean_token_accuracy": 0.8650041818618774, + "num_tokens": 542201134.0, + "step": 14216 + }, + { + "epoch": 1.8085485307212823, + "ewc_loss": 0.034912109375, + "ewc_loss_parallel": 3.4809112548828125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.381242752075195, + "learning_rate": 1e-06, + "loss": 0.4519, + "mean_token_accuracy": 0.8558017611503601, + "num_tokens": 542242590.0, + "step": 14217 + }, + { + "epoch": 1.8086757409998728, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.332866668701172, + "learning_rate": 1e-06, + "loss": 0.4088, + "mean_token_accuracy": 0.8694078922271729, + "num_tokens": 542281945.0, + "step": 14218 + }, + { + "epoch": 1.8088029512784631, + "ewc_loss": 0.034912109375, + "ewc_loss_parallel": 3.4809112548828125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.510000228881836, + "learning_rate": 1e-06, + "loss": 0.3934, + "mean_token_accuracy": 0.8776434659957886, + "num_tokens": 542319389.0, + "step": 14219 + }, + { + "epoch": 1.8089301615570537, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.26470947265625, + "learning_rate": 1e-06, + "loss": 0.4496, + "mean_token_accuracy": 0.8571196794509888, + "num_tokens": 542356912.0, + "step": 14220 + }, + { + "epoch": 1.8090573718356442, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.430049896240234, + "learning_rate": 1e-06, + "loss": 0.436, + "mean_token_accuracy": 0.8632787466049194, + "num_tokens": 542394054.0, + "step": 14221 + }, + { + "epoch": 1.8091845821142347, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.258817672729492, + "learning_rate": 1e-06, + "loss": 0.4038, + "mean_token_accuracy": 0.8736028671264648, + "num_tokens": 542431063.0, + "step": 14222 + }, + { + "epoch": 1.8093117923928252, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.58107566833496, + "learning_rate": 1e-06, + "loss": 0.4557, + "mean_token_accuracy": 0.8573328256607056, + "num_tokens": 542471398.0, + "step": 14223 + }, + { + "epoch": 1.8094390026714158, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.343534469604492, + "learning_rate": 1e-06, + "loss": 0.4676, + "mean_token_accuracy": 0.8552947044372559, + "num_tokens": 542516288.0, + "step": 14224 + }, + { + "epoch": 1.8095662129500063, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.283010482788086, + "learning_rate": 1e-06, + "loss": 0.4202, + "mean_token_accuracy": 0.8673198819160461, + "num_tokens": 542552292.0, + "step": 14225 + }, + { + "epoch": 1.8096934232285968, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.41098403930664, + "learning_rate": 1e-06, + "loss": 0.3981, + "mean_token_accuracy": 0.8710483908653259, + "num_tokens": 542592380.0, + "step": 14226 + }, + { + "epoch": 1.8098206335071874, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.351823806762695, + "learning_rate": 1e-06, + "loss": 0.4339, + "mean_token_accuracy": 0.8612801432609558, + "num_tokens": 542632698.0, + "step": 14227 + }, + { + "epoch": 1.8099478437857779, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.421510696411133, + "learning_rate": 1e-06, + "loss": 0.446, + "mean_token_accuracy": 0.8585238456726074, + "num_tokens": 542668234.0, + "step": 14228 + }, + { + "epoch": 1.8100750540643684, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.34360694885254, + "learning_rate": 1e-06, + "loss": 0.4231, + "mean_token_accuracy": 0.867502748966217, + "num_tokens": 542706205.0, + "step": 14229 + }, + { + "epoch": 1.810202264342959, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.393482208251953, + "learning_rate": 1e-06, + "loss": 0.4147, + "mean_token_accuracy": 0.8671087026596069, + "num_tokens": 542737068.0, + "step": 14230 + }, + { + "epoch": 1.8103294746215495, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.370065689086914, + "learning_rate": 1e-06, + "loss": 0.3783, + "mean_token_accuracy": 0.880774974822998, + "num_tokens": 542769776.0, + "step": 14231 + }, + { + "epoch": 1.81045668490014, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.49264144897461, + "learning_rate": 1e-06, + "loss": 0.3815, + "mean_token_accuracy": 0.8809307813644409, + "num_tokens": 542802440.0, + "step": 14232 + }, + { + "epoch": 1.8105838951787305, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.37896156311035, + "learning_rate": 1e-06, + "loss": 0.3756, + "mean_token_accuracy": 0.8815884590148926, + "num_tokens": 542841158.0, + "step": 14233 + }, + { + "epoch": 1.810711105457321, + "ewc_loss": 0.034912109375, + "ewc_loss_parallel": 3.4809112548828125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.492389678955078, + "learning_rate": 1e-06, + "loss": 0.424, + "mean_token_accuracy": 0.8649296164512634, + "num_tokens": 542876637.0, + "step": 14234 + }, + { + "epoch": 1.8108383157359116, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.32796287536621, + "learning_rate": 1e-06, + "loss": 0.393, + "mean_token_accuracy": 0.8754698038101196, + "num_tokens": 542915087.0, + "step": 14235 + }, + { + "epoch": 1.8109655260145021, + "ewc_loss": 0.034912109375, + "ewc_loss_parallel": 3.4809112548828125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.457103729248047, + "learning_rate": 1e-06, + "loss": 0.4297, + "mean_token_accuracy": 0.8606160879135132, + "num_tokens": 542950635.0, + "step": 14236 + }, + { + "epoch": 1.8110927362930926, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.555320739746094, + "learning_rate": 1e-06, + "loss": 0.4342, + "mean_token_accuracy": 0.8641332387924194, + "num_tokens": 542990078.0, + "step": 14237 + }, + { + "epoch": 1.811219946571683, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.254894256591797, + "learning_rate": 1e-06, + "loss": 0.4889, + "mean_token_accuracy": 0.8438490033149719, + "num_tokens": 543030333.0, + "step": 14238 + }, + { + "epoch": 1.8113471568502735, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.341176986694336, + "learning_rate": 1e-06, + "loss": 0.428, + "mean_token_accuracy": 0.8620771169662476, + "num_tokens": 543070537.0, + "step": 14239 + }, + { + "epoch": 1.811474367128864, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.40325355529785, + "learning_rate": 1e-06, + "loss": 0.4269, + "mean_token_accuracy": 0.8667823672294617, + "num_tokens": 543109137.0, + "step": 14240 + }, + { + "epoch": 1.8116015774074545, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.32149887084961, + "learning_rate": 1e-06, + "loss": 0.4293, + "mean_token_accuracy": 0.8629437685012817, + "num_tokens": 543146616.0, + "step": 14241 + }, + { + "epoch": 1.811728787686045, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.292484283447266, + "learning_rate": 1e-06, + "loss": 0.3685, + "mean_token_accuracy": 0.8823946714401245, + "num_tokens": 543178433.0, + "step": 14242 + }, + { + "epoch": 1.8118559979646356, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.28844451904297, + "learning_rate": 1e-06, + "loss": 0.3733, + "mean_token_accuracy": 0.8805108070373535, + "num_tokens": 543215086.0, + "step": 14243 + }, + { + "epoch": 1.811983208243226, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.24036979675293, + "learning_rate": 1e-06, + "loss": 0.4624, + "mean_token_accuracy": 0.8535381555557251, + "num_tokens": 543252111.0, + "step": 14244 + }, + { + "epoch": 1.8121104185218164, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.465404510498047, + "learning_rate": 1e-06, + "loss": 0.3891, + "mean_token_accuracy": 0.8761425018310547, + "num_tokens": 543291938.0, + "step": 14245 + }, + { + "epoch": 1.812237628800407, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.27316665649414, + "learning_rate": 1e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.873042106628418, + "num_tokens": 543335141.0, + "step": 14246 + }, + { + "epoch": 1.8123648390789975, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.430051803588867, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.8689598441123962, + "num_tokens": 543371305.0, + "step": 14247 + }, + { + "epoch": 1.812492049357588, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.310077667236328, + "learning_rate": 1e-06, + "loss": 0.3929, + "mean_token_accuracy": 0.8792415857315063, + "num_tokens": 543407431.0, + "step": 14248 + }, + { + "epoch": 1.8126192596361785, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.170982360839844, + "learning_rate": 1e-06, + "loss": 0.4067, + "mean_token_accuracy": 0.8690004944801331, + "num_tokens": 543450287.0, + "step": 14249 + }, + { + "epoch": 1.812746469914769, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.53792381286621, + "learning_rate": 1e-06, + "loss": 0.3819, + "mean_token_accuracy": 0.8811418414115906, + "num_tokens": 543486521.0, + "step": 14250 + }, + { + "epoch": 1.8128736801933596, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.37489891052246, + "learning_rate": 1e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.8685458302497864, + "num_tokens": 543522157.0, + "step": 14251 + }, + { + "epoch": 1.8130008904719501, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.24509048461914, + "learning_rate": 1e-06, + "loss": 0.4135, + "mean_token_accuracy": 0.8720463514328003, + "num_tokens": 543560503.0, + "step": 14252 + }, + { + "epoch": 1.8131281007505406, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.347978591918945, + "learning_rate": 1e-06, + "loss": 0.4554, + "mean_token_accuracy": 0.8575600385665894, + "num_tokens": 543598943.0, + "step": 14253 + }, + { + "epoch": 1.8132553110291312, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.2618408203125, + "learning_rate": 1e-06, + "loss": 0.3986, + "mean_token_accuracy": 0.8714004755020142, + "num_tokens": 543640466.0, + "step": 14254 + }, + { + "epoch": 1.8133825213077217, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.363201141357422, + "learning_rate": 1e-06, + "loss": 0.4022, + "mean_token_accuracy": 0.8707355260848999, + "num_tokens": 543677581.0, + "step": 14255 + }, + { + "epoch": 1.8135097315863122, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.484376907348633, + "learning_rate": 1e-06, + "loss": 0.4126, + "mean_token_accuracy": 0.8619638681411743, + "num_tokens": 543711220.0, + "step": 14256 + }, + { + "epoch": 1.8136369418649028, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.4162654876709, + "learning_rate": 1e-06, + "loss": 0.4348, + "mean_token_accuracy": 0.862987220287323, + "num_tokens": 543750109.0, + "step": 14257 + }, + { + "epoch": 1.8137641521434933, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.34554100036621, + "learning_rate": 1e-06, + "loss": 0.3982, + "mean_token_accuracy": 0.8737943172454834, + "num_tokens": 543788040.0, + "step": 14258 + }, + { + "epoch": 1.8138913624220838, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.511510848999023, + "learning_rate": 1e-06, + "loss": 0.4151, + "mean_token_accuracy": 0.8689368367195129, + "num_tokens": 543821037.0, + "step": 14259 + }, + { + "epoch": 1.8140185727006743, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.421764373779297, + "learning_rate": 1e-06, + "loss": 0.3806, + "mean_token_accuracy": 0.882064938545227, + "num_tokens": 543856331.0, + "step": 14260 + }, + { + "epoch": 1.8141457829792649, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.399709701538086, + "learning_rate": 1e-06, + "loss": 0.4081, + "mean_token_accuracy": 0.8737429976463318, + "num_tokens": 543892898.0, + "step": 14261 + }, + { + "epoch": 1.8142729932578552, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.315261840820312, + "learning_rate": 1e-06, + "loss": 0.4297, + "mean_token_accuracy": 0.8649947643280029, + "num_tokens": 543932858.0, + "step": 14262 + }, + { + "epoch": 1.8144002035364457, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.39642906188965, + "learning_rate": 1e-06, + "loss": 0.3928, + "mean_token_accuracy": 0.877601146697998, + "num_tokens": 543968145.0, + "step": 14263 + }, + { + "epoch": 1.8145274138150362, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.44745635986328, + "learning_rate": 1e-06, + "loss": 0.4236, + "mean_token_accuracy": 0.8673794269561768, + "num_tokens": 544000495.0, + "step": 14264 + }, + { + "epoch": 1.8146546240936268, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.32472038269043, + "learning_rate": 1e-06, + "loss": 0.4327, + "mean_token_accuracy": 0.8641073107719421, + "num_tokens": 544041345.0, + "step": 14265 + }, + { + "epoch": 1.8147818343722173, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.51041030883789, + "learning_rate": 1e-06, + "loss": 0.4083, + "mean_token_accuracy": 0.8732445240020752, + "num_tokens": 544080084.0, + "step": 14266 + }, + { + "epoch": 1.8149090446508078, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.41513442993164, + "learning_rate": 1e-06, + "loss": 0.4319, + "mean_token_accuracy": 0.8606293797492981, + "num_tokens": 544115828.0, + "step": 14267 + }, + { + "epoch": 1.8150362549293981, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.537078857421875, + "learning_rate": 1e-06, + "loss": 0.4322, + "mean_token_accuracy": 0.8661201000213623, + "num_tokens": 544153213.0, + "step": 14268 + }, + { + "epoch": 1.8151634652079887, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.447776794433594, + "learning_rate": 1e-06, + "loss": 0.4035, + "mean_token_accuracy": 0.8725188970565796, + "num_tokens": 544192383.0, + "step": 14269 + }, + { + "epoch": 1.8152906754865792, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.661752700805664, + "learning_rate": 1e-06, + "loss": 0.4152, + "mean_token_accuracy": 0.8711175918579102, + "num_tokens": 544232034.0, + "step": 14270 + }, + { + "epoch": 1.8154178857651697, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.52313995361328, + "learning_rate": 1e-06, + "loss": 0.4593, + "mean_token_accuracy": 0.8540395498275757, + "num_tokens": 544266242.0, + "step": 14271 + }, + { + "epoch": 1.8155450960437602, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.515188217163086, + "learning_rate": 1e-06, + "loss": 0.4513, + "mean_token_accuracy": 0.8587481379508972, + "num_tokens": 544304995.0, + "step": 14272 + }, + { + "epoch": 1.8156723063223508, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.422630310058594, + "learning_rate": 1e-06, + "loss": 0.4269, + "mean_token_accuracy": 0.8681271076202393, + "num_tokens": 544341950.0, + "step": 14273 + }, + { + "epoch": 1.8157995166009413, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.454832077026367, + "learning_rate": 1e-06, + "loss": 0.4299, + "mean_token_accuracy": 0.8656306266784668, + "num_tokens": 544379535.0, + "step": 14274 + }, + { + "epoch": 1.8159267268795318, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.89629554748535, + "learning_rate": 1e-06, + "loss": 0.411, + "mean_token_accuracy": 0.8680769801139832, + "num_tokens": 544416939.0, + "step": 14275 + }, + { + "epoch": 1.8160539371581224, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.545841217041016, + "learning_rate": 1e-06, + "loss": 0.3833, + "mean_token_accuracy": 0.8798328638076782, + "num_tokens": 544461476.0, + "step": 14276 + }, + { + "epoch": 1.8161811474367129, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.74640464782715, + "learning_rate": 1e-06, + "loss": 0.4151, + "mean_token_accuracy": 0.8692939281463623, + "num_tokens": 544505051.0, + "step": 14277 + }, + { + "epoch": 1.8163083577153034, + "ewc_loss": 0.034912109375, + "ewc_loss_parallel": 3.4809112548828125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.452518463134766, + "learning_rate": 1e-06, + "loss": 0.4072, + "mean_token_accuracy": 0.8697174787521362, + "num_tokens": 544550408.0, + "step": 14278 + }, + { + "epoch": 1.816435567993894, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.449344635009766, + "learning_rate": 1e-06, + "loss": 0.4211, + "mean_token_accuracy": 0.8679805397987366, + "num_tokens": 544586645.0, + "step": 14279 + }, + { + "epoch": 1.8165627782724845, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.59462547302246, + "learning_rate": 1e-06, + "loss": 0.439, + "mean_token_accuracy": 0.861700177192688, + "num_tokens": 544628087.0, + "step": 14280 + }, + { + "epoch": 1.816689988551075, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.431640625, + "learning_rate": 1e-06, + "loss": 0.3667, + "mean_token_accuracy": 0.8829939365386963, + "num_tokens": 544667961.0, + "step": 14281 + }, + { + "epoch": 1.8168171988296655, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.53805160522461, + "learning_rate": 1e-06, + "loss": 0.4263, + "mean_token_accuracy": 0.8652395009994507, + "num_tokens": 544704919.0, + "step": 14282 + }, + { + "epoch": 1.816944409108256, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.48455810546875, + "learning_rate": 1e-06, + "loss": 0.3991, + "mean_token_accuracy": 0.874336302280426, + "num_tokens": 544741421.0, + "step": 14283 + }, + { + "epoch": 1.8170716193868466, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.26590347290039, + "learning_rate": 1e-06, + "loss": 0.4026, + "mean_token_accuracy": 0.8727356791496277, + "num_tokens": 544776414.0, + "step": 14284 + }, + { + "epoch": 1.817198829665437, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.59226417541504, + "learning_rate": 1e-06, + "loss": 0.4198, + "mean_token_accuracy": 0.8609917759895325, + "num_tokens": 544814964.0, + "step": 14285 + }, + { + "epoch": 1.8173260399440276, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.441913604736328, + "learning_rate": 1e-06, + "loss": 0.4589, + "mean_token_accuracy": 0.8542888164520264, + "num_tokens": 544853381.0, + "step": 14286 + }, + { + "epoch": 1.817453250222618, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.576251983642578, + "learning_rate": 1e-06, + "loss": 0.4003, + "mean_token_accuracy": 0.8704732656478882, + "num_tokens": 544892474.0, + "step": 14287 + }, + { + "epoch": 1.8175804605012085, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.410009384155273, + "learning_rate": 1e-06, + "loss": 0.3859, + "mean_token_accuracy": 0.8759708404541016, + "num_tokens": 544927667.0, + "step": 14288 + }, + { + "epoch": 1.817707670779799, + "ewc_loss": 0.034912109375, + "ewc_loss_parallel": 3.4809112548828125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.543237686157227, + "learning_rate": 1e-06, + "loss": 0.3944, + "mean_token_accuracy": 0.8741889595985413, + "num_tokens": 544959780.0, + "step": 14289 + }, + { + "epoch": 1.8178348810583895, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.3917293548584, + "learning_rate": 1e-06, + "loss": 0.4364, + "mean_token_accuracy": 0.8618248701095581, + "num_tokens": 545001832.0, + "step": 14290 + }, + { + "epoch": 1.81796209133698, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.4946231842041, + "learning_rate": 1e-06, + "loss": 0.3874, + "mean_token_accuracy": 0.8769584894180298, + "num_tokens": 545038489.0, + "step": 14291 + }, + { + "epoch": 1.8180893016155706, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.54767417907715, + "learning_rate": 1e-06, + "loss": 0.5005, + "mean_token_accuracy": 0.8489782214164734, + "num_tokens": 545069086.0, + "step": 14292 + }, + { + "epoch": 1.818216511894161, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.465450286865234, + "learning_rate": 1e-06, + "loss": 0.4051, + "mean_token_accuracy": 0.8716604709625244, + "num_tokens": 545104564.0, + "step": 14293 + }, + { + "epoch": 1.8183437221727514, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.408527374267578, + "learning_rate": 1e-06, + "loss": 0.4053, + "mean_token_accuracy": 0.873418927192688, + "num_tokens": 545149036.0, + "step": 14294 + }, + { + "epoch": 1.818470932451342, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.57944107055664, + "learning_rate": 1e-06, + "loss": 0.3796, + "mean_token_accuracy": 0.8797990083694458, + "num_tokens": 545183825.0, + "step": 14295 + }, + { + "epoch": 1.8185981427299325, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.369258880615234, + "learning_rate": 1e-06, + "loss": 0.4, + "mean_token_accuracy": 0.8724308013916016, + "num_tokens": 545221274.0, + "step": 14296 + }, + { + "epoch": 1.818725353008523, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.634485244750977, + "learning_rate": 1e-06, + "loss": 0.3807, + "mean_token_accuracy": 0.8763050436973572, + "num_tokens": 545259242.0, + "step": 14297 + }, + { + "epoch": 1.8188525632871135, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.502363204956055, + "learning_rate": 1e-06, + "loss": 0.4619, + "mean_token_accuracy": 0.8561797142028809, + "num_tokens": 545289380.0, + "step": 14298 + }, + { + "epoch": 1.818979773565704, + "ewc_loss": 0.034912109375, + "ewc_loss_parallel": 3.4809112548828125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.76817512512207, + "learning_rate": 1e-06, + "loss": 0.4028, + "mean_token_accuracy": 0.8718523383140564, + "num_tokens": 545321935.0, + "step": 14299 + }, + { + "epoch": 1.8191069838442946, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.403362274169922, + "learning_rate": 1e-06, + "loss": 0.3874, + "mean_token_accuracy": 0.8766564130783081, + "num_tokens": 545365714.0, + "step": 14300 + }, + { + "epoch": 1.8192341941228851, + "ewc_loss": 0.034912109375, + "ewc_loss_parallel": 3.4809112548828125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.392208099365234, + "learning_rate": 1e-06, + "loss": 0.4362, + "mean_token_accuracy": 0.8589270114898682, + "num_tokens": 545406838.0, + "step": 14301 + }, + { + "epoch": 1.8193614044014756, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.56251335144043, + "learning_rate": 1e-06, + "loss": 0.4308, + "mean_token_accuracy": 0.8676603436470032, + "num_tokens": 545443265.0, + "step": 14302 + }, + { + "epoch": 1.8194886146800662, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.45832061767578, + "learning_rate": 1e-06, + "loss": 0.3879, + "mean_token_accuracy": 0.8791566491127014, + "num_tokens": 545479903.0, + "step": 14303 + }, + { + "epoch": 1.8196158249586567, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.40767478942871, + "learning_rate": 1e-06, + "loss": 0.3964, + "mean_token_accuracy": 0.8759823441505432, + "num_tokens": 545513767.0, + "step": 14304 + }, + { + "epoch": 1.8197430352372472, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.446760177612305, + "learning_rate": 1e-06, + "loss": 0.4599, + "mean_token_accuracy": 0.8543952703475952, + "num_tokens": 545550541.0, + "step": 14305 + }, + { + "epoch": 1.8198702455158378, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.59859848022461, + "learning_rate": 1e-06, + "loss": 0.4081, + "mean_token_accuracy": 0.8708072304725647, + "num_tokens": 545590739.0, + "step": 14306 + }, + { + "epoch": 1.8199974557944283, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.651836395263672, + "learning_rate": 1e-06, + "loss": 0.4533, + "mean_token_accuracy": 0.8560680747032166, + "num_tokens": 545627127.0, + "step": 14307 + }, + { + "epoch": 1.8201246660730188, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.474109649658203, + "learning_rate": 1e-06, + "loss": 0.3893, + "mean_token_accuracy": 0.8729923963546753, + "num_tokens": 545665473.0, + "step": 14308 + }, + { + "epoch": 1.8202518763516093, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.46872329711914, + "learning_rate": 1e-06, + "loss": 0.372, + "mean_token_accuracy": 0.8843862414360046, + "num_tokens": 545708018.0, + "step": 14309 + }, + { + "epoch": 1.8203790866301999, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.52601432800293, + "learning_rate": 1e-06, + "loss": 0.4141, + "mean_token_accuracy": 0.8742274045944214, + "num_tokens": 545745635.0, + "step": 14310 + }, + { + "epoch": 1.8205062969087902, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.633472442626953, + "learning_rate": 1e-06, + "loss": 0.4376, + "mean_token_accuracy": 0.8578900098800659, + "num_tokens": 545785260.0, + "step": 14311 + }, + { + "epoch": 1.8206335071873807, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.32756805419922, + "learning_rate": 1e-06, + "loss": 0.4201, + "mean_token_accuracy": 0.8670052886009216, + "num_tokens": 545821696.0, + "step": 14312 + }, + { + "epoch": 1.8207607174659712, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.429943084716797, + "learning_rate": 1e-06, + "loss": 0.3676, + "mean_token_accuracy": 0.8831177353858948, + "num_tokens": 545857971.0, + "step": 14313 + }, + { + "epoch": 1.8208879277445618, + "ewc_loss": 0.034912109375, + "ewc_loss_parallel": 3.4809112548828125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.284442901611328, + "learning_rate": 1e-06, + "loss": 0.4455, + "mean_token_accuracy": 0.8587743639945984, + "num_tokens": 545891000.0, + "step": 14314 + }, + { + "epoch": 1.8210151380231523, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.563316345214844, + "learning_rate": 1e-06, + "loss": 0.4692, + "mean_token_accuracy": 0.8560885190963745, + "num_tokens": 545932825.0, + "step": 14315 + }, + { + "epoch": 1.8211423483017428, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.457794189453125, + "learning_rate": 1e-06, + "loss": 0.466, + "mean_token_accuracy": 0.8590275049209595, + "num_tokens": 545972849.0, + "step": 14316 + }, + { + "epoch": 1.8212695585803331, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.40422821044922, + "learning_rate": 1e-06, + "loss": 0.3766, + "mean_token_accuracy": 0.8810237646102905, + "num_tokens": 546011492.0, + "step": 14317 + }, + { + "epoch": 1.8213967688589237, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.54764175415039, + "learning_rate": 1e-06, + "loss": 0.4413, + "mean_token_accuracy": 0.8638559579849243, + "num_tokens": 546048461.0, + "step": 14318 + }, + { + "epoch": 1.8215239791375142, + "ewc_loss": 0.034912109375, + "ewc_loss_parallel": 3.4809112548828125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.49354362487793, + "learning_rate": 1e-06, + "loss": 0.3866, + "mean_token_accuracy": 0.8778340816497803, + "num_tokens": 546087113.0, + "step": 14319 + }, + { + "epoch": 1.8216511894161047, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.256803512573242, + "learning_rate": 1e-06, + "loss": 0.4041, + "mean_token_accuracy": 0.8721374869346619, + "num_tokens": 546124413.0, + "step": 14320 + }, + { + "epoch": 1.8217783996946952, + "ewc_loss": 0.034912109375, + "ewc_loss_parallel": 3.4809112548828125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.489307403564453, + "learning_rate": 1e-06, + "loss": 0.3716, + "mean_token_accuracy": 0.8804087042808533, + "num_tokens": 546165763.0, + "step": 14321 + }, + { + "epoch": 1.8219056099732858, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.478124618530273, + "learning_rate": 1e-06, + "loss": 0.4118, + "mean_token_accuracy": 0.8701698780059814, + "num_tokens": 546203218.0, + "step": 14322 + }, + { + "epoch": 1.8220328202518763, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.456003189086914, + "learning_rate": 1e-06, + "loss": 0.4512, + "mean_token_accuracy": 0.8590137362480164, + "num_tokens": 546247610.0, + "step": 14323 + }, + { + "epoch": 1.8221600305304668, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.349271774291992, + "learning_rate": 1e-06, + "loss": 0.4289, + "mean_token_accuracy": 0.8608065247535706, + "num_tokens": 546288952.0, + "step": 14324 + }, + { + "epoch": 1.8222872408090574, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.50008201599121, + "learning_rate": 1e-06, + "loss": 0.367, + "mean_token_accuracy": 0.8800904750823975, + "num_tokens": 546322449.0, + "step": 14325 + }, + { + "epoch": 1.8224144510876479, + "ewc_loss": 0.034912109375, + "ewc_loss_parallel": 3.4809112548828125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.586259841918945, + "learning_rate": 1e-06, + "loss": 0.3949, + "mean_token_accuracy": 0.875510573387146, + "num_tokens": 546362692.0, + "step": 14326 + }, + { + "epoch": 1.8225416613662384, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.486448287963867, + "learning_rate": 1e-06, + "loss": 0.4628, + "mean_token_accuracy": 0.8533400893211365, + "num_tokens": 546398856.0, + "step": 14327 + }, + { + "epoch": 1.822668871644829, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.673830032348633, + "learning_rate": 1e-06, + "loss": 0.4401, + "mean_token_accuracy": 0.8616970181465149, + "num_tokens": 546439633.0, + "step": 14328 + }, + { + "epoch": 1.8227960819234195, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.42481231689453, + "learning_rate": 1e-06, + "loss": 0.3984, + "mean_token_accuracy": 0.8732955455780029, + "num_tokens": 546477282.0, + "step": 14329 + }, + { + "epoch": 1.82292329220201, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.544118881225586, + "learning_rate": 1e-06, + "loss": 0.3828, + "mean_token_accuracy": 0.8807625770568848, + "num_tokens": 546514584.0, + "step": 14330 + }, + { + "epoch": 1.8230505024806005, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.39225196838379, + "learning_rate": 1e-06, + "loss": 0.3995, + "mean_token_accuracy": 0.8732394576072693, + "num_tokens": 546550761.0, + "step": 14331 + }, + { + "epoch": 1.823177712759191, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.54921531677246, + "learning_rate": 1e-06, + "loss": 0.454, + "mean_token_accuracy": 0.8561704754829407, + "num_tokens": 546594157.0, + "step": 14332 + }, + { + "epoch": 1.8233049230377816, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.464080810546875, + "learning_rate": 1e-06, + "loss": 0.3808, + "mean_token_accuracy": 0.8771560192108154, + "num_tokens": 546634671.0, + "step": 14333 + }, + { + "epoch": 1.823432133316372, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.465778350830078, + "learning_rate": 1e-06, + "loss": 0.4116, + "mean_token_accuracy": 0.8702481985092163, + "num_tokens": 546672084.0, + "step": 14334 + }, + { + "epoch": 1.8235593435949626, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.566482543945312, + "learning_rate": 1e-06, + "loss": 0.4407, + "mean_token_accuracy": 0.8632185459136963, + "num_tokens": 546701920.0, + "step": 14335 + }, + { + "epoch": 1.823686553873553, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.556058883666992, + "learning_rate": 1e-06, + "loss": 0.4061, + "mean_token_accuracy": 0.8731937408447266, + "num_tokens": 546735776.0, + "step": 14336 + }, + { + "epoch": 1.8238137641521435, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.291790008544922, + "learning_rate": 1e-06, + "loss": 0.3991, + "mean_token_accuracy": 0.8740403652191162, + "num_tokens": 546772991.0, + "step": 14337 + }, + { + "epoch": 1.823940974430734, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.69678497314453, + "learning_rate": 1e-06, + "loss": 0.4471, + "mean_token_accuracy": 0.8629580736160278, + "num_tokens": 546805310.0, + "step": 14338 + }, + { + "epoch": 1.8240681847093245, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.53003692626953, + "learning_rate": 1e-06, + "loss": 0.41, + "mean_token_accuracy": 0.8753863573074341, + "num_tokens": 546838240.0, + "step": 14339 + }, + { + "epoch": 1.824195394987915, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.466760635375977, + "learning_rate": 1e-06, + "loss": 0.445, + "mean_token_accuracy": 0.8638931512832642, + "num_tokens": 546878484.0, + "step": 14340 + }, + { + "epoch": 1.8243226052665056, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.596431732177734, + "learning_rate": 1e-06, + "loss": 0.3702, + "mean_token_accuracy": 0.8844779133796692, + "num_tokens": 546919747.0, + "step": 14341 + }, + { + "epoch": 1.8244498155450959, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.593578338623047, + "learning_rate": 1e-06, + "loss": 0.4135, + "mean_token_accuracy": 0.8695844411849976, + "num_tokens": 546962614.0, + "step": 14342 + }, + { + "epoch": 1.8245770258236864, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.30190086364746, + "learning_rate": 1e-06, + "loss": 0.4344, + "mean_token_accuracy": 0.8623018264770508, + "num_tokens": 547000059.0, + "step": 14343 + }, + { + "epoch": 1.824704236102277, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.577177047729492, + "learning_rate": 1e-06, + "loss": 0.4115, + "mean_token_accuracy": 0.8711812496185303, + "num_tokens": 547037372.0, + "step": 14344 + }, + { + "epoch": 1.8248314463808675, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.35169219970703, + "learning_rate": 1e-06, + "loss": 0.4729, + "mean_token_accuracy": 0.8555297255516052, + "num_tokens": 547071840.0, + "step": 14345 + }, + { + "epoch": 1.824958656659458, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.578065872192383, + "learning_rate": 1e-06, + "loss": 0.4959, + "mean_token_accuracy": 0.8450397849082947, + "num_tokens": 547108916.0, + "step": 14346 + }, + { + "epoch": 1.8250858669380485, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.558834075927734, + "learning_rate": 1e-06, + "loss": 0.366, + "mean_token_accuracy": 0.8878285884857178, + "num_tokens": 547146991.0, + "step": 14347 + }, + { + "epoch": 1.825213077216639, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.54753875732422, + "learning_rate": 1e-06, + "loss": 0.3849, + "mean_token_accuracy": 0.8804627060890198, + "num_tokens": 547184093.0, + "step": 14348 + }, + { + "epoch": 1.8253402874952296, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.449525833129883, + "learning_rate": 1e-06, + "loss": 0.392, + "mean_token_accuracy": 0.8752405047416687, + "num_tokens": 547217570.0, + "step": 14349 + }, + { + "epoch": 1.8254674977738201, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.546735763549805, + "learning_rate": 1e-06, + "loss": 0.4196, + "mean_token_accuracy": 0.8682856559753418, + "num_tokens": 547259498.0, + "step": 14350 + }, + { + "epoch": 1.8255947080524106, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.536623001098633, + "learning_rate": 1e-06, + "loss": 0.4208, + "mean_token_accuracy": 0.868047297000885, + "num_tokens": 547298419.0, + "step": 14351 + }, + { + "epoch": 1.8257219183310012, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.538742065429688, + "learning_rate": 1e-06, + "loss": 0.3938, + "mean_token_accuracy": 0.8767260313034058, + "num_tokens": 547342984.0, + "step": 14352 + }, + { + "epoch": 1.8258491286095917, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.45858383178711, + "learning_rate": 1e-06, + "loss": 0.4028, + "mean_token_accuracy": 0.8715107440948486, + "num_tokens": 547383162.0, + "step": 14353 + }, + { + "epoch": 1.8259763388881822, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.562564849853516, + "learning_rate": 1e-06, + "loss": 0.3465, + "mean_token_accuracy": 0.8894765377044678, + "num_tokens": 547416967.0, + "step": 14354 + }, + { + "epoch": 1.8261035491667728, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.474746704101562, + "learning_rate": 1e-06, + "loss": 0.4016, + "mean_token_accuracy": 0.8727306723594666, + "num_tokens": 547454695.0, + "step": 14355 + }, + { + "epoch": 1.8262307594453633, + "ewc_loss": 0.034912109375, + "ewc_loss_parallel": 3.4809112548828125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.314138412475586, + "learning_rate": 1e-06, + "loss": 0.3843, + "mean_token_accuracy": 0.8787752389907837, + "num_tokens": 547494258.0, + "step": 14356 + }, + { + "epoch": 1.8263579697239538, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.543241500854492, + "learning_rate": 1e-06, + "loss": 0.4298, + "mean_token_accuracy": 0.8645471334457397, + "num_tokens": 547531404.0, + "step": 14357 + }, + { + "epoch": 1.8264851800025443, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.595651626586914, + "learning_rate": 1e-06, + "loss": 0.3891, + "mean_token_accuracy": 0.8742048740386963, + "num_tokens": 547566553.0, + "step": 14358 + }, + { + "epoch": 1.8266123902811349, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.29372787475586, + "learning_rate": 1e-06, + "loss": 0.4388, + "mean_token_accuracy": 0.8623391389846802, + "num_tokens": 547605505.0, + "step": 14359 + }, + { + "epoch": 1.8267396005597252, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.61928939819336, + "learning_rate": 1e-06, + "loss": 0.3831, + "mean_token_accuracy": 0.8777369260787964, + "num_tokens": 547643146.0, + "step": 14360 + }, + { + "epoch": 1.8268668108383157, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.42552947998047, + "learning_rate": 1e-06, + "loss": 0.4173, + "mean_token_accuracy": 0.8663667440414429, + "num_tokens": 547683373.0, + "step": 14361 + }, + { + "epoch": 1.8269940211169062, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.603586196899414, + "learning_rate": 1e-06, + "loss": 0.4307, + "mean_token_accuracy": 0.8643820285797119, + "num_tokens": 547716762.0, + "step": 14362 + }, + { + "epoch": 1.8271212313954968, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.512998580932617, + "learning_rate": 1e-06, + "loss": 0.3897, + "mean_token_accuracy": 0.8774310350418091, + "num_tokens": 547756907.0, + "step": 14363 + }, + { + "epoch": 1.8272484416740873, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.26129150390625, + "learning_rate": 1e-06, + "loss": 0.4284, + "mean_token_accuracy": 0.8655632138252258, + "num_tokens": 547793545.0, + "step": 14364 + }, + { + "epoch": 1.8273756519526778, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.474210739135742, + "learning_rate": 1e-06, + "loss": 0.4506, + "mean_token_accuracy": 0.860345721244812, + "num_tokens": 547836845.0, + "step": 14365 + }, + { + "epoch": 1.8275028622312681, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.578691482543945, + "learning_rate": 1e-06, + "loss": 0.3946, + "mean_token_accuracy": 0.8749607801437378, + "num_tokens": 547875849.0, + "step": 14366 + }, + { + "epoch": 1.8276300725098586, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.449230194091797, + "learning_rate": 1e-06, + "loss": 0.4271, + "mean_token_accuracy": 0.8633682131767273, + "num_tokens": 547918672.0, + "step": 14367 + }, + { + "epoch": 1.8277572827884492, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.59123992919922, + "learning_rate": 1e-06, + "loss": 0.4583, + "mean_token_accuracy": 0.8569537401199341, + "num_tokens": 547958681.0, + "step": 14368 + }, + { + "epoch": 1.8278844930670397, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.481670379638672, + "learning_rate": 1e-06, + "loss": 0.4425, + "mean_token_accuracy": 0.8632329702377319, + "num_tokens": 547996940.0, + "step": 14369 + }, + { + "epoch": 1.8280117033456302, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.63631820678711, + "learning_rate": 1e-06, + "loss": 0.4219, + "mean_token_accuracy": 0.8774212002754211, + "num_tokens": 548030188.0, + "step": 14370 + }, + { + "epoch": 1.8281389136242208, + "ewc_loss": 0.034912109375, + "ewc_loss_parallel": 3.4809112548828125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.383968353271484, + "learning_rate": 1e-06, + "loss": 0.3661, + "mean_token_accuracy": 0.8849852681159973, + "num_tokens": 548066891.0, + "step": 14371 + }, + { + "epoch": 1.8282661239028113, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.35565185546875, + "learning_rate": 1e-06, + "loss": 0.3977, + "mean_token_accuracy": 0.8753160238265991, + "num_tokens": 548104664.0, + "step": 14372 + }, + { + "epoch": 1.8283933341814018, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.53215789794922, + "learning_rate": 1e-06, + "loss": 0.4278, + "mean_token_accuracy": 0.8655150532722473, + "num_tokens": 548143938.0, + "step": 14373 + }, + { + "epoch": 1.8285205444599923, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.37086296081543, + "learning_rate": 1e-06, + "loss": 0.4127, + "mean_token_accuracy": 0.8673588037490845, + "num_tokens": 548183213.0, + "step": 14374 + }, + { + "epoch": 1.8286477547385829, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.338871002197266, + "learning_rate": 1e-06, + "loss": 0.4619, + "mean_token_accuracy": 0.8532934188842773, + "num_tokens": 548221304.0, + "step": 14375 + }, + { + "epoch": 1.8287749650171734, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.58352279663086, + "learning_rate": 1e-06, + "loss": 0.4123, + "mean_token_accuracy": 0.8672180771827698, + "num_tokens": 548261123.0, + "step": 14376 + }, + { + "epoch": 1.828902175295764, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.41470718383789, + "learning_rate": 1e-06, + "loss": 0.4728, + "mean_token_accuracy": 0.8505550622940063, + "num_tokens": 548305040.0, + "step": 14377 + }, + { + "epoch": 1.8290293855743545, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.373653411865234, + "learning_rate": 1e-06, + "loss": 0.3826, + "mean_token_accuracy": 0.87663733959198, + "num_tokens": 548338538.0, + "step": 14378 + }, + { + "epoch": 1.829156595852945, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.54202651977539, + "learning_rate": 1e-06, + "loss": 0.3975, + "mean_token_accuracy": 0.8724939227104187, + "num_tokens": 548378510.0, + "step": 14379 + }, + { + "epoch": 1.8292838061315355, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.424251556396484, + "learning_rate": 1e-06, + "loss": 0.4371, + "mean_token_accuracy": 0.8629133701324463, + "num_tokens": 548415790.0, + "step": 14380 + }, + { + "epoch": 1.829411016410126, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.529644012451172, + "learning_rate": 1e-06, + "loss": 0.4365, + "mean_token_accuracy": 0.8594540357589722, + "num_tokens": 548447294.0, + "step": 14381 + }, + { + "epoch": 1.8295382266887166, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.40328598022461, + "learning_rate": 1e-06, + "loss": 0.3909, + "mean_token_accuracy": 0.87534499168396, + "num_tokens": 548484617.0, + "step": 14382 + }, + { + "epoch": 1.829665436967307, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.493619918823242, + "learning_rate": 1e-06, + "loss": 0.4354, + "mean_token_accuracy": 0.8650279641151428, + "num_tokens": 548527552.0, + "step": 14383 + }, + { + "epoch": 1.8297926472458976, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.584531784057617, + "learning_rate": 1e-06, + "loss": 0.3532, + "mean_token_accuracy": 0.8875489234924316, + "num_tokens": 548561322.0, + "step": 14384 + }, + { + "epoch": 1.829919857524488, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.481094360351562, + "learning_rate": 1e-06, + "loss": 0.384, + "mean_token_accuracy": 0.8796422481536865, + "num_tokens": 548594810.0, + "step": 14385 + }, + { + "epoch": 1.8300470678030785, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.546356201171875, + "learning_rate": 1e-06, + "loss": 0.4489, + "mean_token_accuracy": 0.8634753823280334, + "num_tokens": 548631094.0, + "step": 14386 + }, + { + "epoch": 1.830174278081669, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.355083465576172, + "learning_rate": 1e-06, + "loss": 0.4134, + "mean_token_accuracy": 0.8715384006500244, + "num_tokens": 548669162.0, + "step": 14387 + }, + { + "epoch": 1.8303014883602595, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.74472999572754, + "learning_rate": 1e-06, + "loss": 0.4011, + "mean_token_accuracy": 0.8682800531387329, + "num_tokens": 548704555.0, + "step": 14388 + }, + { + "epoch": 1.83042869863885, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.624134063720703, + "learning_rate": 1e-06, + "loss": 0.369, + "mean_token_accuracy": 0.8821713924407959, + "num_tokens": 548741852.0, + "step": 14389 + }, + { + "epoch": 1.8305559089174406, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.45357322692871, + "learning_rate": 1e-06, + "loss": 0.4515, + "mean_token_accuracy": 0.8573275208473206, + "num_tokens": 548788455.0, + "step": 14390 + }, + { + "epoch": 1.8306831191960309, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.64802360534668, + "learning_rate": 1e-06, + "loss": 0.3889, + "mean_token_accuracy": 0.8767606616020203, + "num_tokens": 548826347.0, + "step": 14391 + }, + { + "epoch": 1.8308103294746214, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.413070678710938, + "learning_rate": 1e-06, + "loss": 0.4323, + "mean_token_accuracy": 0.8634249567985535, + "num_tokens": 548865428.0, + "step": 14392 + }, + { + "epoch": 1.830937539753212, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.5468692779541, + "learning_rate": 1e-06, + "loss": 0.4609, + "mean_token_accuracy": 0.8576313853263855, + "num_tokens": 548901671.0, + "step": 14393 + }, + { + "epoch": 1.8310647500318025, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.44980812072754, + "learning_rate": 1e-06, + "loss": 0.4287, + "mean_token_accuracy": 0.8653622269630432, + "num_tokens": 548939234.0, + "step": 14394 + }, + { + "epoch": 1.831191960310393, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.413002014160156, + "learning_rate": 1e-06, + "loss": 0.3999, + "mean_token_accuracy": 0.874142050743103, + "num_tokens": 548975334.0, + "step": 14395 + }, + { + "epoch": 1.8313191705889835, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.53857421875, + "learning_rate": 1e-06, + "loss": 0.4382, + "mean_token_accuracy": 0.8632499575614929, + "num_tokens": 549016372.0, + "step": 14396 + }, + { + "epoch": 1.831446380867574, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.607271194458008, + "learning_rate": 1e-06, + "loss": 0.388, + "mean_token_accuracy": 0.8760964870452881, + "num_tokens": 549058049.0, + "step": 14397 + }, + { + "epoch": 1.8315735911461646, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.5364990234375, + "learning_rate": 1e-06, + "loss": 0.4546, + "mean_token_accuracy": 0.8573859930038452, + "num_tokens": 549097855.0, + "step": 14398 + }, + { + "epoch": 1.831700801424755, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.474119186401367, + "learning_rate": 1e-06, + "loss": 0.3482, + "mean_token_accuracy": 0.8889120221138, + "num_tokens": 549130906.0, + "step": 14399 + }, + { + "epoch": 1.8318280117033456, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.590238571166992, + "learning_rate": 1e-06, + "loss": 0.4769, + "mean_token_accuracy": 0.8504873514175415, + "num_tokens": 549176574.0, + "step": 14400 + }, + { + "epoch": 1.8319552219819362, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.36836814880371, + "learning_rate": 1e-06, + "loss": 0.4202, + "mean_token_accuracy": 0.8679836988449097, + "num_tokens": 549213945.0, + "step": 14401 + }, + { + "epoch": 1.8320824322605267, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.714336395263672, + "learning_rate": 1e-06, + "loss": 0.4408, + "mean_token_accuracy": 0.8604695796966553, + "num_tokens": 549255843.0, + "step": 14402 + }, + { + "epoch": 1.8322096425391172, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.447643280029297, + "learning_rate": 1e-06, + "loss": 0.4054, + "mean_token_accuracy": 0.873160183429718, + "num_tokens": 549292573.0, + "step": 14403 + }, + { + "epoch": 1.8323368528177078, + "ewc_loss": 0.034912109375, + "ewc_loss_parallel": 3.4809112548828125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.429855346679688, + "learning_rate": 1e-06, + "loss": 0.365, + "mean_token_accuracy": 0.8812059164047241, + "num_tokens": 549335291.0, + "step": 14404 + }, + { + "epoch": 1.8324640630962983, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.49401092529297, + "learning_rate": 1e-06, + "loss": 0.4496, + "mean_token_accuracy": 0.8568494319915771, + "num_tokens": 549381337.0, + "step": 14405 + }, + { + "epoch": 1.8325912733748888, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.50493621826172, + "learning_rate": 1e-06, + "loss": 0.382, + "mean_token_accuracy": 0.8798562288284302, + "num_tokens": 549418180.0, + "step": 14406 + }, + { + "epoch": 1.8327184836534793, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.496992111206055, + "learning_rate": 1e-06, + "loss": 0.3938, + "mean_token_accuracy": 0.877338707447052, + "num_tokens": 549461623.0, + "step": 14407 + }, + { + "epoch": 1.8328456939320699, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.508548736572266, + "learning_rate": 1e-06, + "loss": 0.4045, + "mean_token_accuracy": 0.8724285364151001, + "num_tokens": 549500791.0, + "step": 14408 + }, + { + "epoch": 1.8329729042106602, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.663223266601562, + "learning_rate": 1e-06, + "loss": 0.3871, + "mean_token_accuracy": 0.8755946159362793, + "num_tokens": 549533343.0, + "step": 14409 + }, + { + "epoch": 1.8331001144892507, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.683454513549805, + "learning_rate": 1e-06, + "loss": 0.4318, + "mean_token_accuracy": 0.8616033792495728, + "num_tokens": 549572707.0, + "step": 14410 + }, + { + "epoch": 1.8332273247678412, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.54069709777832, + "learning_rate": 1e-06, + "loss": 0.402, + "mean_token_accuracy": 0.8762136101722717, + "num_tokens": 549609772.0, + "step": 14411 + }, + { + "epoch": 1.8333545350464318, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.51852035522461, + "learning_rate": 1e-06, + "loss": 0.4503, + "mean_token_accuracy": 0.8577033281326294, + "num_tokens": 549648626.0, + "step": 14412 + }, + { + "epoch": 1.8334817453250223, + "ewc_loss": 0.034912109375, + "ewc_loss_parallel": 3.4809112548828125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.574016571044922, + "learning_rate": 1e-06, + "loss": 0.4487, + "mean_token_accuracy": 0.858129620552063, + "num_tokens": 549693211.0, + "step": 14413 + }, + { + "epoch": 1.8336089556036128, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.648799896240234, + "learning_rate": 1e-06, + "loss": 0.4355, + "mean_token_accuracy": 0.8640373945236206, + "num_tokens": 549729463.0, + "step": 14414 + }, + { + "epoch": 1.8337361658822031, + "ewc_loss": 0.034912109375, + "ewc_loss_parallel": 3.4809112548828125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.538738250732422, + "learning_rate": 1e-06, + "loss": 0.396, + "mean_token_accuracy": 0.8735670447349548, + "num_tokens": 549769139.0, + "step": 14415 + }, + { + "epoch": 1.8338633761607936, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.56027603149414, + "learning_rate": 1e-06, + "loss": 0.4102, + "mean_token_accuracy": 0.8724265098571777, + "num_tokens": 549807067.0, + "step": 14416 + }, + { + "epoch": 1.8339905864393842, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.374284744262695, + "learning_rate": 1e-06, + "loss": 0.4254, + "mean_token_accuracy": 0.868969202041626, + "num_tokens": 549842689.0, + "step": 14417 + }, + { + "epoch": 1.8341177967179747, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.678544998168945, + "learning_rate": 1e-06, + "loss": 0.3473, + "mean_token_accuracy": 0.8885009288787842, + "num_tokens": 549876108.0, + "step": 14418 + }, + { + "epoch": 1.8342450069965652, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.61104393005371, + "learning_rate": 1e-06, + "loss": 0.3702, + "mean_token_accuracy": 0.8829220533370972, + "num_tokens": 549909656.0, + "step": 14419 + }, + { + "epoch": 1.8343722172751558, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.630512237548828, + "learning_rate": 1e-06, + "loss": 0.4509, + "mean_token_accuracy": 0.8611295223236084, + "num_tokens": 549949156.0, + "step": 14420 + }, + { + "epoch": 1.8344994275537463, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.714780807495117, + "learning_rate": 1e-06, + "loss": 0.3983, + "mean_token_accuracy": 0.8764972686767578, + "num_tokens": 549990256.0, + "step": 14421 + }, + { + "epoch": 1.8346266378323368, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.574325561523438, + "learning_rate": 1e-06, + "loss": 0.3813, + "mean_token_accuracy": 0.8775736093521118, + "num_tokens": 550023557.0, + "step": 14422 + }, + { + "epoch": 1.8347538481109273, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.40517807006836, + "learning_rate": 1e-06, + "loss": 0.4311, + "mean_token_accuracy": 0.8653008937835693, + "num_tokens": 550069402.0, + "step": 14423 + }, + { + "epoch": 1.8348810583895179, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.623241424560547, + "learning_rate": 1e-06, + "loss": 0.3873, + "mean_token_accuracy": 0.8769303560256958, + "num_tokens": 550110860.0, + "step": 14424 + }, + { + "epoch": 1.8350082686681084, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.62714195251465, + "learning_rate": 1e-06, + "loss": 0.4669, + "mean_token_accuracy": 0.8545352220535278, + "num_tokens": 550151736.0, + "step": 14425 + }, + { + "epoch": 1.835135478946699, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.576017379760742, + "learning_rate": 1e-06, + "loss": 0.4001, + "mean_token_accuracy": 0.8719182014465332, + "num_tokens": 550187731.0, + "step": 14426 + }, + { + "epoch": 1.8352626892252895, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.546659469604492, + "learning_rate": 1e-06, + "loss": 0.409, + "mean_token_accuracy": 0.871502697467804, + "num_tokens": 550220545.0, + "step": 14427 + }, + { + "epoch": 1.83538989950388, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.445144653320312, + "learning_rate": 1e-06, + "loss": 0.3943, + "mean_token_accuracy": 0.8734524846076965, + "num_tokens": 550254755.0, + "step": 14428 + }, + { + "epoch": 1.8355171097824705, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.687366485595703, + "learning_rate": 1e-06, + "loss": 0.3847, + "mean_token_accuracy": 0.8765152096748352, + "num_tokens": 550295191.0, + "step": 14429 + }, + { + "epoch": 1.835644320061061, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.630781173706055, + "learning_rate": 1e-06, + "loss": 0.4309, + "mean_token_accuracy": 0.8643179535865784, + "num_tokens": 550332984.0, + "step": 14430 + }, + { + "epoch": 1.8357715303396516, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.7643985748291, + "learning_rate": 1e-06, + "loss": 0.404, + "mean_token_accuracy": 0.8721328973770142, + "num_tokens": 550377470.0, + "step": 14431 + }, + { + "epoch": 1.835898740618242, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.282445907592773, + "learning_rate": 1e-06, + "loss": 0.3895, + "mean_token_accuracy": 0.8738938570022583, + "num_tokens": 550416123.0, + "step": 14432 + }, + { + "epoch": 1.8360259508968326, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.52703285217285, + "learning_rate": 1e-06, + "loss": 0.3977, + "mean_token_accuracy": 0.8747822642326355, + "num_tokens": 550458062.0, + "step": 14433 + }, + { + "epoch": 1.836153161175423, + "ewc_loss": 0.034912109375, + "ewc_loss_parallel": 3.4809112548828125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.408403396606445, + "learning_rate": 1e-06, + "loss": 0.4409, + "mean_token_accuracy": 0.8632930517196655, + "num_tokens": 550496485.0, + "step": 14434 + }, + { + "epoch": 1.8362803714540135, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.507221221923828, + "learning_rate": 1e-06, + "loss": 0.4132, + "mean_token_accuracy": 0.8717354536056519, + "num_tokens": 550536500.0, + "step": 14435 + }, + { + "epoch": 1.836407581732604, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.526309967041016, + "learning_rate": 1e-06, + "loss": 0.4438, + "mean_token_accuracy": 0.8639744520187378, + "num_tokens": 550575611.0, + "step": 14436 + }, + { + "epoch": 1.8365347920111945, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.368762969970703, + "learning_rate": 1e-06, + "loss": 0.4332, + "mean_token_accuracy": 0.8609511852264404, + "num_tokens": 550616431.0, + "step": 14437 + }, + { + "epoch": 1.836662002289785, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.50649642944336, + "learning_rate": 1e-06, + "loss": 0.3795, + "mean_token_accuracy": 0.8798094391822815, + "num_tokens": 550653626.0, + "step": 14438 + }, + { + "epoch": 1.8367892125683756, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.53074836730957, + "learning_rate": 1e-06, + "loss": 0.4231, + "mean_token_accuracy": 0.866178035736084, + "num_tokens": 550690899.0, + "step": 14439 + }, + { + "epoch": 1.8369164228469659, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.584125518798828, + "learning_rate": 1e-06, + "loss": 0.4036, + "mean_token_accuracy": 0.8703005313873291, + "num_tokens": 550734408.0, + "step": 14440 + }, + { + "epoch": 1.8370436331255564, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.35337257385254, + "learning_rate": 1e-06, + "loss": 0.4453, + "mean_token_accuracy": 0.8577470779418945, + "num_tokens": 550767888.0, + "step": 14441 + }, + { + "epoch": 1.837170843404147, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.45207977294922, + "learning_rate": 1e-06, + "loss": 0.4052, + "mean_token_accuracy": 0.8703662157058716, + "num_tokens": 550807192.0, + "step": 14442 + }, + { + "epoch": 1.8372980536827375, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.61398696899414, + "learning_rate": 1e-06, + "loss": 0.382, + "mean_token_accuracy": 0.8782223463058472, + "num_tokens": 550848006.0, + "step": 14443 + }, + { + "epoch": 1.837425263961328, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.547346115112305, + "learning_rate": 1e-06, + "loss": 0.4113, + "mean_token_accuracy": 0.869693398475647, + "num_tokens": 550888350.0, + "step": 14444 + }, + { + "epoch": 1.8375524742399185, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.670106887817383, + "learning_rate": 1e-06, + "loss": 0.395, + "mean_token_accuracy": 0.8764592409133911, + "num_tokens": 550925389.0, + "step": 14445 + }, + { + "epoch": 1.837679684518509, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.673139572143555, + "learning_rate": 1e-06, + "loss": 0.4054, + "mean_token_accuracy": 0.8740006685256958, + "num_tokens": 550962610.0, + "step": 14446 + }, + { + "epoch": 1.8378068947970996, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.505138397216797, + "learning_rate": 1e-06, + "loss": 0.4306, + "mean_token_accuracy": 0.8658058047294617, + "num_tokens": 551001018.0, + "step": 14447 + }, + { + "epoch": 1.83793410507569, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.48759651184082, + "learning_rate": 1e-06, + "loss": 0.4081, + "mean_token_accuracy": 0.8710517883300781, + "num_tokens": 551038505.0, + "step": 14448 + }, + { + "epoch": 1.8380613153542806, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.59204864501953, + "learning_rate": 1e-06, + "loss": 0.372, + "mean_token_accuracy": 0.88094562292099, + "num_tokens": 551072235.0, + "step": 14449 + }, + { + "epoch": 1.8381885256328712, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.591150283813477, + "learning_rate": 1e-06, + "loss": 0.3594, + "mean_token_accuracy": 0.8832540512084961, + "num_tokens": 551110129.0, + "step": 14450 + }, + { + "epoch": 1.8383157359114617, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.494686126708984, + "learning_rate": 1e-06, + "loss": 0.4062, + "mean_token_accuracy": 0.8707060813903809, + "num_tokens": 551152380.0, + "step": 14451 + }, + { + "epoch": 1.8384429461900522, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.563613891601562, + "learning_rate": 1e-06, + "loss": 0.3774, + "mean_token_accuracy": 0.8809396624565125, + "num_tokens": 551195942.0, + "step": 14452 + }, + { + "epoch": 1.8385701564686427, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.314231872558594, + "learning_rate": 1e-06, + "loss": 0.4401, + "mean_token_accuracy": 0.8614990711212158, + "num_tokens": 551238502.0, + "step": 14453 + }, + { + "epoch": 1.8386973667472333, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.684062957763672, + "learning_rate": 1e-06, + "loss": 0.374, + "mean_token_accuracy": 0.8806281685829163, + "num_tokens": 551278091.0, + "step": 14454 + }, + { + "epoch": 1.8388245770258238, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.58881950378418, + "learning_rate": 1e-06, + "loss": 0.4747, + "mean_token_accuracy": 0.8499170541763306, + "num_tokens": 551314940.0, + "step": 14455 + }, + { + "epoch": 1.8389517873044143, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.351436614990234, + "learning_rate": 1e-06, + "loss": 0.4449, + "mean_token_accuracy": 0.856568455696106, + "num_tokens": 551356427.0, + "step": 14456 + }, + { + "epoch": 1.8390789975830049, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.578521728515625, + "learning_rate": 1e-06, + "loss": 0.4058, + "mean_token_accuracy": 0.8749114274978638, + "num_tokens": 551400436.0, + "step": 14457 + }, + { + "epoch": 1.8392062078615952, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.579626083374023, + "learning_rate": 1e-06, + "loss": 0.4159, + "mean_token_accuracy": 0.8680486679077148, + "num_tokens": 551438335.0, + "step": 14458 + }, + { + "epoch": 1.8393334181401857, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.430124282836914, + "learning_rate": 1e-06, + "loss": 0.4024, + "mean_token_accuracy": 0.873112678527832, + "num_tokens": 551471214.0, + "step": 14459 + }, + { + "epoch": 1.8394606284187762, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.607276916503906, + "learning_rate": 1e-06, + "loss": 0.454, + "mean_token_accuracy": 0.856518030166626, + "num_tokens": 551504071.0, + "step": 14460 + }, + { + "epoch": 1.8395878386973668, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.587276458740234, + "learning_rate": 1e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.8652170896530151, + "num_tokens": 551538456.0, + "step": 14461 + }, + { + "epoch": 1.8397150489759573, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.343345642089844, + "learning_rate": 1e-06, + "loss": 0.398, + "mean_token_accuracy": 0.8742635250091553, + "num_tokens": 551575334.0, + "step": 14462 + }, + { + "epoch": 1.8398422592545478, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.54317283630371, + "learning_rate": 1e-06, + "loss": 0.3907, + "mean_token_accuracy": 0.8783076405525208, + "num_tokens": 551617127.0, + "step": 14463 + }, + { + "epoch": 1.8399694695331381, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.67298698425293, + "learning_rate": 1e-06, + "loss": 0.3992, + "mean_token_accuracy": 0.87430340051651, + "num_tokens": 551655548.0, + "step": 14464 + }, + { + "epoch": 1.8400966798117286, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.59967803955078, + "learning_rate": 1e-06, + "loss": 0.43, + "mean_token_accuracy": 0.8671159148216248, + "num_tokens": 551691807.0, + "step": 14465 + }, + { + "epoch": 1.8402238900903192, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.483091354370117, + "learning_rate": 1e-06, + "loss": 0.39, + "mean_token_accuracy": 0.8781828880310059, + "num_tokens": 551729805.0, + "step": 14466 + }, + { + "epoch": 1.8403511003689097, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.619247436523438, + "learning_rate": 1e-06, + "loss": 0.4942, + "mean_token_accuracy": 0.8471018671989441, + "num_tokens": 551775220.0, + "step": 14467 + }, + { + "epoch": 1.8404783106475002, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.525165557861328, + "learning_rate": 1e-06, + "loss": 0.437, + "mean_token_accuracy": 0.8625820875167847, + "num_tokens": 551816242.0, + "step": 14468 + }, + { + "epoch": 1.8406055209260908, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.405752182006836, + "learning_rate": 1e-06, + "loss": 0.4567, + "mean_token_accuracy": 0.8585081696510315, + "num_tokens": 551854929.0, + "step": 14469 + }, + { + "epoch": 1.8407327312046813, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.612079620361328, + "learning_rate": 1e-06, + "loss": 0.4124, + "mean_token_accuracy": 0.8675711154937744, + "num_tokens": 551898877.0, + "step": 14470 + }, + { + "epoch": 1.8408599414832718, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.371826171875, + "learning_rate": 1e-06, + "loss": 0.3609, + "mean_token_accuracy": 0.8869459629058838, + "num_tokens": 551934252.0, + "step": 14471 + }, + { + "epoch": 1.8409871517618623, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.5010929107666, + "learning_rate": 1e-06, + "loss": 0.3759, + "mean_token_accuracy": 0.8811904191970825, + "num_tokens": 551974779.0, + "step": 14472 + }, + { + "epoch": 1.8411143620404529, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 29.383220672607422, + "learning_rate": 1e-06, + "loss": 0.4339, + "mean_token_accuracy": 0.8622481822967529, + "num_tokens": 552011627.0, + "step": 14473 + }, + { + "epoch": 1.8412415723190434, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.727394104003906, + "learning_rate": 1e-06, + "loss": 0.4853, + "mean_token_accuracy": 0.8492767810821533, + "num_tokens": 552045333.0, + "step": 14474 + }, + { + "epoch": 1.841368782597634, + "ewc_loss": 0.034912109375, + "ewc_loss_parallel": 3.4809112548828125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 23.560535430908203, + "learning_rate": 1e-06, + "loss": 0.4449, + "mean_token_accuracy": 0.8644217252731323, + "num_tokens": 552079974.0, + "step": 14475 + }, + { + "epoch": 1.8414959928762245, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.73885154724121, + "learning_rate": 1e-06, + "loss": 0.4063, + "mean_token_accuracy": 0.8768672943115234, + "num_tokens": 552117507.0, + "step": 14476 + }, + { + "epoch": 1.841623203154815, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.154380798339844, + "learning_rate": 1e-06, + "loss": 0.4194, + "mean_token_accuracy": 0.8686290979385376, + "num_tokens": 552156009.0, + "step": 14477 + }, + { + "epoch": 1.8417504134334055, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.201147079467773, + "learning_rate": 1e-06, + "loss": 0.3859, + "mean_token_accuracy": 0.8787820339202881, + "num_tokens": 552189092.0, + "step": 14478 + }, + { + "epoch": 1.841877623711996, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.336959838867188, + "learning_rate": 1e-06, + "loss": 0.4234, + "mean_token_accuracy": 0.8680825233459473, + "num_tokens": 552228305.0, + "step": 14479 + }, + { + "epoch": 1.8420048339905866, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.354278564453125, + "learning_rate": 1e-06, + "loss": 0.416, + "mean_token_accuracy": 0.8711726665496826, + "num_tokens": 552269402.0, + "step": 14480 + }, + { + "epoch": 1.842132044269177, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.331647872924805, + "learning_rate": 1e-06, + "loss": 0.4281, + "mean_token_accuracy": 0.8676050901412964, + "num_tokens": 552304870.0, + "step": 14481 + }, + { + "epoch": 1.8422592545477676, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.486066818237305, + "learning_rate": 1e-06, + "loss": 0.438, + "mean_token_accuracy": 0.8647373914718628, + "num_tokens": 552336802.0, + "step": 14482 + }, + { + "epoch": 1.842386464826358, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.61220359802246, + "learning_rate": 1e-06, + "loss": 0.3708, + "mean_token_accuracy": 0.8825286030769348, + "num_tokens": 552376124.0, + "step": 14483 + }, + { + "epoch": 1.8425136751049485, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.56501007080078, + "learning_rate": 1e-06, + "loss": 0.4419, + "mean_token_accuracy": 0.8610337376594543, + "num_tokens": 552412285.0, + "step": 14484 + }, + { + "epoch": 1.842640885383539, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.6171875, + "learning_rate": 1e-06, + "loss": 0.4108, + "mean_token_accuracy": 0.871019721031189, + "num_tokens": 552447527.0, + "step": 14485 + }, + { + "epoch": 1.8427680956621295, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.562747955322266, + "learning_rate": 1e-06, + "loss": 0.4109, + "mean_token_accuracy": 0.8725439310073853, + "num_tokens": 552486569.0, + "step": 14486 + }, + { + "epoch": 1.84289530594072, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.506420135498047, + "learning_rate": 1e-06, + "loss": 0.5486, + "mean_token_accuracy": 0.8351494073867798, + "num_tokens": 552529158.0, + "step": 14487 + }, + { + "epoch": 1.8430225162193106, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.524351119995117, + "learning_rate": 1e-06, + "loss": 0.4188, + "mean_token_accuracy": 0.8684219121932983, + "num_tokens": 552572183.0, + "step": 14488 + }, + { + "epoch": 1.8431497264979009, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.597000122070312, + "learning_rate": 1e-06, + "loss": 0.4168, + "mean_token_accuracy": 0.8668131828308105, + "num_tokens": 552609785.0, + "step": 14489 + }, + { + "epoch": 1.8432769367764914, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.561813354492188, + "learning_rate": 1e-06, + "loss": 0.3892, + "mean_token_accuracy": 0.875771701335907, + "num_tokens": 552641243.0, + "step": 14490 + }, + { + "epoch": 1.843404147055082, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.769906997680664, + "learning_rate": 1e-06, + "loss": 0.4383, + "mean_token_accuracy": 0.861436128616333, + "num_tokens": 552677595.0, + "step": 14491 + }, + { + "epoch": 1.8435313573336725, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.394657135009766, + "learning_rate": 1e-06, + "loss": 0.4165, + "mean_token_accuracy": 0.8706039786338806, + "num_tokens": 552715492.0, + "step": 14492 + }, + { + "epoch": 1.843658567612263, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.663169860839844, + "learning_rate": 1e-06, + "loss": 0.4126, + "mean_token_accuracy": 0.8683936595916748, + "num_tokens": 552757410.0, + "step": 14493 + }, + { + "epoch": 1.8437857778908535, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.650806427001953, + "learning_rate": 1e-06, + "loss": 0.3779, + "mean_token_accuracy": 0.8853473663330078, + "num_tokens": 552793794.0, + "step": 14494 + }, + { + "epoch": 1.843912988169444, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.437223434448242, + "learning_rate": 1e-06, + "loss": 0.4597, + "mean_token_accuracy": 0.8555968999862671, + "num_tokens": 552836010.0, + "step": 14495 + }, + { + "epoch": 1.8440401984480346, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.67310905456543, + "learning_rate": 1e-06, + "loss": 0.4164, + "mean_token_accuracy": 0.872174859046936, + "num_tokens": 552874136.0, + "step": 14496 + }, + { + "epoch": 1.844167408726625, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.500457763671875, + "learning_rate": 1e-06, + "loss": 0.4204, + "mean_token_accuracy": 0.8659566640853882, + "num_tokens": 552912314.0, + "step": 14497 + }, + { + "epoch": 1.8442946190052156, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.66745948791504, + "learning_rate": 1e-06, + "loss": 0.4584, + "mean_token_accuracy": 0.8573537468910217, + "num_tokens": 552953998.0, + "step": 14498 + }, + { + "epoch": 1.8444218292838062, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.571157455444336, + "learning_rate": 1e-06, + "loss": 0.3586, + "mean_token_accuracy": 0.8882075548171997, + "num_tokens": 552987310.0, + "step": 14499 + }, + { + "epoch": 1.8445490395623967, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.478805541992188, + "learning_rate": 1e-06, + "loss": 0.4261, + "mean_token_accuracy": 0.8663641214370728, + "num_tokens": 553019830.0, + "step": 14500 + }, + { + "epoch": 1.8446762498409872, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.3438663482666, + "learning_rate": 1e-06, + "loss": 0.4178, + "mean_token_accuracy": 0.8685975074768066, + "num_tokens": 553066695.0, + "step": 14501 + }, + { + "epoch": 1.8448034601195777, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.751874923706055, + "learning_rate": 1e-06, + "loss": 0.4424, + "mean_token_accuracy": 0.8604612946510315, + "num_tokens": 553098943.0, + "step": 14502 + }, + { + "epoch": 1.8449306703981683, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.59004783630371, + "learning_rate": 1e-06, + "loss": 0.4001, + "mean_token_accuracy": 0.8735440373420715, + "num_tokens": 553136106.0, + "step": 14503 + }, + { + "epoch": 1.8450578806767588, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.660598754882812, + "learning_rate": 1e-06, + "loss": 0.3994, + "mean_token_accuracy": 0.876288652420044, + "num_tokens": 553170014.0, + "step": 14504 + }, + { + "epoch": 1.8451850909553493, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.558645248413086, + "learning_rate": 1e-06, + "loss": 0.4241, + "mean_token_accuracy": 0.8631684184074402, + "num_tokens": 553213297.0, + "step": 14505 + }, + { + "epoch": 1.8453123012339399, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.660383224487305, + "learning_rate": 1e-06, + "loss": 0.4021, + "mean_token_accuracy": 0.8703899383544922, + "num_tokens": 553248559.0, + "step": 14506 + }, + { + "epoch": 1.8454395115125302, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.512062072753906, + "learning_rate": 1e-06, + "loss": 0.4072, + "mean_token_accuracy": 0.8721392154693604, + "num_tokens": 553289140.0, + "step": 14507 + }, + { + "epoch": 1.8455667217911207, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.865577697753906, + "learning_rate": 1e-06, + "loss": 0.4318, + "mean_token_accuracy": 0.8619892597198486, + "num_tokens": 553330619.0, + "step": 14508 + }, + { + "epoch": 1.8456939320697112, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.394981384277344, + "learning_rate": 1e-06, + "loss": 0.3835, + "mean_token_accuracy": 0.8806575536727905, + "num_tokens": 553371295.0, + "step": 14509 + }, + { + "epoch": 1.8458211423483017, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.731822967529297, + "learning_rate": 1e-06, + "loss": 0.4253, + "mean_token_accuracy": 0.8664320111274719, + "num_tokens": 553410237.0, + "step": 14510 + }, + { + "epoch": 1.8459483526268923, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.531002044677734, + "learning_rate": 1e-06, + "loss": 0.3951, + "mean_token_accuracy": 0.8765336275100708, + "num_tokens": 553446727.0, + "step": 14511 + }, + { + "epoch": 1.8460755629054828, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.688098907470703, + "learning_rate": 1e-06, + "loss": 0.3932, + "mean_token_accuracy": 0.8763420581817627, + "num_tokens": 553488626.0, + "step": 14512 + }, + { + "epoch": 1.846202773184073, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.478605270385742, + "learning_rate": 1e-06, + "loss": 0.4417, + "mean_token_accuracy": 0.8626648187637329, + "num_tokens": 553528294.0, + "step": 14513 + }, + { + "epoch": 1.8463299834626636, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.647171020507812, + "learning_rate": 1e-06, + "loss": 0.4057, + "mean_token_accuracy": 0.8731332421302795, + "num_tokens": 553563738.0, + "step": 14514 + }, + { + "epoch": 1.8464571937412542, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.649707794189453, + "learning_rate": 1e-06, + "loss": 0.525, + "mean_token_accuracy": 0.8365895748138428, + "num_tokens": 553603351.0, + "step": 14515 + }, + { + "epoch": 1.8465844040198447, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.560199737548828, + "learning_rate": 1e-06, + "loss": 0.4129, + "mean_token_accuracy": 0.8679122924804688, + "num_tokens": 553643655.0, + "step": 14516 + }, + { + "epoch": 1.8467116142984352, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.611202239990234, + "learning_rate": 1e-06, + "loss": 0.3781, + "mean_token_accuracy": 0.8819038271903992, + "num_tokens": 553677944.0, + "step": 14517 + }, + { + "epoch": 1.8468388245770258, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.650236129760742, + "learning_rate": 1e-06, + "loss": 0.4202, + "mean_token_accuracy": 0.8650933504104614, + "num_tokens": 553712120.0, + "step": 14518 + }, + { + "epoch": 1.8469660348556163, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.529699325561523, + "learning_rate": 1e-06, + "loss": 0.4026, + "mean_token_accuracy": 0.8731456995010376, + "num_tokens": 553751442.0, + "step": 14519 + }, + { + "epoch": 1.8470932451342068, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.55058479309082, + "learning_rate": 1e-06, + "loss": 0.3907, + "mean_token_accuracy": 0.8758397102355957, + "num_tokens": 553782924.0, + "step": 14520 + }, + { + "epoch": 1.8472204554127973, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.550434112548828, + "learning_rate": 1e-06, + "loss": 0.4405, + "mean_token_accuracy": 0.8607986569404602, + "num_tokens": 553818861.0, + "step": 14521 + }, + { + "epoch": 1.8473476656913879, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.54000473022461, + "learning_rate": 1e-06, + "loss": 0.4112, + "mean_token_accuracy": 0.8681544661521912, + "num_tokens": 553855593.0, + "step": 14522 + }, + { + "epoch": 1.8474748759699784, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.577299118041992, + "learning_rate": 1e-06, + "loss": 0.426, + "mean_token_accuracy": 0.8677182197570801, + "num_tokens": 553896028.0, + "step": 14523 + }, + { + "epoch": 1.847602086248569, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.419780731201172, + "learning_rate": 1e-06, + "loss": 0.4053, + "mean_token_accuracy": 0.8737610578536987, + "num_tokens": 553935212.0, + "step": 14524 + }, + { + "epoch": 1.8477292965271594, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.6678466796875, + "learning_rate": 1e-06, + "loss": 0.4085, + "mean_token_accuracy": 0.8685294389724731, + "num_tokens": 553971231.0, + "step": 14525 + }, + { + "epoch": 1.84785650680575, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.589282989501953, + "learning_rate": 1e-06, + "loss": 0.384, + "mean_token_accuracy": 0.8775079250335693, + "num_tokens": 554006782.0, + "step": 14526 + }, + { + "epoch": 1.8479837170843405, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.582643508911133, + "learning_rate": 1e-06, + "loss": 0.4357, + "mean_token_accuracy": 0.862348198890686, + "num_tokens": 554043427.0, + "step": 14527 + }, + { + "epoch": 1.848110927362931, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.6893310546875, + "learning_rate": 1e-06, + "loss": 0.446, + "mean_token_accuracy": 0.8590939044952393, + "num_tokens": 554084187.0, + "step": 14528 + }, + { + "epoch": 1.8482381376415216, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.582351684570312, + "learning_rate": 1e-06, + "loss": 0.4087, + "mean_token_accuracy": 0.870532751083374, + "num_tokens": 554122186.0, + "step": 14529 + }, + { + "epoch": 1.848365347920112, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.537012100219727, + "learning_rate": 1e-06, + "loss": 0.4185, + "mean_token_accuracy": 0.8686803579330444, + "num_tokens": 554161388.0, + "step": 14530 + }, + { + "epoch": 1.8484925581987026, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.404722213745117, + "learning_rate": 1e-06, + "loss": 0.4492, + "mean_token_accuracy": 0.8614898920059204, + "num_tokens": 554199837.0, + "step": 14531 + }, + { + "epoch": 1.848619768477293, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.616498947143555, + "learning_rate": 1e-06, + "loss": 0.4346, + "mean_token_accuracy": 0.8599388599395752, + "num_tokens": 554239567.0, + "step": 14532 + }, + { + "epoch": 1.8487469787558835, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.404455184936523, + "learning_rate": 1e-06, + "loss": 0.4039, + "mean_token_accuracy": 0.8726197481155396, + "num_tokens": 554277040.0, + "step": 14533 + }, + { + "epoch": 1.848874189034474, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.625553131103516, + "learning_rate": 1e-06, + "loss": 0.4001, + "mean_token_accuracy": 0.8730801343917847, + "num_tokens": 554318994.0, + "step": 14534 + }, + { + "epoch": 1.8490013993130645, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.40513801574707, + "learning_rate": 1e-06, + "loss": 0.4337, + "mean_token_accuracy": 0.8628809452056885, + "num_tokens": 554355334.0, + "step": 14535 + }, + { + "epoch": 1.849128609591655, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.40606689453125, + "learning_rate": 1e-06, + "loss": 0.4459, + "mean_token_accuracy": 0.8587119579315186, + "num_tokens": 554396504.0, + "step": 14536 + }, + { + "epoch": 1.8492558198702456, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.500471115112305, + "learning_rate": 1e-06, + "loss": 0.3915, + "mean_token_accuracy": 0.8760679960250854, + "num_tokens": 554429772.0, + "step": 14537 + }, + { + "epoch": 1.8493830301488359, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.540599822998047, + "learning_rate": 1e-06, + "loss": 0.4021, + "mean_token_accuracy": 0.8708876967430115, + "num_tokens": 554470102.0, + "step": 14538 + }, + { + "epoch": 1.8495102404274264, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.617372512817383, + "learning_rate": 1e-06, + "loss": 0.4352, + "mean_token_accuracy": 0.8608396053314209, + "num_tokens": 554511632.0, + "step": 14539 + }, + { + "epoch": 1.849637450706017, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.568172454833984, + "learning_rate": 1e-06, + "loss": 0.4251, + "mean_token_accuracy": 0.8654612302780151, + "num_tokens": 554553028.0, + "step": 14540 + }, + { + "epoch": 1.8497646609846075, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.579538345336914, + "learning_rate": 1e-06, + "loss": 0.4172, + "mean_token_accuracy": 0.8632783889770508, + "num_tokens": 554585109.0, + "step": 14541 + }, + { + "epoch": 1.849891871263198, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.440397262573242, + "learning_rate": 1e-06, + "loss": 0.412, + "mean_token_accuracy": 0.8702819347381592, + "num_tokens": 554621141.0, + "step": 14542 + }, + { + "epoch": 1.8500190815417885, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.755340576171875, + "learning_rate": 1e-06, + "loss": 0.4205, + "mean_token_accuracy": 0.8685126304626465, + "num_tokens": 554660838.0, + "step": 14543 + }, + { + "epoch": 1.850146291820379, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.498138427734375, + "learning_rate": 1e-06, + "loss": 0.4224, + "mean_token_accuracy": 0.8709944486618042, + "num_tokens": 554696338.0, + "step": 14544 + }, + { + "epoch": 1.8502735020989696, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.50031280517578, + "learning_rate": 1e-06, + "loss": 0.3705, + "mean_token_accuracy": 0.8808577060699463, + "num_tokens": 554729782.0, + "step": 14545 + }, + { + "epoch": 1.85040071237756, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.670974731445312, + "learning_rate": 1e-06, + "loss": 0.3657, + "mean_token_accuracy": 0.8850835561752319, + "num_tokens": 554767566.0, + "step": 14546 + }, + { + "epoch": 1.8505279226561506, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.455642700195312, + "learning_rate": 1e-06, + "loss": 0.3961, + "mean_token_accuracy": 0.8748801350593567, + "num_tokens": 554806063.0, + "step": 14547 + }, + { + "epoch": 1.8506551329347412, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.56884765625, + "learning_rate": 1e-06, + "loss": 0.4136, + "mean_token_accuracy": 0.8664901852607727, + "num_tokens": 554839418.0, + "step": 14548 + }, + { + "epoch": 1.8507823432133317, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.285194396972656, + "learning_rate": 1e-06, + "loss": 0.4168, + "mean_token_accuracy": 0.8650676608085632, + "num_tokens": 554881888.0, + "step": 14549 + }, + { + "epoch": 1.8509095534919222, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.58491325378418, + "learning_rate": 1e-06, + "loss": 0.3602, + "mean_token_accuracy": 0.888466477394104, + "num_tokens": 554921561.0, + "step": 14550 + }, + { + "epoch": 1.8510367637705127, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.467391967773438, + "learning_rate": 1e-06, + "loss": 0.4805, + "mean_token_accuracy": 0.8447887897491455, + "num_tokens": 554954380.0, + "step": 14551 + }, + { + "epoch": 1.8511639740491033, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.632322311401367, + "learning_rate": 1e-06, + "loss": 0.4099, + "mean_token_accuracy": 0.8696551322937012, + "num_tokens": 554994781.0, + "step": 14552 + }, + { + "epoch": 1.8512911843276938, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.676376342773438, + "learning_rate": 1e-06, + "loss": 0.3878, + "mean_token_accuracy": 0.877556562423706, + "num_tokens": 555024938.0, + "step": 14553 + }, + { + "epoch": 1.8514183946062843, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.4381103515625, + "learning_rate": 1e-06, + "loss": 0.4496, + "mean_token_accuracy": 0.8578078746795654, + "num_tokens": 555059785.0, + "step": 14554 + }, + { + "epoch": 1.8515456048848749, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.55187225341797, + "learning_rate": 1e-06, + "loss": 0.4884, + "mean_token_accuracy": 0.8440170288085938, + "num_tokens": 555097502.0, + "step": 14555 + }, + { + "epoch": 1.8516728151634652, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.65531349182129, + "learning_rate": 1e-06, + "loss": 0.4111, + "mean_token_accuracy": 0.8669196963310242, + "num_tokens": 555139976.0, + "step": 14556 + }, + { + "epoch": 1.8518000254420557, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.544509887695312, + "learning_rate": 1e-06, + "loss": 0.4027, + "mean_token_accuracy": 0.8722770810127258, + "num_tokens": 555181250.0, + "step": 14557 + }, + { + "epoch": 1.8519272357206462, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.65273666381836, + "learning_rate": 1e-06, + "loss": 0.4812, + "mean_token_accuracy": 0.8503726720809937, + "num_tokens": 555214927.0, + "step": 14558 + }, + { + "epoch": 1.8520544459992367, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.60123062133789, + "learning_rate": 1e-06, + "loss": 0.4356, + "mean_token_accuracy": 0.8660948276519775, + "num_tokens": 555255795.0, + "step": 14559 + }, + { + "epoch": 1.8521816562778273, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.276748657226562, + "learning_rate": 1e-06, + "loss": 0.4047, + "mean_token_accuracy": 0.8726175427436829, + "num_tokens": 555294177.0, + "step": 14560 + }, + { + "epoch": 1.8523088665564178, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.717529296875, + "learning_rate": 1e-06, + "loss": 0.3797, + "mean_token_accuracy": 0.8812832832336426, + "num_tokens": 555331548.0, + "step": 14561 + }, + { + "epoch": 1.852436076835008, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.680288314819336, + "learning_rate": 1e-06, + "loss": 0.376, + "mean_token_accuracy": 0.88139408826828, + "num_tokens": 555368426.0, + "step": 14562 + }, + { + "epoch": 1.8525632871135986, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.46359634399414, + "learning_rate": 1e-06, + "loss": 0.3682, + "mean_token_accuracy": 0.8854698538780212, + "num_tokens": 555411594.0, + "step": 14563 + }, + { + "epoch": 1.8526904973921892, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.61250114440918, + "learning_rate": 1e-06, + "loss": 0.4435, + "mean_token_accuracy": 0.8609842658042908, + "num_tokens": 555450412.0, + "step": 14564 + }, + { + "epoch": 1.8528177076707797, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.340572357177734, + "learning_rate": 1e-06, + "loss": 0.3856, + "mean_token_accuracy": 0.8778164386749268, + "num_tokens": 555489067.0, + "step": 14565 + }, + { + "epoch": 1.8529449179493702, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.564687728881836, + "learning_rate": 1e-06, + "loss": 0.3885, + "mean_token_accuracy": 0.875224232673645, + "num_tokens": 555524552.0, + "step": 14566 + }, + { + "epoch": 1.8530721282279607, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.674884796142578, + "learning_rate": 1e-06, + "loss": 0.4829, + "mean_token_accuracy": 0.8425437211990356, + "num_tokens": 555563766.0, + "step": 14567 + }, + { + "epoch": 1.8531993385065513, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.529720306396484, + "learning_rate": 1e-06, + "loss": 0.3934, + "mean_token_accuracy": 0.8759837746620178, + "num_tokens": 555601410.0, + "step": 14568 + }, + { + "epoch": 1.8533265487851418, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.460599899291992, + "learning_rate": 1e-06, + "loss": 0.4092, + "mean_token_accuracy": 0.868028998374939, + "num_tokens": 555641346.0, + "step": 14569 + }, + { + "epoch": 1.8534537590637323, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.650928497314453, + "learning_rate": 1e-06, + "loss": 0.3931, + "mean_token_accuracy": 0.8792004585266113, + "num_tokens": 555676537.0, + "step": 14570 + }, + { + "epoch": 1.8535809693423229, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.72977066040039, + "learning_rate": 1e-06, + "loss": 0.454, + "mean_token_accuracy": 0.8545130491256714, + "num_tokens": 555721634.0, + "step": 14571 + }, + { + "epoch": 1.8537081796209134, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.552888870239258, + "learning_rate": 1e-06, + "loss": 0.4126, + "mean_token_accuracy": 0.8687388896942139, + "num_tokens": 555765447.0, + "step": 14572 + }, + { + "epoch": 1.853835389899504, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.727773666381836, + "learning_rate": 1e-06, + "loss": 0.4175, + "mean_token_accuracy": 0.8687013387680054, + "num_tokens": 555798598.0, + "step": 14573 + }, + { + "epoch": 1.8539626001780944, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.487329483032227, + "learning_rate": 1e-06, + "loss": 0.3914, + "mean_token_accuracy": 0.8784180879592896, + "num_tokens": 555829465.0, + "step": 14574 + }, + { + "epoch": 1.854089810456685, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.578683853149414, + "learning_rate": 1e-06, + "loss": 0.4019, + "mean_token_accuracy": 0.8706805109977722, + "num_tokens": 555866577.0, + "step": 14575 + }, + { + "epoch": 1.8542170207352755, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.539443969726562, + "learning_rate": 1e-06, + "loss": 0.3942, + "mean_token_accuracy": 0.8728104829788208, + "num_tokens": 555900472.0, + "step": 14576 + }, + { + "epoch": 1.854344231013866, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.4127254486084, + "learning_rate": 1e-06, + "loss": 0.4214, + "mean_token_accuracy": 0.8645623326301575, + "num_tokens": 555936356.0, + "step": 14577 + }, + { + "epoch": 1.8544714412924566, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.400663375854492, + "learning_rate": 1e-06, + "loss": 0.441, + "mean_token_accuracy": 0.8624188899993896, + "num_tokens": 555975338.0, + "step": 14578 + }, + { + "epoch": 1.854598651571047, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.695207595825195, + "learning_rate": 1e-06, + "loss": 0.4459, + "mean_token_accuracy": 0.8601095080375671, + "num_tokens": 556019651.0, + "step": 14579 + }, + { + "epoch": 1.8547258618496376, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.57370376586914, + "learning_rate": 1e-06, + "loss": 0.442, + "mean_token_accuracy": 0.8646754026412964, + "num_tokens": 556054530.0, + "step": 14580 + }, + { + "epoch": 1.854853072128228, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.544761657714844, + "learning_rate": 1e-06, + "loss": 0.4291, + "mean_token_accuracy": 0.8653950691223145, + "num_tokens": 556091985.0, + "step": 14581 + }, + { + "epoch": 1.8549802824068184, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.743452072143555, + "learning_rate": 1e-06, + "loss": 0.4036, + "mean_token_accuracy": 0.8725787401199341, + "num_tokens": 556128777.0, + "step": 14582 + }, + { + "epoch": 1.855107492685409, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.632080078125, + "learning_rate": 1e-06, + "loss": 0.3535, + "mean_token_accuracy": 0.8876262903213501, + "num_tokens": 556164286.0, + "step": 14583 + }, + { + "epoch": 1.8552347029639995, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.725889205932617, + "learning_rate": 1e-06, + "loss": 0.3872, + "mean_token_accuracy": 0.8775578141212463, + "num_tokens": 556199141.0, + "step": 14584 + }, + { + "epoch": 1.85536191324259, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.557674407958984, + "learning_rate": 1e-06, + "loss": 0.4036, + "mean_token_accuracy": 0.8749781847000122, + "num_tokens": 556237104.0, + "step": 14585 + }, + { + "epoch": 1.8554891235211806, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.61565399169922, + "learning_rate": 1e-06, + "loss": 0.4547, + "mean_token_accuracy": 0.8595370650291443, + "num_tokens": 556275343.0, + "step": 14586 + }, + { + "epoch": 1.8556163337997709, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.638118743896484, + "learning_rate": 1e-06, + "loss": 0.3609, + "mean_token_accuracy": 0.8847141265869141, + "num_tokens": 556316539.0, + "step": 14587 + }, + { + "epoch": 1.8557435440783614, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.5834903717041, + "learning_rate": 1e-06, + "loss": 0.3736, + "mean_token_accuracy": 0.8829838633537292, + "num_tokens": 556351101.0, + "step": 14588 + }, + { + "epoch": 1.855870754356952, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.47329330444336, + "learning_rate": 1e-06, + "loss": 0.3708, + "mean_token_accuracy": 0.8814738988876343, + "num_tokens": 556382458.0, + "step": 14589 + }, + { + "epoch": 1.8559979646355425, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.543025970458984, + "learning_rate": 1e-06, + "loss": 0.3964, + "mean_token_accuracy": 0.8720802068710327, + "num_tokens": 556416980.0, + "step": 14590 + }, + { + "epoch": 1.856125174914133, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.61906623840332, + "learning_rate": 1e-06, + "loss": 0.4015, + "mean_token_accuracy": 0.8759031295776367, + "num_tokens": 556451868.0, + "step": 14591 + }, + { + "epoch": 1.8562523851927235, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.346622467041016, + "learning_rate": 1e-06, + "loss": 0.4459, + "mean_token_accuracy": 0.857679009437561, + "num_tokens": 556485682.0, + "step": 14592 + }, + { + "epoch": 1.856379595471314, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.625694274902344, + "learning_rate": 1e-06, + "loss": 0.3778, + "mean_token_accuracy": 0.8808923363685608, + "num_tokens": 556524857.0, + "step": 14593 + }, + { + "epoch": 1.8565068057499046, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.467321395874023, + "learning_rate": 1e-06, + "loss": 0.4044, + "mean_token_accuracy": 0.8745422959327698, + "num_tokens": 556565763.0, + "step": 14594 + }, + { + "epoch": 1.856634016028495, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.577152252197266, + "learning_rate": 1e-06, + "loss": 0.4467, + "mean_token_accuracy": 0.8585322499275208, + "num_tokens": 556606202.0, + "step": 14595 + }, + { + "epoch": 1.8567612263070856, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.48292350769043, + "learning_rate": 1e-06, + "loss": 0.3868, + "mean_token_accuracy": 0.8793025016784668, + "num_tokens": 556646906.0, + "step": 14596 + }, + { + "epoch": 1.8568884365856761, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.67510986328125, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.8719661235809326, + "num_tokens": 556684249.0, + "step": 14597 + }, + { + "epoch": 1.8570156468642667, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.68173599243164, + "learning_rate": 1e-06, + "loss": 0.4388, + "mean_token_accuracy": 0.8592531085014343, + "num_tokens": 556726840.0, + "step": 14598 + }, + { + "epoch": 1.8571428571428572, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.594358444213867, + "learning_rate": 1e-06, + "loss": 0.3954, + "mean_token_accuracy": 0.8728851079940796, + "num_tokens": 556764822.0, + "step": 14599 + }, + { + "epoch": 1.8572700674214477, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.667160034179688, + "learning_rate": 1e-06, + "loss": 0.4251, + "mean_token_accuracy": 0.8677994608879089, + "num_tokens": 556800884.0, + "step": 14600 + }, + { + "epoch": 1.8573972777000383, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.61724090576172, + "learning_rate": 1e-06, + "loss": 0.3906, + "mean_token_accuracy": 0.8758959174156189, + "num_tokens": 556841233.0, + "step": 14601 + }, + { + "epoch": 1.8575244879786288, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.65508460998535, + "learning_rate": 1e-06, + "loss": 0.4309, + "mean_token_accuracy": 0.8640381693840027, + "num_tokens": 556880995.0, + "step": 14602 + }, + { + "epoch": 1.8576516982572193, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.619543075561523, + "learning_rate": 1e-06, + "loss": 0.4045, + "mean_token_accuracy": 0.874923586845398, + "num_tokens": 556914367.0, + "step": 14603 + }, + { + "epoch": 1.8577789085358098, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.597917556762695, + "learning_rate": 1e-06, + "loss": 0.4815, + "mean_token_accuracy": 0.8509364724159241, + "num_tokens": 556955944.0, + "step": 14604 + }, + { + "epoch": 1.8579061188144002, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.60711097717285, + "learning_rate": 1e-06, + "loss": 0.4697, + "mean_token_accuracy": 0.8531481027603149, + "num_tokens": 556989851.0, + "step": 14605 + }, + { + "epoch": 1.8580333290929907, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.700458526611328, + "learning_rate": 1e-06, + "loss": 0.4043, + "mean_token_accuracy": 0.869519054889679, + "num_tokens": 557033662.0, + "step": 14606 + }, + { + "epoch": 1.8581605393715812, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.559080123901367, + "learning_rate": 1e-06, + "loss": 0.4315, + "mean_token_accuracy": 0.8621180057525635, + "num_tokens": 557074869.0, + "step": 14607 + }, + { + "epoch": 1.8582877496501717, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.437931060791016, + "learning_rate": 1e-06, + "loss": 0.4414, + "mean_token_accuracy": 0.8595304489135742, + "num_tokens": 557110164.0, + "step": 14608 + }, + { + "epoch": 1.8584149599287623, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.72942352294922, + "learning_rate": 1e-06, + "loss": 0.4196, + "mean_token_accuracy": 0.8702680468559265, + "num_tokens": 557149309.0, + "step": 14609 + }, + { + "epoch": 1.8585421702073528, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.51194190979004, + "learning_rate": 1e-06, + "loss": 0.4175, + "mean_token_accuracy": 0.8643892407417297, + "num_tokens": 557180510.0, + "step": 14610 + }, + { + "epoch": 1.858669380485943, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.474807739257812, + "learning_rate": 1e-06, + "loss": 0.3908, + "mean_token_accuracy": 0.8756645321846008, + "num_tokens": 557218062.0, + "step": 14611 + }, + { + "epoch": 1.8587965907645336, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.379606246948242, + "learning_rate": 1e-06, + "loss": 0.4416, + "mean_token_accuracy": 0.8622655272483826, + "num_tokens": 557258963.0, + "step": 14612 + }, + { + "epoch": 1.8589238010431242, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.62682342529297, + "learning_rate": 1e-06, + "loss": 0.4203, + "mean_token_accuracy": 0.8712217211723328, + "num_tokens": 557296046.0, + "step": 14613 + }, + { + "epoch": 1.8590510113217147, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.651140213012695, + "learning_rate": 1e-06, + "loss": 0.4143, + "mean_token_accuracy": 0.8666569590568542, + "num_tokens": 557328395.0, + "step": 14614 + }, + { + "epoch": 1.8591782216003052, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.471168518066406, + "learning_rate": 1e-06, + "loss": 0.3978, + "mean_token_accuracy": 0.8722732067108154, + "num_tokens": 557363888.0, + "step": 14615 + }, + { + "epoch": 1.8593054318788957, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.6352596282959, + "learning_rate": 1e-06, + "loss": 0.4406, + "mean_token_accuracy": 0.8624460697174072, + "num_tokens": 557402944.0, + "step": 14616 + }, + { + "epoch": 1.8594326421574863, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.511465072631836, + "learning_rate": 1e-06, + "loss": 0.3937, + "mean_token_accuracy": 0.8756683468818665, + "num_tokens": 557442247.0, + "step": 14617 + }, + { + "epoch": 1.8595598524360768, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.816267013549805, + "learning_rate": 1e-06, + "loss": 0.4303, + "mean_token_accuracy": 0.8640847206115723, + "num_tokens": 557483580.0, + "step": 14618 + }, + { + "epoch": 1.8596870627146673, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.56583023071289, + "learning_rate": 1e-06, + "loss": 0.4014, + "mean_token_accuracy": 0.8744062781333923, + "num_tokens": 557520134.0, + "step": 14619 + }, + { + "epoch": 1.8598142729932579, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.409013748168945, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.8646314144134521, + "num_tokens": 557565726.0, + "step": 14620 + }, + { + "epoch": 1.8599414832718484, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.632038116455078, + "learning_rate": 1e-06, + "loss": 0.4217, + "mean_token_accuracy": 0.8682479858398438, + "num_tokens": 557602871.0, + "step": 14621 + }, + { + "epoch": 1.860068693550439, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.533761978149414, + "learning_rate": 1e-06, + "loss": 0.4332, + "mean_token_accuracy": 0.8608987927436829, + "num_tokens": 557640873.0, + "step": 14622 + }, + { + "epoch": 1.8601959038290294, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.671756744384766, + "learning_rate": 1e-06, + "loss": 0.429, + "mean_token_accuracy": 0.862980842590332, + "num_tokens": 557684233.0, + "step": 14623 + }, + { + "epoch": 1.86032311410762, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.508750915527344, + "learning_rate": 1e-06, + "loss": 0.3983, + "mean_token_accuracy": 0.8749666213989258, + "num_tokens": 557716895.0, + "step": 14624 + }, + { + "epoch": 1.8604503243862105, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.588611602783203, + "learning_rate": 1e-06, + "loss": 0.4279, + "mean_token_accuracy": 0.8653719425201416, + "num_tokens": 557758197.0, + "step": 14625 + }, + { + "epoch": 1.860577534664801, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.61454200744629, + "learning_rate": 1e-06, + "loss": 0.5134, + "mean_token_accuracy": 0.8430230617523193, + "num_tokens": 557794196.0, + "step": 14626 + }, + { + "epoch": 1.8607047449433916, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.43947982788086, + "learning_rate": 1e-06, + "loss": 0.4476, + "mean_token_accuracy": 0.8533568978309631, + "num_tokens": 557833856.0, + "step": 14627 + }, + { + "epoch": 1.860831955221982, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.7049560546875, + "learning_rate": 1e-06, + "loss": 0.4546, + "mean_token_accuracy": 0.8579312562942505, + "num_tokens": 557866304.0, + "step": 14628 + }, + { + "epoch": 1.8609591655005726, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.376771926879883, + "learning_rate": 1e-06, + "loss": 0.3818, + "mean_token_accuracy": 0.8823311924934387, + "num_tokens": 557907065.0, + "step": 14629 + }, + { + "epoch": 1.861086375779163, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.62266731262207, + "learning_rate": 1e-06, + "loss": 0.4229, + "mean_token_accuracy": 0.8648613691329956, + "num_tokens": 557947326.0, + "step": 14630 + }, + { + "epoch": 1.8612135860577534, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.469091415405273, + "learning_rate": 1e-06, + "loss": 0.4145, + "mean_token_accuracy": 0.8674492835998535, + "num_tokens": 557990059.0, + "step": 14631 + }, + { + "epoch": 1.861340796336344, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.458498001098633, + "learning_rate": 1e-06, + "loss": 0.4033, + "mean_token_accuracy": 0.8711968660354614, + "num_tokens": 558028406.0, + "step": 14632 + }, + { + "epoch": 1.8614680066149345, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.723140716552734, + "learning_rate": 1e-06, + "loss": 0.4293, + "mean_token_accuracy": 0.8649232387542725, + "num_tokens": 558070239.0, + "step": 14633 + }, + { + "epoch": 1.861595216893525, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.5100040435791, + "learning_rate": 1e-06, + "loss": 0.4122, + "mean_token_accuracy": 0.8726319670677185, + "num_tokens": 558107685.0, + "step": 14634 + }, + { + "epoch": 1.8617224271721156, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.687026977539062, + "learning_rate": 1e-06, + "loss": 0.4199, + "mean_token_accuracy": 0.8639724254608154, + "num_tokens": 558143185.0, + "step": 14635 + }, + { + "epoch": 1.8618496374507059, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.63846206665039, + "learning_rate": 1e-06, + "loss": 0.3989, + "mean_token_accuracy": 0.8774397969245911, + "num_tokens": 558176207.0, + "step": 14636 + }, + { + "epoch": 1.8619768477292964, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.6988525390625, + "learning_rate": 1e-06, + "loss": 0.3603, + "mean_token_accuracy": 0.8847712874412537, + "num_tokens": 558212215.0, + "step": 14637 + }, + { + "epoch": 1.862104058007887, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.483718872070312, + "learning_rate": 1e-06, + "loss": 0.3825, + "mean_token_accuracy": 0.8797305822372437, + "num_tokens": 558248471.0, + "step": 14638 + }, + { + "epoch": 1.8622312682864774, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.71816635131836, + "learning_rate": 1e-06, + "loss": 0.4587, + "mean_token_accuracy": 0.8562055826187134, + "num_tokens": 558285211.0, + "step": 14639 + }, + { + "epoch": 1.862358478565068, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.566139221191406, + "learning_rate": 1e-06, + "loss": 0.4295, + "mean_token_accuracy": 0.862258791923523, + "num_tokens": 558314434.0, + "step": 14640 + }, + { + "epoch": 1.8624856888436585, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.709667205810547, + "learning_rate": 1e-06, + "loss": 0.3928, + "mean_token_accuracy": 0.8767304420471191, + "num_tokens": 558353710.0, + "step": 14641 + }, + { + "epoch": 1.862612899122249, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.63332176208496, + "learning_rate": 1e-06, + "loss": 0.4573, + "mean_token_accuracy": 0.8565194010734558, + "num_tokens": 558390476.0, + "step": 14642 + }, + { + "epoch": 1.8627401094008396, + "ewc_loss": 0.03466796875, + "ewc_loss_parallel": 3.457069396972656e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.501239776611328, + "learning_rate": 1e-06, + "loss": 0.3868, + "mean_token_accuracy": 0.8778541088104248, + "num_tokens": 558424917.0, + "step": 14643 + }, + { + "epoch": 1.86286731967943, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.872581481933594, + "learning_rate": 1e-06, + "loss": 0.4341, + "mean_token_accuracy": 0.8629616498947144, + "num_tokens": 558459831.0, + "step": 14644 + }, + { + "epoch": 1.8629945299580206, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.506120681762695, + "learning_rate": 1e-06, + "loss": 0.3666, + "mean_token_accuracy": 0.8847403526306152, + "num_tokens": 558498600.0, + "step": 14645 + }, + { + "epoch": 1.8631217402366111, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.572528839111328, + "learning_rate": 1e-06, + "loss": 0.4411, + "mean_token_accuracy": 0.8600219488143921, + "num_tokens": 558537465.0, + "step": 14646 + }, + { + "epoch": 1.8632489505152017, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.563203811645508, + "learning_rate": 1e-06, + "loss": 0.3526, + "mean_token_accuracy": 0.8887007236480713, + "num_tokens": 558575357.0, + "step": 14647 + }, + { + "epoch": 1.8633761607937922, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.615997314453125, + "learning_rate": 1e-06, + "loss": 0.4266, + "mean_token_accuracy": 0.8658433556556702, + "num_tokens": 558619422.0, + "step": 14648 + }, + { + "epoch": 1.8635033710723827, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.522737503051758, + "learning_rate": 1e-06, + "loss": 0.4455, + "mean_token_accuracy": 0.8607943058013916, + "num_tokens": 558657677.0, + "step": 14649 + }, + { + "epoch": 1.8636305813509733, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.606536865234375, + "learning_rate": 1e-06, + "loss": 0.3871, + "mean_token_accuracy": 0.8774733543395996, + "num_tokens": 558692912.0, + "step": 14650 + }, + { + "epoch": 1.8637577916295638, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.438268661499023, + "learning_rate": 1e-06, + "loss": 0.3814, + "mean_token_accuracy": 0.8808905482292175, + "num_tokens": 558733470.0, + "step": 14651 + }, + { + "epoch": 1.8638850019081543, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.629749298095703, + "learning_rate": 1e-06, + "loss": 0.4188, + "mean_token_accuracy": 0.8647030591964722, + "num_tokens": 558771899.0, + "step": 14652 + }, + { + "epoch": 1.8640122121867448, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.53234100341797, + "learning_rate": 1e-06, + "loss": 0.3966, + "mean_token_accuracy": 0.8756891489028931, + "num_tokens": 558813002.0, + "step": 14653 + }, + { + "epoch": 1.8641394224653351, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.563154220581055, + "learning_rate": 1e-06, + "loss": 0.4618, + "mean_token_accuracy": 0.8537182807922363, + "num_tokens": 558850827.0, + "step": 14654 + }, + { + "epoch": 1.8642666327439257, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.5595703125, + "learning_rate": 1e-06, + "loss": 0.4182, + "mean_token_accuracy": 0.8669078350067139, + "num_tokens": 558889003.0, + "step": 14655 + }, + { + "epoch": 1.8643938430225162, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.464210510253906, + "learning_rate": 1e-06, + "loss": 0.4013, + "mean_token_accuracy": 0.8753800392150879, + "num_tokens": 558925849.0, + "step": 14656 + }, + { + "epoch": 1.8645210533011067, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.649057388305664, + "learning_rate": 1e-06, + "loss": 0.4128, + "mean_token_accuracy": 0.869196355342865, + "num_tokens": 558967471.0, + "step": 14657 + }, + { + "epoch": 1.8646482635796973, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.500185012817383, + "learning_rate": 1e-06, + "loss": 0.3926, + "mean_token_accuracy": 0.8731557130813599, + "num_tokens": 559001938.0, + "step": 14658 + }, + { + "epoch": 1.8647754738582878, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.55541229248047, + "learning_rate": 1e-06, + "loss": 0.3967, + "mean_token_accuracy": 0.8751345872879028, + "num_tokens": 559040515.0, + "step": 14659 + }, + { + "epoch": 1.864902684136878, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.63577651977539, + "learning_rate": 1e-06, + "loss": 0.3844, + "mean_token_accuracy": 0.8807080984115601, + "num_tokens": 559077702.0, + "step": 14660 + }, + { + "epoch": 1.8650298944154686, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.354019165039062, + "learning_rate": 1e-06, + "loss": 0.4301, + "mean_token_accuracy": 0.8626916408538818, + "num_tokens": 559114741.0, + "step": 14661 + }, + { + "epoch": 1.8651571046940592, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.654909133911133, + "learning_rate": 1e-06, + "loss": 0.4895, + "mean_token_accuracy": 0.8484147787094116, + "num_tokens": 559155844.0, + "step": 14662 + }, + { + "epoch": 1.8652843149726497, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.48192596435547, + "learning_rate": 1e-06, + "loss": 0.3989, + "mean_token_accuracy": 0.8716188669204712, + "num_tokens": 559194964.0, + "step": 14663 + }, + { + "epoch": 1.8654115252512402, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.56515884399414, + "learning_rate": 1e-06, + "loss": 0.3891, + "mean_token_accuracy": 0.8758648633956909, + "num_tokens": 559230083.0, + "step": 14664 + }, + { + "epoch": 1.8655387355298307, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.59414291381836, + "learning_rate": 1e-06, + "loss": 0.3908, + "mean_token_accuracy": 0.8800400495529175, + "num_tokens": 559268397.0, + "step": 14665 + }, + { + "epoch": 1.8656659458084213, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.54111099243164, + "learning_rate": 1e-06, + "loss": 0.4495, + "mean_token_accuracy": 0.8599669933319092, + "num_tokens": 559302024.0, + "step": 14666 + }, + { + "epoch": 1.8657931560870118, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.706865310668945, + "learning_rate": 1e-06, + "loss": 0.4315, + "mean_token_accuracy": 0.8673664331436157, + "num_tokens": 559343192.0, + "step": 14667 + }, + { + "epoch": 1.8659203663656023, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.56161117553711, + "learning_rate": 1e-06, + "loss": 0.3459, + "mean_token_accuracy": 0.8922581672668457, + "num_tokens": 559380616.0, + "step": 14668 + }, + { + "epoch": 1.8660475766441929, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.71299171447754, + "learning_rate": 1e-06, + "loss": 0.3882, + "mean_token_accuracy": 0.879568338394165, + "num_tokens": 559413621.0, + "step": 14669 + }, + { + "epoch": 1.8661747869227834, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.505388259887695, + "learning_rate": 1e-06, + "loss": 0.3836, + "mean_token_accuracy": 0.8786412477493286, + "num_tokens": 559447274.0, + "step": 14670 + }, + { + "epoch": 1.866301997201374, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.46550750732422, + "learning_rate": 1e-06, + "loss": 0.3855, + "mean_token_accuracy": 0.8786112070083618, + "num_tokens": 559484306.0, + "step": 14671 + }, + { + "epoch": 1.8664292074799644, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.61739158630371, + "learning_rate": 1e-06, + "loss": 0.3949, + "mean_token_accuracy": 0.8758314847946167, + "num_tokens": 559527126.0, + "step": 14672 + }, + { + "epoch": 1.866556417758555, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.50330352783203, + "learning_rate": 1e-06, + "loss": 0.3923, + "mean_token_accuracy": 0.8744061589241028, + "num_tokens": 559568342.0, + "step": 14673 + }, + { + "epoch": 1.8666836280371455, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.610546112060547, + "learning_rate": 1e-06, + "loss": 0.4086, + "mean_token_accuracy": 0.8681807518005371, + "num_tokens": 559609597.0, + "step": 14674 + }, + { + "epoch": 1.866810838315736, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.60787582397461, + "learning_rate": 1e-06, + "loss": 0.4234, + "mean_token_accuracy": 0.8686057925224304, + "num_tokens": 559646392.0, + "step": 14675 + }, + { + "epoch": 1.8669380485943265, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.66973114013672, + "learning_rate": 1e-06, + "loss": 0.4019, + "mean_token_accuracy": 0.8731144666671753, + "num_tokens": 559684836.0, + "step": 14676 + }, + { + "epoch": 1.867065258872917, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.36871910095215, + "learning_rate": 1e-06, + "loss": 0.4013, + "mean_token_accuracy": 0.8745381832122803, + "num_tokens": 559727793.0, + "step": 14677 + }, + { + "epoch": 1.8671924691515076, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.859739303588867, + "learning_rate": 1e-06, + "loss": 0.4395, + "mean_token_accuracy": 0.8588598966598511, + "num_tokens": 559768154.0, + "step": 14678 + }, + { + "epoch": 1.867319679430098, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.479412078857422, + "learning_rate": 1e-06, + "loss": 0.4064, + "mean_token_accuracy": 0.8704109787940979, + "num_tokens": 559798732.0, + "step": 14679 + }, + { + "epoch": 1.8674468897086884, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.60126495361328, + "learning_rate": 1e-06, + "loss": 0.4162, + "mean_token_accuracy": 0.8692259788513184, + "num_tokens": 559842796.0, + "step": 14680 + }, + { + "epoch": 1.867574099987279, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.522546768188477, + "learning_rate": 1e-06, + "loss": 0.4235, + "mean_token_accuracy": 0.8677942752838135, + "num_tokens": 559884198.0, + "step": 14681 + }, + { + "epoch": 1.8677013102658695, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.596250534057617, + "learning_rate": 1e-06, + "loss": 0.4417, + "mean_token_accuracy": 0.8559054136276245, + "num_tokens": 559916749.0, + "step": 14682 + }, + { + "epoch": 1.86782852054446, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.675100326538086, + "learning_rate": 1e-06, + "loss": 0.4353, + "mean_token_accuracy": 0.8655024170875549, + "num_tokens": 559958430.0, + "step": 14683 + }, + { + "epoch": 1.8679557308230506, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.56145477294922, + "learning_rate": 1e-06, + "loss": 0.3649, + "mean_token_accuracy": 0.8869892358779907, + "num_tokens": 559995720.0, + "step": 14684 + }, + { + "epoch": 1.8680829411016409, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.55766487121582, + "learning_rate": 1e-06, + "loss": 0.4082, + "mean_token_accuracy": 0.8738551139831543, + "num_tokens": 560033304.0, + "step": 14685 + }, + { + "epoch": 1.8682101513802314, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.600740432739258, + "learning_rate": 1e-06, + "loss": 0.4124, + "mean_token_accuracy": 0.8685280084609985, + "num_tokens": 560069879.0, + "step": 14686 + }, + { + "epoch": 1.868337361658822, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.454015731811523, + "learning_rate": 1e-06, + "loss": 0.4271, + "mean_token_accuracy": 0.8687905669212341, + "num_tokens": 560106853.0, + "step": 14687 + }, + { + "epoch": 1.8684645719374124, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.556270599365234, + "learning_rate": 1e-06, + "loss": 0.4134, + "mean_token_accuracy": 0.8713050484657288, + "num_tokens": 560144225.0, + "step": 14688 + }, + { + "epoch": 1.868591782216003, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.468088150024414, + "learning_rate": 1e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.8637661933898926, + "num_tokens": 560186520.0, + "step": 14689 + }, + { + "epoch": 1.8687189924945935, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.65985870361328, + "learning_rate": 1e-06, + "loss": 0.4035, + "mean_token_accuracy": 0.8726469874382019, + "num_tokens": 560225458.0, + "step": 14690 + }, + { + "epoch": 1.868846202773184, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.653257369995117, + "learning_rate": 1e-06, + "loss": 0.3902, + "mean_token_accuracy": 0.8751561641693115, + "num_tokens": 560263401.0, + "step": 14691 + }, + { + "epoch": 1.8689734130517746, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.58158302307129, + "learning_rate": 1e-06, + "loss": 0.4231, + "mean_token_accuracy": 0.8662079572677612, + "num_tokens": 560305026.0, + "step": 14692 + }, + { + "epoch": 1.869100623330365, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.573394775390625, + "learning_rate": 1e-06, + "loss": 0.513, + "mean_token_accuracy": 0.8382198810577393, + "num_tokens": 560354823.0, + "step": 14693 + }, + { + "epoch": 1.8692278336089556, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.614192962646484, + "learning_rate": 1e-06, + "loss": 0.3623, + "mean_token_accuracy": 0.8844349384307861, + "num_tokens": 560391888.0, + "step": 14694 + }, + { + "epoch": 1.8693550438875461, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.63060188293457, + "learning_rate": 1e-06, + "loss": 0.4185, + "mean_token_accuracy": 0.8718761205673218, + "num_tokens": 560425350.0, + "step": 14695 + }, + { + "epoch": 1.8694822541661367, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.459918975830078, + "learning_rate": 1e-06, + "loss": 0.3762, + "mean_token_accuracy": 0.8794825673103333, + "num_tokens": 560458521.0, + "step": 14696 + }, + { + "epoch": 1.8696094644447272, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.458290100097656, + "learning_rate": 1e-06, + "loss": 0.4366, + "mean_token_accuracy": 0.8643940091133118, + "num_tokens": 560506378.0, + "step": 14697 + }, + { + "epoch": 1.8697366747233177, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.768083572387695, + "learning_rate": 1e-06, + "loss": 0.3408, + "mean_token_accuracy": 0.8917728662490845, + "num_tokens": 560544055.0, + "step": 14698 + }, + { + "epoch": 1.8698638850019083, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.524938583374023, + "learning_rate": 1e-06, + "loss": 0.3871, + "mean_token_accuracy": 0.8794339895248413, + "num_tokens": 560577240.0, + "step": 14699 + }, + { + "epoch": 1.8699910952804988, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.51462745666504, + "learning_rate": 1e-06, + "loss": 0.4415, + "mean_token_accuracy": 0.8649763464927673, + "num_tokens": 560615662.0, + "step": 14700 + }, + { + "epoch": 1.8701183055590893, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.736553192138672, + "learning_rate": 1e-06, + "loss": 0.428, + "mean_token_accuracy": 0.8652037382125854, + "num_tokens": 560658386.0, + "step": 14701 + }, + { + "epoch": 1.8702455158376798, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.39665985107422, + "learning_rate": 1e-06, + "loss": 0.4238, + "mean_token_accuracy": 0.8673019409179688, + "num_tokens": 560696366.0, + "step": 14702 + }, + { + "epoch": 1.8703727261162701, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.712209701538086, + "learning_rate": 1e-06, + "loss": 0.4415, + "mean_token_accuracy": 0.8588906526565552, + "num_tokens": 560736419.0, + "step": 14703 + }, + { + "epoch": 1.8704999363948607, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.398378372192383, + "learning_rate": 1e-06, + "loss": 0.4216, + "mean_token_accuracy": 0.8687611818313599, + "num_tokens": 560777839.0, + "step": 14704 + }, + { + "epoch": 1.8706271466734512, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.53301239013672, + "learning_rate": 1e-06, + "loss": 0.3839, + "mean_token_accuracy": 0.8801803588867188, + "num_tokens": 560816167.0, + "step": 14705 + }, + { + "epoch": 1.8707543569520417, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.708724975585938, + "learning_rate": 1e-06, + "loss": 0.4563, + "mean_token_accuracy": 0.8560686707496643, + "num_tokens": 560854524.0, + "step": 14706 + }, + { + "epoch": 1.8708815672306323, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.493738174438477, + "learning_rate": 1e-06, + "loss": 0.426, + "mean_token_accuracy": 0.867577075958252, + "num_tokens": 560894393.0, + "step": 14707 + }, + { + "epoch": 1.8710087775092228, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.50287437438965, + "learning_rate": 1e-06, + "loss": 0.3857, + "mean_token_accuracy": 0.8804168701171875, + "num_tokens": 560938640.0, + "step": 14708 + }, + { + "epoch": 1.871135987787813, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.637203216552734, + "learning_rate": 1e-06, + "loss": 0.4182, + "mean_token_accuracy": 0.8695535063743591, + "num_tokens": 560976805.0, + "step": 14709 + }, + { + "epoch": 1.8712631980664036, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.472585678100586, + "learning_rate": 1e-06, + "loss": 0.4356, + "mean_token_accuracy": 0.867466390132904, + "num_tokens": 561018940.0, + "step": 14710 + }, + { + "epoch": 1.8713904083449941, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.702709197998047, + "learning_rate": 1e-06, + "loss": 0.4022, + "mean_token_accuracy": 0.8747403621673584, + "num_tokens": 561065228.0, + "step": 14711 + }, + { + "epoch": 1.8715176186235847, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.682058334350586, + "learning_rate": 1e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.8618500232696533, + "num_tokens": 561107540.0, + "step": 14712 + }, + { + "epoch": 1.8716448289021752, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.607017517089844, + "learning_rate": 1e-06, + "loss": 0.5036, + "mean_token_accuracy": 0.8505809307098389, + "num_tokens": 561144774.0, + "step": 14713 + }, + { + "epoch": 1.8717720391807657, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.56396484375, + "learning_rate": 1e-06, + "loss": 0.476, + "mean_token_accuracy": 0.8502074480056763, + "num_tokens": 561186774.0, + "step": 14714 + }, + { + "epoch": 1.8718992494593563, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.595312118530273, + "learning_rate": 1e-06, + "loss": 0.3659, + "mean_token_accuracy": 0.8833571672439575, + "num_tokens": 561223997.0, + "step": 14715 + }, + { + "epoch": 1.8720264597379468, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.71285629272461, + "learning_rate": 1e-06, + "loss": 0.3988, + "mean_token_accuracy": 0.8775383234024048, + "num_tokens": 561261583.0, + "step": 14716 + }, + { + "epoch": 1.8721536700165373, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.320064544677734, + "learning_rate": 1e-06, + "loss": 0.3944, + "mean_token_accuracy": 0.8751039505004883, + "num_tokens": 561302681.0, + "step": 14717 + }, + { + "epoch": 1.8722808802951278, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.640226364135742, + "learning_rate": 1e-06, + "loss": 0.422, + "mean_token_accuracy": 0.8682758212089539, + "num_tokens": 561338402.0, + "step": 14718 + }, + { + "epoch": 1.8724080905737184, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.75687026977539, + "learning_rate": 1e-06, + "loss": 0.4019, + "mean_token_accuracy": 0.8736480474472046, + "num_tokens": 561372892.0, + "step": 14719 + }, + { + "epoch": 1.872535300852309, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.445758819580078, + "learning_rate": 1e-06, + "loss": 0.4228, + "mean_token_accuracy": 0.8669085502624512, + "num_tokens": 561409474.0, + "step": 14720 + }, + { + "epoch": 1.8726625111308994, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.53737449645996, + "learning_rate": 1e-06, + "loss": 0.4922, + "mean_token_accuracy": 0.8439658284187317, + "num_tokens": 561450152.0, + "step": 14721 + }, + { + "epoch": 1.87278972140949, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.653974533081055, + "learning_rate": 1e-06, + "loss": 0.4318, + "mean_token_accuracy": 0.8632668256759644, + "num_tokens": 561489448.0, + "step": 14722 + }, + { + "epoch": 1.8729169316880805, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.33005714416504, + "learning_rate": 1e-06, + "loss": 0.4893, + "mean_token_accuracy": 0.8470827341079712, + "num_tokens": 561529684.0, + "step": 14723 + }, + { + "epoch": 1.873044141966671, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.038206100463867, + "learning_rate": 1e-06, + "loss": 0.4029, + "mean_token_accuracy": 0.8705265522003174, + "num_tokens": 561564259.0, + "step": 14724 + }, + { + "epoch": 1.8731713522452615, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.496389389038086, + "learning_rate": 1e-06, + "loss": 0.3892, + "mean_token_accuracy": 0.8776057958602905, + "num_tokens": 561609650.0, + "step": 14725 + }, + { + "epoch": 1.873298562523852, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.49220085144043, + "learning_rate": 1e-06, + "loss": 0.3937, + "mean_token_accuracy": 0.8757375478744507, + "num_tokens": 561645520.0, + "step": 14726 + }, + { + "epoch": 1.8734257728024426, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.51483154296875, + "learning_rate": 1e-06, + "loss": 0.395, + "mean_token_accuracy": 0.8749249577522278, + "num_tokens": 561682771.0, + "step": 14727 + }, + { + "epoch": 1.873552983081033, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.7891902923584, + "learning_rate": 1e-06, + "loss": 0.44, + "mean_token_accuracy": 0.8594470620155334, + "num_tokens": 561721317.0, + "step": 14728 + }, + { + "epoch": 1.8736801933596234, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.591428756713867, + "learning_rate": 1e-06, + "loss": 0.4128, + "mean_token_accuracy": 0.8692260980606079, + "num_tokens": 561763178.0, + "step": 14729 + }, + { + "epoch": 1.873807403638214, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.725299835205078, + "learning_rate": 1e-06, + "loss": 0.4016, + "mean_token_accuracy": 0.8705066442489624, + "num_tokens": 561802292.0, + "step": 14730 + }, + { + "epoch": 1.8739346139168045, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.764915466308594, + "learning_rate": 1e-06, + "loss": 0.3935, + "mean_token_accuracy": 0.8780420422554016, + "num_tokens": 561838610.0, + "step": 14731 + }, + { + "epoch": 1.874061824195395, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.729246139526367, + "learning_rate": 1e-06, + "loss": 0.4001, + "mean_token_accuracy": 0.8735918998718262, + "num_tokens": 561881992.0, + "step": 14732 + }, + { + "epoch": 1.8741890344739855, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.558488845825195, + "learning_rate": 1e-06, + "loss": 0.4475, + "mean_token_accuracy": 0.8596840500831604, + "num_tokens": 561917661.0, + "step": 14733 + }, + { + "epoch": 1.8743162447525759, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.767044067382812, + "learning_rate": 1e-06, + "loss": 0.4148, + "mean_token_accuracy": 0.8710508942604065, + "num_tokens": 561958218.0, + "step": 14734 + }, + { + "epoch": 1.8744434550311664, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.870864868164062, + "learning_rate": 1e-06, + "loss": 0.3691, + "mean_token_accuracy": 0.881462574005127, + "num_tokens": 561993882.0, + "step": 14735 + }, + { + "epoch": 1.874570665309757, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.642107009887695, + "learning_rate": 1e-06, + "loss": 0.4079, + "mean_token_accuracy": 0.8713889718055725, + "num_tokens": 562035443.0, + "step": 14736 + }, + { + "epoch": 1.8746978755883474, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.535539627075195, + "learning_rate": 1e-06, + "loss": 0.4167, + "mean_token_accuracy": 0.8683747053146362, + "num_tokens": 562072806.0, + "step": 14737 + }, + { + "epoch": 1.874825085866938, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.68039321899414, + "learning_rate": 1e-06, + "loss": 0.4604, + "mean_token_accuracy": 0.8542189598083496, + "num_tokens": 562113652.0, + "step": 14738 + }, + { + "epoch": 1.8749522961455285, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.977554321289062, + "learning_rate": 1e-06, + "loss": 0.4444, + "mean_token_accuracy": 0.8612363338470459, + "num_tokens": 562150600.0, + "step": 14739 + }, + { + "epoch": 1.875079506424119, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.64203643798828, + "learning_rate": 1e-06, + "loss": 0.4008, + "mean_token_accuracy": 0.8718234300613403, + "num_tokens": 562187730.0, + "step": 14740 + }, + { + "epoch": 1.8752067167027096, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.6782283782959, + "learning_rate": 1e-06, + "loss": 0.4075, + "mean_token_accuracy": 0.870492696762085, + "num_tokens": 562228229.0, + "step": 14741 + }, + { + "epoch": 1.8753339269813, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.649938583374023, + "learning_rate": 1e-06, + "loss": 0.399, + "mean_token_accuracy": 0.8753845691680908, + "num_tokens": 562262293.0, + "step": 14742 + }, + { + "epoch": 1.8754611372598906, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.788623809814453, + "learning_rate": 1e-06, + "loss": 0.4204, + "mean_token_accuracy": 0.8667168617248535, + "num_tokens": 562303389.0, + "step": 14743 + }, + { + "epoch": 1.8755883475384811, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.87464141845703, + "learning_rate": 1e-06, + "loss": 0.4405, + "mean_token_accuracy": 0.8631637096405029, + "num_tokens": 562344461.0, + "step": 14744 + }, + { + "epoch": 1.8757155578170717, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.642091751098633, + "learning_rate": 1e-06, + "loss": 0.4692, + "mean_token_accuracy": 0.8516934514045715, + "num_tokens": 562387068.0, + "step": 14745 + }, + { + "epoch": 1.8758427680956622, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.687578201293945, + "learning_rate": 1e-06, + "loss": 0.4226, + "mean_token_accuracy": 0.8677181005477905, + "num_tokens": 562428225.0, + "step": 14746 + }, + { + "epoch": 1.8759699783742527, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.737167358398438, + "learning_rate": 1e-06, + "loss": 0.4186, + "mean_token_accuracy": 0.8694784045219421, + "num_tokens": 562463879.0, + "step": 14747 + }, + { + "epoch": 1.8760971886528433, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.637393951416016, + "learning_rate": 1e-06, + "loss": 0.4324, + "mean_token_accuracy": 0.8628614544868469, + "num_tokens": 562502327.0, + "step": 14748 + }, + { + "epoch": 1.8762243989314338, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.960739135742188, + "learning_rate": 1e-06, + "loss": 0.395, + "mean_token_accuracy": 0.8736612200737, + "num_tokens": 562542162.0, + "step": 14749 + }, + { + "epoch": 1.8763516092100243, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.23026466369629, + "learning_rate": 1e-06, + "loss": 0.4347, + "mean_token_accuracy": 0.8640385866165161, + "num_tokens": 562583364.0, + "step": 14750 + }, + { + "epoch": 1.8764788194886148, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.357290267944336, + "learning_rate": 1e-06, + "loss": 0.4074, + "mean_token_accuracy": 0.87138432264328, + "num_tokens": 562615870.0, + "step": 14751 + }, + { + "epoch": 1.8766060297672051, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.717866897583008, + "learning_rate": 1e-06, + "loss": 0.4433, + "mean_token_accuracy": 0.8537499308586121, + "num_tokens": 562655086.0, + "step": 14752 + }, + { + "epoch": 1.8767332400457957, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.823776245117188, + "learning_rate": 1e-06, + "loss": 0.4324, + "mean_token_accuracy": 0.8628277778625488, + "num_tokens": 562697532.0, + "step": 14753 + }, + { + "epoch": 1.8768604503243862, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.410799026489258, + "learning_rate": 1e-06, + "loss": 0.4611, + "mean_token_accuracy": 0.8496784567832947, + "num_tokens": 562731057.0, + "step": 14754 + }, + { + "epoch": 1.8769876606029767, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.918790817260742, + "learning_rate": 1e-06, + "loss": 0.4445, + "mean_token_accuracy": 0.8576208353042603, + "num_tokens": 562770402.0, + "step": 14755 + }, + { + "epoch": 1.8771148708815673, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.961896896362305, + "learning_rate": 1e-06, + "loss": 0.4232, + "mean_token_accuracy": 0.8655534982681274, + "num_tokens": 562806191.0, + "step": 14756 + }, + { + "epoch": 1.8772420811601578, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.455467224121094, + "learning_rate": 1e-06, + "loss": 0.4177, + "mean_token_accuracy": 0.8675445914268494, + "num_tokens": 562846864.0, + "step": 14757 + }, + { + "epoch": 1.877369291438748, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.750709533691406, + "learning_rate": 1e-06, + "loss": 0.428, + "mean_token_accuracy": 0.8637243509292603, + "num_tokens": 562882887.0, + "step": 14758 + }, + { + "epoch": 1.8774965017173386, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.728498458862305, + "learning_rate": 1e-06, + "loss": 0.4235, + "mean_token_accuracy": 0.8689461946487427, + "num_tokens": 562919096.0, + "step": 14759 + }, + { + "epoch": 1.8776237119959291, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.700828552246094, + "learning_rate": 1e-06, + "loss": 0.5025, + "mean_token_accuracy": 0.8426222205162048, + "num_tokens": 562959336.0, + "step": 14760 + }, + { + "epoch": 1.8777509222745197, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.430021286010742, + "learning_rate": 1e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.8677859902381897, + "num_tokens": 562997883.0, + "step": 14761 + }, + { + "epoch": 1.8778781325531102, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.84940528869629, + "learning_rate": 1e-06, + "loss": 0.3915, + "mean_token_accuracy": 0.8780134916305542, + "num_tokens": 563036763.0, + "step": 14762 + }, + { + "epoch": 1.8780053428317007, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.005321502685547, + "learning_rate": 1e-06, + "loss": 0.3751, + "mean_token_accuracy": 0.8852890133857727, + "num_tokens": 563076815.0, + "step": 14763 + }, + { + "epoch": 1.8781325531102913, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.506860733032227, + "learning_rate": 1e-06, + "loss": 0.3809, + "mean_token_accuracy": 0.8794476985931396, + "num_tokens": 563110511.0, + "step": 14764 + }, + { + "epoch": 1.8782597633888818, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.62337303161621, + "learning_rate": 1e-06, + "loss": 0.4292, + "mean_token_accuracy": 0.8637266159057617, + "num_tokens": 563140757.0, + "step": 14765 + }, + { + "epoch": 1.8783869736674723, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.82948875427246, + "learning_rate": 1e-06, + "loss": 0.4138, + "mean_token_accuracy": 0.8698698878288269, + "num_tokens": 563179961.0, + "step": 14766 + }, + { + "epoch": 1.8785141839460628, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.62293815612793, + "learning_rate": 1e-06, + "loss": 0.3966, + "mean_token_accuracy": 0.8758992552757263, + "num_tokens": 563221355.0, + "step": 14767 + }, + { + "epoch": 1.8786413942246534, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.57750701904297, + "learning_rate": 1e-06, + "loss": 0.3492, + "mean_token_accuracy": 0.8898262977600098, + "num_tokens": 563257427.0, + "step": 14768 + }, + { + "epoch": 1.878768604503244, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.812713623046875, + "learning_rate": 1e-06, + "loss": 0.4232, + "mean_token_accuracy": 0.8656338453292847, + "num_tokens": 563294371.0, + "step": 14769 + }, + { + "epoch": 1.8788958147818344, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.616317749023438, + "learning_rate": 1e-06, + "loss": 0.3899, + "mean_token_accuracy": 0.8793481588363647, + "num_tokens": 563338592.0, + "step": 14770 + }, + { + "epoch": 1.879023025060425, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.707820892333984, + "learning_rate": 1e-06, + "loss": 0.4076, + "mean_token_accuracy": 0.8728655576705933, + "num_tokens": 563373853.0, + "step": 14771 + }, + { + "epoch": 1.8791502353390155, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.744596481323242, + "learning_rate": 1e-06, + "loss": 0.4131, + "mean_token_accuracy": 0.8688260316848755, + "num_tokens": 563411137.0, + "step": 14772 + }, + { + "epoch": 1.879277445617606, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.63735580444336, + "learning_rate": 1e-06, + "loss": 0.4, + "mean_token_accuracy": 0.8729563355445862, + "num_tokens": 563450294.0, + "step": 14773 + }, + { + "epoch": 1.8794046558961965, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.62436294555664, + "learning_rate": 1e-06, + "loss": 0.408, + "mean_token_accuracy": 0.8710750937461853, + "num_tokens": 563496858.0, + "step": 14774 + }, + { + "epoch": 1.879531866174787, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.848953247070312, + "learning_rate": 1e-06, + "loss": 0.4163, + "mean_token_accuracy": 0.8645615577697754, + "num_tokens": 563539566.0, + "step": 14775 + }, + { + "epoch": 1.8796590764533776, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.651865005493164, + "learning_rate": 1e-06, + "loss": 0.3939, + "mean_token_accuracy": 0.876997172832489, + "num_tokens": 563574363.0, + "step": 14776 + }, + { + "epoch": 1.879786286731968, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.4770565032959, + "learning_rate": 1e-06, + "loss": 0.4114, + "mean_token_accuracy": 0.8702852129936218, + "num_tokens": 563605540.0, + "step": 14777 + }, + { + "epoch": 1.8799134970105584, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.871442794799805, + "learning_rate": 1e-06, + "loss": 0.4615, + "mean_token_accuracy": 0.8554474115371704, + "num_tokens": 563646640.0, + "step": 14778 + }, + { + "epoch": 1.880040707289149, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.52417755126953, + "learning_rate": 1e-06, + "loss": 0.3746, + "mean_token_accuracy": 0.8797620534896851, + "num_tokens": 563683576.0, + "step": 14779 + }, + { + "epoch": 1.8801679175677395, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.537189483642578, + "learning_rate": 1e-06, + "loss": 0.4272, + "mean_token_accuracy": 0.8615196347236633, + "num_tokens": 563724088.0, + "step": 14780 + }, + { + "epoch": 1.88029512784633, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.776784896850586, + "learning_rate": 1e-06, + "loss": 0.4151, + "mean_token_accuracy": 0.8682641983032227, + "num_tokens": 563760897.0, + "step": 14781 + }, + { + "epoch": 1.8804223381249205, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.60032081604004, + "learning_rate": 1e-06, + "loss": 0.3942, + "mean_token_accuracy": 0.8744800090789795, + "num_tokens": 563800158.0, + "step": 14782 + }, + { + "epoch": 1.8805495484035109, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.709505081176758, + "learning_rate": 1e-06, + "loss": 0.392, + "mean_token_accuracy": 0.8744938373565674, + "num_tokens": 563833288.0, + "step": 14783 + }, + { + "epoch": 1.8806767586821014, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.52446937561035, + "learning_rate": 1e-06, + "loss": 0.3791, + "mean_token_accuracy": 0.8810029029846191, + "num_tokens": 563874507.0, + "step": 14784 + }, + { + "epoch": 1.880803968960692, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.467927932739258, + "learning_rate": 1e-06, + "loss": 0.4097, + "mean_token_accuracy": 0.8688699007034302, + "num_tokens": 563912452.0, + "step": 14785 + }, + { + "epoch": 1.8809311792392824, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.82435417175293, + "learning_rate": 1e-06, + "loss": 0.4726, + "mean_token_accuracy": 0.8518564701080322, + "num_tokens": 563950565.0, + "step": 14786 + }, + { + "epoch": 1.881058389517873, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.729808807373047, + "learning_rate": 1e-06, + "loss": 0.43, + "mean_token_accuracy": 0.8605004549026489, + "num_tokens": 563984924.0, + "step": 14787 + }, + { + "epoch": 1.8811855997964635, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.677017211914062, + "learning_rate": 1e-06, + "loss": 0.3936, + "mean_token_accuracy": 0.8763077259063721, + "num_tokens": 564022887.0, + "step": 14788 + }, + { + "epoch": 1.881312810075054, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.509235382080078, + "learning_rate": 1e-06, + "loss": 0.4128, + "mean_token_accuracy": 0.868781328201294, + "num_tokens": 564064829.0, + "step": 14789 + }, + { + "epoch": 1.8814400203536445, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.89632797241211, + "learning_rate": 1e-06, + "loss": 0.489, + "mean_token_accuracy": 0.847889244556427, + "num_tokens": 564100858.0, + "step": 14790 + }, + { + "epoch": 1.881567230632235, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.611223220825195, + "learning_rate": 1e-06, + "loss": 0.4334, + "mean_token_accuracy": 0.8619067072868347, + "num_tokens": 564141477.0, + "step": 14791 + }, + { + "epoch": 1.8816944409108256, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.903865814208984, + "learning_rate": 1e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.8623827695846558, + "num_tokens": 564179271.0, + "step": 14792 + }, + { + "epoch": 1.8818216511894161, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.61722183227539, + "learning_rate": 1e-06, + "loss": 0.3902, + "mean_token_accuracy": 0.875560462474823, + "num_tokens": 564224336.0, + "step": 14793 + }, + { + "epoch": 1.8819488614680067, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.71599006652832, + "learning_rate": 1e-06, + "loss": 0.4348, + "mean_token_accuracy": 0.8636812567710876, + "num_tokens": 564265157.0, + "step": 14794 + }, + { + "epoch": 1.8820760717465972, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.704490661621094, + "learning_rate": 1e-06, + "loss": 0.3948, + "mean_token_accuracy": 0.8749434947967529, + "num_tokens": 564295023.0, + "step": 14795 + }, + { + "epoch": 1.8822032820251877, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.686920166015625, + "learning_rate": 1e-06, + "loss": 0.4165, + "mean_token_accuracy": 0.870582103729248, + "num_tokens": 564333700.0, + "step": 14796 + }, + { + "epoch": 1.8823304923037782, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.902149200439453, + "learning_rate": 1e-06, + "loss": 0.4623, + "mean_token_accuracy": 0.8555926084518433, + "num_tokens": 564374342.0, + "step": 14797 + }, + { + "epoch": 1.8824577025823688, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.711658477783203, + "learning_rate": 1e-06, + "loss": 0.4027, + "mean_token_accuracy": 0.8719865083694458, + "num_tokens": 564422877.0, + "step": 14798 + }, + { + "epoch": 1.8825849128609593, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.632375717163086, + "learning_rate": 1e-06, + "loss": 0.4154, + "mean_token_accuracy": 0.8693555593490601, + "num_tokens": 564464713.0, + "step": 14799 + }, + { + "epoch": 1.8827121231395498, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.694074630737305, + "learning_rate": 1e-06, + "loss": 0.4397, + "mean_token_accuracy": 0.8586971759796143, + "num_tokens": 564500667.0, + "step": 14800 + }, + { + "epoch": 1.8828393334181401, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.63254165649414, + "learning_rate": 1e-06, + "loss": 0.3866, + "mean_token_accuracy": 0.8774879574775696, + "num_tokens": 564541460.0, + "step": 14801 + }, + { + "epoch": 1.8829665436967307, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.926889419555664, + "learning_rate": 1e-06, + "loss": 0.4406, + "mean_token_accuracy": 0.8612833619117737, + "num_tokens": 564580983.0, + "step": 14802 + }, + { + "epoch": 1.8830937539753212, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.546855926513672, + "learning_rate": 1e-06, + "loss": 0.3894, + "mean_token_accuracy": 0.8783934712409973, + "num_tokens": 564615967.0, + "step": 14803 + }, + { + "epoch": 1.8832209642539117, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.59786605834961, + "learning_rate": 1e-06, + "loss": 0.4351, + "mean_token_accuracy": 0.8611543774604797, + "num_tokens": 564653255.0, + "step": 14804 + }, + { + "epoch": 1.8833481745325023, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.6865291595459, + "learning_rate": 1e-06, + "loss": 0.4111, + "mean_token_accuracy": 0.867074191570282, + "num_tokens": 564688420.0, + "step": 14805 + }, + { + "epoch": 1.8834753848110928, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.529111862182617, + "learning_rate": 1e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.8627711534500122, + "num_tokens": 564733123.0, + "step": 14806 + }, + { + "epoch": 1.883602595089683, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.874448776245117, + "learning_rate": 1e-06, + "loss": 0.3903, + "mean_token_accuracy": 0.8762471079826355, + "num_tokens": 564767712.0, + "step": 14807 + }, + { + "epoch": 1.8837298053682736, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.635419845581055, + "learning_rate": 1e-06, + "loss": 0.4213, + "mean_token_accuracy": 0.8665851950645447, + "num_tokens": 564810663.0, + "step": 14808 + }, + { + "epoch": 1.8838570156468641, + "ewc_loss": 0.034912109375, + "ewc_loss_parallel": 3.4809112548828125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.56396484375, + "learning_rate": 1e-06, + "loss": 0.4348, + "mean_token_accuracy": 0.8674092888832092, + "num_tokens": 564848689.0, + "step": 14809 + }, + { + "epoch": 1.8839842259254547, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.912242889404297, + "learning_rate": 1e-06, + "loss": 0.4488, + "mean_token_accuracy": 0.8571174144744873, + "num_tokens": 564887426.0, + "step": 14810 + }, + { + "epoch": 1.8841114362040452, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.545368194580078, + "learning_rate": 1e-06, + "loss": 0.3983, + "mean_token_accuracy": 0.8733939528465271, + "num_tokens": 564924852.0, + "step": 14811 + }, + { + "epoch": 1.8842386464826357, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.641666412353516, + "learning_rate": 1e-06, + "loss": 0.4262, + "mean_token_accuracy": 0.8654743432998657, + "num_tokens": 564964201.0, + "step": 14812 + }, + { + "epoch": 1.8843658567612263, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.86158561706543, + "learning_rate": 1e-06, + "loss": 0.381, + "mean_token_accuracy": 0.880544900894165, + "num_tokens": 565004939.0, + "step": 14813 + }, + { + "epoch": 1.8844930670398168, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.375516891479492, + "learning_rate": 1e-06, + "loss": 0.4009, + "mean_token_accuracy": 0.8740946054458618, + "num_tokens": 565044067.0, + "step": 14814 + }, + { + "epoch": 1.8846202773184073, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.676176071166992, + "learning_rate": 1e-06, + "loss": 0.4095, + "mean_token_accuracy": 0.8709261417388916, + "num_tokens": 565082114.0, + "step": 14815 + }, + { + "epoch": 1.8847474875969978, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.758962631225586, + "learning_rate": 1e-06, + "loss": 0.4104, + "mean_token_accuracy": 0.8690230846405029, + "num_tokens": 565118350.0, + "step": 14816 + }, + { + "epoch": 1.8848746978755884, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.585491180419922, + "learning_rate": 1e-06, + "loss": 0.4429, + "mean_token_accuracy": 0.8621982336044312, + "num_tokens": 565153313.0, + "step": 14817 + }, + { + "epoch": 1.885001908154179, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.51512336730957, + "learning_rate": 1e-06, + "loss": 0.4524, + "mean_token_accuracy": 0.8572056889533997, + "num_tokens": 565191332.0, + "step": 14818 + }, + { + "epoch": 1.8851291184327694, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.922409057617188, + "learning_rate": 1e-06, + "loss": 0.426, + "mean_token_accuracy": 0.8657189607620239, + "num_tokens": 565229937.0, + "step": 14819 + }, + { + "epoch": 1.88525632871136, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.628822326660156, + "learning_rate": 1e-06, + "loss": 0.4281, + "mean_token_accuracy": 0.8637412786483765, + "num_tokens": 565266271.0, + "step": 14820 + }, + { + "epoch": 1.8853835389899505, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.750804901123047, + "learning_rate": 1e-06, + "loss": 0.4288, + "mean_token_accuracy": 0.8658367991447449, + "num_tokens": 565300593.0, + "step": 14821 + }, + { + "epoch": 1.885510749268541, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.63259506225586, + "learning_rate": 1e-06, + "loss": 0.4107, + "mean_token_accuracy": 0.8683106303215027, + "num_tokens": 565341801.0, + "step": 14822 + }, + { + "epoch": 1.8856379595471315, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.74587059020996, + "learning_rate": 1e-06, + "loss": 0.3551, + "mean_token_accuracy": 0.887161135673523, + "num_tokens": 565374380.0, + "step": 14823 + }, + { + "epoch": 1.885765169825722, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.694103240966797, + "learning_rate": 1e-06, + "loss": 0.4369, + "mean_token_accuracy": 0.8609247207641602, + "num_tokens": 565413096.0, + "step": 14824 + }, + { + "epoch": 1.8858923801043126, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.711360931396484, + "learning_rate": 1e-06, + "loss": 0.3884, + "mean_token_accuracy": 0.878017246723175, + "num_tokens": 565453292.0, + "step": 14825 + }, + { + "epoch": 1.886019590382903, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.792375564575195, + "learning_rate": 1e-06, + "loss": 0.4353, + "mean_token_accuracy": 0.8638259172439575, + "num_tokens": 565489311.0, + "step": 14826 + }, + { + "epoch": 1.8861468006614934, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.706789016723633, + "learning_rate": 1e-06, + "loss": 0.4629, + "mean_token_accuracy": 0.8526250123977661, + "num_tokens": 565526293.0, + "step": 14827 + }, + { + "epoch": 1.886274010940084, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.535799026489258, + "learning_rate": 1e-06, + "loss": 0.4466, + "mean_token_accuracy": 0.8618496656417847, + "num_tokens": 565569215.0, + "step": 14828 + }, + { + "epoch": 1.8864012212186745, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.542760848999023, + "learning_rate": 1e-06, + "loss": 0.3841, + "mean_token_accuracy": 0.8793237209320068, + "num_tokens": 565610755.0, + "step": 14829 + }, + { + "epoch": 1.886528431497265, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.665306091308594, + "learning_rate": 1e-06, + "loss": 0.3939, + "mean_token_accuracy": 0.8745481967926025, + "num_tokens": 565641964.0, + "step": 14830 + }, + { + "epoch": 1.8866556417758555, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.598857879638672, + "learning_rate": 1e-06, + "loss": 0.5167, + "mean_token_accuracy": 0.8355166912078857, + "num_tokens": 565682852.0, + "step": 14831 + }, + { + "epoch": 1.8867828520544458, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.521329879760742, + "learning_rate": 1e-06, + "loss": 0.3651, + "mean_token_accuracy": 0.8847556710243225, + "num_tokens": 565721352.0, + "step": 14832 + }, + { + "epoch": 1.8869100623330364, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.54192352294922, + "learning_rate": 1e-06, + "loss": 0.3918, + "mean_token_accuracy": 0.8732762336730957, + "num_tokens": 565764045.0, + "step": 14833 + }, + { + "epoch": 1.887037272611627, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.402034759521484, + "learning_rate": 1e-06, + "loss": 0.3897, + "mean_token_accuracy": 0.8771605491638184, + "num_tokens": 565797617.0, + "step": 14834 + }, + { + "epoch": 1.8871644828902174, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.53463363647461, + "learning_rate": 1e-06, + "loss": 0.4388, + "mean_token_accuracy": 0.8683280944824219, + "num_tokens": 565839254.0, + "step": 14835 + }, + { + "epoch": 1.887291693168808, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.636770248413086, + "learning_rate": 1e-06, + "loss": 0.4101, + "mean_token_accuracy": 0.8683084845542908, + "num_tokens": 565874742.0, + "step": 14836 + }, + { + "epoch": 1.8874189034473985, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.478763580322266, + "learning_rate": 1e-06, + "loss": 0.411, + "mean_token_accuracy": 0.8694734573364258, + "num_tokens": 565911043.0, + "step": 14837 + }, + { + "epoch": 1.887546113725989, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.681833267211914, + "learning_rate": 1e-06, + "loss": 0.4283, + "mean_token_accuracy": 0.8650999665260315, + "num_tokens": 565950228.0, + "step": 14838 + }, + { + "epoch": 1.8876733240045795, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.640886306762695, + "learning_rate": 1e-06, + "loss": 0.3743, + "mean_token_accuracy": 0.8803087472915649, + "num_tokens": 565982891.0, + "step": 14839 + }, + { + "epoch": 1.88780053428317, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.6711368560791, + "learning_rate": 1e-06, + "loss": 0.4143, + "mean_token_accuracy": 0.8673927783966064, + "num_tokens": 566022601.0, + "step": 14840 + }, + { + "epoch": 1.8879277445617606, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.640615463256836, + "learning_rate": 1e-06, + "loss": 0.4071, + "mean_token_accuracy": 0.8703739047050476, + "num_tokens": 566054928.0, + "step": 14841 + }, + { + "epoch": 1.8880549548403511, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.753061294555664, + "learning_rate": 1e-06, + "loss": 0.4346, + "mean_token_accuracy": 0.8622618913650513, + "num_tokens": 566094900.0, + "step": 14842 + }, + { + "epoch": 1.8881821651189417, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.853866577148438, + "learning_rate": 1e-06, + "loss": 0.4201, + "mean_token_accuracy": 0.8668454885482788, + "num_tokens": 566125230.0, + "step": 14843 + }, + { + "epoch": 1.8883093753975322, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.633930206298828, + "learning_rate": 1e-06, + "loss": 0.3634, + "mean_token_accuracy": 0.8839457035064697, + "num_tokens": 566165320.0, + "step": 14844 + }, + { + "epoch": 1.8884365856761227, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.83133316040039, + "learning_rate": 1e-06, + "loss": 0.4359, + "mean_token_accuracy": 0.8594530820846558, + "num_tokens": 566202769.0, + "step": 14845 + }, + { + "epoch": 1.8885637959547132, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.732608795166016, + "learning_rate": 1e-06, + "loss": 0.4148, + "mean_token_accuracy": 0.8705288171768188, + "num_tokens": 566245874.0, + "step": 14846 + }, + { + "epoch": 1.8886910062333038, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.677309036254883, + "learning_rate": 1e-06, + "loss": 0.4656, + "mean_token_accuracy": 0.8506983518600464, + "num_tokens": 566284855.0, + "step": 14847 + }, + { + "epoch": 1.8888182165118943, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.627887725830078, + "learning_rate": 1e-06, + "loss": 0.4416, + "mean_token_accuracy": 0.8604069352149963, + "num_tokens": 566323422.0, + "step": 14848 + }, + { + "epoch": 1.8889454267904848, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.6514835357666, + "learning_rate": 1e-06, + "loss": 0.39, + "mean_token_accuracy": 0.8772904872894287, + "num_tokens": 566364357.0, + "step": 14849 + }, + { + "epoch": 1.8890726370690751, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.704792022705078, + "learning_rate": 1e-06, + "loss": 0.4358, + "mean_token_accuracy": 0.8661850690841675, + "num_tokens": 566404498.0, + "step": 14850 + }, + { + "epoch": 1.8891998473476657, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.7210636138916, + "learning_rate": 1e-06, + "loss": 0.4212, + "mean_token_accuracy": 0.8650792837142944, + "num_tokens": 566442960.0, + "step": 14851 + }, + { + "epoch": 1.8893270576262562, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.814727783203125, + "learning_rate": 1e-06, + "loss": 0.4071, + "mean_token_accuracy": 0.8783656358718872, + "num_tokens": 566475087.0, + "step": 14852 + }, + { + "epoch": 1.8894542679048467, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.656030654907227, + "learning_rate": 1e-06, + "loss": 0.4382, + "mean_token_accuracy": 0.8589690923690796, + "num_tokens": 566514337.0, + "step": 14853 + }, + { + "epoch": 1.8895814781834372, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.83142852783203, + "learning_rate": 1e-06, + "loss": 0.4451, + "mean_token_accuracy": 0.859551191329956, + "num_tokens": 566553204.0, + "step": 14854 + }, + { + "epoch": 1.8897086884620278, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.496612548828125, + "learning_rate": 1e-06, + "loss": 0.4192, + "mean_token_accuracy": 0.8664717674255371, + "num_tokens": 566589697.0, + "step": 14855 + }, + { + "epoch": 1.889835898740618, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.7852725982666, + "learning_rate": 1e-06, + "loss": 0.4604, + "mean_token_accuracy": 0.8543423414230347, + "num_tokens": 566629680.0, + "step": 14856 + }, + { + "epoch": 1.8899631090192086, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.77642059326172, + "learning_rate": 1e-06, + "loss": 0.3391, + "mean_token_accuracy": 0.8913016319274902, + "num_tokens": 566670931.0, + "step": 14857 + }, + { + "epoch": 1.8900903192977991, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.625572204589844, + "learning_rate": 1e-06, + "loss": 0.4261, + "mean_token_accuracy": 0.8636175394058228, + "num_tokens": 566706673.0, + "step": 14858 + }, + { + "epoch": 1.8902175295763897, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.744384765625, + "learning_rate": 1e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.864788293838501, + "num_tokens": 566749503.0, + "step": 14859 + }, + { + "epoch": 1.8903447398549802, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.775672912597656, + "learning_rate": 1e-06, + "loss": 0.3699, + "mean_token_accuracy": 0.883131742477417, + "num_tokens": 566790700.0, + "step": 14860 + }, + { + "epoch": 1.8904719501335707, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.570960998535156, + "learning_rate": 1e-06, + "loss": 0.4246, + "mean_token_accuracy": 0.8676576614379883, + "num_tokens": 566832995.0, + "step": 14861 + }, + { + "epoch": 1.8905991604121613, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.64984130859375, + "learning_rate": 1e-06, + "loss": 0.4428, + "mean_token_accuracy": 0.8565896153450012, + "num_tokens": 566868190.0, + "step": 14862 + }, + { + "epoch": 1.8907263706907518, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.723419189453125, + "learning_rate": 1e-06, + "loss": 0.4974, + "mean_token_accuracy": 0.8408938646316528, + "num_tokens": 566906637.0, + "step": 14863 + }, + { + "epoch": 1.8908535809693423, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.89586639404297, + "learning_rate": 1e-06, + "loss": 0.4178, + "mean_token_accuracy": 0.8689774870872498, + "num_tokens": 566946006.0, + "step": 14864 + }, + { + "epoch": 1.8909807912479328, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.74108123779297, + "learning_rate": 1e-06, + "loss": 0.4472, + "mean_token_accuracy": 0.8566944599151611, + "num_tokens": 566987726.0, + "step": 14865 + }, + { + "epoch": 1.8911080015265234, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.69174575805664, + "learning_rate": 1e-06, + "loss": 0.3683, + "mean_token_accuracy": 0.8839294910430908, + "num_tokens": 567027432.0, + "step": 14866 + }, + { + "epoch": 1.891235211805114, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.6021785736084, + "learning_rate": 1e-06, + "loss": 0.3793, + "mean_token_accuracy": 0.8757177591323853, + "num_tokens": 567060607.0, + "step": 14867 + }, + { + "epoch": 1.8913624220837044, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.83861541748047, + "learning_rate": 1e-06, + "loss": 0.4355, + "mean_token_accuracy": 0.8622072339057922, + "num_tokens": 567100691.0, + "step": 14868 + }, + { + "epoch": 1.891489632362295, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.6267147064209, + "learning_rate": 1e-06, + "loss": 0.353, + "mean_token_accuracy": 0.887540340423584, + "num_tokens": 567139460.0, + "step": 14869 + }, + { + "epoch": 1.8916168426408855, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.433223724365234, + "learning_rate": 1e-06, + "loss": 0.4463, + "mean_token_accuracy": 0.85686856508255, + "num_tokens": 567185384.0, + "step": 14870 + }, + { + "epoch": 1.891744052919476, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.80429458618164, + "learning_rate": 1e-06, + "loss": 0.4206, + "mean_token_accuracy": 0.8681067228317261, + "num_tokens": 567219898.0, + "step": 14871 + }, + { + "epoch": 1.8918712631980665, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.550722122192383, + "learning_rate": 1e-06, + "loss": 0.3892, + "mean_token_accuracy": 0.8756060600280762, + "num_tokens": 567260322.0, + "step": 14872 + }, + { + "epoch": 1.891998473476657, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.628442764282227, + "learning_rate": 1e-06, + "loss": 0.4288, + "mean_token_accuracy": 0.8643447160720825, + "num_tokens": 567293739.0, + "step": 14873 + }, + { + "epoch": 1.8921256837552476, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.790069580078125, + "learning_rate": 1e-06, + "loss": 0.4137, + "mean_token_accuracy": 0.8701212406158447, + "num_tokens": 567327282.0, + "step": 14874 + }, + { + "epoch": 1.892252894033838, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.661123275756836, + "learning_rate": 1e-06, + "loss": 0.441, + "mean_token_accuracy": 0.8624266982078552, + "num_tokens": 567366402.0, + "step": 14875 + }, + { + "epoch": 1.8923801043124284, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.659637451171875, + "learning_rate": 1e-06, + "loss": 0.4295, + "mean_token_accuracy": 0.8614513278007507, + "num_tokens": 567409924.0, + "step": 14876 + }, + { + "epoch": 1.892507314591019, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.837461471557617, + "learning_rate": 1e-06, + "loss": 0.4106, + "mean_token_accuracy": 0.8711079359054565, + "num_tokens": 567450355.0, + "step": 14877 + }, + { + "epoch": 1.8926345248696095, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.641231536865234, + "learning_rate": 1e-06, + "loss": 0.3684, + "mean_token_accuracy": 0.8823769092559814, + "num_tokens": 567487701.0, + "step": 14878 + }, + { + "epoch": 1.8927617351482, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.912029266357422, + "learning_rate": 1e-06, + "loss": 0.4283, + "mean_token_accuracy": 0.8678237199783325, + "num_tokens": 567527619.0, + "step": 14879 + }, + { + "epoch": 1.8928889454267905, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.580698013305664, + "learning_rate": 1e-06, + "loss": 0.4183, + "mean_token_accuracy": 0.8678337335586548, + "num_tokens": 567567016.0, + "step": 14880 + }, + { + "epoch": 1.8930161557053808, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.004968643188477, + "learning_rate": 1e-06, + "loss": 0.4517, + "mean_token_accuracy": 0.8561743497848511, + "num_tokens": 567605465.0, + "step": 14881 + }, + { + "epoch": 1.8931433659839714, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.923612594604492, + "learning_rate": 1e-06, + "loss": 0.4044, + "mean_token_accuracy": 0.8718898296356201, + "num_tokens": 567646680.0, + "step": 14882 + }, + { + "epoch": 1.893270576262562, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.67070198059082, + "learning_rate": 1e-06, + "loss": 0.4062, + "mean_token_accuracy": 0.869794487953186, + "num_tokens": 567685685.0, + "step": 14883 + }, + { + "epoch": 1.8933977865411524, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.730018615722656, + "learning_rate": 1e-06, + "loss": 0.4415, + "mean_token_accuracy": 0.8587059378623962, + "num_tokens": 567717222.0, + "step": 14884 + }, + { + "epoch": 1.893524996819743, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.02254295349121, + "learning_rate": 1e-06, + "loss": 0.4409, + "mean_token_accuracy": 0.8590752482414246, + "num_tokens": 567761050.0, + "step": 14885 + }, + { + "epoch": 1.8936522070983335, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.682384490966797, + "learning_rate": 1e-06, + "loss": 0.4335, + "mean_token_accuracy": 0.8668943643569946, + "num_tokens": 567801423.0, + "step": 14886 + }, + { + "epoch": 1.893779417376924, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.09982681274414, + "learning_rate": 1e-06, + "loss": 0.4521, + "mean_token_accuracy": 0.8592365384101868, + "num_tokens": 567844614.0, + "step": 14887 + }, + { + "epoch": 1.8939066276555145, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.11029624938965, + "learning_rate": 1e-06, + "loss": 0.3972, + "mean_token_accuracy": 0.8716021776199341, + "num_tokens": 567878803.0, + "step": 14888 + }, + { + "epoch": 1.894033837934105, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.644800186157227, + "learning_rate": 1e-06, + "loss": 0.3744, + "mean_token_accuracy": 0.880973756313324, + "num_tokens": 567909419.0, + "step": 14889 + }, + { + "epoch": 1.8941610482126956, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.008525848388672, + "learning_rate": 1e-06, + "loss": 0.4133, + "mean_token_accuracy": 0.8698323965072632, + "num_tokens": 567949221.0, + "step": 14890 + }, + { + "epoch": 1.8942882584912861, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.68347930908203, + "learning_rate": 1e-06, + "loss": 0.3628, + "mean_token_accuracy": 0.8850035667419434, + "num_tokens": 567985101.0, + "step": 14891 + }, + { + "epoch": 1.8944154687698767, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.60878562927246, + "learning_rate": 1e-06, + "loss": 0.4591, + "mean_token_accuracy": 0.8519647121429443, + "num_tokens": 568030542.0, + "step": 14892 + }, + { + "epoch": 1.8945426790484672, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.160236358642578, + "learning_rate": 1e-06, + "loss": 0.4034, + "mean_token_accuracy": 0.8719286918640137, + "num_tokens": 568070723.0, + "step": 14893 + }, + { + "epoch": 1.8946698893270577, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.76296043395996, + "learning_rate": 1e-06, + "loss": 0.4401, + "mean_token_accuracy": 0.8609793782234192, + "num_tokens": 568113444.0, + "step": 14894 + }, + { + "epoch": 1.8947970996056482, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.66571044921875, + "learning_rate": 1e-06, + "loss": 0.3996, + "mean_token_accuracy": 0.871617317199707, + "num_tokens": 568152646.0, + "step": 14895 + }, + { + "epoch": 1.8949243098842388, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.816795349121094, + "learning_rate": 1e-06, + "loss": 0.4171, + "mean_token_accuracy": 0.8690767288208008, + "num_tokens": 568187046.0, + "step": 14896 + }, + { + "epoch": 1.8950515201628293, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.943023681640625, + "learning_rate": 1e-06, + "loss": 0.4322, + "mean_token_accuracy": 0.8647561073303223, + "num_tokens": 568225052.0, + "step": 14897 + }, + { + "epoch": 1.8951787304414198, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.757164001464844, + "learning_rate": 1e-06, + "loss": 0.4403, + "mean_token_accuracy": 0.8603676557540894, + "num_tokens": 568258299.0, + "step": 14898 + }, + { + "epoch": 1.8953059407200101, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.88150405883789, + "learning_rate": 1e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.8747893571853638, + "num_tokens": 568297137.0, + "step": 14899 + }, + { + "epoch": 1.8954331509986007, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.954648971557617, + "learning_rate": 1e-06, + "loss": 0.3443, + "mean_token_accuracy": 0.8902804255485535, + "num_tokens": 568331102.0, + "step": 14900 + }, + { + "epoch": 1.8955603612771912, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.6778621673584, + "learning_rate": 1e-06, + "loss": 0.4168, + "mean_token_accuracy": 0.866692066192627, + "num_tokens": 568373926.0, + "step": 14901 + }, + { + "epoch": 1.8956875715557817, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.01206398010254, + "learning_rate": 1e-06, + "loss": 0.3849, + "mean_token_accuracy": 0.8771085739135742, + "num_tokens": 568406470.0, + "step": 14902 + }, + { + "epoch": 1.8958147818343722, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.618703842163086, + "learning_rate": 1e-06, + "loss": 0.4023, + "mean_token_accuracy": 0.8756104111671448, + "num_tokens": 568443452.0, + "step": 14903 + }, + { + "epoch": 1.8959419921129628, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.57970428466797, + "learning_rate": 1e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.8636883497238159, + "num_tokens": 568480085.0, + "step": 14904 + }, + { + "epoch": 1.896069202391553, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.58843421936035, + "learning_rate": 1e-06, + "loss": 0.4278, + "mean_token_accuracy": 0.8654810786247253, + "num_tokens": 568519191.0, + "step": 14905 + }, + { + "epoch": 1.8961964126701436, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.05755615234375, + "learning_rate": 1e-06, + "loss": 0.404, + "mean_token_accuracy": 0.8750328421592712, + "num_tokens": 568554224.0, + "step": 14906 + }, + { + "epoch": 1.8963236229487341, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.101730346679688, + "learning_rate": 1e-06, + "loss": 0.36, + "mean_token_accuracy": 0.885922908782959, + "num_tokens": 568586102.0, + "step": 14907 + }, + { + "epoch": 1.8964508332273247, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.522930145263672, + "learning_rate": 1e-06, + "loss": 0.4309, + "mean_token_accuracy": 0.863965630531311, + "num_tokens": 568625224.0, + "step": 14908 + }, + { + "epoch": 1.8965780435059152, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.781827926635742, + "learning_rate": 1e-06, + "loss": 0.4847, + "mean_token_accuracy": 0.8474893569946289, + "num_tokens": 568665588.0, + "step": 14909 + }, + { + "epoch": 1.8967052537845057, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.785667419433594, + "learning_rate": 1e-06, + "loss": 0.3632, + "mean_token_accuracy": 0.8844189643859863, + "num_tokens": 568698408.0, + "step": 14910 + }, + { + "epoch": 1.8968324640630962, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.550493240356445, + "learning_rate": 1e-06, + "loss": 0.451, + "mean_token_accuracy": 0.8568459749221802, + "num_tokens": 568731982.0, + "step": 14911 + }, + { + "epoch": 1.8969596743416868, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.954912185668945, + "learning_rate": 1e-06, + "loss": 0.423, + "mean_token_accuracy": 0.8674727082252502, + "num_tokens": 568769388.0, + "step": 14912 + }, + { + "epoch": 1.8970868846202773, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.569395065307617, + "learning_rate": 1e-06, + "loss": 0.3977, + "mean_token_accuracy": 0.8757859468460083, + "num_tokens": 568810329.0, + "step": 14913 + }, + { + "epoch": 1.8972140948988678, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.90428352355957, + "learning_rate": 1e-06, + "loss": 0.4109, + "mean_token_accuracy": 0.8735122680664062, + "num_tokens": 568852241.0, + "step": 14914 + }, + { + "epoch": 1.8973413051774584, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.768177032470703, + "learning_rate": 1e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.8642258644104004, + "num_tokens": 568889660.0, + "step": 14915 + }, + { + "epoch": 1.8974685154560489, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.709985733032227, + "learning_rate": 1e-06, + "loss": 0.423, + "mean_token_accuracy": 0.8662830591201782, + "num_tokens": 568934489.0, + "step": 14916 + }, + { + "epoch": 1.8975957257346394, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.990285873413086, + "learning_rate": 1e-06, + "loss": 0.341, + "mean_token_accuracy": 0.8898086547851562, + "num_tokens": 568971904.0, + "step": 14917 + }, + { + "epoch": 1.89772293601323, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.767183303833008, + "learning_rate": 1e-06, + "loss": 0.4704, + "mean_token_accuracy": 0.8499447107315063, + "num_tokens": 569011863.0, + "step": 14918 + }, + { + "epoch": 1.8978501462918205, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.70932960510254, + "learning_rate": 1e-06, + "loss": 0.3563, + "mean_token_accuracy": 0.8898525238037109, + "num_tokens": 569050295.0, + "step": 14919 + }, + { + "epoch": 1.897977356570411, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.16306495666504, + "learning_rate": 1e-06, + "loss": 0.4361, + "mean_token_accuracy": 0.8623903393745422, + "num_tokens": 569087013.0, + "step": 14920 + }, + { + "epoch": 1.8981045668490015, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.609760284423828, + "learning_rate": 1e-06, + "loss": 0.4427, + "mean_token_accuracy": 0.8626793622970581, + "num_tokens": 569125226.0, + "step": 14921 + }, + { + "epoch": 1.898231777127592, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.814668655395508, + "learning_rate": 1e-06, + "loss": 0.4209, + "mean_token_accuracy": 0.8656948804855347, + "num_tokens": 569161910.0, + "step": 14922 + }, + { + "epoch": 1.8983589874061826, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.849430084228516, + "learning_rate": 1e-06, + "loss": 0.4046, + "mean_token_accuracy": 0.8707248568534851, + "num_tokens": 569196519.0, + "step": 14923 + }, + { + "epoch": 1.898486197684773, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.507694244384766, + "learning_rate": 1e-06, + "loss": 0.4343, + "mean_token_accuracy": 0.8650623559951782, + "num_tokens": 569234404.0, + "step": 14924 + }, + { + "epoch": 1.8986134079633634, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.62691307067871, + "learning_rate": 1e-06, + "loss": 0.44, + "mean_token_accuracy": 0.8602168560028076, + "num_tokens": 569272728.0, + "step": 14925 + }, + { + "epoch": 1.898740618241954, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.695213317871094, + "learning_rate": 1e-06, + "loss": 0.4368, + "mean_token_accuracy": 0.8600915670394897, + "num_tokens": 569315434.0, + "step": 14926 + }, + { + "epoch": 1.8988678285205445, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.729156494140625, + "learning_rate": 1e-06, + "loss": 0.4634, + "mean_token_accuracy": 0.8523640036582947, + "num_tokens": 569357473.0, + "step": 14927 + }, + { + "epoch": 1.898995038799135, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.721620559692383, + "learning_rate": 1e-06, + "loss": 0.4426, + "mean_token_accuracy": 0.8561395406723022, + "num_tokens": 569398747.0, + "step": 14928 + }, + { + "epoch": 1.8991222490777255, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.647342681884766, + "learning_rate": 1e-06, + "loss": 0.3894, + "mean_token_accuracy": 0.87593013048172, + "num_tokens": 569434855.0, + "step": 14929 + }, + { + "epoch": 1.8992494593563158, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.782501220703125, + "learning_rate": 1e-06, + "loss": 0.4231, + "mean_token_accuracy": 0.8681113719940186, + "num_tokens": 569475236.0, + "step": 14930 + }, + { + "epoch": 1.8993766696349064, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.628944396972656, + "learning_rate": 1e-06, + "loss": 0.4121, + "mean_token_accuracy": 0.8677595853805542, + "num_tokens": 569513599.0, + "step": 14931 + }, + { + "epoch": 1.899503879913497, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.607194900512695, + "learning_rate": 1e-06, + "loss": 0.3972, + "mean_token_accuracy": 0.8756155967712402, + "num_tokens": 569559059.0, + "step": 14932 + }, + { + "epoch": 1.8996310901920874, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.56982421875, + "learning_rate": 1e-06, + "loss": 0.373, + "mean_token_accuracy": 0.8859943747520447, + "num_tokens": 569595240.0, + "step": 14933 + }, + { + "epoch": 1.899758300470678, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.65460968017578, + "learning_rate": 1e-06, + "loss": 0.443, + "mean_token_accuracy": 0.8584451079368591, + "num_tokens": 569629336.0, + "step": 14934 + }, + { + "epoch": 1.8998855107492685, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.660276412963867, + "learning_rate": 1e-06, + "loss": 0.3841, + "mean_token_accuracy": 0.8790994882583618, + "num_tokens": 569663613.0, + "step": 14935 + }, + { + "epoch": 1.900012721027859, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.732070922851562, + "learning_rate": 1e-06, + "loss": 0.4695, + "mean_token_accuracy": 0.8528276681900024, + "num_tokens": 569702884.0, + "step": 14936 + }, + { + "epoch": 1.9001399313064495, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.656389236450195, + "learning_rate": 1e-06, + "loss": 0.3665, + "mean_token_accuracy": 0.8829445242881775, + "num_tokens": 569744618.0, + "step": 14937 + }, + { + "epoch": 1.90026714158504, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.617130279541016, + "learning_rate": 1e-06, + "loss": 0.3917, + "mean_token_accuracy": 0.8772984743118286, + "num_tokens": 569777083.0, + "step": 14938 + }, + { + "epoch": 1.9003943518636306, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.77569007873535, + "learning_rate": 1e-06, + "loss": 0.3956, + "mean_token_accuracy": 0.8759745955467224, + "num_tokens": 569809652.0, + "step": 14939 + }, + { + "epoch": 1.9005215621422211, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.659027099609375, + "learning_rate": 1e-06, + "loss": 0.3947, + "mean_token_accuracy": 0.8734562993049622, + "num_tokens": 569846465.0, + "step": 14940 + }, + { + "epoch": 1.9006487724208116, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.606094360351562, + "learning_rate": 1e-06, + "loss": 0.4299, + "mean_token_accuracy": 0.8612081408500671, + "num_tokens": 569883655.0, + "step": 14941 + }, + { + "epoch": 1.9007759826994022, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.78317642211914, + "learning_rate": 1e-06, + "loss": 0.4251, + "mean_token_accuracy": 0.8665827512741089, + "num_tokens": 569923358.0, + "step": 14942 + }, + { + "epoch": 1.9009031929779927, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.567461013793945, + "learning_rate": 1e-06, + "loss": 0.4369, + "mean_token_accuracy": 0.8647478818893433, + "num_tokens": 569961393.0, + "step": 14943 + }, + { + "epoch": 1.9010304032565832, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.60867691040039, + "learning_rate": 1e-06, + "loss": 0.5012, + "mean_token_accuracy": 0.842029333114624, + "num_tokens": 570002092.0, + "step": 14944 + }, + { + "epoch": 1.9011576135351738, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.983985900878906, + "learning_rate": 1e-06, + "loss": 0.4595, + "mean_token_accuracy": 0.8541956543922424, + "num_tokens": 570037699.0, + "step": 14945 + }, + { + "epoch": 1.9012848238137643, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.71629524230957, + "learning_rate": 1e-06, + "loss": 0.4834, + "mean_token_accuracy": 0.8459352254867554, + "num_tokens": 570077693.0, + "step": 14946 + }, + { + "epoch": 1.9014120340923548, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.805482864379883, + "learning_rate": 1e-06, + "loss": 0.3619, + "mean_token_accuracy": 0.8879286646842957, + "num_tokens": 570115354.0, + "step": 14947 + }, + { + "epoch": 1.9015392443709451, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.70193862915039, + "learning_rate": 1e-06, + "loss": 0.3939, + "mean_token_accuracy": 0.8767968416213989, + "num_tokens": 570154387.0, + "step": 14948 + }, + { + "epoch": 1.9016664546495357, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.675674438476562, + "learning_rate": 1e-06, + "loss": 0.4269, + "mean_token_accuracy": 0.8647992610931396, + "num_tokens": 570189743.0, + "step": 14949 + }, + { + "epoch": 1.9017936649281262, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.60310935974121, + "learning_rate": 1e-06, + "loss": 0.3925, + "mean_token_accuracy": 0.8802522420883179, + "num_tokens": 570224009.0, + "step": 14950 + }, + { + "epoch": 1.9019208752067167, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.690006256103516, + "learning_rate": 1e-06, + "loss": 0.3923, + "mean_token_accuracy": 0.8755463361740112, + "num_tokens": 570266703.0, + "step": 14951 + }, + { + "epoch": 1.9020480854853072, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.6680965423584, + "learning_rate": 1e-06, + "loss": 0.4236, + "mean_token_accuracy": 0.8685648441314697, + "num_tokens": 570308241.0, + "step": 14952 + }, + { + "epoch": 1.9021752957638978, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.474727630615234, + "learning_rate": 1e-06, + "loss": 0.4203, + "mean_token_accuracy": 0.8692858219146729, + "num_tokens": 570346931.0, + "step": 14953 + }, + { + "epoch": 1.902302506042488, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.63729476928711, + "learning_rate": 1e-06, + "loss": 0.368, + "mean_token_accuracy": 0.8821506500244141, + "num_tokens": 570380373.0, + "step": 14954 + }, + { + "epoch": 1.9024297163210786, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.591474533081055, + "learning_rate": 1e-06, + "loss": 0.3797, + "mean_token_accuracy": 0.8804598450660706, + "num_tokens": 570420106.0, + "step": 14955 + }, + { + "epoch": 1.9025569265996691, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.588672637939453, + "learning_rate": 1e-06, + "loss": 0.3535, + "mean_token_accuracy": 0.8901427984237671, + "num_tokens": 570464534.0, + "step": 14956 + }, + { + "epoch": 1.9026841368782597, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.769548416137695, + "learning_rate": 1e-06, + "loss": 0.4498, + "mean_token_accuracy": 0.8583693504333496, + "num_tokens": 570505734.0, + "step": 14957 + }, + { + "epoch": 1.9028113471568502, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.7786922454834, + "learning_rate": 1e-06, + "loss": 0.3779, + "mean_token_accuracy": 0.8785911798477173, + "num_tokens": 570545913.0, + "step": 14958 + }, + { + "epoch": 1.9029385574354407, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.611417770385742, + "learning_rate": 1e-06, + "loss": 0.3931, + "mean_token_accuracy": 0.8754771947860718, + "num_tokens": 570584397.0, + "step": 14959 + }, + { + "epoch": 1.9030657677140312, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.819000244140625, + "learning_rate": 1e-06, + "loss": 0.3991, + "mean_token_accuracy": 0.8746206760406494, + "num_tokens": 570622285.0, + "step": 14960 + }, + { + "epoch": 1.9031929779926218, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.748666763305664, + "learning_rate": 1e-06, + "loss": 0.4587, + "mean_token_accuracy": 0.8543163537979126, + "num_tokens": 570656331.0, + "step": 14961 + }, + { + "epoch": 1.9033201882712123, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.800186157226562, + "learning_rate": 1e-06, + "loss": 0.4038, + "mean_token_accuracy": 0.8732184171676636, + "num_tokens": 570699944.0, + "step": 14962 + }, + { + "epoch": 1.9034473985498028, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.714628219604492, + "learning_rate": 1e-06, + "loss": 0.437, + "mean_token_accuracy": 0.8636608123779297, + "num_tokens": 570737649.0, + "step": 14963 + }, + { + "epoch": 1.9035746088283934, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.73092269897461, + "learning_rate": 1e-06, + "loss": 0.4285, + "mean_token_accuracy": 0.866573691368103, + "num_tokens": 570773193.0, + "step": 14964 + }, + { + "epoch": 1.9037018191069839, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.515636444091797, + "learning_rate": 1e-06, + "loss": 0.4312, + "mean_token_accuracy": 0.8624430298805237, + "num_tokens": 570809117.0, + "step": 14965 + }, + { + "epoch": 1.9038290293855744, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.902942657470703, + "learning_rate": 1e-06, + "loss": 0.381, + "mean_token_accuracy": 0.8809972405433655, + "num_tokens": 570844268.0, + "step": 14966 + }, + { + "epoch": 1.903956239664165, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.56062889099121, + "learning_rate": 1e-06, + "loss": 0.4468, + "mean_token_accuracy": 0.8606165051460266, + "num_tokens": 570890166.0, + "step": 14967 + }, + { + "epoch": 1.9040834499427555, + "ewc_loss": 0.03515625, + "ewc_loss_parallel": 3.504753112792969e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.582773208618164, + "learning_rate": 1e-06, + "loss": 0.4323, + "mean_token_accuracy": 0.8640145659446716, + "num_tokens": 570921419.0, + "step": 14968 + }, + { + "epoch": 1.904210660221346, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.5454158782959, + "learning_rate": 1e-06, + "loss": 0.4481, + "mean_token_accuracy": 0.8585960865020752, + "num_tokens": 570966444.0, + "step": 14969 + }, + { + "epoch": 1.9043378704999365, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.64191436767578, + "learning_rate": 1e-06, + "loss": 0.3905, + "mean_token_accuracy": 0.8791661262512207, + "num_tokens": 571011656.0, + "step": 14970 + }, + { + "epoch": 1.904465080778527, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.90082359313965, + "learning_rate": 1e-06, + "loss": 0.3947, + "mean_token_accuracy": 0.8729861974716187, + "num_tokens": 571047828.0, + "step": 14971 + }, + { + "epoch": 1.9045922910571176, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.756521224975586, + "learning_rate": 1e-06, + "loss": 0.4327, + "mean_token_accuracy": 0.8667522072792053, + "num_tokens": 571084541.0, + "step": 14972 + }, + { + "epoch": 1.9047195013357079, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.470718383789062, + "learning_rate": 1e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.8627341985702515, + "num_tokens": 571120978.0, + "step": 14973 + }, + { + "epoch": 1.9048467116142984, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.683334350585938, + "learning_rate": 1e-06, + "loss": 0.371, + "mean_token_accuracy": 0.8832340240478516, + "num_tokens": 571160181.0, + "step": 14974 + }, + { + "epoch": 1.904973921892889, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.58338737487793, + "learning_rate": 1e-06, + "loss": 0.4086, + "mean_token_accuracy": 0.8740679621696472, + "num_tokens": 571194648.0, + "step": 14975 + }, + { + "epoch": 1.9051011321714795, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.74843978881836, + "learning_rate": 1e-06, + "loss": 0.4373, + "mean_token_accuracy": 0.8641966581344604, + "num_tokens": 571238449.0, + "step": 14976 + }, + { + "epoch": 1.90522834245007, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.67081069946289, + "learning_rate": 1e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.8692496418952942, + "num_tokens": 571277266.0, + "step": 14977 + }, + { + "epoch": 1.9053555527286605, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.652902603149414, + "learning_rate": 1e-06, + "loss": 0.3911, + "mean_token_accuracy": 0.880748987197876, + "num_tokens": 571316891.0, + "step": 14978 + }, + { + "epoch": 1.9054827630072508, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.50510597229004, + "learning_rate": 1e-06, + "loss": 0.4071, + "mean_token_accuracy": 0.8710574507713318, + "num_tokens": 571353097.0, + "step": 14979 + }, + { + "epoch": 1.9056099732858414, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.70810890197754, + "learning_rate": 1e-06, + "loss": 0.3793, + "mean_token_accuracy": 0.8833346962928772, + "num_tokens": 571392217.0, + "step": 14980 + }, + { + "epoch": 1.905737183564432, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.73297882080078, + "learning_rate": 1e-06, + "loss": 0.3844, + "mean_token_accuracy": 0.8771346807479858, + "num_tokens": 571430440.0, + "step": 14981 + }, + { + "epoch": 1.9058643938430224, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.79793357849121, + "learning_rate": 1e-06, + "loss": 0.4228, + "mean_token_accuracy": 0.8647023439407349, + "num_tokens": 571472002.0, + "step": 14982 + }, + { + "epoch": 1.905991604121613, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.77164649963379, + "learning_rate": 1e-06, + "loss": 0.4788, + "mean_token_accuracy": 0.8502694964408875, + "num_tokens": 571509368.0, + "step": 14983 + }, + { + "epoch": 1.9061188144002035, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.519901275634766, + "learning_rate": 1e-06, + "loss": 0.4682, + "mean_token_accuracy": 0.8543928861618042, + "num_tokens": 571543898.0, + "step": 14984 + }, + { + "epoch": 1.906246024678794, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.77816390991211, + "learning_rate": 1e-06, + "loss": 0.4472, + "mean_token_accuracy": 0.863486647605896, + "num_tokens": 571584395.0, + "step": 14985 + }, + { + "epoch": 1.9063732349573845, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.678279876708984, + "learning_rate": 1e-06, + "loss": 0.422, + "mean_token_accuracy": 0.8684952855110168, + "num_tokens": 571624792.0, + "step": 14986 + }, + { + "epoch": 1.906500445235975, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.752586364746094, + "learning_rate": 1e-06, + "loss": 0.4076, + "mean_token_accuracy": 0.8711798787117004, + "num_tokens": 571667197.0, + "step": 14987 + }, + { + "epoch": 1.9066276555145656, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.60360336303711, + "learning_rate": 1e-06, + "loss": 0.4285, + "mean_token_accuracy": 0.8626339435577393, + "num_tokens": 571703648.0, + "step": 14988 + }, + { + "epoch": 1.9067548657931561, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.72783660888672, + "learning_rate": 1e-06, + "loss": 0.4381, + "mean_token_accuracy": 0.861263632774353, + "num_tokens": 571738980.0, + "step": 14989 + }, + { + "epoch": 1.9068820760717466, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.592206954956055, + "learning_rate": 1e-06, + "loss": 0.4053, + "mean_token_accuracy": 0.8717275857925415, + "num_tokens": 571774540.0, + "step": 14990 + }, + { + "epoch": 1.9070092863503372, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.75901222229004, + "learning_rate": 1e-06, + "loss": 0.3925, + "mean_token_accuracy": 0.8747005462646484, + "num_tokens": 571805286.0, + "step": 14991 + }, + { + "epoch": 1.9071364966289277, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.739151000976562, + "learning_rate": 1e-06, + "loss": 0.434, + "mean_token_accuracy": 0.86261385679245, + "num_tokens": 571847936.0, + "step": 14992 + }, + { + "epoch": 1.9072637069075182, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.75275230407715, + "learning_rate": 1e-06, + "loss": 0.429, + "mean_token_accuracy": 0.8657692670822144, + "num_tokens": 571885454.0, + "step": 14993 + }, + { + "epoch": 1.9073909171861088, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.62194061279297, + "learning_rate": 1e-06, + "loss": 0.4118, + "mean_token_accuracy": 0.8672516942024231, + "num_tokens": 571924870.0, + "step": 14994 + }, + { + "epoch": 1.9075181274646993, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.631404876708984, + "learning_rate": 1e-06, + "loss": 0.4877, + "mean_token_accuracy": 0.8480288982391357, + "num_tokens": 571957870.0, + "step": 14995 + }, + { + "epoch": 1.9076453377432898, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.761886596679688, + "learning_rate": 1e-06, + "loss": 0.4269, + "mean_token_accuracy": 0.8624377250671387, + "num_tokens": 571993645.0, + "step": 14996 + }, + { + "epoch": 1.9077725480218801, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.563583374023438, + "learning_rate": 1e-06, + "loss": 0.3888, + "mean_token_accuracy": 0.8778300285339355, + "num_tokens": 572028594.0, + "step": 14997 + }, + { + "epoch": 1.9078997583004706, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.73602294921875, + "learning_rate": 1e-06, + "loss": 0.496, + "mean_token_accuracy": 0.8421192169189453, + "num_tokens": 572066971.0, + "step": 14998 + }, + { + "epoch": 1.9080269685790612, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.684101104736328, + "learning_rate": 1e-06, + "loss": 0.395, + "mean_token_accuracy": 0.8727809190750122, + "num_tokens": 572107609.0, + "step": 14999 + }, + { + "epoch": 1.9081541788576517, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.58905601501465, + "learning_rate": 1e-06, + "loss": 0.3939, + "mean_token_accuracy": 0.8766694068908691, + "num_tokens": 572146940.0, + "step": 15000 + }, + { + "epoch": 1.9082813891362422, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.684864044189453, + "learning_rate": 1e-06, + "loss": 0.3374, + "mean_token_accuracy": 0.8941206336021423, + "num_tokens": 572184010.0, + "step": 15001 + }, + { + "epoch": 1.9084085994148328, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.69324493408203, + "learning_rate": 1e-06, + "loss": 0.4565, + "mean_token_accuracy": 0.8520799875259399, + "num_tokens": 572223184.0, + "step": 15002 + }, + { + "epoch": 1.908535809693423, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.774568557739258, + "learning_rate": 1e-06, + "loss": 0.4124, + "mean_token_accuracy": 0.8694854378700256, + "num_tokens": 572261770.0, + "step": 15003 + }, + { + "epoch": 1.9086630199720136, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.66344451904297, + "learning_rate": 1e-06, + "loss": 0.416, + "mean_token_accuracy": 0.8710324764251709, + "num_tokens": 572303229.0, + "step": 15004 + }, + { + "epoch": 1.9087902302506041, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.786094665527344, + "learning_rate": 1e-06, + "loss": 0.453, + "mean_token_accuracy": 0.8571284413337708, + "num_tokens": 572337839.0, + "step": 15005 + }, + { + "epoch": 1.9089174405291947, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.584739685058594, + "learning_rate": 1e-06, + "loss": 0.4083, + "mean_token_accuracy": 0.8690414428710938, + "num_tokens": 572373604.0, + "step": 15006 + }, + { + "epoch": 1.9090446508077852, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.640443801879883, + "learning_rate": 1e-06, + "loss": 0.3943, + "mean_token_accuracy": 0.8763754367828369, + "num_tokens": 572410425.0, + "step": 15007 + }, + { + "epoch": 1.9091718610863757, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.61726951599121, + "learning_rate": 1e-06, + "loss": 0.3911, + "mean_token_accuracy": 0.8782140016555786, + "num_tokens": 572443998.0, + "step": 15008 + }, + { + "epoch": 1.9092990713649662, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.486042022705078, + "learning_rate": 1e-06, + "loss": 0.4522, + "mean_token_accuracy": 0.8559766411781311, + "num_tokens": 572485337.0, + "step": 15009 + }, + { + "epoch": 1.9094262816435568, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.57067108154297, + "learning_rate": 1e-06, + "loss": 0.4503, + "mean_token_accuracy": 0.8573301434516907, + "num_tokens": 572528273.0, + "step": 15010 + }, + { + "epoch": 1.9095534919221473, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.647235870361328, + "learning_rate": 1e-06, + "loss": 0.4107, + "mean_token_accuracy": 0.8717741370201111, + "num_tokens": 572569467.0, + "step": 15011 + }, + { + "epoch": 1.9096807022007378, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.682056427001953, + "learning_rate": 1e-06, + "loss": 0.4028, + "mean_token_accuracy": 0.8768842220306396, + "num_tokens": 572609626.0, + "step": 15012 + }, + { + "epoch": 1.9098079124793284, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.722888946533203, + "learning_rate": 1e-06, + "loss": 0.4665, + "mean_token_accuracy": 0.8530418276786804, + "num_tokens": 572652907.0, + "step": 15013 + }, + { + "epoch": 1.9099351227579189, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.773258209228516, + "learning_rate": 1e-06, + "loss": 0.4039, + "mean_token_accuracy": 0.8738141655921936, + "num_tokens": 572687460.0, + "step": 15014 + }, + { + "epoch": 1.9100623330365094, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.501136779785156, + "learning_rate": 1e-06, + "loss": 0.3999, + "mean_token_accuracy": 0.8715898990631104, + "num_tokens": 572723937.0, + "step": 15015 + }, + { + "epoch": 1.9101895433151, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.617155075073242, + "learning_rate": 1e-06, + "loss": 0.4257, + "mean_token_accuracy": 0.8645696043968201, + "num_tokens": 572764929.0, + "step": 15016 + }, + { + "epoch": 1.9103167535936905, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.728286743164062, + "learning_rate": 1e-06, + "loss": 0.4251, + "mean_token_accuracy": 0.8660900592803955, + "num_tokens": 572802545.0, + "step": 15017 + }, + { + "epoch": 1.910443963872281, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.68764877319336, + "learning_rate": 1e-06, + "loss": 0.4253, + "mean_token_accuracy": 0.8632792830467224, + "num_tokens": 572850008.0, + "step": 15018 + }, + { + "epoch": 1.9105711741508715, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.46855926513672, + "learning_rate": 1e-06, + "loss": 0.3879, + "mean_token_accuracy": 0.8761941194534302, + "num_tokens": 572888968.0, + "step": 15019 + }, + { + "epoch": 1.910698384429462, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.521913528442383, + "learning_rate": 1e-06, + "loss": 0.4165, + "mean_token_accuracy": 0.8668597936630249, + "num_tokens": 572926314.0, + "step": 15020 + }, + { + "epoch": 1.9108255947080524, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.521770477294922, + "learning_rate": 1e-06, + "loss": 0.4079, + "mean_token_accuracy": 0.872876763343811, + "num_tokens": 572961789.0, + "step": 15021 + }, + { + "epoch": 1.9109528049866429, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.77626609802246, + "learning_rate": 1e-06, + "loss": 0.4065, + "mean_token_accuracy": 0.8697678446769714, + "num_tokens": 572993680.0, + "step": 15022 + }, + { + "epoch": 1.9110800152652334, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.660720825195312, + "learning_rate": 1e-06, + "loss": 0.4042, + "mean_token_accuracy": 0.8707436919212341, + "num_tokens": 573024003.0, + "step": 15023 + }, + { + "epoch": 1.911207225543824, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.642101287841797, + "learning_rate": 1e-06, + "loss": 0.3959, + "mean_token_accuracy": 0.8766554594039917, + "num_tokens": 573055138.0, + "step": 15024 + }, + { + "epoch": 1.9113344358224145, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.460622787475586, + "learning_rate": 1e-06, + "loss": 0.4085, + "mean_token_accuracy": 0.8742879629135132, + "num_tokens": 573092373.0, + "step": 15025 + }, + { + "epoch": 1.911461646101005, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.792137145996094, + "learning_rate": 1e-06, + "loss": 0.4278, + "mean_token_accuracy": 0.8643238544464111, + "num_tokens": 573131486.0, + "step": 15026 + }, + { + "epoch": 1.9115888563795955, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.661144256591797, + "learning_rate": 1e-06, + "loss": 0.3867, + "mean_token_accuracy": 0.878010094165802, + "num_tokens": 573170114.0, + "step": 15027 + }, + { + "epoch": 1.9117160666581858, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.60232162475586, + "learning_rate": 1e-06, + "loss": 0.4084, + "mean_token_accuracy": 0.8727561235427856, + "num_tokens": 573200906.0, + "step": 15028 + }, + { + "epoch": 1.9118432769367764, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.909866333007812, + "learning_rate": 1e-06, + "loss": 0.4583, + "mean_token_accuracy": 0.8560551404953003, + "num_tokens": 573243731.0, + "step": 15029 + }, + { + "epoch": 1.9119704872153669, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.655670166015625, + "learning_rate": 1e-06, + "loss": 0.3967, + "mean_token_accuracy": 0.8755744695663452, + "num_tokens": 573283335.0, + "step": 15030 + }, + { + "epoch": 1.9120976974939574, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.94871711730957, + "learning_rate": 1e-06, + "loss": 0.4608, + "mean_token_accuracy": 0.8559308648109436, + "num_tokens": 573326628.0, + "step": 15031 + }, + { + "epoch": 1.912224907772548, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.689847946166992, + "learning_rate": 1e-06, + "loss": 0.3946, + "mean_token_accuracy": 0.8750221729278564, + "num_tokens": 573369025.0, + "step": 15032 + }, + { + "epoch": 1.9123521180511385, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.637720108032227, + "learning_rate": 1e-06, + "loss": 0.4397, + "mean_token_accuracy": 0.8620638251304626, + "num_tokens": 573411923.0, + "step": 15033 + }, + { + "epoch": 1.912479328329729, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.474903106689453, + "learning_rate": 1e-06, + "loss": 0.4358, + "mean_token_accuracy": 0.8646553754806519, + "num_tokens": 573450455.0, + "step": 15034 + }, + { + "epoch": 1.9126065386083195, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.581220626831055, + "learning_rate": 1e-06, + "loss": 0.441, + "mean_token_accuracy": 0.8591599464416504, + "num_tokens": 573492886.0, + "step": 15035 + }, + { + "epoch": 1.91273374888691, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.050771713256836, + "learning_rate": 1e-06, + "loss": 0.4, + "mean_token_accuracy": 0.8733919262886047, + "num_tokens": 573532073.0, + "step": 15036 + }, + { + "epoch": 1.9128609591655006, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.655315399169922, + "learning_rate": 1e-06, + "loss": 0.3651, + "mean_token_accuracy": 0.8844563961029053, + "num_tokens": 573573632.0, + "step": 15037 + }, + { + "epoch": 1.9129881694440911, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.783958435058594, + "learning_rate": 1e-06, + "loss": 0.3808, + "mean_token_accuracy": 0.8790038228034973, + "num_tokens": 573610194.0, + "step": 15038 + }, + { + "epoch": 1.9131153797226816, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.70676612854004, + "learning_rate": 1e-06, + "loss": 0.4338, + "mean_token_accuracy": 0.860988199710846, + "num_tokens": 573650237.0, + "step": 15039 + }, + { + "epoch": 1.9132425900012722, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.83778953552246, + "learning_rate": 1e-06, + "loss": 0.4087, + "mean_token_accuracy": 0.872291088104248, + "num_tokens": 573686859.0, + "step": 15040 + }, + { + "epoch": 1.9133698002798627, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.764455795288086, + "learning_rate": 1e-06, + "loss": 0.4056, + "mean_token_accuracy": 0.869239091873169, + "num_tokens": 573726770.0, + "step": 15041 + }, + { + "epoch": 1.9134970105584532, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.796619415283203, + "learning_rate": 1e-06, + "loss": 0.4332, + "mean_token_accuracy": 0.8637937307357788, + "num_tokens": 573768668.0, + "step": 15042 + }, + { + "epoch": 1.9136242208370438, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.6133975982666, + "learning_rate": 1e-06, + "loss": 0.4509, + "mean_token_accuracy": 0.859075665473938, + "num_tokens": 573807880.0, + "step": 15043 + }, + { + "epoch": 1.9137514311156343, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.71589469909668, + "learning_rate": 1e-06, + "loss": 0.4217, + "mean_token_accuracy": 0.8662441372871399, + "num_tokens": 573847018.0, + "step": 15044 + }, + { + "epoch": 1.9138786413942248, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.54830551147461, + "learning_rate": 1e-06, + "loss": 0.3689, + "mean_token_accuracy": 0.8846272826194763, + "num_tokens": 573881193.0, + "step": 15045 + }, + { + "epoch": 1.9140058516728151, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.680912017822266, + "learning_rate": 1e-06, + "loss": 0.3863, + "mean_token_accuracy": 0.8767728805541992, + "num_tokens": 573918457.0, + "step": 15046 + }, + { + "epoch": 1.9141330619514056, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.829477310180664, + "learning_rate": 1e-06, + "loss": 0.355, + "mean_token_accuracy": 0.8875300884246826, + "num_tokens": 573958818.0, + "step": 15047 + }, + { + "epoch": 1.9142602722299962, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.641002655029297, + "learning_rate": 1e-06, + "loss": 0.4324, + "mean_token_accuracy": 0.8629485964775085, + "num_tokens": 573996735.0, + "step": 15048 + }, + { + "epoch": 1.9143874825085867, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.722230911254883, + "learning_rate": 1e-06, + "loss": 0.4038, + "mean_token_accuracy": 0.8730511665344238, + "num_tokens": 574035005.0, + "step": 15049 + }, + { + "epoch": 1.9145146927871772, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.828140258789062, + "learning_rate": 1e-06, + "loss": 0.4177, + "mean_token_accuracy": 0.8681723475456238, + "num_tokens": 574077817.0, + "step": 15050 + }, + { + "epoch": 1.9146419030657678, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.775779724121094, + "learning_rate": 1e-06, + "loss": 0.3949, + "mean_token_accuracy": 0.8768274188041687, + "num_tokens": 574113083.0, + "step": 15051 + }, + { + "epoch": 1.914769113344358, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.977685928344727, + "learning_rate": 1e-06, + "loss": 0.3941, + "mean_token_accuracy": 0.8729895353317261, + "num_tokens": 574147324.0, + "step": 15052 + }, + { + "epoch": 1.9148963236229486, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.76380157470703, + "learning_rate": 1e-06, + "loss": 0.3677, + "mean_token_accuracy": 0.8834337592124939, + "num_tokens": 574186108.0, + "step": 15053 + }, + { + "epoch": 1.9150235339015391, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.618032455444336, + "learning_rate": 1e-06, + "loss": 0.3729, + "mean_token_accuracy": 0.8822386264801025, + "num_tokens": 574218076.0, + "step": 15054 + }, + { + "epoch": 1.9151507441801296, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.866649627685547, + "learning_rate": 1e-06, + "loss": 0.3783, + "mean_token_accuracy": 0.8796252012252808, + "num_tokens": 574256416.0, + "step": 15055 + }, + { + "epoch": 1.9152779544587202, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.772003173828125, + "learning_rate": 1e-06, + "loss": 0.3755, + "mean_token_accuracy": 0.8824172019958496, + "num_tokens": 574295262.0, + "step": 15056 + }, + { + "epoch": 1.9154051647373107, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.611581802368164, + "learning_rate": 1e-06, + "loss": 0.4356, + "mean_token_accuracy": 0.859089732170105, + "num_tokens": 574329836.0, + "step": 15057 + }, + { + "epoch": 1.9155323750159012, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.649993896484375, + "learning_rate": 1e-06, + "loss": 0.4046, + "mean_token_accuracy": 0.872146487236023, + "num_tokens": 574363189.0, + "step": 15058 + }, + { + "epoch": 1.9156595852944918, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.781391143798828, + "learning_rate": 1e-06, + "loss": 0.3844, + "mean_token_accuracy": 0.8777849078178406, + "num_tokens": 574396374.0, + "step": 15059 + }, + { + "epoch": 1.9157867955730823, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.69879913330078, + "learning_rate": 1e-06, + "loss": 0.4159, + "mean_token_accuracy": 0.871976375579834, + "num_tokens": 574435488.0, + "step": 15060 + }, + { + "epoch": 1.9159140058516728, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.724628448486328, + "learning_rate": 1e-06, + "loss": 0.4101, + "mean_token_accuracy": 0.8699802756309509, + "num_tokens": 574475261.0, + "step": 15061 + }, + { + "epoch": 1.9160412161302633, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.512001037597656, + "learning_rate": 1e-06, + "loss": 0.349, + "mean_token_accuracy": 0.8884602785110474, + "num_tokens": 574513531.0, + "step": 15062 + }, + { + "epoch": 1.9161684264088539, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.80568504333496, + "learning_rate": 1e-06, + "loss": 0.4531, + "mean_token_accuracy": 0.8556257486343384, + "num_tokens": 574551736.0, + "step": 15063 + }, + { + "epoch": 1.9162956366874444, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.696929931640625, + "learning_rate": 1e-06, + "loss": 0.4332, + "mean_token_accuracy": 0.8635414838790894, + "num_tokens": 574589978.0, + "step": 15064 + }, + { + "epoch": 1.916422846966035, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.840635299682617, + "learning_rate": 1e-06, + "loss": 0.4738, + "mean_token_accuracy": 0.8479443192481995, + "num_tokens": 574628845.0, + "step": 15065 + }, + { + "epoch": 1.9165500572446255, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.68773078918457, + "learning_rate": 1e-06, + "loss": 0.421, + "mean_token_accuracy": 0.8667483329772949, + "num_tokens": 574660700.0, + "step": 15066 + }, + { + "epoch": 1.916677267523216, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.712379455566406, + "learning_rate": 1e-06, + "loss": 0.426, + "mean_token_accuracy": 0.860519289970398, + "num_tokens": 574692195.0, + "step": 15067 + }, + { + "epoch": 1.9168044778018065, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.701614379882812, + "learning_rate": 1e-06, + "loss": 0.4344, + "mean_token_accuracy": 0.8606393337249756, + "num_tokens": 574732462.0, + "step": 15068 + }, + { + "epoch": 1.916931688080397, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.725324630737305, + "learning_rate": 1e-06, + "loss": 0.394, + "mean_token_accuracy": 0.8758422136306763, + "num_tokens": 574769353.0, + "step": 15069 + }, + { + "epoch": 1.9170588983589874, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.788585662841797, + "learning_rate": 1e-06, + "loss": 0.4175, + "mean_token_accuracy": 0.8686935901641846, + "num_tokens": 574804958.0, + "step": 15070 + }, + { + "epoch": 1.9171861086375779, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.716909408569336, + "learning_rate": 1e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.8693901896476746, + "num_tokens": 574843966.0, + "step": 15071 + }, + { + "epoch": 1.9173133189161684, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.770586013793945, + "learning_rate": 1e-06, + "loss": 0.3951, + "mean_token_accuracy": 0.8740025758743286, + "num_tokens": 574882916.0, + "step": 15072 + }, + { + "epoch": 1.917440529194759, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.648731231689453, + "learning_rate": 1e-06, + "loss": 0.3854, + "mean_token_accuracy": 0.8777122497558594, + "num_tokens": 574922154.0, + "step": 15073 + }, + { + "epoch": 1.9175677394733495, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.04607391357422, + "learning_rate": 1e-06, + "loss": 0.4213, + "mean_token_accuracy": 0.8657425045967102, + "num_tokens": 574956894.0, + "step": 15074 + }, + { + "epoch": 1.91769494975194, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.753768920898438, + "learning_rate": 1e-06, + "loss": 0.3736, + "mean_token_accuracy": 0.8839436769485474, + "num_tokens": 574998343.0, + "step": 15075 + }, + { + "epoch": 1.9178221600305305, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.87691879272461, + "learning_rate": 1e-06, + "loss": 0.3929, + "mean_token_accuracy": 0.8791837692260742, + "num_tokens": 575036689.0, + "step": 15076 + }, + { + "epoch": 1.9179493703091208, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.686559677124023, + "learning_rate": 1e-06, + "loss": 0.4289, + "mean_token_accuracy": 0.8640663623809814, + "num_tokens": 575075510.0, + "step": 15077 + }, + { + "epoch": 1.9180765805877114, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.83344841003418, + "learning_rate": 1e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.8674882650375366, + "num_tokens": 575115797.0, + "step": 15078 + }, + { + "epoch": 1.9182037908663019, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.737699508666992, + "learning_rate": 1e-06, + "loss": 0.4607, + "mean_token_accuracy": 0.8559143543243408, + "num_tokens": 575155240.0, + "step": 15079 + }, + { + "epoch": 1.9183310011448924, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.916065216064453, + "learning_rate": 1e-06, + "loss": 0.4604, + "mean_token_accuracy": 0.856521487236023, + "num_tokens": 575191048.0, + "step": 15080 + }, + { + "epoch": 1.918458211423483, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.87318992614746, + "learning_rate": 1e-06, + "loss": 0.4391, + "mean_token_accuracy": 0.8591798543930054, + "num_tokens": 575227778.0, + "step": 15081 + }, + { + "epoch": 1.9185854217020735, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.709850311279297, + "learning_rate": 1e-06, + "loss": 0.4146, + "mean_token_accuracy": 0.8703025579452515, + "num_tokens": 575269774.0, + "step": 15082 + }, + { + "epoch": 1.918712631980664, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.581615447998047, + "learning_rate": 1e-06, + "loss": 0.4383, + "mean_token_accuracy": 0.8609277606010437, + "num_tokens": 575310710.0, + "step": 15083 + }, + { + "epoch": 1.9188398422592545, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.693727493286133, + "learning_rate": 1e-06, + "loss": 0.3831, + "mean_token_accuracy": 0.8767218589782715, + "num_tokens": 575344073.0, + "step": 15084 + }, + { + "epoch": 1.918967052537845, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.703781127929688, + "learning_rate": 1e-06, + "loss": 0.4444, + "mean_token_accuracy": 0.8601354360580444, + "num_tokens": 575386737.0, + "step": 15085 + }, + { + "epoch": 1.9190942628164356, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.503829956054688, + "learning_rate": 1e-06, + "loss": 0.3981, + "mean_token_accuracy": 0.8738117814064026, + "num_tokens": 575424287.0, + "step": 15086 + }, + { + "epoch": 1.919221473095026, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.881641387939453, + "learning_rate": 1e-06, + "loss": 0.4126, + "mean_token_accuracy": 0.8713442087173462, + "num_tokens": 575463859.0, + "step": 15087 + }, + { + "epoch": 1.9193486833736166, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.584367752075195, + "learning_rate": 1e-06, + "loss": 0.4383, + "mean_token_accuracy": 0.8647007346153259, + "num_tokens": 575500838.0, + "step": 15088 + }, + { + "epoch": 1.9194758936522072, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.950586318969727, + "learning_rate": 1e-06, + "loss": 0.4431, + "mean_token_accuracy": 0.8615717887878418, + "num_tokens": 575535164.0, + "step": 15089 + }, + { + "epoch": 1.9196031039307977, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.847042083740234, + "learning_rate": 1e-06, + "loss": 0.3948, + "mean_token_accuracy": 0.875352680683136, + "num_tokens": 575569493.0, + "step": 15090 + }, + { + "epoch": 1.9197303142093882, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.772241592407227, + "learning_rate": 1e-06, + "loss": 0.4724, + "mean_token_accuracy": 0.8546919822692871, + "num_tokens": 575603396.0, + "step": 15091 + }, + { + "epoch": 1.9198575244879788, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.908205032348633, + "learning_rate": 1e-06, + "loss": 0.3942, + "mean_token_accuracy": 0.8762706518173218, + "num_tokens": 575638255.0, + "step": 15092 + }, + { + "epoch": 1.9199847347665693, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.715452194213867, + "learning_rate": 1e-06, + "loss": 0.4451, + "mean_token_accuracy": 0.8591424226760864, + "num_tokens": 575685281.0, + "step": 15093 + }, + { + "epoch": 1.9201119450451598, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.651958465576172, + "learning_rate": 1e-06, + "loss": 0.3966, + "mean_token_accuracy": 0.8759085536003113, + "num_tokens": 575725030.0, + "step": 15094 + }, + { + "epoch": 1.9202391553237501, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.836389541625977, + "learning_rate": 1e-06, + "loss": 0.4331, + "mean_token_accuracy": 0.8637009859085083, + "num_tokens": 575764237.0, + "step": 15095 + }, + { + "epoch": 1.9203663656023406, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.727697372436523, + "learning_rate": 1e-06, + "loss": 0.3983, + "mean_token_accuracy": 0.8674911260604858, + "num_tokens": 575796732.0, + "step": 15096 + }, + { + "epoch": 1.9204935758809312, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.804643630981445, + "learning_rate": 1e-06, + "loss": 0.3952, + "mean_token_accuracy": 0.8734784126281738, + "num_tokens": 575833133.0, + "step": 15097 + }, + { + "epoch": 1.9206207861595217, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.732105255126953, + "learning_rate": 1e-06, + "loss": 0.4245, + "mean_token_accuracy": 0.864326000213623, + "num_tokens": 575870920.0, + "step": 15098 + }, + { + "epoch": 1.9207479964381122, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.75892448425293, + "learning_rate": 1e-06, + "loss": 0.4059, + "mean_token_accuracy": 0.8715072870254517, + "num_tokens": 575909132.0, + "step": 15099 + }, + { + "epoch": 1.9208752067167028, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.004026412963867, + "learning_rate": 1e-06, + "loss": 0.4312, + "mean_token_accuracy": 0.8693713545799255, + "num_tokens": 575947979.0, + "step": 15100 + }, + { + "epoch": 1.921002416995293, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.754674911499023, + "learning_rate": 1e-06, + "loss": 0.4454, + "mean_token_accuracy": 0.8585913181304932, + "num_tokens": 575988888.0, + "step": 15101 + }, + { + "epoch": 1.9211296272738836, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.8291072845459, + "learning_rate": 1e-06, + "loss": 0.4085, + "mean_token_accuracy": 0.8712759017944336, + "num_tokens": 576028614.0, + "step": 15102 + }, + { + "epoch": 1.9212568375524741, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.848499298095703, + "learning_rate": 1e-06, + "loss": 0.3306, + "mean_token_accuracy": 0.8966150283813477, + "num_tokens": 576066648.0, + "step": 15103 + }, + { + "epoch": 1.9213840478310646, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.72234535217285, + "learning_rate": 1e-06, + "loss": 0.4218, + "mean_token_accuracy": 0.867348849773407, + "num_tokens": 576104045.0, + "step": 15104 + }, + { + "epoch": 1.9215112581096552, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.918006896972656, + "learning_rate": 1e-06, + "loss": 0.4294, + "mean_token_accuracy": 0.8619897365570068, + "num_tokens": 576145419.0, + "step": 15105 + }, + { + "epoch": 1.9216384683882457, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.772737503051758, + "learning_rate": 1e-06, + "loss": 0.4346, + "mean_token_accuracy": 0.8657630681991577, + "num_tokens": 576191410.0, + "step": 15106 + }, + { + "epoch": 1.9217656786668362, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.648527145385742, + "learning_rate": 1e-06, + "loss": 0.4413, + "mean_token_accuracy": 0.863010048866272, + "num_tokens": 576226383.0, + "step": 15107 + }, + { + "epoch": 1.9218928889454268, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.69831657409668, + "learning_rate": 1e-06, + "loss": 0.4361, + "mean_token_accuracy": 0.8620094060897827, + "num_tokens": 576259958.0, + "step": 15108 + }, + { + "epoch": 1.9220200992240173, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.717599868774414, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.8721513748168945, + "num_tokens": 576299467.0, + "step": 15109 + }, + { + "epoch": 1.9221473095026078, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.765453338623047, + "learning_rate": 1e-06, + "loss": 0.4356, + "mean_token_accuracy": 0.8628986477851868, + "num_tokens": 576334137.0, + "step": 15110 + }, + { + "epoch": 1.9222745197811983, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.799877166748047, + "learning_rate": 1e-06, + "loss": 0.4257, + "mean_token_accuracy": 0.8656007051467896, + "num_tokens": 576379582.0, + "step": 15111 + }, + { + "epoch": 1.9224017300597889, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.817047119140625, + "learning_rate": 1e-06, + "loss": 0.3726, + "mean_token_accuracy": 0.8836609125137329, + "num_tokens": 576414590.0, + "step": 15112 + }, + { + "epoch": 1.9225289403383794, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.684263229370117, + "learning_rate": 1e-06, + "loss": 0.4383, + "mean_token_accuracy": 0.8612369298934937, + "num_tokens": 576445023.0, + "step": 15113 + }, + { + "epoch": 1.92265615061697, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.820560455322266, + "learning_rate": 1e-06, + "loss": 0.4218, + "mean_token_accuracy": 0.8658931255340576, + "num_tokens": 576477861.0, + "step": 15114 + }, + { + "epoch": 1.9227833608955605, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.727895736694336, + "learning_rate": 1e-06, + "loss": 0.4222, + "mean_token_accuracy": 0.865761399269104, + "num_tokens": 576513121.0, + "step": 15115 + }, + { + "epoch": 1.922910571174151, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.763290405273438, + "learning_rate": 1e-06, + "loss": 0.3967, + "mean_token_accuracy": 0.8747823238372803, + "num_tokens": 576551039.0, + "step": 15116 + }, + { + "epoch": 1.9230377814527415, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.936485290527344, + "learning_rate": 1e-06, + "loss": 0.4374, + "mean_token_accuracy": 0.8617748022079468, + "num_tokens": 576590651.0, + "step": 15117 + }, + { + "epoch": 1.923164991731332, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.66876220703125, + "learning_rate": 1e-06, + "loss": 0.4474, + "mean_token_accuracy": 0.8602024912834167, + "num_tokens": 576629119.0, + "step": 15118 + }, + { + "epoch": 1.9232922020099223, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.901262283325195, + "learning_rate": 1e-06, + "loss": 0.385, + "mean_token_accuracy": 0.8783480525016785, + "num_tokens": 576664240.0, + "step": 15119 + }, + { + "epoch": 1.9234194122885129, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.93524169921875, + "learning_rate": 1e-06, + "loss": 0.4258, + "mean_token_accuracy": 0.8627462387084961, + "num_tokens": 576701946.0, + "step": 15120 + }, + { + "epoch": 1.9235466225671034, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.803787231445312, + "learning_rate": 1e-06, + "loss": 0.3826, + "mean_token_accuracy": 0.8799822330474854, + "num_tokens": 576733541.0, + "step": 15121 + }, + { + "epoch": 1.923673832845694, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.75572395324707, + "learning_rate": 1e-06, + "loss": 0.4534, + "mean_token_accuracy": 0.8549472093582153, + "num_tokens": 576770930.0, + "step": 15122 + }, + { + "epoch": 1.9238010431242845, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.720504760742188, + "learning_rate": 1e-06, + "loss": 0.4198, + "mean_token_accuracy": 0.8693495988845825, + "num_tokens": 576807455.0, + "step": 15123 + }, + { + "epoch": 1.923928253402875, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.697736740112305, + "learning_rate": 1e-06, + "loss": 0.3907, + "mean_token_accuracy": 0.8753383755683899, + "num_tokens": 576849577.0, + "step": 15124 + }, + { + "epoch": 1.9240554636814655, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.8751220703125, + "learning_rate": 1e-06, + "loss": 0.4324, + "mean_token_accuracy": 0.8629192113876343, + "num_tokens": 576883506.0, + "step": 15125 + }, + { + "epoch": 1.9241826739600558, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.83894920349121, + "learning_rate": 1e-06, + "loss": 0.4673, + "mean_token_accuracy": 0.852851152420044, + "num_tokens": 576923892.0, + "step": 15126 + }, + { + "epoch": 1.9243098842386464, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.65435791015625, + "learning_rate": 1e-06, + "loss": 0.36, + "mean_token_accuracy": 0.8850883841514587, + "num_tokens": 576970832.0, + "step": 15127 + }, + { + "epoch": 1.9244370945172369, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.035810470581055, + "learning_rate": 1e-06, + "loss": 0.4703, + "mean_token_accuracy": 0.8506029844284058, + "num_tokens": 577006990.0, + "step": 15128 + }, + { + "epoch": 1.9245643047958274, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.584983825683594, + "learning_rate": 1e-06, + "loss": 0.4054, + "mean_token_accuracy": 0.8736864924430847, + "num_tokens": 577044932.0, + "step": 15129 + }, + { + "epoch": 1.924691515074418, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.947208404541016, + "learning_rate": 1e-06, + "loss": 0.4689, + "mean_token_accuracy": 0.8520259857177734, + "num_tokens": 577076217.0, + "step": 15130 + }, + { + "epoch": 1.9248187253530085, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.749326705932617, + "learning_rate": 1e-06, + "loss": 0.3633, + "mean_token_accuracy": 0.8847159743309021, + "num_tokens": 577112696.0, + "step": 15131 + }, + { + "epoch": 1.924945935631599, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.825422286987305, + "learning_rate": 1e-06, + "loss": 0.4115, + "mean_token_accuracy": 0.8684244155883789, + "num_tokens": 577156397.0, + "step": 15132 + }, + { + "epoch": 1.9250731459101895, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.773839950561523, + "learning_rate": 1e-06, + "loss": 0.3934, + "mean_token_accuracy": 0.8729753494262695, + "num_tokens": 577194192.0, + "step": 15133 + }, + { + "epoch": 1.92520035618878, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.6704158782959, + "learning_rate": 1e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.8638900518417358, + "num_tokens": 577232639.0, + "step": 15134 + }, + { + "epoch": 1.9253275664673706, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.712759017944336, + "learning_rate": 1e-06, + "loss": 0.3835, + "mean_token_accuracy": 0.8775457739830017, + "num_tokens": 577267349.0, + "step": 15135 + }, + { + "epoch": 1.925454776745961, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.03958511352539, + "learning_rate": 1e-06, + "loss": 0.4389, + "mean_token_accuracy": 0.8644938468933105, + "num_tokens": 577308329.0, + "step": 15136 + }, + { + "epoch": 1.9255819870245516, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.59513282775879, + "learning_rate": 1e-06, + "loss": 0.4067, + "mean_token_accuracy": 0.8695139288902283, + "num_tokens": 577347825.0, + "step": 15137 + }, + { + "epoch": 1.9257091973031422, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.730703353881836, + "learning_rate": 1e-06, + "loss": 0.4061, + "mean_token_accuracy": 0.8697150945663452, + "num_tokens": 577379981.0, + "step": 15138 + }, + { + "epoch": 1.9258364075817327, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.630084991455078, + "learning_rate": 1e-06, + "loss": 0.4049, + "mean_token_accuracy": 0.8720753192901611, + "num_tokens": 577414134.0, + "step": 15139 + }, + { + "epoch": 1.9259636178603232, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.727214813232422, + "learning_rate": 1e-06, + "loss": 0.428, + "mean_token_accuracy": 0.8669339418411255, + "num_tokens": 577451170.0, + "step": 15140 + }, + { + "epoch": 1.9260908281389137, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.869935989379883, + "learning_rate": 1e-06, + "loss": 0.4576, + "mean_token_accuracy": 0.8551031351089478, + "num_tokens": 577487447.0, + "step": 15141 + }, + { + "epoch": 1.9262180384175043, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.654003143310547, + "learning_rate": 1e-06, + "loss": 0.405, + "mean_token_accuracy": 0.8734946846961975, + "num_tokens": 577525732.0, + "step": 15142 + }, + { + "epoch": 1.9263452486960948, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.645336151123047, + "learning_rate": 1e-06, + "loss": 0.4232, + "mean_token_accuracy": 0.8673029541969299, + "num_tokens": 577563303.0, + "step": 15143 + }, + { + "epoch": 1.926472458974685, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.679677963256836, + "learning_rate": 1e-06, + "loss": 0.3733, + "mean_token_accuracy": 0.8823700547218323, + "num_tokens": 577607592.0, + "step": 15144 + }, + { + "epoch": 1.9265996692532756, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.78329086303711, + "learning_rate": 1e-06, + "loss": 0.4372, + "mean_token_accuracy": 0.8603441715240479, + "num_tokens": 577651646.0, + "step": 15145 + }, + { + "epoch": 1.9267268795318662, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.88005256652832, + "learning_rate": 1e-06, + "loss": 0.3901, + "mean_token_accuracy": 0.8769217729568481, + "num_tokens": 577689530.0, + "step": 15146 + }, + { + "epoch": 1.9268540898104567, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.769638061523438, + "learning_rate": 1e-06, + "loss": 0.4387, + "mean_token_accuracy": 0.863619327545166, + "num_tokens": 577732524.0, + "step": 15147 + }, + { + "epoch": 1.9269813000890472, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.823320388793945, + "learning_rate": 1e-06, + "loss": 0.4013, + "mean_token_accuracy": 0.8732625246047974, + "num_tokens": 577767164.0, + "step": 15148 + }, + { + "epoch": 1.9271085103676378, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.58831024169922, + "learning_rate": 1e-06, + "loss": 0.3801, + "mean_token_accuracy": 0.8767000436782837, + "num_tokens": 577802347.0, + "step": 15149 + }, + { + "epoch": 1.927235720646228, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.88056755065918, + "learning_rate": 1e-06, + "loss": 0.442, + "mean_token_accuracy": 0.8612920045852661, + "num_tokens": 577842899.0, + "step": 15150 + }, + { + "epoch": 1.9273629309248186, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.596656799316406, + "learning_rate": 1e-06, + "loss": 0.4188, + "mean_token_accuracy": 0.8681343793869019, + "num_tokens": 577885137.0, + "step": 15151 + }, + { + "epoch": 1.9274901412034091, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.063722610473633, + "learning_rate": 1e-06, + "loss": 0.3663, + "mean_token_accuracy": 0.8823373317718506, + "num_tokens": 577925727.0, + "step": 15152 + }, + { + "epoch": 1.9276173514819996, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.82785987854004, + "learning_rate": 1e-06, + "loss": 0.4027, + "mean_token_accuracy": 0.8721779584884644, + "num_tokens": 577965399.0, + "step": 15153 + }, + { + "epoch": 1.9277445617605902, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.62249755859375, + "learning_rate": 1e-06, + "loss": 0.419, + "mean_token_accuracy": 0.8691556453704834, + "num_tokens": 578007608.0, + "step": 15154 + }, + { + "epoch": 1.9278717720391807, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.880901336669922, + "learning_rate": 1e-06, + "loss": 0.3788, + "mean_token_accuracy": 0.8805021047592163, + "num_tokens": 578043425.0, + "step": 15155 + }, + { + "epoch": 1.9279989823177712, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.668975830078125, + "learning_rate": 1e-06, + "loss": 0.4161, + "mean_token_accuracy": 0.8662317991256714, + "num_tokens": 578075838.0, + "step": 15156 + }, + { + "epoch": 1.9281261925963618, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.964326858520508, + "learning_rate": 1e-06, + "loss": 0.4182, + "mean_token_accuracy": 0.8642431497573853, + "num_tokens": 578110113.0, + "step": 15157 + }, + { + "epoch": 1.9282534028749523, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.70060157775879, + "learning_rate": 1e-06, + "loss": 0.4231, + "mean_token_accuracy": 0.8692047595977783, + "num_tokens": 578145743.0, + "step": 15158 + }, + { + "epoch": 1.9283806131535428, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.63081169128418, + "learning_rate": 1e-06, + "loss": 0.3818, + "mean_token_accuracy": 0.8773354887962341, + "num_tokens": 578181588.0, + "step": 15159 + }, + { + "epoch": 1.9285078234321333, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.739107131958008, + "learning_rate": 1e-06, + "loss": 0.4062, + "mean_token_accuracy": 0.8676426410675049, + "num_tokens": 578222048.0, + "step": 15160 + }, + { + "epoch": 1.9286350337107239, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.736543655395508, + "learning_rate": 1e-06, + "loss": 0.4142, + "mean_token_accuracy": 0.8710262775421143, + "num_tokens": 578262664.0, + "step": 15161 + }, + { + "epoch": 1.9287622439893144, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.89980125427246, + "learning_rate": 1e-06, + "loss": 0.456, + "mean_token_accuracy": 0.8573237061500549, + "num_tokens": 578303431.0, + "step": 15162 + }, + { + "epoch": 1.928889454267905, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.758527755737305, + "learning_rate": 1e-06, + "loss": 0.4364, + "mean_token_accuracy": 0.8591040968894958, + "num_tokens": 578338978.0, + "step": 15163 + }, + { + "epoch": 1.9290166645464955, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.709552764892578, + "learning_rate": 1e-06, + "loss": 0.4326, + "mean_token_accuracy": 0.8653719425201416, + "num_tokens": 578376340.0, + "step": 15164 + }, + { + "epoch": 1.929143874825086, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.84635353088379, + "learning_rate": 1e-06, + "loss": 0.4223, + "mean_token_accuracy": 0.8708162307739258, + "num_tokens": 578414714.0, + "step": 15165 + }, + { + "epoch": 1.9292710851036765, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.711872100830078, + "learning_rate": 1e-06, + "loss": 0.4587, + "mean_token_accuracy": 0.8547566533088684, + "num_tokens": 578453668.0, + "step": 15166 + }, + { + "epoch": 1.929398295382267, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.704347610473633, + "learning_rate": 1e-06, + "loss": 0.4433, + "mean_token_accuracy": 0.8610067367553711, + "num_tokens": 578494148.0, + "step": 15167 + }, + { + "epoch": 1.9295255056608573, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.848798751831055, + "learning_rate": 1e-06, + "loss": 0.4787, + "mean_token_accuracy": 0.8488032817840576, + "num_tokens": 578530743.0, + "step": 15168 + }, + { + "epoch": 1.9296527159394479, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.865148544311523, + "learning_rate": 1e-06, + "loss": 0.3493, + "mean_token_accuracy": 0.8904122114181519, + "num_tokens": 578567613.0, + "step": 15169 + }, + { + "epoch": 1.9297799262180384, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.64211082458496, + "learning_rate": 1e-06, + "loss": 0.421, + "mean_token_accuracy": 0.8675148487091064, + "num_tokens": 578602981.0, + "step": 15170 + }, + { + "epoch": 1.929907136496629, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.592863082885742, + "learning_rate": 1e-06, + "loss": 0.4387, + "mean_token_accuracy": 0.8628121614456177, + "num_tokens": 578641027.0, + "step": 15171 + }, + { + "epoch": 1.9300343467752195, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.910274505615234, + "learning_rate": 1e-06, + "loss": 0.3853, + "mean_token_accuracy": 0.8814042806625366, + "num_tokens": 578683006.0, + "step": 15172 + }, + { + "epoch": 1.93016155705381, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.609373092651367, + "learning_rate": 1e-06, + "loss": 0.4109, + "mean_token_accuracy": 0.8731917142868042, + "num_tokens": 578716722.0, + "step": 15173 + }, + { + "epoch": 1.9302887673324005, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.842012405395508, + "learning_rate": 1e-06, + "loss": 0.4711, + "mean_token_accuracy": 0.8500065803527832, + "num_tokens": 578750740.0, + "step": 15174 + }, + { + "epoch": 1.9304159776109908, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.77963638305664, + "learning_rate": 1e-06, + "loss": 0.4085, + "mean_token_accuracy": 0.8715251684188843, + "num_tokens": 578791358.0, + "step": 15175 + }, + { + "epoch": 1.9305431878895813, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.795347213745117, + "learning_rate": 1e-06, + "loss": 0.4712, + "mean_token_accuracy": 0.8532071113586426, + "num_tokens": 578834514.0, + "step": 15176 + }, + { + "epoch": 1.9306703981681719, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.714962005615234, + "learning_rate": 1e-06, + "loss": 0.439, + "mean_token_accuracy": 0.8606879711151123, + "num_tokens": 578875869.0, + "step": 15177 + }, + { + "epoch": 1.9307976084467624, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.74496078491211, + "learning_rate": 1e-06, + "loss": 0.4149, + "mean_token_accuracy": 0.8721036314964294, + "num_tokens": 578921079.0, + "step": 15178 + }, + { + "epoch": 1.930924818725353, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.973859786987305, + "learning_rate": 1e-06, + "loss": 0.4173, + "mean_token_accuracy": 0.8675724864006042, + "num_tokens": 578959837.0, + "step": 15179 + }, + { + "epoch": 1.9310520290039435, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.77437973022461, + "learning_rate": 1e-06, + "loss": 0.5173, + "mean_token_accuracy": 0.8406472206115723, + "num_tokens": 579001629.0, + "step": 15180 + }, + { + "epoch": 1.931179239282534, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.79542350769043, + "learning_rate": 1e-06, + "loss": 0.3488, + "mean_token_accuracy": 0.8908966183662415, + "num_tokens": 579036762.0, + "step": 15181 + }, + { + "epoch": 1.9313064495611245, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.80754280090332, + "learning_rate": 1e-06, + "loss": 0.4777, + "mean_token_accuracy": 0.8468675017356873, + "num_tokens": 579078455.0, + "step": 15182 + }, + { + "epoch": 1.931433659839715, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.908464431762695, + "learning_rate": 1e-06, + "loss": 0.4094, + "mean_token_accuracy": 0.871829628944397, + "num_tokens": 579116942.0, + "step": 15183 + }, + { + "epoch": 1.9315608701183056, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.91486930847168, + "learning_rate": 1e-06, + "loss": 0.3787, + "mean_token_accuracy": 0.8804998397827148, + "num_tokens": 579155180.0, + "step": 15184 + }, + { + "epoch": 1.931688080396896, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.785520553588867, + "learning_rate": 1e-06, + "loss": 0.4613, + "mean_token_accuracy": 0.8612141013145447, + "num_tokens": 579194336.0, + "step": 15185 + }, + { + "epoch": 1.9318152906754866, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.845178604125977, + "learning_rate": 1e-06, + "loss": 0.4413, + "mean_token_accuracy": 0.8649581670761108, + "num_tokens": 579226609.0, + "step": 15186 + }, + { + "epoch": 1.9319425009540772, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.656494140625, + "learning_rate": 1e-06, + "loss": 0.404, + "mean_token_accuracy": 0.8730248212814331, + "num_tokens": 579259288.0, + "step": 15187 + }, + { + "epoch": 1.9320697112326677, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.854148864746094, + "learning_rate": 1e-06, + "loss": 0.3936, + "mean_token_accuracy": 0.8767770528793335, + "num_tokens": 579294797.0, + "step": 15188 + }, + { + "epoch": 1.9321969215112582, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.628746032714844, + "learning_rate": 1e-06, + "loss": 0.3802, + "mean_token_accuracy": 0.8789936900138855, + "num_tokens": 579334352.0, + "step": 15189 + }, + { + "epoch": 1.9323241317898487, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.854040145874023, + "learning_rate": 1e-06, + "loss": 0.4771, + "mean_token_accuracy": 0.8574579358100891, + "num_tokens": 579379618.0, + "step": 15190 + }, + { + "epoch": 1.9324513420684393, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.937898635864258, + "learning_rate": 1e-06, + "loss": 0.422, + "mean_token_accuracy": 0.86844801902771, + "num_tokens": 579420230.0, + "step": 15191 + }, + { + "epoch": 1.9325785523470298, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.734718322753906, + "learning_rate": 1e-06, + "loss": 0.4288, + "mean_token_accuracy": 0.869078516960144, + "num_tokens": 579458236.0, + "step": 15192 + }, + { + "epoch": 1.93270576262562, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.858505249023438, + "learning_rate": 1e-06, + "loss": 0.4168, + "mean_token_accuracy": 0.8678290247917175, + "num_tokens": 579495561.0, + "step": 15193 + }, + { + "epoch": 1.9328329729042106, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.816194534301758, + "learning_rate": 1e-06, + "loss": 0.4254, + "mean_token_accuracy": 0.8674317598342896, + "num_tokens": 579530225.0, + "step": 15194 + }, + { + "epoch": 1.9329601831828012, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.755380630493164, + "learning_rate": 1e-06, + "loss": 0.444, + "mean_token_accuracy": 0.8602133393287659, + "num_tokens": 579570143.0, + "step": 15195 + }, + { + "epoch": 1.9330873934613917, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.7695369720459, + "learning_rate": 1e-06, + "loss": 0.3759, + "mean_token_accuracy": 0.881558895111084, + "num_tokens": 579607105.0, + "step": 15196 + }, + { + "epoch": 1.9332146037399822, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.981977462768555, + "learning_rate": 1e-06, + "loss": 0.3642, + "mean_token_accuracy": 0.8832799196243286, + "num_tokens": 579640018.0, + "step": 15197 + }, + { + "epoch": 1.9333418140185727, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.886316299438477, + "learning_rate": 1e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.8636776208877563, + "num_tokens": 579681250.0, + "step": 15198 + }, + { + "epoch": 1.933469024297163, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.64897918701172, + "learning_rate": 1e-06, + "loss": 0.4468, + "mean_token_accuracy": 0.8578954935073853, + "num_tokens": 579718535.0, + "step": 15199 + }, + { + "epoch": 1.9335962345757536, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.845197677612305, + "learning_rate": 1e-06, + "loss": 0.4546, + "mean_token_accuracy": 0.8563541769981384, + "num_tokens": 579756959.0, + "step": 15200 + }, + { + "epoch": 1.933723444854344, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.973966598510742, + "learning_rate": 1e-06, + "loss": 0.4544, + "mean_token_accuracy": 0.8586529493331909, + "num_tokens": 579798266.0, + "step": 15201 + }, + { + "epoch": 1.9338506551329346, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.97275161743164, + "learning_rate": 1e-06, + "loss": 0.3636, + "mean_token_accuracy": 0.8870101571083069, + "num_tokens": 579831869.0, + "step": 15202 + }, + { + "epoch": 1.9339778654115252, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.877830505371094, + "learning_rate": 1e-06, + "loss": 0.4196, + "mean_token_accuracy": 0.8708490133285522, + "num_tokens": 579874168.0, + "step": 15203 + }, + { + "epoch": 1.9341050756901157, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.817466735839844, + "learning_rate": 1e-06, + "loss": 0.3985, + "mean_token_accuracy": 0.8754400610923767, + "num_tokens": 579916313.0, + "step": 15204 + }, + { + "epoch": 1.9342322859687062, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.832426071166992, + "learning_rate": 1e-06, + "loss": 0.3827, + "mean_token_accuracy": 0.8797285556793213, + "num_tokens": 579959022.0, + "step": 15205 + }, + { + "epoch": 1.9343594962472968, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.87077522277832, + "learning_rate": 1e-06, + "loss": 0.4595, + "mean_token_accuracy": 0.8540708422660828, + "num_tokens": 580003675.0, + "step": 15206 + }, + { + "epoch": 1.9344867065258873, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.814861297607422, + "learning_rate": 1e-06, + "loss": 0.3898, + "mean_token_accuracy": 0.8780686855316162, + "num_tokens": 580044384.0, + "step": 15207 + }, + { + "epoch": 1.9346139168044778, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.822031021118164, + "learning_rate": 1e-06, + "loss": 0.4458, + "mean_token_accuracy": 0.8582942485809326, + "num_tokens": 580078955.0, + "step": 15208 + }, + { + "epoch": 1.9347411270830683, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.885221481323242, + "learning_rate": 1e-06, + "loss": 0.4254, + "mean_token_accuracy": 0.8656942844390869, + "num_tokens": 580116793.0, + "step": 15209 + }, + { + "epoch": 1.9348683373616589, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.77861213684082, + "learning_rate": 1e-06, + "loss": 0.4115, + "mean_token_accuracy": 0.8676760792732239, + "num_tokens": 580153622.0, + "step": 15210 + }, + { + "epoch": 1.9349955476402494, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.823200225830078, + "learning_rate": 1e-06, + "loss": 0.4174, + "mean_token_accuracy": 0.8666486740112305, + "num_tokens": 580195287.0, + "step": 15211 + }, + { + "epoch": 1.93512275791884, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.88868522644043, + "learning_rate": 1e-06, + "loss": 0.4654, + "mean_token_accuracy": 0.8565551042556763, + "num_tokens": 580229601.0, + "step": 15212 + }, + { + "epoch": 1.9352499681974304, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.904861450195312, + "learning_rate": 1e-06, + "loss": 0.4175, + "mean_token_accuracy": 0.8703739643096924, + "num_tokens": 580268079.0, + "step": 15213 + }, + { + "epoch": 1.935377178476021, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.843551635742188, + "learning_rate": 1e-06, + "loss": 0.4862, + "mean_token_accuracy": 0.8507625460624695, + "num_tokens": 580305379.0, + "step": 15214 + }, + { + "epoch": 1.9355043887546115, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.80091667175293, + "learning_rate": 1e-06, + "loss": 0.4203, + "mean_token_accuracy": 0.868328332901001, + "num_tokens": 580338438.0, + "step": 15215 + }, + { + "epoch": 1.935631599033202, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.843027114868164, + "learning_rate": 1e-06, + "loss": 0.416, + "mean_token_accuracy": 0.8690904378890991, + "num_tokens": 580370017.0, + "step": 15216 + }, + { + "epoch": 1.9357588093117923, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.819355010986328, + "learning_rate": 1e-06, + "loss": 0.3921, + "mean_token_accuracy": 0.8786080479621887, + "num_tokens": 580409572.0, + "step": 15217 + }, + { + "epoch": 1.9358860195903829, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.824459075927734, + "learning_rate": 1e-06, + "loss": 0.3962, + "mean_token_accuracy": 0.8738396167755127, + "num_tokens": 580448481.0, + "step": 15218 + }, + { + "epoch": 1.9360132298689734, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.849092483520508, + "learning_rate": 1e-06, + "loss": 0.3807, + "mean_token_accuracy": 0.8805596828460693, + "num_tokens": 580481919.0, + "step": 15219 + }, + { + "epoch": 1.936140440147564, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.153362274169922, + "learning_rate": 1e-06, + "loss": 0.4351, + "mean_token_accuracy": 0.8626861572265625, + "num_tokens": 580520758.0, + "step": 15220 + }, + { + "epoch": 1.9362676504261545, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.73995590209961, + "learning_rate": 1e-06, + "loss": 0.466, + "mean_token_accuracy": 0.8591623306274414, + "num_tokens": 580559839.0, + "step": 15221 + }, + { + "epoch": 1.936394860704745, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.935436248779297, + "learning_rate": 1e-06, + "loss": 0.3409, + "mean_token_accuracy": 0.894639790058136, + "num_tokens": 580591767.0, + "step": 15222 + }, + { + "epoch": 1.9365220709833355, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.63620376586914, + "learning_rate": 1e-06, + "loss": 0.3992, + "mean_token_accuracy": 0.8767659068107605, + "num_tokens": 580633180.0, + "step": 15223 + }, + { + "epoch": 1.9366492812619258, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.824831008911133, + "learning_rate": 1e-06, + "loss": 0.41, + "mean_token_accuracy": 0.8703136444091797, + "num_tokens": 580677045.0, + "step": 15224 + }, + { + "epoch": 1.9367764915405163, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.85849952697754, + "learning_rate": 1e-06, + "loss": 0.3765, + "mean_token_accuracy": 0.8816118240356445, + "num_tokens": 580717423.0, + "step": 15225 + }, + { + "epoch": 1.9369037018191069, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.718101501464844, + "learning_rate": 1e-06, + "loss": 0.4097, + "mean_token_accuracy": 0.8732197284698486, + "num_tokens": 580756677.0, + "step": 15226 + }, + { + "epoch": 1.9370309120976974, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.83942985534668, + "learning_rate": 1e-06, + "loss": 0.3999, + "mean_token_accuracy": 0.873356819152832, + "num_tokens": 580796993.0, + "step": 15227 + }, + { + "epoch": 1.937158122376288, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.07230567932129, + "learning_rate": 1e-06, + "loss": 0.4024, + "mean_token_accuracy": 0.8720705509185791, + "num_tokens": 580834616.0, + "step": 15228 + }, + { + "epoch": 1.9372853326548785, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.585098266601562, + "learning_rate": 1e-06, + "loss": 0.413, + "mean_token_accuracy": 0.8719602823257446, + "num_tokens": 580873718.0, + "step": 15229 + }, + { + "epoch": 1.937412542933469, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.68339729309082, + "learning_rate": 1e-06, + "loss": 0.4342, + "mean_token_accuracy": 0.8641589879989624, + "num_tokens": 580911678.0, + "step": 15230 + }, + { + "epoch": 1.9375397532120595, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.785675048828125, + "learning_rate": 1e-06, + "loss": 0.4001, + "mean_token_accuracy": 0.8756102919578552, + "num_tokens": 580953282.0, + "step": 15231 + }, + { + "epoch": 1.93766696349065, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.645971298217773, + "learning_rate": 1e-06, + "loss": 0.3873, + "mean_token_accuracy": 0.8754634857177734, + "num_tokens": 580997954.0, + "step": 15232 + }, + { + "epoch": 1.9377941737692406, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.90057945251465, + "learning_rate": 1e-06, + "loss": 0.407, + "mean_token_accuracy": 0.8704767227172852, + "num_tokens": 581035623.0, + "step": 15233 + }, + { + "epoch": 1.937921384047831, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.78261375427246, + "learning_rate": 1e-06, + "loss": 0.3918, + "mean_token_accuracy": 0.8786092400550842, + "num_tokens": 581068727.0, + "step": 15234 + }, + { + "epoch": 1.9380485943264216, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.987802505493164, + "learning_rate": 1e-06, + "loss": 0.4134, + "mean_token_accuracy": 0.8671295642852783, + "num_tokens": 581111180.0, + "step": 15235 + }, + { + "epoch": 1.9381758046050122, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.804332733154297, + "learning_rate": 1e-06, + "loss": 0.4734, + "mean_token_accuracy": 0.8488328456878662, + "num_tokens": 581152373.0, + "step": 15236 + }, + { + "epoch": 1.9383030148836027, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.81751823425293, + "learning_rate": 1e-06, + "loss": 0.4234, + "mean_token_accuracy": 0.8662145137786865, + "num_tokens": 581195854.0, + "step": 15237 + }, + { + "epoch": 1.9384302251621932, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.910259246826172, + "learning_rate": 1e-06, + "loss": 0.5068, + "mean_token_accuracy": 0.842121422290802, + "num_tokens": 581236414.0, + "step": 15238 + }, + { + "epoch": 1.9385574354407837, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.674238204956055, + "learning_rate": 1e-06, + "loss": 0.4563, + "mean_token_accuracy": 0.8520833849906921, + "num_tokens": 581275349.0, + "step": 15239 + }, + { + "epoch": 1.9386846457193743, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.91895294189453, + "learning_rate": 1e-06, + "loss": 0.3929, + "mean_token_accuracy": 0.8760698437690735, + "num_tokens": 581313231.0, + "step": 15240 + }, + { + "epoch": 1.9388118559979648, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.604944229125977, + "learning_rate": 1e-06, + "loss": 0.4128, + "mean_token_accuracy": 0.8695300221443176, + "num_tokens": 581355295.0, + "step": 15241 + }, + { + "epoch": 1.938939066276555, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.78093147277832, + "learning_rate": 1e-06, + "loss": 0.402, + "mean_token_accuracy": 0.8718539476394653, + "num_tokens": 581392633.0, + "step": 15242 + }, + { + "epoch": 1.9390662765551456, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.825952529907227, + "learning_rate": 1e-06, + "loss": 0.3804, + "mean_token_accuracy": 0.8822489976882935, + "num_tokens": 581425552.0, + "step": 15243 + }, + { + "epoch": 1.9391934868337362, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.763935089111328, + "learning_rate": 1e-06, + "loss": 0.4569, + "mean_token_accuracy": 0.8573623299598694, + "num_tokens": 581467707.0, + "step": 15244 + }, + { + "epoch": 1.9393206971123267, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.6690616607666, + "learning_rate": 1e-06, + "loss": 0.3992, + "mean_token_accuracy": 0.8750810623168945, + "num_tokens": 581504888.0, + "step": 15245 + }, + { + "epoch": 1.9394479073909172, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.763517379760742, + "learning_rate": 1e-06, + "loss": 0.4094, + "mean_token_accuracy": 0.8680827617645264, + "num_tokens": 581537800.0, + "step": 15246 + }, + { + "epoch": 1.9395751176695077, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.933727264404297, + "learning_rate": 1e-06, + "loss": 0.3531, + "mean_token_accuracy": 0.8902190923690796, + "num_tokens": 581570594.0, + "step": 15247 + }, + { + "epoch": 1.939702327948098, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.749807357788086, + "learning_rate": 1e-06, + "loss": 0.4351, + "mean_token_accuracy": 0.8626096248626709, + "num_tokens": 581610764.0, + "step": 15248 + }, + { + "epoch": 1.9398295382266886, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.727149963378906, + "learning_rate": 1e-06, + "loss": 0.4171, + "mean_token_accuracy": 0.868816077709198, + "num_tokens": 581652341.0, + "step": 15249 + }, + { + "epoch": 1.939956748505279, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.938194274902344, + "learning_rate": 1e-06, + "loss": 0.4386, + "mean_token_accuracy": 0.8608474731445312, + "num_tokens": 581689978.0, + "step": 15250 + }, + { + "epoch": 1.9400839587838696, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.636646270751953, + "learning_rate": 1e-06, + "loss": 0.3862, + "mean_token_accuracy": 0.8772146105766296, + "num_tokens": 581728811.0, + "step": 15251 + }, + { + "epoch": 1.9402111690624602, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.055946350097656, + "learning_rate": 1e-06, + "loss": 0.3802, + "mean_token_accuracy": 0.8770841956138611, + "num_tokens": 581768016.0, + "step": 15252 + }, + { + "epoch": 1.9403383793410507, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.71420669555664, + "learning_rate": 1e-06, + "loss": 0.4171, + "mean_token_accuracy": 0.8712780475616455, + "num_tokens": 581805405.0, + "step": 15253 + }, + { + "epoch": 1.9404655896196412, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.75580596923828, + "learning_rate": 1e-06, + "loss": 0.4754, + "mean_token_accuracy": 0.8492348194122314, + "num_tokens": 581845302.0, + "step": 15254 + }, + { + "epoch": 1.9405927998982317, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.845407485961914, + "learning_rate": 1e-06, + "loss": 0.3794, + "mean_token_accuracy": 0.8782923817634583, + "num_tokens": 581875391.0, + "step": 15255 + }, + { + "epoch": 1.9407200101768223, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.834836959838867, + "learning_rate": 1e-06, + "loss": 0.4128, + "mean_token_accuracy": 0.8684694766998291, + "num_tokens": 581916580.0, + "step": 15256 + }, + { + "epoch": 1.9408472204554128, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.781070709228516, + "learning_rate": 1e-06, + "loss": 0.4122, + "mean_token_accuracy": 0.8695124387741089, + "num_tokens": 581958539.0, + "step": 15257 + }, + { + "epoch": 1.9409744307340033, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.120725631713867, + "learning_rate": 1e-06, + "loss": 0.4156, + "mean_token_accuracy": 0.8676154017448425, + "num_tokens": 581996981.0, + "step": 15258 + }, + { + "epoch": 1.9411016410125939, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.767282485961914, + "learning_rate": 1e-06, + "loss": 0.3828, + "mean_token_accuracy": 0.8792130947113037, + "num_tokens": 582036068.0, + "step": 15259 + }, + { + "epoch": 1.9412288512911844, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.804370880126953, + "learning_rate": 1e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.8716838359832764, + "num_tokens": 582073322.0, + "step": 15260 + }, + { + "epoch": 1.941356061569775, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.903430938720703, + "learning_rate": 1e-06, + "loss": 0.4116, + "mean_token_accuracy": 0.8719837665557861, + "num_tokens": 582110736.0, + "step": 15261 + }, + { + "epoch": 1.9414832718483654, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.99451446533203, + "learning_rate": 1e-06, + "loss": 0.4628, + "mean_token_accuracy": 0.8530523180961609, + "num_tokens": 582153431.0, + "step": 15262 + }, + { + "epoch": 1.941610482126956, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.579893112182617, + "learning_rate": 1e-06, + "loss": 0.4739, + "mean_token_accuracy": 0.8502094745635986, + "num_tokens": 582191793.0, + "step": 15263 + }, + { + "epoch": 1.9417376924055465, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.084341049194336, + "learning_rate": 1e-06, + "loss": 0.4473, + "mean_token_accuracy": 0.8599074482917786, + "num_tokens": 582227923.0, + "step": 15264 + }, + { + "epoch": 1.941864902684137, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.74600601196289, + "learning_rate": 1e-06, + "loss": 0.4495, + "mean_token_accuracy": 0.8542391657829285, + "num_tokens": 582268029.0, + "step": 15265 + }, + { + "epoch": 1.9419921129627273, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.862958908081055, + "learning_rate": 1e-06, + "loss": 0.4368, + "mean_token_accuracy": 0.8628755807876587, + "num_tokens": 582302173.0, + "step": 15266 + }, + { + "epoch": 1.9421193232413179, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.863107681274414, + "learning_rate": 1e-06, + "loss": 0.3694, + "mean_token_accuracy": 0.8837473392486572, + "num_tokens": 582341642.0, + "step": 15267 + }, + { + "epoch": 1.9422465335199084, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.734943389892578, + "learning_rate": 1e-06, + "loss": 0.4043, + "mean_token_accuracy": 0.871767520904541, + "num_tokens": 582378960.0, + "step": 15268 + }, + { + "epoch": 1.942373743798499, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.799549102783203, + "learning_rate": 1e-06, + "loss": 0.3955, + "mean_token_accuracy": 0.8762609958648682, + "num_tokens": 582422149.0, + "step": 15269 + }, + { + "epoch": 1.9425009540770894, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.786653518676758, + "learning_rate": 1e-06, + "loss": 0.4598, + "mean_token_accuracy": 0.8529175519943237, + "num_tokens": 582462657.0, + "step": 15270 + }, + { + "epoch": 1.94262816435568, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.889644622802734, + "learning_rate": 1e-06, + "loss": 0.3713, + "mean_token_accuracy": 0.8813536763191223, + "num_tokens": 582499264.0, + "step": 15271 + }, + { + "epoch": 1.9427553746342705, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.81755256652832, + "learning_rate": 1e-06, + "loss": 0.4027, + "mean_token_accuracy": 0.872620165348053, + "num_tokens": 582534744.0, + "step": 15272 + }, + { + "epoch": 1.9428825849128608, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.693361282348633, + "learning_rate": 1e-06, + "loss": 0.4237, + "mean_token_accuracy": 0.8658369779586792, + "num_tokens": 582577010.0, + "step": 15273 + }, + { + "epoch": 1.9430097951914513, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.87214469909668, + "learning_rate": 1e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.8646443486213684, + "num_tokens": 582617103.0, + "step": 15274 + }, + { + "epoch": 1.9431370054700419, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.760982513427734, + "learning_rate": 1e-06, + "loss": 0.4387, + "mean_token_accuracy": 0.8631565570831299, + "num_tokens": 582661754.0, + "step": 15275 + }, + { + "epoch": 1.9432642157486324, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.74349021911621, + "learning_rate": 1e-06, + "loss": 0.4119, + "mean_token_accuracy": 0.8713389039039612, + "num_tokens": 582701701.0, + "step": 15276 + }, + { + "epoch": 1.943391426027223, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.831979751586914, + "learning_rate": 1e-06, + "loss": 0.4343, + "mean_token_accuracy": 0.860058069229126, + "num_tokens": 582741977.0, + "step": 15277 + }, + { + "epoch": 1.9435186363058135, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.810266494750977, + "learning_rate": 1e-06, + "loss": 0.399, + "mean_token_accuracy": 0.8777592778205872, + "num_tokens": 582783420.0, + "step": 15278 + }, + { + "epoch": 1.943645846584404, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.750242233276367, + "learning_rate": 1e-06, + "loss": 0.404, + "mean_token_accuracy": 0.8751986622810364, + "num_tokens": 582819464.0, + "step": 15279 + }, + { + "epoch": 1.9437730568629945, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.84096336364746, + "learning_rate": 1e-06, + "loss": 0.3975, + "mean_token_accuracy": 0.8739856481552124, + "num_tokens": 582860757.0, + "step": 15280 + }, + { + "epoch": 1.943900267141585, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.811716079711914, + "learning_rate": 1e-06, + "loss": 0.4335, + "mean_token_accuracy": 0.8641209006309509, + "num_tokens": 582910300.0, + "step": 15281 + }, + { + "epoch": 1.9440274774201756, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.8498477935791, + "learning_rate": 1e-06, + "loss": 0.4436, + "mean_token_accuracy": 0.8619568347930908, + "num_tokens": 582956333.0, + "step": 15282 + }, + { + "epoch": 1.944154687698766, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.72173500061035, + "learning_rate": 1e-06, + "loss": 0.3876, + "mean_token_accuracy": 0.8774008750915527, + "num_tokens": 582994068.0, + "step": 15283 + }, + { + "epoch": 1.9442818979773566, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.78615379333496, + "learning_rate": 1e-06, + "loss": 0.4197, + "mean_token_accuracy": 0.8672516942024231, + "num_tokens": 583027829.0, + "step": 15284 + }, + { + "epoch": 1.9444091082559471, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.706789016723633, + "learning_rate": 1e-06, + "loss": 0.4145, + "mean_token_accuracy": 0.8662402033805847, + "num_tokens": 583068881.0, + "step": 15285 + }, + { + "epoch": 1.9445363185345377, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.847347259521484, + "learning_rate": 1e-06, + "loss": 0.3982, + "mean_token_accuracy": 0.8723999261856079, + "num_tokens": 583100097.0, + "step": 15286 + }, + { + "epoch": 1.9446635288131282, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.850818634033203, + "learning_rate": 1e-06, + "loss": 0.4312, + "mean_token_accuracy": 0.8682438731193542, + "num_tokens": 583135340.0, + "step": 15287 + }, + { + "epoch": 1.9447907390917187, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.638381958007812, + "learning_rate": 1e-06, + "loss": 0.4097, + "mean_token_accuracy": 0.868492066860199, + "num_tokens": 583172032.0, + "step": 15288 + }, + { + "epoch": 1.9449179493703093, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.929672241210938, + "learning_rate": 1e-06, + "loss": 0.3857, + "mean_token_accuracy": 0.8728940486907959, + "num_tokens": 583205244.0, + "step": 15289 + }, + { + "epoch": 1.9450451596488998, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.685287475585938, + "learning_rate": 1e-06, + "loss": 0.4297, + "mean_token_accuracy": 0.8653682470321655, + "num_tokens": 583247537.0, + "step": 15290 + }, + { + "epoch": 1.94517236992749, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.588388442993164, + "learning_rate": 1e-06, + "loss": 0.4356, + "mean_token_accuracy": 0.8621255159378052, + "num_tokens": 583285349.0, + "step": 15291 + }, + { + "epoch": 1.9452995802060806, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.756149291992188, + "learning_rate": 1e-06, + "loss": 0.4109, + "mean_token_accuracy": 0.8704710602760315, + "num_tokens": 583323810.0, + "step": 15292 + }, + { + "epoch": 1.9454267904846712, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.576662063598633, + "learning_rate": 1e-06, + "loss": 0.3778, + "mean_token_accuracy": 0.881665050983429, + "num_tokens": 583354244.0, + "step": 15293 + }, + { + "epoch": 1.9455540007632617, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.83085060119629, + "learning_rate": 1e-06, + "loss": 0.3705, + "mean_token_accuracy": 0.8829245567321777, + "num_tokens": 583391764.0, + "step": 15294 + }, + { + "epoch": 1.9456812110418522, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.76195526123047, + "learning_rate": 1e-06, + "loss": 0.4525, + "mean_token_accuracy": 0.8564392924308777, + "num_tokens": 583426305.0, + "step": 15295 + }, + { + "epoch": 1.9458084213204427, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.740964889526367, + "learning_rate": 1e-06, + "loss": 0.4235, + "mean_token_accuracy": 0.8652377724647522, + "num_tokens": 583465252.0, + "step": 15296 + }, + { + "epoch": 1.945935631599033, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.576557159423828, + "learning_rate": 1e-06, + "loss": 0.4443, + "mean_token_accuracy": 0.8605155348777771, + "num_tokens": 583510001.0, + "step": 15297 + }, + { + "epoch": 1.9460628418776236, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.851318359375, + "learning_rate": 1e-06, + "loss": 0.4054, + "mean_token_accuracy": 0.8708245754241943, + "num_tokens": 583547380.0, + "step": 15298 + }, + { + "epoch": 1.946190052156214, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.741159439086914, + "learning_rate": 1e-06, + "loss": 0.407, + "mean_token_accuracy": 0.8708713054656982, + "num_tokens": 583588852.0, + "step": 15299 + }, + { + "epoch": 1.9463172624348046, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.6983642578125, + "learning_rate": 1e-06, + "loss": 0.3931, + "mean_token_accuracy": 0.8754779100418091, + "num_tokens": 583634006.0, + "step": 15300 + }, + { + "epoch": 1.9464444727133952, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.628686904907227, + "learning_rate": 1e-06, + "loss": 0.4603, + "mean_token_accuracy": 0.8554194569587708, + "num_tokens": 583679430.0, + "step": 15301 + }, + { + "epoch": 1.9465716829919857, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.7526798248291, + "learning_rate": 1e-06, + "loss": 0.392, + "mean_token_accuracy": 0.8756663799285889, + "num_tokens": 583708856.0, + "step": 15302 + }, + { + "epoch": 1.9466988932705762, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.85634422302246, + "learning_rate": 1e-06, + "loss": 0.4202, + "mean_token_accuracy": 0.8671320676803589, + "num_tokens": 583750737.0, + "step": 15303 + }, + { + "epoch": 1.9468261035491667, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.635395050048828, + "learning_rate": 1e-06, + "loss": 0.3973, + "mean_token_accuracy": 0.8744363784790039, + "num_tokens": 583786342.0, + "step": 15304 + }, + { + "epoch": 1.9469533138277573, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.834880828857422, + "learning_rate": 1e-06, + "loss": 0.3922, + "mean_token_accuracy": 0.8742753267288208, + "num_tokens": 583821681.0, + "step": 15305 + }, + { + "epoch": 1.9470805241063478, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.666902542114258, + "learning_rate": 1e-06, + "loss": 0.4681, + "mean_token_accuracy": 0.8513442277908325, + "num_tokens": 583855640.0, + "step": 15306 + }, + { + "epoch": 1.9472077343849383, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.76146697998047, + "learning_rate": 1e-06, + "loss": 0.3975, + "mean_token_accuracy": 0.8783986568450928, + "num_tokens": 583888047.0, + "step": 15307 + }, + { + "epoch": 1.9473349446635289, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.795412063598633, + "learning_rate": 1e-06, + "loss": 0.4125, + "mean_token_accuracy": 0.8682039380073547, + "num_tokens": 583923220.0, + "step": 15308 + }, + { + "epoch": 1.9474621549421194, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.786542892456055, + "learning_rate": 1e-06, + "loss": 0.4384, + "mean_token_accuracy": 0.864342451095581, + "num_tokens": 583965718.0, + "step": 15309 + }, + { + "epoch": 1.94758936522071, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.82010269165039, + "learning_rate": 1e-06, + "loss": 0.3948, + "mean_token_accuracy": 0.8747972846031189, + "num_tokens": 584014027.0, + "step": 15310 + }, + { + "epoch": 1.9477165754993004, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.878559112548828, + "learning_rate": 1e-06, + "loss": 0.406, + "mean_token_accuracy": 0.8731048107147217, + "num_tokens": 584045019.0, + "step": 15311 + }, + { + "epoch": 1.947843785777891, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.612253189086914, + "learning_rate": 1e-06, + "loss": 0.4663, + "mean_token_accuracy": 0.8536783456802368, + "num_tokens": 584086839.0, + "step": 15312 + }, + { + "epoch": 1.9479709960564815, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.842309951782227, + "learning_rate": 1e-06, + "loss": 0.3771, + "mean_token_accuracy": 0.883391261100769, + "num_tokens": 584123785.0, + "step": 15313 + }, + { + "epoch": 1.948098206335072, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.669384002685547, + "learning_rate": 1e-06, + "loss": 0.4332, + "mean_token_accuracy": 0.8607683181762695, + "num_tokens": 584163768.0, + "step": 15314 + }, + { + "epoch": 1.9482254166136623, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.860815048217773, + "learning_rate": 1e-06, + "loss": 0.4393, + "mean_token_accuracy": 0.865071177482605, + "num_tokens": 584201668.0, + "step": 15315 + }, + { + "epoch": 1.9483526268922529, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.573535919189453, + "learning_rate": 1e-06, + "loss": 0.3855, + "mean_token_accuracy": 0.8775714039802551, + "num_tokens": 584236626.0, + "step": 15316 + }, + { + "epoch": 1.9484798371708434, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.788644790649414, + "learning_rate": 1e-06, + "loss": 0.4705, + "mean_token_accuracy": 0.8531749844551086, + "num_tokens": 584279811.0, + "step": 15317 + }, + { + "epoch": 1.948607047449434, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.783100128173828, + "learning_rate": 1e-06, + "loss": 0.4127, + "mean_token_accuracy": 0.8692237734794617, + "num_tokens": 584317314.0, + "step": 15318 + }, + { + "epoch": 1.9487342577280244, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.685182571411133, + "learning_rate": 1e-06, + "loss": 0.3976, + "mean_token_accuracy": 0.8745323419570923, + "num_tokens": 584355768.0, + "step": 15319 + }, + { + "epoch": 1.948861468006615, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.611347198486328, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.8658244013786316, + "num_tokens": 584386205.0, + "step": 15320 + }, + { + "epoch": 1.9489886782852053, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.939699172973633, + "learning_rate": 1e-06, + "loss": 0.3784, + "mean_token_accuracy": 0.8814067840576172, + "num_tokens": 584419721.0, + "step": 15321 + }, + { + "epoch": 1.9491158885637958, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.628400802612305, + "learning_rate": 1e-06, + "loss": 0.4123, + "mean_token_accuracy": 0.8671409487724304, + "num_tokens": 584451029.0, + "step": 15322 + }, + { + "epoch": 1.9492430988423863, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.796749114990234, + "learning_rate": 1e-06, + "loss": 0.3744, + "mean_token_accuracy": 0.8833699226379395, + "num_tokens": 584483991.0, + "step": 15323 + }, + { + "epoch": 1.9493703091209769, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.981840133666992, + "learning_rate": 1e-06, + "loss": 0.4217, + "mean_token_accuracy": 0.8672522902488708, + "num_tokens": 584517652.0, + "step": 15324 + }, + { + "epoch": 1.9494975193995674, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.663951873779297, + "learning_rate": 1e-06, + "loss": 0.3891, + "mean_token_accuracy": 0.877246618270874, + "num_tokens": 584561058.0, + "step": 15325 + }, + { + "epoch": 1.949624729678158, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.750812530517578, + "learning_rate": 1e-06, + "loss": 0.4099, + "mean_token_accuracy": 0.8728479146957397, + "num_tokens": 584595097.0, + "step": 15326 + }, + { + "epoch": 1.9497519399567484, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.862491607666016, + "learning_rate": 1e-06, + "loss": 0.3944, + "mean_token_accuracy": 0.8760810494422913, + "num_tokens": 584639073.0, + "step": 15327 + }, + { + "epoch": 1.949879150235339, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.805198669433594, + "learning_rate": 1e-06, + "loss": 0.4443, + "mean_token_accuracy": 0.8600651621818542, + "num_tokens": 584680517.0, + "step": 15328 + }, + { + "epoch": 1.9500063605139295, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.83352279663086, + "learning_rate": 1e-06, + "loss": 0.4119, + "mean_token_accuracy": 0.8694121241569519, + "num_tokens": 584717796.0, + "step": 15329 + }, + { + "epoch": 1.95013357079252, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.76205062866211, + "learning_rate": 1e-06, + "loss": 0.3692, + "mean_token_accuracy": 0.8832859396934509, + "num_tokens": 584755595.0, + "step": 15330 + }, + { + "epoch": 1.9502607810711106, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.88005256652832, + "learning_rate": 1e-06, + "loss": 0.4519, + "mean_token_accuracy": 0.8589926958084106, + "num_tokens": 584798681.0, + "step": 15331 + }, + { + "epoch": 1.950387991349701, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.830472946166992, + "learning_rate": 1e-06, + "loss": 0.4035, + "mean_token_accuracy": 0.8722782135009766, + "num_tokens": 584835424.0, + "step": 15332 + }, + { + "epoch": 1.9505152016282916, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.790796279907227, + "learning_rate": 1e-06, + "loss": 0.3726, + "mean_token_accuracy": 0.8828350305557251, + "num_tokens": 584872063.0, + "step": 15333 + }, + { + "epoch": 1.9506424119068821, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.750865936279297, + "learning_rate": 1e-06, + "loss": 0.367, + "mean_token_accuracy": 0.884232759475708, + "num_tokens": 584905850.0, + "step": 15334 + }, + { + "epoch": 1.9507696221854727, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.722070693969727, + "learning_rate": 1e-06, + "loss": 0.3889, + "mean_token_accuracy": 0.8775874376296997, + "num_tokens": 584946480.0, + "step": 15335 + }, + { + "epoch": 1.9508968324640632, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.712448120117188, + "learning_rate": 1e-06, + "loss": 0.391, + "mean_token_accuracy": 0.8759080171585083, + "num_tokens": 584977382.0, + "step": 15336 + }, + { + "epoch": 1.9510240427426537, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.95551300048828, + "learning_rate": 1e-06, + "loss": 0.4033, + "mean_token_accuracy": 0.872468113899231, + "num_tokens": 585010885.0, + "step": 15337 + }, + { + "epoch": 1.9511512530212443, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.651473999023438, + "learning_rate": 1e-06, + "loss": 0.4211, + "mean_token_accuracy": 0.868958592414856, + "num_tokens": 585048208.0, + "step": 15338 + }, + { + "epoch": 1.9512784632998348, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.77743148803711, + "learning_rate": 1e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.8650243878364563, + "num_tokens": 585087477.0, + "step": 15339 + }, + { + "epoch": 1.951405673578425, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.816762924194336, + "learning_rate": 1e-06, + "loss": 0.4149, + "mean_token_accuracy": 0.8693220019340515, + "num_tokens": 585120860.0, + "step": 15340 + }, + { + "epoch": 1.9515328838570156, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.757781982421875, + "learning_rate": 1e-06, + "loss": 0.4159, + "mean_token_accuracy": 0.8704532384872437, + "num_tokens": 585161887.0, + "step": 15341 + }, + { + "epoch": 1.9516600941356061, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.679636001586914, + "learning_rate": 1e-06, + "loss": 0.3912, + "mean_token_accuracy": 0.8780816793441772, + "num_tokens": 585200496.0, + "step": 15342 + }, + { + "epoch": 1.9517873044141967, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.947067260742188, + "learning_rate": 1e-06, + "loss": 0.4011, + "mean_token_accuracy": 0.8698623180389404, + "num_tokens": 585243976.0, + "step": 15343 + }, + { + "epoch": 1.9519145146927872, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.77715492248535, + "learning_rate": 1e-06, + "loss": 0.4306, + "mean_token_accuracy": 0.8645544052124023, + "num_tokens": 585282366.0, + "step": 15344 + }, + { + "epoch": 1.9520417249713777, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.807514190673828, + "learning_rate": 1e-06, + "loss": 0.4145, + "mean_token_accuracy": 0.8687795400619507, + "num_tokens": 585325224.0, + "step": 15345 + }, + { + "epoch": 1.952168935249968, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.901023864746094, + "learning_rate": 1e-06, + "loss": 0.3879, + "mean_token_accuracy": 0.8786197304725647, + "num_tokens": 585361954.0, + "step": 15346 + }, + { + "epoch": 1.9522961455285586, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.611431121826172, + "learning_rate": 1e-06, + "loss": 0.4161, + "mean_token_accuracy": 0.8658952713012695, + "num_tokens": 585406033.0, + "step": 15347 + }, + { + "epoch": 1.952423355807149, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.65154457092285, + "learning_rate": 1e-06, + "loss": 0.42, + "mean_token_accuracy": 0.8682844042778015, + "num_tokens": 585440457.0, + "step": 15348 + }, + { + "epoch": 1.9525505660857396, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.941831588745117, + "learning_rate": 1e-06, + "loss": 0.3522, + "mean_token_accuracy": 0.8909283876419067, + "num_tokens": 585473451.0, + "step": 15349 + }, + { + "epoch": 1.9526777763643302, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.797456741333008, + "learning_rate": 1e-06, + "loss": 0.4097, + "mean_token_accuracy": 0.8735717535018921, + "num_tokens": 585509652.0, + "step": 15350 + }, + { + "epoch": 1.9528049866429207, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.782636642456055, + "learning_rate": 1e-06, + "loss": 0.3866, + "mean_token_accuracy": 0.874595046043396, + "num_tokens": 585545591.0, + "step": 15351 + }, + { + "epoch": 1.9529321969215112, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.912662506103516, + "learning_rate": 1e-06, + "loss": 0.45, + "mean_token_accuracy": 0.8579543828964233, + "num_tokens": 585583757.0, + "step": 15352 + }, + { + "epoch": 1.9530594072001017, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.8580379486084, + "learning_rate": 1e-06, + "loss": 0.4406, + "mean_token_accuracy": 0.8584729433059692, + "num_tokens": 585623897.0, + "step": 15353 + }, + { + "epoch": 1.9531866174786923, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.06732749938965, + "learning_rate": 1e-06, + "loss": 0.3961, + "mean_token_accuracy": 0.8723065853118896, + "num_tokens": 585661043.0, + "step": 15354 + }, + { + "epoch": 1.9533138277572828, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.979036331176758, + "learning_rate": 1e-06, + "loss": 0.4912, + "mean_token_accuracy": 0.8429453372955322, + "num_tokens": 585699201.0, + "step": 15355 + }, + { + "epoch": 1.9534410380358733, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.691715240478516, + "learning_rate": 1e-06, + "loss": 0.4428, + "mean_token_accuracy": 0.8625442981719971, + "num_tokens": 585735325.0, + "step": 15356 + }, + { + "epoch": 1.9535682483144639, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.012205123901367, + "learning_rate": 1e-06, + "loss": 0.3854, + "mean_token_accuracy": 0.8769792914390564, + "num_tokens": 585769909.0, + "step": 15357 + }, + { + "epoch": 1.9536954585930544, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.843185424804688, + "learning_rate": 1e-06, + "loss": 0.4571, + "mean_token_accuracy": 0.8583735227584839, + "num_tokens": 585804590.0, + "step": 15358 + }, + { + "epoch": 1.953822668871645, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.917736053466797, + "learning_rate": 1e-06, + "loss": 0.4347, + "mean_token_accuracy": 0.8651373982429504, + "num_tokens": 585843349.0, + "step": 15359 + }, + { + "epoch": 1.9539498791502354, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.764820098876953, + "learning_rate": 1e-06, + "loss": 0.3741, + "mean_token_accuracy": 0.8814679384231567, + "num_tokens": 585881846.0, + "step": 15360 + }, + { + "epoch": 1.954077089428826, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.645139694213867, + "learning_rate": 1e-06, + "loss": 0.4199, + "mean_token_accuracy": 0.8675401210784912, + "num_tokens": 585917848.0, + "step": 15361 + }, + { + "epoch": 1.9542042997074165, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.978862762451172, + "learning_rate": 1e-06, + "loss": 0.3998, + "mean_token_accuracy": 0.8749814629554749, + "num_tokens": 585961249.0, + "step": 15362 + }, + { + "epoch": 1.954331509986007, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.68337631225586, + "learning_rate": 1e-06, + "loss": 0.3809, + "mean_token_accuracy": 0.8778074979782104, + "num_tokens": 586005795.0, + "step": 15363 + }, + { + "epoch": 1.9544587202645973, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.015520095825195, + "learning_rate": 1e-06, + "loss": 0.387, + "mean_token_accuracy": 0.8786054849624634, + "num_tokens": 586041523.0, + "step": 15364 + }, + { + "epoch": 1.9545859305431879, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.868520736694336, + "learning_rate": 1e-06, + "loss": 0.4504, + "mean_token_accuracy": 0.8604735136032104, + "num_tokens": 586077800.0, + "step": 15365 + }, + { + "epoch": 1.9547131408217784, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.710033416748047, + "learning_rate": 1e-06, + "loss": 0.4145, + "mean_token_accuracy": 0.8646546602249146, + "num_tokens": 586116010.0, + "step": 15366 + }, + { + "epoch": 1.954840351100369, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.866662979125977, + "learning_rate": 1e-06, + "loss": 0.3718, + "mean_token_accuracy": 0.8817018866539001, + "num_tokens": 586155359.0, + "step": 15367 + }, + { + "epoch": 1.9549675613789594, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.681625366210938, + "learning_rate": 1e-06, + "loss": 0.4026, + "mean_token_accuracy": 0.8747130632400513, + "num_tokens": 586188190.0, + "step": 15368 + }, + { + "epoch": 1.95509477165755, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.5660343170166, + "learning_rate": 1e-06, + "loss": 0.4265, + "mean_token_accuracy": 0.8690475225448608, + "num_tokens": 586225971.0, + "step": 15369 + }, + { + "epoch": 1.9552219819361403, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.756637573242188, + "learning_rate": 1e-06, + "loss": 0.4164, + "mean_token_accuracy": 0.8661960363388062, + "num_tokens": 586263295.0, + "step": 15370 + }, + { + "epoch": 1.9553491922147308, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.808931350708008, + "learning_rate": 1e-06, + "loss": 0.3444, + "mean_token_accuracy": 0.892665445804596, + "num_tokens": 586293547.0, + "step": 15371 + }, + { + "epoch": 1.9554764024933213, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.781497955322266, + "learning_rate": 1e-06, + "loss": 0.4198, + "mean_token_accuracy": 0.8651533722877502, + "num_tokens": 586333180.0, + "step": 15372 + }, + { + "epoch": 1.9556036127719119, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.758224487304688, + "learning_rate": 1e-06, + "loss": 0.4078, + "mean_token_accuracy": 0.8709132671356201, + "num_tokens": 586368780.0, + "step": 15373 + }, + { + "epoch": 1.9557308230505024, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.79935646057129, + "learning_rate": 1e-06, + "loss": 0.4517, + "mean_token_accuracy": 0.8545522093772888, + "num_tokens": 586404177.0, + "step": 15374 + }, + { + "epoch": 1.955858033329093, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.628934860229492, + "learning_rate": 1e-06, + "loss": 0.4712, + "mean_token_accuracy": 0.855069637298584, + "num_tokens": 586446917.0, + "step": 15375 + }, + { + "epoch": 1.9559852436076834, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.70376205444336, + "learning_rate": 1e-06, + "loss": 0.4062, + "mean_token_accuracy": 0.8703906536102295, + "num_tokens": 586485832.0, + "step": 15376 + }, + { + "epoch": 1.956112453886274, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.83617401123047, + "learning_rate": 1e-06, + "loss": 0.3863, + "mean_token_accuracy": 0.8803470730781555, + "num_tokens": 586524089.0, + "step": 15377 + }, + { + "epoch": 1.9562396641648645, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.48651695251465, + "learning_rate": 1e-06, + "loss": 0.3938, + "mean_token_accuracy": 0.8737435340881348, + "num_tokens": 586564097.0, + "step": 15378 + }, + { + "epoch": 1.956366874443455, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.745651245117188, + "learning_rate": 1e-06, + "loss": 0.3746, + "mean_token_accuracy": 0.8830848336219788, + "num_tokens": 586600653.0, + "step": 15379 + }, + { + "epoch": 1.9564940847220456, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.797996520996094, + "learning_rate": 1e-06, + "loss": 0.3926, + "mean_token_accuracy": 0.876177966594696, + "num_tokens": 586639527.0, + "step": 15380 + }, + { + "epoch": 1.956621295000636, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.61968994140625, + "learning_rate": 1e-06, + "loss": 0.3904, + "mean_token_accuracy": 0.8748946189880371, + "num_tokens": 586670885.0, + "step": 15381 + }, + { + "epoch": 1.9567485052792266, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.770307540893555, + "learning_rate": 1e-06, + "loss": 0.3743, + "mean_token_accuracy": 0.8833762407302856, + "num_tokens": 586709875.0, + "step": 15382 + }, + { + "epoch": 1.9568757155578171, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.80327796936035, + "learning_rate": 1e-06, + "loss": 0.4258, + "mean_token_accuracy": 0.8639829158782959, + "num_tokens": 586753164.0, + "step": 15383 + }, + { + "epoch": 1.9570029258364077, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.682254791259766, + "learning_rate": 1e-06, + "loss": 0.3869, + "mean_token_accuracy": 0.877571702003479, + "num_tokens": 586793737.0, + "step": 15384 + }, + { + "epoch": 1.9571301361149982, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.733619689941406, + "learning_rate": 1e-06, + "loss": 0.4175, + "mean_token_accuracy": 0.8657230138778687, + "num_tokens": 586833428.0, + "step": 15385 + }, + { + "epoch": 1.9572573463935887, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.75635528564453, + "learning_rate": 1e-06, + "loss": 0.4931, + "mean_token_accuracy": 0.846743106842041, + "num_tokens": 586871202.0, + "step": 15386 + }, + { + "epoch": 1.9573845566721793, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.913917541503906, + "learning_rate": 1e-06, + "loss": 0.3712, + "mean_token_accuracy": 0.8806298971176147, + "num_tokens": 586905537.0, + "step": 15387 + }, + { + "epoch": 1.9575117669507698, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.725000381469727, + "learning_rate": 1e-06, + "loss": 0.4497, + "mean_token_accuracy": 0.8600603342056274, + "num_tokens": 586940378.0, + "step": 15388 + }, + { + "epoch": 1.95763897722936, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.769132614135742, + "learning_rate": 1e-06, + "loss": 0.3895, + "mean_token_accuracy": 0.8802809715270996, + "num_tokens": 586978806.0, + "step": 15389 + }, + { + "epoch": 1.9577661875079506, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.79669952392578, + "learning_rate": 1e-06, + "loss": 0.4289, + "mean_token_accuracy": 0.8630701303482056, + "num_tokens": 587007367.0, + "step": 15390 + }, + { + "epoch": 1.9578933977865411, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.741737365722656, + "learning_rate": 1e-06, + "loss": 0.3474, + "mean_token_accuracy": 0.8915484547615051, + "num_tokens": 587040422.0, + "step": 15391 + }, + { + "epoch": 1.9580206080651317, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.8403263092041, + "learning_rate": 1e-06, + "loss": 0.3674, + "mean_token_accuracy": 0.8865966200828552, + "num_tokens": 587070238.0, + "step": 15392 + }, + { + "epoch": 1.9581478183437222, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.929363250732422, + "learning_rate": 1e-06, + "loss": 0.3847, + "mean_token_accuracy": 0.8762486577033997, + "num_tokens": 587105973.0, + "step": 15393 + }, + { + "epoch": 1.9582750286223127, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.825986862182617, + "learning_rate": 1e-06, + "loss": 0.3652, + "mean_token_accuracy": 0.883631706237793, + "num_tokens": 587137975.0, + "step": 15394 + }, + { + "epoch": 1.958402238900903, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.759178161621094, + "learning_rate": 1e-06, + "loss": 0.4249, + "mean_token_accuracy": 0.8664915561676025, + "num_tokens": 587169176.0, + "step": 15395 + }, + { + "epoch": 1.9585294491794936, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.908607482910156, + "learning_rate": 1e-06, + "loss": 0.4117, + "mean_token_accuracy": 0.8677540421485901, + "num_tokens": 587205570.0, + "step": 15396 + }, + { + "epoch": 1.958656659458084, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.798686981201172, + "learning_rate": 1e-06, + "loss": 0.4301, + "mean_token_accuracy": 0.8620174527168274, + "num_tokens": 587246729.0, + "step": 15397 + }, + { + "epoch": 1.9587838697366746, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.753711700439453, + "learning_rate": 1e-06, + "loss": 0.3998, + "mean_token_accuracy": 0.874466061592102, + "num_tokens": 587284537.0, + "step": 15398 + }, + { + "epoch": 1.9589110800152651, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.88155746459961, + "learning_rate": 1e-06, + "loss": 0.3876, + "mean_token_accuracy": 0.874592661857605, + "num_tokens": 587324394.0, + "step": 15399 + }, + { + "epoch": 1.9590382902938557, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.513404846191406, + "learning_rate": 1e-06, + "loss": 0.3796, + "mean_token_accuracy": 0.8772826790809631, + "num_tokens": 587359935.0, + "step": 15400 + }, + { + "epoch": 1.9591655005724462, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.893104553222656, + "learning_rate": 1e-06, + "loss": 0.3799, + "mean_token_accuracy": 0.8817629814147949, + "num_tokens": 587402081.0, + "step": 15401 + }, + { + "epoch": 1.9592927108510367, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.906795501708984, + "learning_rate": 1e-06, + "loss": 0.4165, + "mean_token_accuracy": 0.8677310347557068, + "num_tokens": 587442939.0, + "step": 15402 + }, + { + "epoch": 1.9594199211296273, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.705324172973633, + "learning_rate": 1e-06, + "loss": 0.428, + "mean_token_accuracy": 0.8621047139167786, + "num_tokens": 587479155.0, + "step": 15403 + }, + { + "epoch": 1.9595471314082178, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.81549644470215, + "learning_rate": 1e-06, + "loss": 0.422, + "mean_token_accuracy": 0.8595840930938721, + "num_tokens": 587519285.0, + "step": 15404 + }, + { + "epoch": 1.9596743416868083, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.849536895751953, + "learning_rate": 1e-06, + "loss": 0.4674, + "mean_token_accuracy": 0.8520428538322449, + "num_tokens": 587563978.0, + "step": 15405 + }, + { + "epoch": 1.9598015519653988, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.926610946655273, + "learning_rate": 1e-06, + "loss": 0.4255, + "mean_token_accuracy": 0.8643290400505066, + "num_tokens": 587599052.0, + "step": 15406 + }, + { + "epoch": 1.9599287622439894, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.897293090820312, + "learning_rate": 1e-06, + "loss": 0.3972, + "mean_token_accuracy": 0.8740836381912231, + "num_tokens": 587639980.0, + "step": 15407 + }, + { + "epoch": 1.96005597252258, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.942171096801758, + "learning_rate": 1e-06, + "loss": 0.422, + "mean_token_accuracy": 0.8659630417823792, + "num_tokens": 587673836.0, + "step": 15408 + }, + { + "epoch": 1.9601831828011704, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.75000762939453, + "learning_rate": 1e-06, + "loss": 0.4011, + "mean_token_accuracy": 0.8706659078598022, + "num_tokens": 587712896.0, + "step": 15409 + }, + { + "epoch": 1.960310393079761, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.91339874267578, + "learning_rate": 1e-06, + "loss": 0.4297, + "mean_token_accuracy": 0.864233136177063, + "num_tokens": 587747352.0, + "step": 15410 + }, + { + "epoch": 1.9604376033583515, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.62937355041504, + "learning_rate": 1e-06, + "loss": 0.4117, + "mean_token_accuracy": 0.8690198659896851, + "num_tokens": 587790234.0, + "step": 15411 + }, + { + "epoch": 1.960564813636942, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.002275466918945, + "learning_rate": 1e-06, + "loss": 0.3739, + "mean_token_accuracy": 0.8834294080734253, + "num_tokens": 587830185.0, + "step": 15412 + }, + { + "epoch": 1.9606920239155323, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.821020126342773, + "learning_rate": 1e-06, + "loss": 0.4112, + "mean_token_accuracy": 0.8715264797210693, + "num_tokens": 587866265.0, + "step": 15413 + }, + { + "epoch": 1.9608192341941229, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.021364212036133, + "learning_rate": 1e-06, + "loss": 0.3794, + "mean_token_accuracy": 0.8821606636047363, + "num_tokens": 587904956.0, + "step": 15414 + }, + { + "epoch": 1.9609464444727134, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.54899787902832, + "learning_rate": 1e-06, + "loss": 0.3928, + "mean_token_accuracy": 0.8762180805206299, + "num_tokens": 587942369.0, + "step": 15415 + }, + { + "epoch": 1.961073654751304, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.822799682617188, + "learning_rate": 1e-06, + "loss": 0.3956, + "mean_token_accuracy": 0.875898003578186, + "num_tokens": 587979340.0, + "step": 15416 + }, + { + "epoch": 1.9612008650298944, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.768842697143555, + "learning_rate": 1e-06, + "loss": 0.4281, + "mean_token_accuracy": 0.8659038543701172, + "num_tokens": 588021812.0, + "step": 15417 + }, + { + "epoch": 1.961328075308485, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.84403419494629, + "learning_rate": 1e-06, + "loss": 0.4189, + "mean_token_accuracy": 0.8667647838592529, + "num_tokens": 588064978.0, + "step": 15418 + }, + { + "epoch": 1.9614552855870753, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.073997497558594, + "learning_rate": 1e-06, + "loss": 0.4061, + "mean_token_accuracy": 0.8660082817077637, + "num_tokens": 588098142.0, + "step": 15419 + }, + { + "epoch": 1.9615824958656658, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.5358829498291, + "learning_rate": 1e-06, + "loss": 0.3919, + "mean_token_accuracy": 0.8778048753738403, + "num_tokens": 588141808.0, + "step": 15420 + }, + { + "epoch": 1.9617097061442563, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.644243240356445, + "learning_rate": 1e-06, + "loss": 0.3805, + "mean_token_accuracy": 0.8776128888130188, + "num_tokens": 588178137.0, + "step": 15421 + }, + { + "epoch": 1.9618369164228469, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.889955520629883, + "learning_rate": 1e-06, + "loss": 0.4548, + "mean_token_accuracy": 0.8637065291404724, + "num_tokens": 588215646.0, + "step": 15422 + }, + { + "epoch": 1.9619641267014374, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.62106704711914, + "learning_rate": 1e-06, + "loss": 0.434, + "mean_token_accuracy": 0.861797034740448, + "num_tokens": 588251345.0, + "step": 15423 + }, + { + "epoch": 1.962091336980028, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.91881561279297, + "learning_rate": 1e-06, + "loss": 0.3944, + "mean_token_accuracy": 0.8765021562576294, + "num_tokens": 588291222.0, + "step": 15424 + }, + { + "epoch": 1.9622185472586184, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.861618041992188, + "learning_rate": 1e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.8617491722106934, + "num_tokens": 588329480.0, + "step": 15425 + }, + { + "epoch": 1.962345757537209, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.827234268188477, + "learning_rate": 1e-06, + "loss": 0.4273, + "mean_token_accuracy": 0.8618224263191223, + "num_tokens": 588367143.0, + "step": 15426 + }, + { + "epoch": 1.9624729678157995, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.872732162475586, + "learning_rate": 1e-06, + "loss": 0.3839, + "mean_token_accuracy": 0.8786532878875732, + "num_tokens": 588408363.0, + "step": 15427 + }, + { + "epoch": 1.96260017809439, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.90765380859375, + "learning_rate": 1e-06, + "loss": 0.4039, + "mean_token_accuracy": 0.8774373531341553, + "num_tokens": 588445397.0, + "step": 15428 + }, + { + "epoch": 1.9627273883729806, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.850383758544922, + "learning_rate": 1e-06, + "loss": 0.3896, + "mean_token_accuracy": 0.8764837384223938, + "num_tokens": 588480436.0, + "step": 15429 + }, + { + "epoch": 1.962854598651571, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.970335006713867, + "learning_rate": 1e-06, + "loss": 0.418, + "mean_token_accuracy": 0.8685750365257263, + "num_tokens": 588526918.0, + "step": 15430 + }, + { + "epoch": 1.9629818089301616, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.786148071289062, + "learning_rate": 1e-06, + "loss": 0.4034, + "mean_token_accuracy": 0.8713729381561279, + "num_tokens": 588561300.0, + "step": 15431 + }, + { + "epoch": 1.9631090192087521, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.69944190979004, + "learning_rate": 1e-06, + "loss": 0.4465, + "mean_token_accuracy": 0.8602863550186157, + "num_tokens": 588603393.0, + "step": 15432 + }, + { + "epoch": 1.9632362294873427, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.793535232543945, + "learning_rate": 1e-06, + "loss": 0.3949, + "mean_token_accuracy": 0.8769790530204773, + "num_tokens": 588643806.0, + "step": 15433 + }, + { + "epoch": 1.9633634397659332, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.812564849853516, + "learning_rate": 1e-06, + "loss": 0.4158, + "mean_token_accuracy": 0.8699243068695068, + "num_tokens": 588685090.0, + "step": 15434 + }, + { + "epoch": 1.9634906500445237, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.855772018432617, + "learning_rate": 1e-06, + "loss": 0.4497, + "mean_token_accuracy": 0.8586763143539429, + "num_tokens": 588720983.0, + "step": 15435 + }, + { + "epoch": 1.9636178603231143, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.730627059936523, + "learning_rate": 1e-06, + "loss": 0.4258, + "mean_token_accuracy": 0.8628425598144531, + "num_tokens": 588760143.0, + "step": 15436 + }, + { + "epoch": 1.9637450706017048, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.017473220825195, + "learning_rate": 1e-06, + "loss": 0.3994, + "mean_token_accuracy": 0.872658908367157, + "num_tokens": 588796636.0, + "step": 15437 + }, + { + "epoch": 1.963872280880295, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.773134231567383, + "learning_rate": 1e-06, + "loss": 0.4234, + "mean_token_accuracy": 0.8665610551834106, + "num_tokens": 588831681.0, + "step": 15438 + }, + { + "epoch": 1.9639994911588856, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.99879264831543, + "learning_rate": 1e-06, + "loss": 0.381, + "mean_token_accuracy": 0.8795759677886963, + "num_tokens": 588875408.0, + "step": 15439 + }, + { + "epoch": 1.9641267014374761, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.723466873168945, + "learning_rate": 1e-06, + "loss": 0.4042, + "mean_token_accuracy": 0.8744447231292725, + "num_tokens": 588913583.0, + "step": 15440 + }, + { + "epoch": 1.9642539117160667, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.78889274597168, + "learning_rate": 1e-06, + "loss": 0.3849, + "mean_token_accuracy": 0.8800528645515442, + "num_tokens": 588949833.0, + "step": 15441 + }, + { + "epoch": 1.9643811219946572, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.873584747314453, + "learning_rate": 1e-06, + "loss": 0.3959, + "mean_token_accuracy": 0.8755840063095093, + "num_tokens": 588985678.0, + "step": 15442 + }, + { + "epoch": 1.9645083322732477, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.031951904296875, + "learning_rate": 1e-06, + "loss": 0.4145, + "mean_token_accuracy": 0.8698673844337463, + "num_tokens": 589018276.0, + "step": 15443 + }, + { + "epoch": 1.964635542551838, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.71206283569336, + "learning_rate": 1e-06, + "loss": 0.4101, + "mean_token_accuracy": 0.8689787983894348, + "num_tokens": 589061900.0, + "step": 15444 + }, + { + "epoch": 1.9647627528304286, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.704998016357422, + "learning_rate": 1e-06, + "loss": 0.4015, + "mean_token_accuracy": 0.8719605207443237, + "num_tokens": 589102740.0, + "step": 15445 + }, + { + "epoch": 1.964889963109019, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.770336151123047, + "learning_rate": 1e-06, + "loss": 0.378, + "mean_token_accuracy": 0.8805770874023438, + "num_tokens": 589135276.0, + "step": 15446 + }, + { + "epoch": 1.9650171733876096, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.641847610473633, + "learning_rate": 1e-06, + "loss": 0.4142, + "mean_token_accuracy": 0.8705735206604004, + "num_tokens": 589173390.0, + "step": 15447 + }, + { + "epoch": 1.9651443836662001, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.93566131591797, + "learning_rate": 1e-06, + "loss": 0.4371, + "mean_token_accuracy": 0.8639588952064514, + "num_tokens": 589214592.0, + "step": 15448 + }, + { + "epoch": 1.9652715939447907, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.668407440185547, + "learning_rate": 1e-06, + "loss": 0.4131, + "mean_token_accuracy": 0.8698925971984863, + "num_tokens": 589250682.0, + "step": 15449 + }, + { + "epoch": 1.9653988042233812, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.635257720947266, + "learning_rate": 1e-06, + "loss": 0.4341, + "mean_token_accuracy": 0.8640282154083252, + "num_tokens": 589288326.0, + "step": 15450 + }, + { + "epoch": 1.9655260145019717, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.73066520690918, + "learning_rate": 1e-06, + "loss": 0.4206, + "mean_token_accuracy": 0.8662946224212646, + "num_tokens": 589321632.0, + "step": 15451 + }, + { + "epoch": 1.9656532247805623, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.874616622924805, + "learning_rate": 1e-06, + "loss": 0.4387, + "mean_token_accuracy": 0.8613350987434387, + "num_tokens": 589361134.0, + "step": 15452 + }, + { + "epoch": 1.9657804350591528, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.907663345336914, + "learning_rate": 1e-06, + "loss": 0.4045, + "mean_token_accuracy": 0.872704267501831, + "num_tokens": 589402338.0, + "step": 15453 + }, + { + "epoch": 1.9659076453377433, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.88987922668457, + "learning_rate": 1e-06, + "loss": 0.4416, + "mean_token_accuracy": 0.864459753036499, + "num_tokens": 589441080.0, + "step": 15454 + }, + { + "epoch": 1.9660348556163338, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.82279396057129, + "learning_rate": 1e-06, + "loss": 0.4159, + "mean_token_accuracy": 0.8672952055931091, + "num_tokens": 589484019.0, + "step": 15455 + }, + { + "epoch": 1.9661620658949244, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.70430564880371, + "learning_rate": 1e-06, + "loss": 0.4365, + "mean_token_accuracy": 0.8653923273086548, + "num_tokens": 589525890.0, + "step": 15456 + }, + { + "epoch": 1.966289276173515, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.660856246948242, + "learning_rate": 1e-06, + "loss": 0.3855, + "mean_token_accuracy": 0.8750677704811096, + "num_tokens": 589560644.0, + "step": 15457 + }, + { + "epoch": 1.9664164864521054, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.85765266418457, + "learning_rate": 1e-06, + "loss": 0.3564, + "mean_token_accuracy": 0.8897455930709839, + "num_tokens": 589600863.0, + "step": 15458 + }, + { + "epoch": 1.966543696730696, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.721651077270508, + "learning_rate": 1e-06, + "loss": 0.4135, + "mean_token_accuracy": 0.869408130645752, + "num_tokens": 589637948.0, + "step": 15459 + }, + { + "epoch": 1.9666709070092865, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.780715942382812, + "learning_rate": 1e-06, + "loss": 0.3753, + "mean_token_accuracy": 0.8830146789550781, + "num_tokens": 589678061.0, + "step": 15460 + }, + { + "epoch": 1.966798117287877, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.81473159790039, + "learning_rate": 1e-06, + "loss": 0.4499, + "mean_token_accuracy": 0.8593546152114868, + "num_tokens": 589718050.0, + "step": 15461 + }, + { + "epoch": 1.9669253275664673, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.81328773498535, + "learning_rate": 1e-06, + "loss": 0.3988, + "mean_token_accuracy": 0.8737383484840393, + "num_tokens": 589759206.0, + "step": 15462 + }, + { + "epoch": 1.9670525378450578, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.851003646850586, + "learning_rate": 1e-06, + "loss": 0.4441, + "mean_token_accuracy": 0.8594051003456116, + "num_tokens": 589800218.0, + "step": 15463 + }, + { + "epoch": 1.9671797481236484, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.84942626953125, + "learning_rate": 1e-06, + "loss": 0.4025, + "mean_token_accuracy": 0.8711432814598083, + "num_tokens": 589843105.0, + "step": 15464 + }, + { + "epoch": 1.967306958402239, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.68501091003418, + "learning_rate": 1e-06, + "loss": 0.4433, + "mean_token_accuracy": 0.861479640007019, + "num_tokens": 589881560.0, + "step": 15465 + }, + { + "epoch": 1.9674341686808294, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.87293815612793, + "learning_rate": 1e-06, + "loss": 0.4507, + "mean_token_accuracy": 0.8586952686309814, + "num_tokens": 589922755.0, + "step": 15466 + }, + { + "epoch": 1.96756137895942, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.886043548583984, + "learning_rate": 1e-06, + "loss": 0.4712, + "mean_token_accuracy": 0.8512988686561584, + "num_tokens": 589964982.0, + "step": 15467 + }, + { + "epoch": 1.9676885892380103, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.889013290405273, + "learning_rate": 1e-06, + "loss": 0.4595, + "mean_token_accuracy": 0.8570892810821533, + "num_tokens": 590003371.0, + "step": 15468 + }, + { + "epoch": 1.9678157995166008, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.945280075073242, + "learning_rate": 1e-06, + "loss": 0.4094, + "mean_token_accuracy": 0.867421567440033, + "num_tokens": 590048361.0, + "step": 15469 + }, + { + "epoch": 1.9679430097951913, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.7415828704834, + "learning_rate": 1e-06, + "loss": 0.4143, + "mean_token_accuracy": 0.8668084740638733, + "num_tokens": 590083141.0, + "step": 15470 + }, + { + "epoch": 1.9680702200737819, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.893611907958984, + "learning_rate": 1e-06, + "loss": 0.428, + "mean_token_accuracy": 0.8642281293869019, + "num_tokens": 590128161.0, + "step": 15471 + }, + { + "epoch": 1.9681974303523724, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.73583984375, + "learning_rate": 1e-06, + "loss": 0.3663, + "mean_token_accuracy": 0.8851138353347778, + "num_tokens": 590166101.0, + "step": 15472 + }, + { + "epoch": 1.968324640630963, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.791664123535156, + "learning_rate": 1e-06, + "loss": 0.412, + "mean_token_accuracy": 0.8697614669799805, + "num_tokens": 590207643.0, + "step": 15473 + }, + { + "epoch": 1.9684518509095534, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.801355361938477, + "learning_rate": 1e-06, + "loss": 0.4369, + "mean_token_accuracy": 0.8631696701049805, + "num_tokens": 590247567.0, + "step": 15474 + }, + { + "epoch": 1.968579061188144, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.747493743896484, + "learning_rate": 1e-06, + "loss": 0.4112, + "mean_token_accuracy": 0.8658562898635864, + "num_tokens": 590287940.0, + "step": 15475 + }, + { + "epoch": 1.9687062714667345, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.55678367614746, + "learning_rate": 1e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.864257276058197, + "num_tokens": 590322558.0, + "step": 15476 + }, + { + "epoch": 1.968833481745325, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.82721710205078, + "learning_rate": 1e-06, + "loss": 0.398, + "mean_token_accuracy": 0.8684223890304565, + "num_tokens": 590362913.0, + "step": 15477 + }, + { + "epoch": 1.9689606920239155, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.86448097229004, + "learning_rate": 1e-06, + "loss": 0.438, + "mean_token_accuracy": 0.8617472052574158, + "num_tokens": 590405877.0, + "step": 15478 + }, + { + "epoch": 1.969087902302506, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.583099365234375, + "learning_rate": 1e-06, + "loss": 0.4345, + "mean_token_accuracy": 0.860666036605835, + "num_tokens": 590445373.0, + "step": 15479 + }, + { + "epoch": 1.9692151125810966, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.99355697631836, + "learning_rate": 1e-06, + "loss": 0.4619, + "mean_token_accuracy": 0.8589707612991333, + "num_tokens": 590483319.0, + "step": 15480 + }, + { + "epoch": 1.9693423228596871, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.864439010620117, + "learning_rate": 1e-06, + "loss": 0.4642, + "mean_token_accuracy": 0.8559902906417847, + "num_tokens": 590522433.0, + "step": 15481 + }, + { + "epoch": 1.9694695331382777, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.94234275817871, + "learning_rate": 1e-06, + "loss": 0.3943, + "mean_token_accuracy": 0.8735033273696899, + "num_tokens": 590564342.0, + "step": 15482 + }, + { + "epoch": 1.9695967434168682, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.733797073364258, + "learning_rate": 1e-06, + "loss": 0.3989, + "mean_token_accuracy": 0.8721587061882019, + "num_tokens": 590594527.0, + "step": 15483 + }, + { + "epoch": 1.9697239536954587, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.840431213378906, + "learning_rate": 1e-06, + "loss": 0.3764, + "mean_token_accuracy": 0.8814172148704529, + "num_tokens": 590630573.0, + "step": 15484 + }, + { + "epoch": 1.9698511639740492, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.85470962524414, + "learning_rate": 1e-06, + "loss": 0.4145, + "mean_token_accuracy": 0.8709202408790588, + "num_tokens": 590669333.0, + "step": 15485 + }, + { + "epoch": 1.9699783742526398, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.887054443359375, + "learning_rate": 1e-06, + "loss": 0.3705, + "mean_token_accuracy": 0.8846973180770874, + "num_tokens": 590709884.0, + "step": 15486 + }, + { + "epoch": 1.97010558453123, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.982160568237305, + "learning_rate": 1e-06, + "loss": 0.4006, + "mean_token_accuracy": 0.8749626278877258, + "num_tokens": 590746109.0, + "step": 15487 + }, + { + "epoch": 1.9702327948098206, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.79366111755371, + "learning_rate": 1e-06, + "loss": 0.4498, + "mean_token_accuracy": 0.8542536497116089, + "num_tokens": 590787881.0, + "step": 15488 + }, + { + "epoch": 1.9703600050884111, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.93281364440918, + "learning_rate": 1e-06, + "loss": 0.4553, + "mean_token_accuracy": 0.8553434610366821, + "num_tokens": 590826861.0, + "step": 15489 + }, + { + "epoch": 1.9704872153670017, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.82886505126953, + "learning_rate": 1e-06, + "loss": 0.3963, + "mean_token_accuracy": 0.8775582313537598, + "num_tokens": 590863104.0, + "step": 15490 + }, + { + "epoch": 1.9706144256455922, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.8391170501709, + "learning_rate": 1e-06, + "loss": 0.3909, + "mean_token_accuracy": 0.8766727447509766, + "num_tokens": 590900582.0, + "step": 15491 + }, + { + "epoch": 1.9707416359241827, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.988567352294922, + "learning_rate": 1e-06, + "loss": 0.3949, + "mean_token_accuracy": 0.8764740228652954, + "num_tokens": 590937735.0, + "step": 15492 + }, + { + "epoch": 1.970868846202773, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.72847557067871, + "learning_rate": 1e-06, + "loss": 0.3655, + "mean_token_accuracy": 0.8839019536972046, + "num_tokens": 590973089.0, + "step": 15493 + }, + { + "epoch": 1.9709960564813636, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.853160858154297, + "learning_rate": 1e-06, + "loss": 0.4404, + "mean_token_accuracy": 0.8568232655525208, + "num_tokens": 591012117.0, + "step": 15494 + }, + { + "epoch": 1.971123266759954, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.92686653137207, + "learning_rate": 1e-06, + "loss": 0.4734, + "mean_token_accuracy": 0.8506512641906738, + "num_tokens": 591044978.0, + "step": 15495 + }, + { + "epoch": 1.9712504770385446, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.939960479736328, + "learning_rate": 1e-06, + "loss": 0.4171, + "mean_token_accuracy": 0.8667560815811157, + "num_tokens": 591085562.0, + "step": 15496 + }, + { + "epoch": 1.9713776873171351, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.753265380859375, + "learning_rate": 1e-06, + "loss": 0.4327, + "mean_token_accuracy": 0.8603300452232361, + "num_tokens": 591120140.0, + "step": 15497 + }, + { + "epoch": 1.9715048975957257, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.97709846496582, + "learning_rate": 1e-06, + "loss": 0.4382, + "mean_token_accuracy": 0.8606946468353271, + "num_tokens": 591153952.0, + "step": 15498 + }, + { + "epoch": 1.9716321078743162, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.757431030273438, + "learning_rate": 1e-06, + "loss": 0.3693, + "mean_token_accuracy": 0.885723888874054, + "num_tokens": 591192983.0, + "step": 15499 + }, + { + "epoch": 1.9717593181529067, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.90067481994629, + "learning_rate": 1e-06, + "loss": 0.4063, + "mean_token_accuracy": 0.8705655932426453, + "num_tokens": 591226260.0, + "step": 15500 + }, + { + "epoch": 1.9718865284314973, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.69784164428711, + "learning_rate": 1e-06, + "loss": 0.4528, + "mean_token_accuracy": 0.857458233833313, + "num_tokens": 591259361.0, + "step": 15501 + }, + { + "epoch": 1.9720137387100878, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.852977752685547, + "learning_rate": 1e-06, + "loss": 0.4022, + "mean_token_accuracy": 0.8705236315727234, + "num_tokens": 591299753.0, + "step": 15502 + }, + { + "epoch": 1.9721409489886783, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.699729919433594, + "learning_rate": 1e-06, + "loss": 0.4165, + "mean_token_accuracy": 0.8717265129089355, + "num_tokens": 591335008.0, + "step": 15503 + }, + { + "epoch": 1.9722681592672688, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.908126831054688, + "learning_rate": 1e-06, + "loss": 0.4023, + "mean_token_accuracy": 0.8766169548034668, + "num_tokens": 591373001.0, + "step": 15504 + }, + { + "epoch": 1.9723953695458594, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.800390243530273, + "learning_rate": 1e-06, + "loss": 0.4077, + "mean_token_accuracy": 0.8714187145233154, + "num_tokens": 591408637.0, + "step": 15505 + }, + { + "epoch": 1.97252257982445, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.698089599609375, + "learning_rate": 1e-06, + "loss": 0.4207, + "mean_token_accuracy": 0.8671948909759521, + "num_tokens": 591444908.0, + "step": 15506 + }, + { + "epoch": 1.9726497901030404, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.893022537231445, + "learning_rate": 1e-06, + "loss": 0.4037, + "mean_token_accuracy": 0.8722501397132874, + "num_tokens": 591484641.0, + "step": 15507 + }, + { + "epoch": 1.972777000381631, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.794315338134766, + "learning_rate": 1e-06, + "loss": 0.4092, + "mean_token_accuracy": 0.8659567832946777, + "num_tokens": 591521859.0, + "step": 15508 + }, + { + "epoch": 1.9729042106602215, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.051023483276367, + "learning_rate": 1e-06, + "loss": 0.4293, + "mean_token_accuracy": 0.8681473731994629, + "num_tokens": 591559124.0, + "step": 15509 + }, + { + "epoch": 1.973031420938812, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.76793098449707, + "learning_rate": 1e-06, + "loss": 0.4145, + "mean_token_accuracy": 0.8677976131439209, + "num_tokens": 591597170.0, + "step": 15510 + }, + { + "epoch": 1.9731586312174023, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.583393096923828, + "learning_rate": 1e-06, + "loss": 0.3961, + "mean_token_accuracy": 0.8756944537162781, + "num_tokens": 591634784.0, + "step": 15511 + }, + { + "epoch": 1.9732858414959928, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.9288387298584, + "learning_rate": 1e-06, + "loss": 0.4073, + "mean_token_accuracy": 0.8746527433395386, + "num_tokens": 591671904.0, + "step": 15512 + }, + { + "epoch": 1.9734130517745834, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.764822006225586, + "learning_rate": 1e-06, + "loss": 0.4168, + "mean_token_accuracy": 0.8714504837989807, + "num_tokens": 591712990.0, + "step": 15513 + }, + { + "epoch": 1.973540262053174, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.674156188964844, + "learning_rate": 1e-06, + "loss": 0.425, + "mean_token_accuracy": 0.8667134046554565, + "num_tokens": 591748579.0, + "step": 15514 + }, + { + "epoch": 1.9736674723317644, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.954544067382812, + "learning_rate": 1e-06, + "loss": 0.4283, + "mean_token_accuracy": 0.8631268739700317, + "num_tokens": 591785865.0, + "step": 15515 + }, + { + "epoch": 1.973794682610355, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.991193771362305, + "learning_rate": 1e-06, + "loss": 0.3819, + "mean_token_accuracy": 0.8789969682693481, + "num_tokens": 591820087.0, + "step": 15516 + }, + { + "epoch": 1.9739218928889453, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.798490524291992, + "learning_rate": 1e-06, + "loss": 0.4172, + "mean_token_accuracy": 0.8689730167388916, + "num_tokens": 591859713.0, + "step": 15517 + }, + { + "epoch": 1.9740491031675358, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.768415451049805, + "learning_rate": 1e-06, + "loss": 0.395, + "mean_token_accuracy": 0.8767017722129822, + "num_tokens": 591893142.0, + "step": 15518 + }, + { + "epoch": 1.9741763134461263, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.908493041992188, + "learning_rate": 1e-06, + "loss": 0.4299, + "mean_token_accuracy": 0.8645857572555542, + "num_tokens": 591931529.0, + "step": 15519 + }, + { + "epoch": 1.9743035237247168, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.866601943969727, + "learning_rate": 1e-06, + "loss": 0.3918, + "mean_token_accuracy": 0.8743611574172974, + "num_tokens": 591976248.0, + "step": 15520 + }, + { + "epoch": 1.9744307340033074, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.823570251464844, + "learning_rate": 1e-06, + "loss": 0.419, + "mean_token_accuracy": 0.8670110702514648, + "num_tokens": 592018008.0, + "step": 15521 + }, + { + "epoch": 1.974557944281898, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.869964599609375, + "learning_rate": 1e-06, + "loss": 0.4628, + "mean_token_accuracy": 0.8555421233177185, + "num_tokens": 592062934.0, + "step": 15522 + }, + { + "epoch": 1.9746851545604884, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.71150779724121, + "learning_rate": 1e-06, + "loss": 0.4332, + "mean_token_accuracy": 0.8676872253417969, + "num_tokens": 592103129.0, + "step": 15523 + }, + { + "epoch": 1.974812364839079, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.915672302246094, + "learning_rate": 1e-06, + "loss": 0.4258, + "mean_token_accuracy": 0.8650637865066528, + "num_tokens": 592142116.0, + "step": 15524 + }, + { + "epoch": 1.9749395751176695, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.807458877563477, + "learning_rate": 1e-06, + "loss": 0.4176, + "mean_token_accuracy": 0.8697983622550964, + "num_tokens": 592182507.0, + "step": 15525 + }, + { + "epoch": 1.97506678539626, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.84390640258789, + "learning_rate": 1e-06, + "loss": 0.4793, + "mean_token_accuracy": 0.8479458689689636, + "num_tokens": 592225593.0, + "step": 15526 + }, + { + "epoch": 1.9751939956748505, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.147249221801758, + "learning_rate": 1e-06, + "loss": 0.3433, + "mean_token_accuracy": 0.8937336206436157, + "num_tokens": 592270676.0, + "step": 15527 + }, + { + "epoch": 1.975321205953441, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.955591201782227, + "learning_rate": 1e-06, + "loss": 0.3558, + "mean_token_accuracy": 0.8881152868270874, + "num_tokens": 592305537.0, + "step": 15528 + }, + { + "epoch": 1.9754484162320316, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.84354591369629, + "learning_rate": 1e-06, + "loss": 0.4055, + "mean_token_accuracy": 0.8760764598846436, + "num_tokens": 592338289.0, + "step": 15529 + }, + { + "epoch": 1.9755756265106221, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.00042152404785, + "learning_rate": 1e-06, + "loss": 0.4077, + "mean_token_accuracy": 0.8714263439178467, + "num_tokens": 592379189.0, + "step": 15530 + }, + { + "epoch": 1.9757028367892127, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.9966983795166, + "learning_rate": 1e-06, + "loss": 0.3745, + "mean_token_accuracy": 0.8833376169204712, + "num_tokens": 592421875.0, + "step": 15531 + }, + { + "epoch": 1.9758300470678032, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.909603118896484, + "learning_rate": 1e-06, + "loss": 0.4595, + "mean_token_accuracy": 0.8509864807128906, + "num_tokens": 592457438.0, + "step": 15532 + }, + { + "epoch": 1.9759572573463937, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.87129020690918, + "learning_rate": 1e-06, + "loss": 0.3818, + "mean_token_accuracy": 0.880742073059082, + "num_tokens": 592493230.0, + "step": 15533 + }, + { + "epoch": 1.9760844676249842, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.85250473022461, + "learning_rate": 1e-06, + "loss": 0.4683, + "mean_token_accuracy": 0.853498101234436, + "num_tokens": 592533656.0, + "step": 15534 + }, + { + "epoch": 1.9762116779035748, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.811479568481445, + "learning_rate": 1e-06, + "loss": 0.3681, + "mean_token_accuracy": 0.8827506899833679, + "num_tokens": 592571622.0, + "step": 15535 + }, + { + "epoch": 1.976338888182165, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.7138729095459, + "learning_rate": 1e-06, + "loss": 0.4351, + "mean_token_accuracy": 0.8651825189590454, + "num_tokens": 592613260.0, + "step": 15536 + }, + { + "epoch": 1.9764660984607556, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.770395278930664, + "learning_rate": 1e-06, + "loss": 0.4206, + "mean_token_accuracy": 0.8667765259742737, + "num_tokens": 592648872.0, + "step": 15537 + }, + { + "epoch": 1.9765933087393461, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.90958595275879, + "learning_rate": 1e-06, + "loss": 0.4122, + "mean_token_accuracy": 0.8698980212211609, + "num_tokens": 592682336.0, + "step": 15538 + }, + { + "epoch": 1.9767205190179367, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.553565979003906, + "learning_rate": 1e-06, + "loss": 0.369, + "mean_token_accuracy": 0.883515477180481, + "num_tokens": 592716412.0, + "step": 15539 + }, + { + "epoch": 1.9768477292965272, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.247812271118164, + "learning_rate": 1e-06, + "loss": 0.4054, + "mean_token_accuracy": 0.8731819987297058, + "num_tokens": 592753936.0, + "step": 15540 + }, + { + "epoch": 1.9769749395751177, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.635169982910156, + "learning_rate": 1e-06, + "loss": 0.403, + "mean_token_accuracy": 0.8701184988021851, + "num_tokens": 592792731.0, + "step": 15541 + }, + { + "epoch": 1.977102149853708, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.08619499206543, + "learning_rate": 1e-06, + "loss": 0.3922, + "mean_token_accuracy": 0.8728447556495667, + "num_tokens": 592832035.0, + "step": 15542 + }, + { + "epoch": 1.9772293601322986, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.64691734313965, + "learning_rate": 1e-06, + "loss": 0.352, + "mean_token_accuracy": 0.8866701126098633, + "num_tokens": 592869693.0, + "step": 15543 + }, + { + "epoch": 1.977356570410889, + "ewc_loss": 0.035400390625, + "ewc_loss_parallel": 3.528594970703125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.08847427368164, + "learning_rate": 1e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.8724974393844604, + "num_tokens": 592909648.0, + "step": 15544 + }, + { + "epoch": 1.9774837806894796, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.82727813720703, + "learning_rate": 1e-06, + "loss": 0.4136, + "mean_token_accuracy": 0.8736348152160645, + "num_tokens": 592942539.0, + "step": 15545 + }, + { + "epoch": 1.9776109909680701, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.729190826416016, + "learning_rate": 1e-06, + "loss": 0.4119, + "mean_token_accuracy": 0.8687880635261536, + "num_tokens": 592983680.0, + "step": 15546 + }, + { + "epoch": 1.9777382012466607, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.16083335876465, + "learning_rate": 1e-06, + "loss": 0.448, + "mean_token_accuracy": 0.8588310480117798, + "num_tokens": 593023161.0, + "step": 15547 + }, + { + "epoch": 1.9778654115252512, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.80611228942871, + "learning_rate": 1e-06, + "loss": 0.4656, + "mean_token_accuracy": 0.8510544896125793, + "num_tokens": 593066612.0, + "step": 15548 + }, + { + "epoch": 1.9779926218038417, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.86871337890625, + "learning_rate": 1e-06, + "loss": 0.443, + "mean_token_accuracy": 0.8588813543319702, + "num_tokens": 593105224.0, + "step": 15549 + }, + { + "epoch": 1.9781198320824323, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.804792404174805, + "learning_rate": 1e-06, + "loss": 0.4142, + "mean_token_accuracy": 0.8713784217834473, + "num_tokens": 593143683.0, + "step": 15550 + }, + { + "epoch": 1.9782470423610228, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.94812774658203, + "learning_rate": 1e-06, + "loss": 0.4486, + "mean_token_accuracy": 0.8557530641555786, + "num_tokens": 593179827.0, + "step": 15551 + }, + { + "epoch": 1.9783742526396133, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.86977767944336, + "learning_rate": 1e-06, + "loss": 0.4092, + "mean_token_accuracy": 0.8705582618713379, + "num_tokens": 593220936.0, + "step": 15552 + }, + { + "epoch": 1.9785014629182038, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.88085174560547, + "learning_rate": 1e-06, + "loss": 0.4167, + "mean_token_accuracy": 0.8652166128158569, + "num_tokens": 593258503.0, + "step": 15553 + }, + { + "epoch": 1.9786286731967944, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.76017951965332, + "learning_rate": 1e-06, + "loss": 0.4995, + "mean_token_accuracy": 0.8499435186386108, + "num_tokens": 593293738.0, + "step": 15554 + }, + { + "epoch": 1.978755883475385, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.552436828613281e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.737018585205078, + "learning_rate": 1e-06, + "loss": 0.45, + "mean_token_accuracy": 0.8600830435752869, + "num_tokens": 593338037.0, + "step": 15555 + }, + { + "epoch": 1.9788830937539754, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.9112491607666, + "learning_rate": 1e-06, + "loss": 0.4608, + "mean_token_accuracy": 0.8539993166923523, + "num_tokens": 593383509.0, + "step": 15556 + }, + { + "epoch": 1.979010304032566, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.723859786987305, + "learning_rate": 1e-06, + "loss": 0.4015, + "mean_token_accuracy": 0.8734989166259766, + "num_tokens": 593417734.0, + "step": 15557 + }, + { + "epoch": 1.9791375143111565, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.902055740356445, + "learning_rate": 1e-06, + "loss": 0.3696, + "mean_token_accuracy": 0.8823765516281128, + "num_tokens": 593454502.0, + "step": 15558 + }, + { + "epoch": 1.979264724589747, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.886035919189453, + "learning_rate": 1e-06, + "loss": 0.4374, + "mean_token_accuracy": 0.8653804659843445, + "num_tokens": 593498864.0, + "step": 15559 + }, + { + "epoch": 1.9793919348683373, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.692564010620117, + "learning_rate": 1e-06, + "loss": 0.4267, + "mean_token_accuracy": 0.8661986589431763, + "num_tokens": 593540125.0, + "step": 15560 + }, + { + "epoch": 1.9795191451469278, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.940929412841797, + "learning_rate": 1e-06, + "loss": 0.374, + "mean_token_accuracy": 0.8801833391189575, + "num_tokens": 593566937.0, + "step": 15561 + }, + { + "epoch": 1.9796463554255184, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.83635902404785, + "learning_rate": 1e-06, + "loss": 0.4297, + "mean_token_accuracy": 0.8647323250770569, + "num_tokens": 593600109.0, + "step": 15562 + }, + { + "epoch": 1.979773565704109, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.62232208251953, + "learning_rate": 1e-06, + "loss": 0.4133, + "mean_token_accuracy": 0.8681581616401672, + "num_tokens": 593638982.0, + "step": 15563 + }, + { + "epoch": 1.9799007759826994, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.014617919921875, + "learning_rate": 1e-06, + "loss": 0.442, + "mean_token_accuracy": 0.8601475954055786, + "num_tokens": 593677270.0, + "step": 15564 + }, + { + "epoch": 1.98002798626129, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.613807678222656, + "learning_rate": 1e-06, + "loss": 0.4112, + "mean_token_accuracy": 0.8766196966171265, + "num_tokens": 593714056.0, + "step": 15565 + }, + { + "epoch": 1.9801551965398803, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.826753616333008, + "learning_rate": 1e-06, + "loss": 0.4457, + "mean_token_accuracy": 0.8583707213401794, + "num_tokens": 593753516.0, + "step": 15566 + }, + { + "epoch": 1.9802824068184708, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.841981887817383, + "learning_rate": 1e-06, + "loss": 0.4146, + "mean_token_accuracy": 0.8720448017120361, + "num_tokens": 593789276.0, + "step": 15567 + }, + { + "epoch": 1.9804096170970613, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.741369247436523, + "learning_rate": 1e-06, + "loss": 0.3988, + "mean_token_accuracy": 0.8733171820640564, + "num_tokens": 593828112.0, + "step": 15568 + }, + { + "epoch": 1.9805368273756518, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.781475067138672, + "learning_rate": 1e-06, + "loss": 0.4458, + "mean_token_accuracy": 0.8575421571731567, + "num_tokens": 593866370.0, + "step": 15569 + }, + { + "epoch": 1.9806640376542424, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.940080642700195, + "learning_rate": 1e-06, + "loss": 0.4088, + "mean_token_accuracy": 0.8732147216796875, + "num_tokens": 593907863.0, + "step": 15570 + }, + { + "epoch": 1.980791247932833, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.73607063293457, + "learning_rate": 1e-06, + "loss": 0.3794, + "mean_token_accuracy": 0.8805437088012695, + "num_tokens": 593945254.0, + "step": 15571 + }, + { + "epoch": 1.9809184582114234, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.088850021362305, + "learning_rate": 1e-06, + "loss": 0.4093, + "mean_token_accuracy": 0.8736075758934021, + "num_tokens": 593987105.0, + "step": 15572 + }, + { + "epoch": 1.981045668490014, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.79899024963379, + "learning_rate": 1e-06, + "loss": 0.3699, + "mean_token_accuracy": 0.8809458017349243, + "num_tokens": 594024091.0, + "step": 15573 + }, + { + "epoch": 1.9811728787686045, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.974912643432617, + "learning_rate": 1e-06, + "loss": 0.3834, + "mean_token_accuracy": 0.8799448609352112, + "num_tokens": 594056018.0, + "step": 15574 + }, + { + "epoch": 1.981300089047195, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.88503646850586, + "learning_rate": 1e-06, + "loss": 0.4034, + "mean_token_accuracy": 0.8724383115768433, + "num_tokens": 594102297.0, + "step": 15575 + }, + { + "epoch": 1.9814272993257855, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.855634689331055, + "learning_rate": 1e-06, + "loss": 0.4032, + "mean_token_accuracy": 0.8708980083465576, + "num_tokens": 594140278.0, + "step": 15576 + }, + { + "epoch": 1.981554509604376, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.063087463378906, + "learning_rate": 1e-06, + "loss": 0.3947, + "mean_token_accuracy": 0.8738065958023071, + "num_tokens": 594175025.0, + "step": 15577 + }, + { + "epoch": 1.9816817198829666, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.673446655273438, + "learning_rate": 1e-06, + "loss": 0.423, + "mean_token_accuracy": 0.8685423135757446, + "num_tokens": 594217906.0, + "step": 15578 + }, + { + "epoch": 1.9818089301615571, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.818126678466797, + "learning_rate": 1e-06, + "loss": 0.4759, + "mean_token_accuracy": 0.8518679141998291, + "num_tokens": 594264331.0, + "step": 15579 + }, + { + "epoch": 1.9819361404401477, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.841068267822266, + "learning_rate": 1e-06, + "loss": 0.3622, + "mean_token_accuracy": 0.8843985795974731, + "num_tokens": 594300220.0, + "step": 15580 + }, + { + "epoch": 1.9820633507187382, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.06460952758789, + "learning_rate": 1e-06, + "loss": 0.3936, + "mean_token_accuracy": 0.8761405944824219, + "num_tokens": 594337849.0, + "step": 15581 + }, + { + "epoch": 1.9821905609973287, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.784006118774414, + "learning_rate": 1e-06, + "loss": 0.4372, + "mean_token_accuracy": 0.8642600178718567, + "num_tokens": 594379295.0, + "step": 15582 + }, + { + "epoch": 1.9823177712759192, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.062387466430664, + "learning_rate": 1e-06, + "loss": 0.4153, + "mean_token_accuracy": 0.865847110748291, + "num_tokens": 594408350.0, + "step": 15583 + }, + { + "epoch": 1.9824449815545098, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.022533416748047, + "learning_rate": 1e-06, + "loss": 0.4503, + "mean_token_accuracy": 0.8591475486755371, + "num_tokens": 594455110.0, + "step": 15584 + }, + { + "epoch": 1.9825721918331, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.707887649536133, + "learning_rate": 1e-06, + "loss": 0.3946, + "mean_token_accuracy": 0.8731410503387451, + "num_tokens": 594499231.0, + "step": 15585 + }, + { + "epoch": 1.9826994021116906, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.953638076782227, + "learning_rate": 1e-06, + "loss": 0.4809, + "mean_token_accuracy": 0.8505028486251831, + "num_tokens": 594541478.0, + "step": 15586 + }, + { + "epoch": 1.9828266123902811, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.61711883544922, + "learning_rate": 1e-06, + "loss": 0.4106, + "mean_token_accuracy": 0.8727539777755737, + "num_tokens": 594579075.0, + "step": 15587 + }, + { + "epoch": 1.9829538226688717, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.82573699951172, + "learning_rate": 1e-06, + "loss": 0.3641, + "mean_token_accuracy": 0.8868160843849182, + "num_tokens": 594616189.0, + "step": 15588 + }, + { + "epoch": 1.9830810329474622, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.79472541809082, + "learning_rate": 1e-06, + "loss": 0.3916, + "mean_token_accuracy": 0.8762959241867065, + "num_tokens": 594656633.0, + "step": 15589 + }, + { + "epoch": 1.9832082432260527, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.871183395385742, + "learning_rate": 1e-06, + "loss": 0.4702, + "mean_token_accuracy": 0.8513519763946533, + "num_tokens": 594696833.0, + "step": 15590 + }, + { + "epoch": 1.983335453504643, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.72355079650879, + "learning_rate": 1e-06, + "loss": 0.4322, + "mean_token_accuracy": 0.857269823551178, + "num_tokens": 594735138.0, + "step": 15591 + }, + { + "epoch": 1.9834626637832335, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.6235408782959, + "learning_rate": 1e-06, + "loss": 0.4184, + "mean_token_accuracy": 0.8635858297348022, + "num_tokens": 594769587.0, + "step": 15592 + }, + { + "epoch": 1.983589874061824, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.848108291625977, + "learning_rate": 1e-06, + "loss": 0.3885, + "mean_token_accuracy": 0.8747774362564087, + "num_tokens": 594803635.0, + "step": 15593 + }, + { + "epoch": 1.9837170843404146, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.681711196899414, + "learning_rate": 1e-06, + "loss": 0.3615, + "mean_token_accuracy": 0.885757565498352, + "num_tokens": 594839872.0, + "step": 15594 + }, + { + "epoch": 1.9838442946190051, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.79022789001465, + "learning_rate": 1e-06, + "loss": 0.4416, + "mean_token_accuracy": 0.8581366539001465, + "num_tokens": 594879208.0, + "step": 15595 + }, + { + "epoch": 1.9839715048975957, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.864362716674805, + "learning_rate": 1e-06, + "loss": 0.3872, + "mean_token_accuracy": 0.8758626580238342, + "num_tokens": 594910377.0, + "step": 15596 + }, + { + "epoch": 1.9840987151761862, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.750946044921875, + "learning_rate": 1e-06, + "loss": 0.4068, + "mean_token_accuracy": 0.8714800477027893, + "num_tokens": 594956574.0, + "step": 15597 + }, + { + "epoch": 1.9842259254547767, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.692731857299805, + "learning_rate": 1e-06, + "loss": 0.463, + "mean_token_accuracy": 0.8539061546325684, + "num_tokens": 594998036.0, + "step": 15598 + }, + { + "epoch": 1.9843531357333672, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.777528762817383, + "learning_rate": 1e-06, + "loss": 0.4229, + "mean_token_accuracy": 0.8643175363540649, + "num_tokens": 595031404.0, + "step": 15599 + }, + { + "epoch": 1.9844803460119578, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.891849517822266, + "learning_rate": 1e-06, + "loss": 0.3838, + "mean_token_accuracy": 0.8787308931350708, + "num_tokens": 595069800.0, + "step": 15600 + }, + { + "epoch": 1.9846075562905483, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.008359909057617, + "learning_rate": 1e-06, + "loss": 0.3952, + "mean_token_accuracy": 0.8752803802490234, + "num_tokens": 595110987.0, + "step": 15601 + }, + { + "epoch": 1.9847347665691388, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.91875648498535, + "learning_rate": 1e-06, + "loss": 0.3949, + "mean_token_accuracy": 0.8725022077560425, + "num_tokens": 595149192.0, + "step": 15602 + }, + { + "epoch": 1.9848619768477294, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.775793075561523, + "learning_rate": 1e-06, + "loss": 0.4041, + "mean_token_accuracy": 0.8684669733047485, + "num_tokens": 595182993.0, + "step": 15603 + }, + { + "epoch": 1.9849891871263199, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.922077178955078, + "learning_rate": 1e-06, + "loss": 0.4056, + "mean_token_accuracy": 0.8665013909339905, + "num_tokens": 595216177.0, + "step": 15604 + }, + { + "epoch": 1.9851163974049104, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.665159225463867, + "learning_rate": 1e-06, + "loss": 0.4058, + "mean_token_accuracy": 0.8699041604995728, + "num_tokens": 595253379.0, + "step": 15605 + }, + { + "epoch": 1.985243607683501, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.82343101501465, + "learning_rate": 1e-06, + "loss": 0.397, + "mean_token_accuracy": 0.8739378452301025, + "num_tokens": 595292878.0, + "step": 15606 + }, + { + "epoch": 1.9853708179620915, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.779010772705078, + "learning_rate": 1e-06, + "loss": 0.4012, + "mean_token_accuracy": 0.8715929388999939, + "num_tokens": 595329082.0, + "step": 15607 + }, + { + "epoch": 1.985498028240682, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.846826553344727, + "learning_rate": 1e-06, + "loss": 0.4421, + "mean_token_accuracy": 0.8620520830154419, + "num_tokens": 595367246.0, + "step": 15608 + }, + { + "epoch": 1.9856252385192723, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.919477462768555, + "learning_rate": 1e-06, + "loss": 0.393, + "mean_token_accuracy": 0.8765050172805786, + "num_tokens": 595404450.0, + "step": 15609 + }, + { + "epoch": 1.9857524487978628, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.80035972595215, + "learning_rate": 1e-06, + "loss": 0.3711, + "mean_token_accuracy": 0.8823250532150269, + "num_tokens": 595439718.0, + "step": 15610 + }, + { + "epoch": 1.9858796590764534, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.757856369018555, + "learning_rate": 1e-06, + "loss": 0.3956, + "mean_token_accuracy": 0.8719354867935181, + "num_tokens": 595473617.0, + "step": 15611 + }, + { + "epoch": 1.986006869355044, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.852846145629883, + "learning_rate": 1e-06, + "loss": 0.4311, + "mean_token_accuracy": 0.8671596050262451, + "num_tokens": 595516432.0, + "step": 15612 + }, + { + "epoch": 1.9861340796336344, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.999263763427734, + "learning_rate": 1e-06, + "loss": 0.4074, + "mean_token_accuracy": 0.8717535734176636, + "num_tokens": 595556278.0, + "step": 15613 + }, + { + "epoch": 1.986261289912225, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.703798294067383, + "learning_rate": 1e-06, + "loss": 0.395, + "mean_token_accuracy": 0.8757730722427368, + "num_tokens": 595598713.0, + "step": 15614 + }, + { + "epoch": 1.9863885001908153, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.897010803222656, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.8724864721298218, + "num_tokens": 595630597.0, + "step": 15615 + }, + { + "epoch": 1.9865157104694058, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.782739639282227, + "learning_rate": 1e-06, + "loss": 0.354, + "mean_token_accuracy": 0.8874282836914062, + "num_tokens": 595664163.0, + "step": 15616 + }, + { + "epoch": 1.9866429207479963, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.875713348388672, + "learning_rate": 1e-06, + "loss": 0.3728, + "mean_token_accuracy": 0.8795844316482544, + "num_tokens": 595698809.0, + "step": 15617 + }, + { + "epoch": 1.9867701310265868, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.779388427734375, + "learning_rate": 1e-06, + "loss": 0.4328, + "mean_token_accuracy": 0.8652093410491943, + "num_tokens": 595741166.0, + "step": 15618 + }, + { + "epoch": 1.9868973413051774, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.723203659057617, + "learning_rate": 1e-06, + "loss": 0.4382, + "mean_token_accuracy": 0.865386426448822, + "num_tokens": 595787079.0, + "step": 15619 + }, + { + "epoch": 1.987024551583768, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.932838439941406, + "learning_rate": 1e-06, + "loss": 0.4839, + "mean_token_accuracy": 0.8457431793212891, + "num_tokens": 595829085.0, + "step": 15620 + }, + { + "epoch": 1.9871517618623584, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.829151153564453, + "learning_rate": 1e-06, + "loss": 0.4143, + "mean_token_accuracy": 0.8681953549385071, + "num_tokens": 595870646.0, + "step": 15621 + }, + { + "epoch": 1.987278972140949, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.800323486328125, + "learning_rate": 1e-06, + "loss": 0.4396, + "mean_token_accuracy": 0.8601531982421875, + "num_tokens": 595918473.0, + "step": 15622 + }, + { + "epoch": 1.9874061824195395, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.786069869995117, + "learning_rate": 1e-06, + "loss": 0.4443, + "mean_token_accuracy": 0.8593047857284546, + "num_tokens": 595954097.0, + "step": 15623 + }, + { + "epoch": 1.98753339269813, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.898008346557617, + "learning_rate": 1e-06, + "loss": 0.4691, + "mean_token_accuracy": 0.8518706560134888, + "num_tokens": 595990191.0, + "step": 15624 + }, + { + "epoch": 1.9876606029767205, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.866474151611328, + "learning_rate": 1e-06, + "loss": 0.4415, + "mean_token_accuracy": 0.8630117177963257, + "num_tokens": 596027109.0, + "step": 15625 + }, + { + "epoch": 1.987787813255311, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.83228874206543, + "learning_rate": 1e-06, + "loss": 0.3777, + "mean_token_accuracy": 0.8785164952278137, + "num_tokens": 596067930.0, + "step": 15626 + }, + { + "epoch": 1.9879150235339016, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.80303192138672, + "learning_rate": 1e-06, + "loss": 0.377, + "mean_token_accuracy": 0.880766749382019, + "num_tokens": 596105809.0, + "step": 15627 + }, + { + "epoch": 1.9880422338124921, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.814685821533203, + "learning_rate": 1e-06, + "loss": 0.4075, + "mean_token_accuracy": 0.8733822107315063, + "num_tokens": 596144220.0, + "step": 15628 + }, + { + "epoch": 1.9881694440910826, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.121509552001953, + "learning_rate": 1e-06, + "loss": 0.4796, + "mean_token_accuracy": 0.847878098487854, + "num_tokens": 596183825.0, + "step": 15629 + }, + { + "epoch": 1.9882966543696732, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.638486862182617, + "learning_rate": 1e-06, + "loss": 0.3835, + "mean_token_accuracy": 0.8785964250564575, + "num_tokens": 596232333.0, + "step": 15630 + }, + { + "epoch": 1.9884238646482637, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.96407699584961, + "learning_rate": 1e-06, + "loss": 0.3791, + "mean_token_accuracy": 0.8786958456039429, + "num_tokens": 596272138.0, + "step": 15631 + }, + { + "epoch": 1.9885510749268542, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.839967727661133, + "learning_rate": 1e-06, + "loss": 0.3802, + "mean_token_accuracy": 0.8793478012084961, + "num_tokens": 596310254.0, + "step": 15632 + }, + { + "epoch": 1.9886782852054448, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.015596389770508, + "learning_rate": 1e-06, + "loss": 0.4197, + "mean_token_accuracy": 0.8665120601654053, + "num_tokens": 596350396.0, + "step": 15633 + }, + { + "epoch": 1.988805495484035, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.93608283996582, + "learning_rate": 1e-06, + "loss": 0.3864, + "mean_token_accuracy": 0.8738548755645752, + "num_tokens": 596388809.0, + "step": 15634 + }, + { + "epoch": 1.9889327057626256, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.86749839782715, + "learning_rate": 1e-06, + "loss": 0.3917, + "mean_token_accuracy": 0.878151535987854, + "num_tokens": 596427366.0, + "step": 15635 + }, + { + "epoch": 1.9890599160412161, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.995487213134766, + "learning_rate": 1e-06, + "loss": 0.4485, + "mean_token_accuracy": 0.8602018356323242, + "num_tokens": 596469999.0, + "step": 15636 + }, + { + "epoch": 1.9891871263198067, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.63575553894043, + "learning_rate": 1e-06, + "loss": 0.4566, + "mean_token_accuracy": 0.8547041416168213, + "num_tokens": 596506773.0, + "step": 15637 + }, + { + "epoch": 1.9893143365983972, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.810209274291992, + "learning_rate": 1e-06, + "loss": 0.4108, + "mean_token_accuracy": 0.8716524839401245, + "num_tokens": 596547440.0, + "step": 15638 + }, + { + "epoch": 1.9894415468769877, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.09532356262207, + "learning_rate": 1e-06, + "loss": 0.3882, + "mean_token_accuracy": 0.8766283392906189, + "num_tokens": 596580212.0, + "step": 15639 + }, + { + "epoch": 1.989568757155578, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.831937789916992, + "learning_rate": 1e-06, + "loss": 0.3509, + "mean_token_accuracy": 0.8912746906280518, + "num_tokens": 596611790.0, + "step": 15640 + }, + { + "epoch": 1.9896959674341685, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.97547149658203, + "learning_rate": 1e-06, + "loss": 0.3506, + "mean_token_accuracy": 0.8852411508560181, + "num_tokens": 596646984.0, + "step": 15641 + }, + { + "epoch": 1.989823177712759, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.75879669189453, + "learning_rate": 1e-06, + "loss": 0.4347, + "mean_token_accuracy": 0.866003692150116, + "num_tokens": 596688362.0, + "step": 15642 + }, + { + "epoch": 1.9899503879913496, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.730405807495117, + "learning_rate": 1e-06, + "loss": 0.421, + "mean_token_accuracy": 0.8724607229232788, + "num_tokens": 596725298.0, + "step": 15643 + }, + { + "epoch": 1.9900775982699401, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.87921905517578, + "learning_rate": 1e-06, + "loss": 0.3848, + "mean_token_accuracy": 0.8766210079193115, + "num_tokens": 596757400.0, + "step": 15644 + }, + { + "epoch": 1.9902048085485307, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.9716739654541, + "learning_rate": 1e-06, + "loss": 0.3687, + "mean_token_accuracy": 0.8826110363006592, + "num_tokens": 596790533.0, + "step": 15645 + }, + { + "epoch": 1.9903320188271212, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.020587921142578, + "learning_rate": 1e-06, + "loss": 0.426, + "mean_token_accuracy": 0.8641180396080017, + "num_tokens": 596834467.0, + "step": 15646 + }, + { + "epoch": 1.9904592291057117, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.723180770874023, + "learning_rate": 1e-06, + "loss": 0.4457, + "mean_token_accuracy": 0.8576984405517578, + "num_tokens": 596873262.0, + "step": 15647 + }, + { + "epoch": 1.9905864393843022, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.854331970214844, + "learning_rate": 1e-06, + "loss": 0.4066, + "mean_token_accuracy": 0.8705998659133911, + "num_tokens": 596912833.0, + "step": 15648 + }, + { + "epoch": 1.9907136496628928, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.888893127441406, + "learning_rate": 1e-06, + "loss": 0.3852, + "mean_token_accuracy": 0.8779392242431641, + "num_tokens": 596953554.0, + "step": 15649 + }, + { + "epoch": 1.9908408599414833, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.960535049438477, + "learning_rate": 1e-06, + "loss": 0.429, + "mean_token_accuracy": 0.8649139404296875, + "num_tokens": 596991206.0, + "step": 15650 + }, + { + "epoch": 1.9909680702200738, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.832630157470703, + "learning_rate": 1e-06, + "loss": 0.4056, + "mean_token_accuracy": 0.8716840744018555, + "num_tokens": 597027222.0, + "step": 15651 + }, + { + "epoch": 1.9910952804986644, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.713157653808594, + "learning_rate": 1e-06, + "loss": 0.382, + "mean_token_accuracy": 0.8788696527481079, + "num_tokens": 597069040.0, + "step": 15652 + }, + { + "epoch": 1.9912224907772549, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.89793586730957, + "learning_rate": 1e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.8626430034637451, + "num_tokens": 597112529.0, + "step": 15653 + }, + { + "epoch": 1.9913497010558454, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.899030685424805, + "learning_rate": 1e-06, + "loss": 0.4064, + "mean_token_accuracy": 0.8718639016151428, + "num_tokens": 597148304.0, + "step": 15654 + }, + { + "epoch": 1.991476911334436, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.750263214111328, + "learning_rate": 1e-06, + "loss": 0.3901, + "mean_token_accuracy": 0.8790940046310425, + "num_tokens": 597184244.0, + "step": 15655 + }, + { + "epoch": 1.9916041216130265, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.912925720214844, + "learning_rate": 1e-06, + "loss": 0.4307, + "mean_token_accuracy": 0.8632054924964905, + "num_tokens": 597218178.0, + "step": 15656 + }, + { + "epoch": 1.991731331891617, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.85856056213379, + "learning_rate": 1e-06, + "loss": 0.4193, + "mean_token_accuracy": 0.867398202419281, + "num_tokens": 597259256.0, + "step": 15657 + }, + { + "epoch": 1.9918585421702073, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.87782096862793, + "learning_rate": 1e-06, + "loss": 0.3786, + "mean_token_accuracy": 0.8812037706375122, + "num_tokens": 597296584.0, + "step": 15658 + }, + { + "epoch": 1.9919857524487978, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.832374572753906, + "learning_rate": 1e-06, + "loss": 0.4633, + "mean_token_accuracy": 0.85577791929245, + "num_tokens": 597341116.0, + "step": 15659 + }, + { + "epoch": 1.9921129627273884, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.780595779418945, + "learning_rate": 1e-06, + "loss": 0.4235, + "mean_token_accuracy": 0.8675847053527832, + "num_tokens": 597375595.0, + "step": 15660 + }, + { + "epoch": 1.9922401730059789, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.89417266845703, + "learning_rate": 1e-06, + "loss": 0.4572, + "mean_token_accuracy": 0.8568049669265747, + "num_tokens": 597419226.0, + "step": 15661 + }, + { + "epoch": 1.9923673832845694, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.61724853515625, + "learning_rate": 1e-06, + "loss": 0.4725, + "mean_token_accuracy": 0.851386308670044, + "num_tokens": 597461242.0, + "step": 15662 + }, + { + "epoch": 1.99249459356316, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.865842819213867, + "learning_rate": 1e-06, + "loss": 0.4007, + "mean_token_accuracy": 0.8703761696815491, + "num_tokens": 597500030.0, + "step": 15663 + }, + { + "epoch": 1.9926218038417502, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.87021827697754, + "learning_rate": 1e-06, + "loss": 0.4652, + "mean_token_accuracy": 0.854718029499054, + "num_tokens": 597540428.0, + "step": 15664 + }, + { + "epoch": 1.9927490141203408, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.810998916625977, + "learning_rate": 1e-06, + "loss": 0.3678, + "mean_token_accuracy": 0.881057858467102, + "num_tokens": 597575070.0, + "step": 15665 + }, + { + "epoch": 1.9928762243989313, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.7114200592041, + "learning_rate": 1e-06, + "loss": 0.4326, + "mean_token_accuracy": 0.8690885901451111, + "num_tokens": 597617425.0, + "step": 15666 + }, + { + "epoch": 1.9930034346775218, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.76980209350586, + "learning_rate": 1e-06, + "loss": 0.3926, + "mean_token_accuracy": 0.8765457272529602, + "num_tokens": 597660489.0, + "step": 15667 + }, + { + "epoch": 1.9931306449561124, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.792400360107422, + "learning_rate": 1e-06, + "loss": 0.3878, + "mean_token_accuracy": 0.8754047155380249, + "num_tokens": 597703626.0, + "step": 15668 + }, + { + "epoch": 1.993257855234703, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.86296844482422, + "learning_rate": 1e-06, + "loss": 0.4227, + "mean_token_accuracy": 0.8652803301811218, + "num_tokens": 597741176.0, + "step": 15669 + }, + { + "epoch": 1.9933850655132934, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.79747772216797, + "learning_rate": 1e-06, + "loss": 0.4608, + "mean_token_accuracy": 0.8577065467834473, + "num_tokens": 597779763.0, + "step": 15670 + }, + { + "epoch": 1.993512275791884, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.79900360107422, + "learning_rate": 1e-06, + "loss": 0.3942, + "mean_token_accuracy": 0.8741101026535034, + "num_tokens": 597813300.0, + "step": 15671 + }, + { + "epoch": 1.9936394860704745, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.792612075805664, + "learning_rate": 1e-06, + "loss": 0.4346, + "mean_token_accuracy": 0.8598257303237915, + "num_tokens": 597849062.0, + "step": 15672 + }, + { + "epoch": 1.993766696349065, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.742206573486328, + "learning_rate": 1e-06, + "loss": 0.4037, + "mean_token_accuracy": 0.8703666925430298, + "num_tokens": 597886565.0, + "step": 15673 + }, + { + "epoch": 1.9938939066276555, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.95992088317871, + "learning_rate": 1e-06, + "loss": 0.4073, + "mean_token_accuracy": 0.8728767037391663, + "num_tokens": 597925097.0, + "step": 15674 + }, + { + "epoch": 1.994021116906246, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.90331268310547, + "learning_rate": 1e-06, + "loss": 0.377, + "mean_token_accuracy": 0.8823108077049255, + "num_tokens": 597960786.0, + "step": 15675 + }, + { + "epoch": 1.9941483271848366, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.909561157226562, + "learning_rate": 1e-06, + "loss": 0.4711, + "mean_token_accuracy": 0.8516423106193542, + "num_tokens": 598000442.0, + "step": 15676 + }, + { + "epoch": 1.9942755374634271, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.820802688598633, + "learning_rate": 1e-06, + "loss": 0.4202, + "mean_token_accuracy": 0.8676208257675171, + "num_tokens": 598042946.0, + "step": 15677 + }, + { + "epoch": 1.9944027477420176, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.093828201293945, + "learning_rate": 1e-06, + "loss": 0.4847, + "mean_token_accuracy": 0.8500851392745972, + "num_tokens": 598081659.0, + "step": 15678 + }, + { + "epoch": 1.9945299580206082, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.722759246826172, + "learning_rate": 1e-06, + "loss": 0.4291, + "mean_token_accuracy": 0.8639261722564697, + "num_tokens": 598120203.0, + "step": 15679 + }, + { + "epoch": 1.9946571682991987, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.00092124938965, + "learning_rate": 1e-06, + "loss": 0.4537, + "mean_token_accuracy": 0.8573931455612183, + "num_tokens": 598154779.0, + "step": 15680 + }, + { + "epoch": 1.9947843785777892, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.800403594970703, + "learning_rate": 1e-06, + "loss": 0.4103, + "mean_token_accuracy": 0.8692912459373474, + "num_tokens": 598197075.0, + "step": 15681 + }, + { + "epoch": 1.9949115888563798, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.795238494873047, + "learning_rate": 1e-06, + "loss": 0.4358, + "mean_token_accuracy": 0.8623678684234619, + "num_tokens": 598240539.0, + "step": 15682 + }, + { + "epoch": 1.99503879913497, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.803077697753906, + "learning_rate": 1e-06, + "loss": 0.4092, + "mean_token_accuracy": 0.8710541129112244, + "num_tokens": 598281776.0, + "step": 15683 + }, + { + "epoch": 1.9951660094135606, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.866891860961914, + "learning_rate": 1e-06, + "loss": 0.4007, + "mean_token_accuracy": 0.8769035339355469, + "num_tokens": 598322715.0, + "step": 15684 + }, + { + "epoch": 1.9952932196921511, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.744213104248047, + "learning_rate": 1e-06, + "loss": 0.4306, + "mean_token_accuracy": 0.8641477227210999, + "num_tokens": 598364495.0, + "step": 15685 + }, + { + "epoch": 1.9954204299707416, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.996219635009766, + "learning_rate": 1e-06, + "loss": 0.4255, + "mean_token_accuracy": 0.8662731647491455, + "num_tokens": 598407102.0, + "step": 15686 + }, + { + "epoch": 1.9955476402493322, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.757417678833008, + "learning_rate": 1e-06, + "loss": 0.4536, + "mean_token_accuracy": 0.8555889129638672, + "num_tokens": 598448600.0, + "step": 15687 + }, + { + "epoch": 1.9956748505279227, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.87635040283203, + "learning_rate": 1e-06, + "loss": 0.4418, + "mean_token_accuracy": 0.859063982963562, + "num_tokens": 598491140.0, + "step": 15688 + }, + { + "epoch": 1.995802060806513, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.06914520263672, + "learning_rate": 1e-06, + "loss": 0.4361, + "mean_token_accuracy": 0.8650401830673218, + "num_tokens": 598527831.0, + "step": 15689 + }, + { + "epoch": 1.9959292710851035, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.626319885253906, + "learning_rate": 1e-06, + "loss": 0.4593, + "mean_token_accuracy": 0.8560723066329956, + "num_tokens": 598568680.0, + "step": 15690 + }, + { + "epoch": 1.996056481363694, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.92043685913086, + "learning_rate": 1e-06, + "loss": 0.4445, + "mean_token_accuracy": 0.8581515550613403, + "num_tokens": 598606254.0, + "step": 15691 + }, + { + "epoch": 1.9961836916422846, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.780025482177734, + "learning_rate": 1e-06, + "loss": 0.3998, + "mean_token_accuracy": 0.8715982437133789, + "num_tokens": 598642400.0, + "step": 15692 + }, + { + "epoch": 1.9963109019208751, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.83427619934082, + "learning_rate": 1e-06, + "loss": 0.4764, + "mean_token_accuracy": 0.8476018309593201, + "num_tokens": 598678379.0, + "step": 15693 + }, + { + "epoch": 1.9964381121994657, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.82818031311035, + "learning_rate": 1e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.8671543598175049, + "num_tokens": 598718234.0, + "step": 15694 + }, + { + "epoch": 1.9965653224780562, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.94771385192871, + "learning_rate": 1e-06, + "loss": 0.3962, + "mean_token_accuracy": 0.8728287220001221, + "num_tokens": 598754897.0, + "step": 15695 + }, + { + "epoch": 1.9966925327566467, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.915067672729492, + "learning_rate": 1e-06, + "loss": 0.4295, + "mean_token_accuracy": 0.8637247085571289, + "num_tokens": 598785268.0, + "step": 15696 + }, + { + "epoch": 1.9968197430352372, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.916967391967773, + "learning_rate": 1e-06, + "loss": 0.3978, + "mean_token_accuracy": 0.8754560351371765, + "num_tokens": 598827131.0, + "step": 15697 + }, + { + "epoch": 1.9969469533138278, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.768529891967773, + "learning_rate": 1e-06, + "loss": 0.4123, + "mean_token_accuracy": 0.8698663711547852, + "num_tokens": 598859507.0, + "step": 15698 + }, + { + "epoch": 1.9970741635924183, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.98277473449707, + "learning_rate": 1e-06, + "loss": 0.4406, + "mean_token_accuracy": 0.8592360615730286, + "num_tokens": 598893280.0, + "step": 15699 + }, + { + "epoch": 1.9972013738710088, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.91874122619629, + "learning_rate": 1e-06, + "loss": 0.3706, + "mean_token_accuracy": 0.8823291063308716, + "num_tokens": 598932759.0, + "step": 15700 + }, + { + "epoch": 1.9973285841495994, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.777748107910156, + "learning_rate": 1e-06, + "loss": 0.3828, + "mean_token_accuracy": 0.8788343667984009, + "num_tokens": 598972899.0, + "step": 15701 + }, + { + "epoch": 1.9974557944281899, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.906208038330078, + "learning_rate": 1e-06, + "loss": 0.4155, + "mean_token_accuracy": 0.8704190254211426, + "num_tokens": 599012811.0, + "step": 15702 + }, + { + "epoch": 1.9975830047067804, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.933576583862305, + "learning_rate": 1e-06, + "loss": 0.4579, + "mean_token_accuracy": 0.8602277636528015, + "num_tokens": 599055632.0, + "step": 15703 + }, + { + "epoch": 1.997710214985371, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.001562118530273, + "learning_rate": 1e-06, + "loss": 0.459, + "mean_token_accuracy": 0.8512723445892334, + "num_tokens": 599090924.0, + "step": 15704 + }, + { + "epoch": 1.9978374252639615, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.893844604492188, + "learning_rate": 1e-06, + "loss": 0.3678, + "mean_token_accuracy": 0.8825820684432983, + "num_tokens": 599130434.0, + "step": 15705 + }, + { + "epoch": 1.997964635542552, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.015125274658203, + "learning_rate": 1e-06, + "loss": 0.4251, + "mean_token_accuracy": 0.8678213953971863, + "num_tokens": 599171018.0, + "step": 15706 + }, + { + "epoch": 1.9980918458211423, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.108373641967773, + "learning_rate": 1e-06, + "loss": 0.4453, + "mean_token_accuracy": 0.8597386479377747, + "num_tokens": 599212445.0, + "step": 15707 + }, + { + "epoch": 1.9982190560997328, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.76321792602539, + "learning_rate": 1e-06, + "loss": 0.3998, + "mean_token_accuracy": 0.8738138675689697, + "num_tokens": 599250591.0, + "step": 15708 + }, + { + "epoch": 1.9983462663783234, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.946672439575195, + "learning_rate": 1e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.8652958869934082, + "num_tokens": 599284987.0, + "step": 15709 + }, + { + "epoch": 1.9984734766569139, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.014921188354492, + "learning_rate": 1e-06, + "loss": 0.4283, + "mean_token_accuracy": 0.8644976615905762, + "num_tokens": 599324656.0, + "step": 15710 + }, + { + "epoch": 1.9986006869355044, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.882883071899414, + "learning_rate": 1e-06, + "loss": 0.3853, + "mean_token_accuracy": 0.8798971176147461, + "num_tokens": 599364014.0, + "step": 15711 + }, + { + "epoch": 1.998727897214095, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.77360725402832, + "learning_rate": 1e-06, + "loss": 0.4365, + "mean_token_accuracy": 0.8610533475875854, + "num_tokens": 599396781.0, + "step": 15712 + }, + { + "epoch": 1.9988551074926852, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.825319290161133, + "learning_rate": 1e-06, + "loss": 0.3828, + "mean_token_accuracy": 0.8784472942352295, + "num_tokens": 599433240.0, + "step": 15713 + }, + { + "epoch": 1.9989823177712758, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.901086807250977, + "learning_rate": 1e-06, + "loss": 0.3915, + "mean_token_accuracy": 0.878426194190979, + "num_tokens": 599474883.0, + "step": 15714 + }, + { + "epoch": 1.9991095280498663, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.906112670898438, + "learning_rate": 1e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.869526743888855, + "num_tokens": 599508492.0, + "step": 15715 + }, + { + "epoch": 1.9992367383284568, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.87332534790039, + "learning_rate": 1e-06, + "loss": 0.4126, + "mean_token_accuracy": 0.8719844222068787, + "num_tokens": 599545946.0, + "step": 15716 + }, + { + "epoch": 1.9993639486070474, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.92169952392578, + "learning_rate": 1e-06, + "loss": 0.3742, + "mean_token_accuracy": 0.8825085163116455, + "num_tokens": 599586217.0, + "step": 15717 + }, + { + "epoch": 1.9994911588856379, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.831876754760742, + "learning_rate": 1e-06, + "loss": 0.3887, + "mean_token_accuracy": 0.8769813179969788, + "num_tokens": 599620431.0, + "step": 15718 + }, + { + "epoch": 1.9996183691642284, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.875272750854492, + "learning_rate": 1e-06, + "loss": 0.4614, + "mean_token_accuracy": 0.8555744290351868, + "num_tokens": 599659959.0, + "step": 15719 + }, + { + "epoch": 1.999745579442819, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.92007827758789, + "learning_rate": 1e-06, + "loss": 0.4141, + "mean_token_accuracy": 0.8678712844848633, + "num_tokens": 599696558.0, + "step": 15720 + }, + { + "epoch": 1.9998727897214095, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.9410343170166, + "learning_rate": 1e-06, + "loss": 0.4395, + "mean_token_accuracy": 0.858124852180481, + "num_tokens": 599734925.0, + "step": 15721 + }, + { + "epoch": 2.0, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.739709854125977, + "learning_rate": 1e-06, + "loss": 0.3936, + "mean_token_accuracy": 0.873399019241333, + "num_tokens": 599772613.0, + "step": 15722 + }, + { + "epoch": 2.0001272102785905, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.87119483947754, + "learning_rate": 1e-06, + "loss": 0.4016, + "mean_token_accuracy": 0.8746035099029541, + "num_tokens": 599813892.0, + "step": 15723 + }, + { + "epoch": 2.000254420557181, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.936508178710938, + "learning_rate": 1e-06, + "loss": 0.4171, + "mean_token_accuracy": 0.8715023994445801, + "num_tokens": 599852807.0, + "step": 15724 + }, + { + "epoch": 2.0003816308357716, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.876935958862305, + "learning_rate": 1e-06, + "loss": 0.3829, + "mean_token_accuracy": 0.8775815367698669, + "num_tokens": 599893712.0, + "step": 15725 + }, + { + "epoch": 2.000508841114362, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.914052963256836, + "learning_rate": 1e-06, + "loss": 0.4031, + "mean_token_accuracy": 0.8721021413803101, + "num_tokens": 599929954.0, + "step": 15726 + }, + { + "epoch": 2.0006360513929526, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.053457260131836, + "learning_rate": 1e-06, + "loss": 0.4021, + "mean_token_accuracy": 0.8688827157020569, + "num_tokens": 599967763.0, + "step": 15727 + }, + { + "epoch": 2.000763261671543, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.886266708374023, + "learning_rate": 1e-06, + "loss": 0.3675, + "mean_token_accuracy": 0.8798184990882874, + "num_tokens": 600008938.0, + "step": 15728 + }, + { + "epoch": 2.0008904719501337, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.900421142578125, + "learning_rate": 1e-06, + "loss": 0.4207, + "mean_token_accuracy": 0.8673192858695984, + "num_tokens": 600051849.0, + "step": 15729 + }, + { + "epoch": 2.0010176822287242, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.982460021972656, + "learning_rate": 1e-06, + "loss": 0.3924, + "mean_token_accuracy": 0.877518892288208, + "num_tokens": 600088568.0, + "step": 15730 + }, + { + "epoch": 2.0011448925073148, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.860679626464844, + "learning_rate": 1e-06, + "loss": 0.3921, + "mean_token_accuracy": 0.8756493926048279, + "num_tokens": 600131553.0, + "step": 15731 + }, + { + "epoch": 2.0012721027859053, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.876636505126953, + "learning_rate": 1e-06, + "loss": 0.4012, + "mean_token_accuracy": 0.8717638850212097, + "num_tokens": 600173198.0, + "step": 15732 + }, + { + "epoch": 2.001399313064496, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.904788970947266, + "learning_rate": 1e-06, + "loss": 0.3915, + "mean_token_accuracy": 0.8762416243553162, + "num_tokens": 600211347.0, + "step": 15733 + }, + { + "epoch": 2.0015265233430863, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.8837890625, + "learning_rate": 1e-06, + "loss": 0.4293, + "mean_token_accuracy": 0.8659781813621521, + "num_tokens": 600249953.0, + "step": 15734 + }, + { + "epoch": 2.0016537336216764, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.866790771484375, + "learning_rate": 1e-06, + "loss": 0.4316, + "mean_token_accuracy": 0.8641204833984375, + "num_tokens": 600291211.0, + "step": 15735 + }, + { + "epoch": 2.001780943900267, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.99505615234375, + "learning_rate": 1e-06, + "loss": 0.3931, + "mean_token_accuracy": 0.8768558502197266, + "num_tokens": 600332906.0, + "step": 15736 + }, + { + "epoch": 2.0019081541788575, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.937162399291992, + "learning_rate": 1e-06, + "loss": 0.4068, + "mean_token_accuracy": 0.871145486831665, + "num_tokens": 600370570.0, + "step": 15737 + }, + { + "epoch": 2.002035364457448, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.935977935791016, + "learning_rate": 1e-06, + "loss": 0.4346, + "mean_token_accuracy": 0.8623511791229248, + "num_tokens": 600409368.0, + "step": 15738 + }, + { + "epoch": 2.0021625747360385, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.9271240234375, + "learning_rate": 1e-06, + "loss": 0.391, + "mean_token_accuracy": 0.8755508065223694, + "num_tokens": 600442019.0, + "step": 15739 + }, + { + "epoch": 2.002289785014629, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.09127426147461, + "learning_rate": 1e-06, + "loss": 0.3768, + "mean_token_accuracy": 0.8804063200950623, + "num_tokens": 600471665.0, + "step": 15740 + }, + { + "epoch": 2.0024169952932196, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.992063522338867, + "learning_rate": 1e-06, + "loss": 0.4079, + "mean_token_accuracy": 0.8700448870658875, + "num_tokens": 600508804.0, + "step": 15741 + }, + { + "epoch": 2.00254420557181, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.20496940612793, + "learning_rate": 1e-06, + "loss": 0.446, + "mean_token_accuracy": 0.8604327440261841, + "num_tokens": 600543895.0, + "step": 15742 + }, + { + "epoch": 2.0026714158504006, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.883010864257812, + "learning_rate": 1e-06, + "loss": 0.4044, + "mean_token_accuracy": 0.873173713684082, + "num_tokens": 600585818.0, + "step": 15743 + }, + { + "epoch": 2.002798626128991, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.03725242614746, + "learning_rate": 1e-06, + "loss": 0.3958, + "mean_token_accuracy": 0.8764197826385498, + "num_tokens": 600623106.0, + "step": 15744 + }, + { + "epoch": 2.0029258364075817, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.03121566772461, + "learning_rate": 1e-06, + "loss": 0.3358, + "mean_token_accuracy": 0.8924753665924072, + "num_tokens": 600662662.0, + "step": 15745 + }, + { + "epoch": 2.0030530466861722, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.825271606445312, + "learning_rate": 1e-06, + "loss": 0.4399, + "mean_token_accuracy": 0.8579984903335571, + "num_tokens": 600699416.0, + "step": 15746 + }, + { + "epoch": 2.0031802569647628, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.96160888671875, + "learning_rate": 1e-06, + "loss": 0.4216, + "mean_token_accuracy": 0.8645851612091064, + "num_tokens": 600739351.0, + "step": 15747 + }, + { + "epoch": 2.0033074672433533, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.077011108398438, + "learning_rate": 1e-06, + "loss": 0.397, + "mean_token_accuracy": 0.8738536238670349, + "num_tokens": 600781198.0, + "step": 15748 + }, + { + "epoch": 2.003434677521944, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.988937377929688, + "learning_rate": 1e-06, + "loss": 0.4104, + "mean_token_accuracy": 0.8705614805221558, + "num_tokens": 600821743.0, + "step": 15749 + }, + { + "epoch": 2.0035618878005343, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.873733520507812, + "learning_rate": 1e-06, + "loss": 0.3578, + "mean_token_accuracy": 0.8867735862731934, + "num_tokens": 600862463.0, + "step": 15750 + }, + { + "epoch": 2.003689098079125, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.80024528503418, + "learning_rate": 1e-06, + "loss": 0.4248, + "mean_token_accuracy": 0.8616036176681519, + "num_tokens": 600902629.0, + "step": 15751 + }, + { + "epoch": 2.0038163083577154, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.904977798461914, + "learning_rate": 1e-06, + "loss": 0.4073, + "mean_token_accuracy": 0.8694894313812256, + "num_tokens": 600941778.0, + "step": 15752 + }, + { + "epoch": 2.003943518636306, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.957965850830078, + "learning_rate": 1e-06, + "loss": 0.383, + "mean_token_accuracy": 0.8797873258590698, + "num_tokens": 600979425.0, + "step": 15753 + }, + { + "epoch": 2.0040707289148965, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.980331420898438, + "learning_rate": 1e-06, + "loss": 0.4146, + "mean_token_accuracy": 0.8677527904510498, + "num_tokens": 601017882.0, + "step": 15754 + }, + { + "epoch": 2.004197939193487, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.98366928100586, + "learning_rate": 1e-06, + "loss": 0.4157, + "mean_token_accuracy": 0.8662642240524292, + "num_tokens": 601053394.0, + "step": 15755 + }, + { + "epoch": 2.0043251494720775, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.949867248535156, + "learning_rate": 1e-06, + "loss": 0.3955, + "mean_token_accuracy": 0.8755015730857849, + "num_tokens": 601094940.0, + "step": 15756 + }, + { + "epoch": 2.004452359750668, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.94607162475586, + "learning_rate": 1e-06, + "loss": 0.3994, + "mean_token_accuracy": 0.8727638721466064, + "num_tokens": 601132963.0, + "step": 15757 + }, + { + "epoch": 2.0045795700292586, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.973217010498047, + "learning_rate": 1e-06, + "loss": 0.4113, + "mean_token_accuracy": 0.871100127696991, + "num_tokens": 601168511.0, + "step": 15758 + }, + { + "epoch": 2.0047067803078487, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.88013458251953, + "learning_rate": 1e-06, + "loss": 0.4008, + "mean_token_accuracy": 0.8719750642776489, + "num_tokens": 601208922.0, + "step": 15759 + }, + { + "epoch": 2.004833990586439, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.02123260498047, + "learning_rate": 1e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.866175651550293, + "num_tokens": 601253814.0, + "step": 15760 + }, + { + "epoch": 2.0049612008650297, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.03842544555664, + "learning_rate": 1e-06, + "loss": 0.4009, + "mean_token_accuracy": 0.8732698559761047, + "num_tokens": 601295584.0, + "step": 15761 + }, + { + "epoch": 2.0050884111436202, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.915895462036133, + "learning_rate": 1e-06, + "loss": 0.4295, + "mean_token_accuracy": 0.8635653257369995, + "num_tokens": 601329134.0, + "step": 15762 + }, + { + "epoch": 2.0052156214222108, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.886825561523438, + "learning_rate": 1e-06, + "loss": 0.3995, + "mean_token_accuracy": 0.8807875514030457, + "num_tokens": 601369100.0, + "step": 15763 + }, + { + "epoch": 2.0053428317008013, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.093826293945312, + "learning_rate": 1e-06, + "loss": 0.3574, + "mean_token_accuracy": 0.8867543935775757, + "num_tokens": 601407876.0, + "step": 15764 + }, + { + "epoch": 2.005470041979392, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.78852653503418, + "learning_rate": 1e-06, + "loss": 0.3665, + "mean_token_accuracy": 0.8853546977043152, + "num_tokens": 601446996.0, + "step": 15765 + }, + { + "epoch": 2.0055972522579824, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.728118896484375, + "learning_rate": 1e-06, + "loss": 0.4365, + "mean_token_accuracy": 0.863764762878418, + "num_tokens": 601482158.0, + "step": 15766 + }, + { + "epoch": 2.005724462536573, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.986492156982422, + "learning_rate": 1e-06, + "loss": 0.3822, + "mean_token_accuracy": 0.8777135014533997, + "num_tokens": 601517940.0, + "step": 15767 + }, + { + "epoch": 2.0058516728151634, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.923986434936523, + "learning_rate": 1e-06, + "loss": 0.3351, + "mean_token_accuracy": 0.8921040892601013, + "num_tokens": 601553823.0, + "step": 15768 + }, + { + "epoch": 2.005978883093754, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.778120040893555, + "learning_rate": 1e-06, + "loss": 0.4445, + "mean_token_accuracy": 0.8602052927017212, + "num_tokens": 601595482.0, + "step": 15769 + }, + { + "epoch": 2.0061060933723445, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.87440299987793, + "learning_rate": 1e-06, + "loss": 0.3784, + "mean_token_accuracy": 0.8796971440315247, + "num_tokens": 601632451.0, + "step": 15770 + }, + { + "epoch": 2.006233303650935, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.732421875, + "learning_rate": 1e-06, + "loss": 0.4132, + "mean_token_accuracy": 0.8698278069496155, + "num_tokens": 601667573.0, + "step": 15771 + }, + { + "epoch": 2.0063605139295255, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.017118453979492, + "learning_rate": 1e-06, + "loss": 0.3834, + "mean_token_accuracy": 0.8783320188522339, + "num_tokens": 601704494.0, + "step": 15772 + }, + { + "epoch": 2.006487724208116, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.94758415222168, + "learning_rate": 1e-06, + "loss": 0.4261, + "mean_token_accuracy": 0.8619019389152527, + "num_tokens": 601745307.0, + "step": 15773 + }, + { + "epoch": 2.0066149344867066, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.832622528076172, + "learning_rate": 1e-06, + "loss": 0.3563, + "mean_token_accuracy": 0.8885549902915955, + "num_tokens": 601782551.0, + "step": 15774 + }, + { + "epoch": 2.006742144765297, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.10769271850586, + "learning_rate": 1e-06, + "loss": 0.3989, + "mean_token_accuracy": 0.8740178942680359, + "num_tokens": 601823378.0, + "step": 15775 + }, + { + "epoch": 2.0068693550438876, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.74919319152832, + "learning_rate": 1e-06, + "loss": 0.4077, + "mean_token_accuracy": 0.8726563453674316, + "num_tokens": 601860881.0, + "step": 15776 + }, + { + "epoch": 2.006996565322478, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.041481018066406, + "learning_rate": 1e-06, + "loss": 0.4141, + "mean_token_accuracy": 0.8707177042961121, + "num_tokens": 601903502.0, + "step": 15777 + }, + { + "epoch": 2.0071237756010687, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.036645889282227, + "learning_rate": 1e-06, + "loss": 0.3542, + "mean_token_accuracy": 0.8900696635246277, + "num_tokens": 601937097.0, + "step": 15778 + }, + { + "epoch": 2.007250985879659, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.0200252532959, + "learning_rate": 1e-06, + "loss": 0.3976, + "mean_token_accuracy": 0.875823438167572, + "num_tokens": 601978050.0, + "step": 15779 + }, + { + "epoch": 2.0073781961582498, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.964815139770508, + "learning_rate": 1e-06, + "loss": 0.4027, + "mean_token_accuracy": 0.8699966669082642, + "num_tokens": 602018167.0, + "step": 15780 + }, + { + "epoch": 2.0075054064368403, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.890901565551758, + "learning_rate": 1e-06, + "loss": 0.4367, + "mean_token_accuracy": 0.858220100402832, + "num_tokens": 602054485.0, + "step": 15781 + }, + { + "epoch": 2.007632616715431, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.96492576599121, + "learning_rate": 1e-06, + "loss": 0.3901, + "mean_token_accuracy": 0.8769766092300415, + "num_tokens": 602092881.0, + "step": 15782 + }, + { + "epoch": 2.0077598269940213, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.0185604095459, + "learning_rate": 1e-06, + "loss": 0.3953, + "mean_token_accuracy": 0.8732175230979919, + "num_tokens": 602127424.0, + "step": 15783 + }, + { + "epoch": 2.0078870372726114, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.150333404541016, + "learning_rate": 1e-06, + "loss": 0.3786, + "mean_token_accuracy": 0.8798709511756897, + "num_tokens": 602168340.0, + "step": 15784 + }, + { + "epoch": 2.008014247551202, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.823923110961914, + "learning_rate": 1e-06, + "loss": 0.4399, + "mean_token_accuracy": 0.8636722564697266, + "num_tokens": 602207894.0, + "step": 15785 + }, + { + "epoch": 2.0081414578297925, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.947444915771484, + "learning_rate": 1e-06, + "loss": 0.4078, + "mean_token_accuracy": 0.8746756911277771, + "num_tokens": 602245924.0, + "step": 15786 + }, + { + "epoch": 2.008268668108383, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.859092712402344, + "learning_rate": 1e-06, + "loss": 0.4087, + "mean_token_accuracy": 0.8710057735443115, + "num_tokens": 602278417.0, + "step": 15787 + }, + { + "epoch": 2.0083958783869735, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.041973114013672, + "learning_rate": 1e-06, + "loss": 0.388, + "mean_token_accuracy": 0.8753007650375366, + "num_tokens": 602318431.0, + "step": 15788 + }, + { + "epoch": 2.008523088665564, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.98484230041504, + "learning_rate": 1e-06, + "loss": 0.4243, + "mean_token_accuracy": 0.8640084862709045, + "num_tokens": 602356315.0, + "step": 15789 + }, + { + "epoch": 2.0086502989441546, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.877965927124023, + "learning_rate": 1e-06, + "loss": 0.3521, + "mean_token_accuracy": 0.8863731622695923, + "num_tokens": 602398865.0, + "step": 15790 + }, + { + "epoch": 2.008777509222745, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.924335479736328, + "learning_rate": 1e-06, + "loss": 0.3702, + "mean_token_accuracy": 0.8828166723251343, + "num_tokens": 602435835.0, + "step": 15791 + }, + { + "epoch": 2.0089047195013356, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.924699783325195, + "learning_rate": 1e-06, + "loss": 0.3992, + "mean_token_accuracy": 0.8803122043609619, + "num_tokens": 602470359.0, + "step": 15792 + }, + { + "epoch": 2.009031929779926, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.90168571472168, + "learning_rate": 1e-06, + "loss": 0.3858, + "mean_token_accuracy": 0.876919150352478, + "num_tokens": 602512132.0, + "step": 15793 + }, + { + "epoch": 2.0091591400585167, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.934127807617188, + "learning_rate": 1e-06, + "loss": 0.4083, + "mean_token_accuracy": 0.8695729970932007, + "num_tokens": 602552700.0, + "step": 15794 + }, + { + "epoch": 2.0092863503371072, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.854177474975586, + "learning_rate": 1e-06, + "loss": 0.4018, + "mean_token_accuracy": 0.8723195791244507, + "num_tokens": 602587779.0, + "step": 15795 + }, + { + "epoch": 2.0094135606156978, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.96126365661621, + "learning_rate": 1e-06, + "loss": 0.445, + "mean_token_accuracy": 0.8617774248123169, + "num_tokens": 602623834.0, + "step": 15796 + }, + { + "epoch": 2.0095407708942883, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.794761657714844, + "learning_rate": 1e-06, + "loss": 0.3698, + "mean_token_accuracy": 0.8832461833953857, + "num_tokens": 602660141.0, + "step": 15797 + }, + { + "epoch": 2.009667981172879, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.925600051879883, + "learning_rate": 1e-06, + "loss": 0.3877, + "mean_token_accuracy": 0.8771055936813354, + "num_tokens": 602698990.0, + "step": 15798 + }, + { + "epoch": 2.0097951914514693, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.84813117980957, + "learning_rate": 1e-06, + "loss": 0.4536, + "mean_token_accuracy": 0.8549506664276123, + "num_tokens": 602732053.0, + "step": 15799 + }, + { + "epoch": 2.00992240173006, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.944049835205078, + "learning_rate": 1e-06, + "loss": 0.3848, + "mean_token_accuracy": 0.8739449977874756, + "num_tokens": 602770975.0, + "step": 15800 + }, + { + "epoch": 2.0100496120086504, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.02517318725586, + "learning_rate": 1e-06, + "loss": 0.4338, + "mean_token_accuracy": 0.8634078502655029, + "num_tokens": 602813207.0, + "step": 15801 + }, + { + "epoch": 2.010176822287241, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.979877471923828, + "learning_rate": 1e-06, + "loss": 0.3882, + "mean_token_accuracy": 0.8790261745452881, + "num_tokens": 602849951.0, + "step": 15802 + }, + { + "epoch": 2.0103040325658315, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.956918716430664, + "learning_rate": 1e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.8639466166496277, + "num_tokens": 602891418.0, + "step": 15803 + }, + { + "epoch": 2.010431242844422, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.86189842224121, + "learning_rate": 1e-06, + "loss": 0.3872, + "mean_token_accuracy": 0.8785666823387146, + "num_tokens": 602933039.0, + "step": 15804 + }, + { + "epoch": 2.0105584531230125, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.86053466796875, + "learning_rate": 1e-06, + "loss": 0.3604, + "mean_token_accuracy": 0.8876923322677612, + "num_tokens": 602965546.0, + "step": 15805 + }, + { + "epoch": 2.010685663401603, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.880788803100586, + "learning_rate": 1e-06, + "loss": 0.3965, + "mean_token_accuracy": 0.8725245594978333, + "num_tokens": 603004473.0, + "step": 15806 + }, + { + "epoch": 2.0108128736801936, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.754531860351562, + "learning_rate": 1e-06, + "loss": 0.4278, + "mean_token_accuracy": 0.8656734228134155, + "num_tokens": 603041644.0, + "step": 15807 + }, + { + "epoch": 2.0109400839587837, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.968690872192383, + "learning_rate": 1e-06, + "loss": 0.4348, + "mean_token_accuracy": 0.8610900640487671, + "num_tokens": 603084698.0, + "step": 15808 + }, + { + "epoch": 2.011067294237374, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.831031799316406, + "learning_rate": 1e-06, + "loss": 0.454, + "mean_token_accuracy": 0.8571182489395142, + "num_tokens": 603121059.0, + "step": 15809 + }, + { + "epoch": 2.0111945045159647, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.93555450439453, + "learning_rate": 1e-06, + "loss": 0.4303, + "mean_token_accuracy": 0.870362401008606, + "num_tokens": 603156604.0, + "step": 15810 + }, + { + "epoch": 2.0113217147945552, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.002971649169922, + "learning_rate": 1e-06, + "loss": 0.394, + "mean_token_accuracy": 0.8726545572280884, + "num_tokens": 603183743.0, + "step": 15811 + }, + { + "epoch": 2.0114489250731458, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.135658264160156, + "learning_rate": 1e-06, + "loss": 0.4799, + "mean_token_accuracy": 0.8486770987510681, + "num_tokens": 603222099.0, + "step": 15812 + }, + { + "epoch": 2.0115761353517363, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.83582878112793, + "learning_rate": 1e-06, + "loss": 0.4638, + "mean_token_accuracy": 0.8531420826911926, + "num_tokens": 603263467.0, + "step": 15813 + }, + { + "epoch": 2.011703345630327, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.97222137451172, + "learning_rate": 1e-06, + "loss": 0.4622, + "mean_token_accuracy": 0.8532353639602661, + "num_tokens": 603296726.0, + "step": 15814 + }, + { + "epoch": 2.0118305559089174, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.064971923828125, + "learning_rate": 1e-06, + "loss": 0.3832, + "mean_token_accuracy": 0.8781751990318298, + "num_tokens": 603329691.0, + "step": 15815 + }, + { + "epoch": 2.011957766187508, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.91486930847168, + "learning_rate": 1e-06, + "loss": 0.3842, + "mean_token_accuracy": 0.8800789713859558, + "num_tokens": 603371619.0, + "step": 15816 + }, + { + "epoch": 2.0120849764660984, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.011560440063477, + "learning_rate": 1e-06, + "loss": 0.3891, + "mean_token_accuracy": 0.874856173992157, + "num_tokens": 603400018.0, + "step": 15817 + }, + { + "epoch": 2.012212186744689, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.79583740234375, + "learning_rate": 1e-06, + "loss": 0.3875, + "mean_token_accuracy": 0.8755139112472534, + "num_tokens": 603436971.0, + "step": 15818 + }, + { + "epoch": 2.0123393970232795, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.022449493408203, + "learning_rate": 1e-06, + "loss": 0.4108, + "mean_token_accuracy": 0.8724952936172485, + "num_tokens": 603472802.0, + "step": 15819 + }, + { + "epoch": 2.01246660730187, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.884153366088867, + "learning_rate": 1e-06, + "loss": 0.3829, + "mean_token_accuracy": 0.8778781890869141, + "num_tokens": 603506547.0, + "step": 15820 + }, + { + "epoch": 2.0125938175804605, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.976104736328125, + "learning_rate": 1e-06, + "loss": 0.4538, + "mean_token_accuracy": 0.8550947308540344, + "num_tokens": 603544967.0, + "step": 15821 + }, + { + "epoch": 2.012721027859051, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.80374526977539, + "learning_rate": 1e-06, + "loss": 0.4284, + "mean_token_accuracy": 0.8673019409179688, + "num_tokens": 603582522.0, + "step": 15822 + }, + { + "epoch": 2.0128482381376416, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.81490707397461, + "learning_rate": 1e-06, + "loss": 0.3702, + "mean_token_accuracy": 0.881953775882721, + "num_tokens": 603619584.0, + "step": 15823 + }, + { + "epoch": 2.012975448416232, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.08188247680664, + "learning_rate": 1e-06, + "loss": 0.4493, + "mean_token_accuracy": 0.8602475523948669, + "num_tokens": 603660542.0, + "step": 15824 + }, + { + "epoch": 2.0131026586948226, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.805408477783203, + "learning_rate": 1e-06, + "loss": 0.396, + "mean_token_accuracy": 0.8742826581001282, + "num_tokens": 603698900.0, + "step": 15825 + }, + { + "epoch": 2.013229868973413, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.8914852142334, + "learning_rate": 1e-06, + "loss": 0.4598, + "mean_token_accuracy": 0.853983998298645, + "num_tokens": 603733019.0, + "step": 15826 + }, + { + "epoch": 2.0133570792520037, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.81351661682129, + "learning_rate": 1e-06, + "loss": 0.3717, + "mean_token_accuracy": 0.8803029656410217, + "num_tokens": 603772071.0, + "step": 15827 + }, + { + "epoch": 2.013484289530594, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.902055740356445, + "learning_rate": 1e-06, + "loss": 0.4142, + "mean_token_accuracy": 0.8649939894676208, + "num_tokens": 603811008.0, + "step": 15828 + }, + { + "epoch": 2.0136114998091847, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.839977264404297, + "learning_rate": 1e-06, + "loss": 0.4289, + "mean_token_accuracy": 0.8627117872238159, + "num_tokens": 603852647.0, + "step": 15829 + }, + { + "epoch": 2.0137387100877753, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.999103546142578, + "learning_rate": 1e-06, + "loss": 0.4088, + "mean_token_accuracy": 0.8709825277328491, + "num_tokens": 603894198.0, + "step": 15830 + }, + { + "epoch": 2.013865920366366, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.873422622680664, + "learning_rate": 1e-06, + "loss": 0.4095, + "mean_token_accuracy": 0.8691527843475342, + "num_tokens": 603930168.0, + "step": 15831 + }, + { + "epoch": 2.0139931306449563, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.941694259643555, + "learning_rate": 1e-06, + "loss": 0.3674, + "mean_token_accuracy": 0.8813799619674683, + "num_tokens": 603966189.0, + "step": 15832 + }, + { + "epoch": 2.0141203409235464, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.76609992980957, + "learning_rate": 1e-06, + "loss": 0.4178, + "mean_token_accuracy": 0.8680391311645508, + "num_tokens": 604011110.0, + "step": 15833 + }, + { + "epoch": 2.014247551202137, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.900142669677734, + "learning_rate": 1e-06, + "loss": 0.4521, + "mean_token_accuracy": 0.8555012345314026, + "num_tokens": 604044256.0, + "step": 15834 + }, + { + "epoch": 2.0143747614807275, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.848407745361328, + "learning_rate": 1e-06, + "loss": 0.376, + "mean_token_accuracy": 0.8808584213256836, + "num_tokens": 604078652.0, + "step": 15835 + }, + { + "epoch": 2.014501971759318, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.95747184753418, + "learning_rate": 1e-06, + "loss": 0.3761, + "mean_token_accuracy": 0.8812992572784424, + "num_tokens": 604112302.0, + "step": 15836 + }, + { + "epoch": 2.0146291820379085, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.972888946533203, + "learning_rate": 1e-06, + "loss": 0.3524, + "mean_token_accuracy": 0.8880236148834229, + "num_tokens": 604150011.0, + "step": 15837 + }, + { + "epoch": 2.014756392316499, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.056781768798828, + "learning_rate": 1e-06, + "loss": 0.4055, + "mean_token_accuracy": 0.8726463913917542, + "num_tokens": 604188859.0, + "step": 15838 + }, + { + "epoch": 2.0148836025950896, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.729888916015625, + "learning_rate": 1e-06, + "loss": 0.4227, + "mean_token_accuracy": 0.8665163516998291, + "num_tokens": 604232273.0, + "step": 15839 + }, + { + "epoch": 2.01501081287368, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.944774627685547, + "learning_rate": 1e-06, + "loss": 0.3819, + "mean_token_accuracy": 0.8786057829856873, + "num_tokens": 604265236.0, + "step": 15840 + }, + { + "epoch": 2.0151380231522706, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.87135124206543, + "learning_rate": 1e-06, + "loss": 0.3612, + "mean_token_accuracy": 0.8861974477767944, + "num_tokens": 604304784.0, + "step": 15841 + }, + { + "epoch": 2.015265233430861, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.903305053710938, + "learning_rate": 1e-06, + "loss": 0.3268, + "mean_token_accuracy": 0.8969926238059998, + "num_tokens": 604341461.0, + "step": 15842 + }, + { + "epoch": 2.0153924437094517, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.738527297973633, + "learning_rate": 1e-06, + "loss": 0.3741, + "mean_token_accuracy": 0.8781962990760803, + "num_tokens": 604378628.0, + "step": 15843 + }, + { + "epoch": 2.0155196539880422, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.822063446044922, + "learning_rate": 1e-06, + "loss": 0.3707, + "mean_token_accuracy": 0.882644534111023, + "num_tokens": 604418070.0, + "step": 15844 + }, + { + "epoch": 2.0156468642666328, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.936134338378906, + "learning_rate": 1e-06, + "loss": 0.3612, + "mean_token_accuracy": 0.8855661153793335, + "num_tokens": 604454445.0, + "step": 15845 + }, + { + "epoch": 2.0157740745452233, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.798559188842773, + "learning_rate": 1e-06, + "loss": 0.3813, + "mean_token_accuracy": 0.8757913112640381, + "num_tokens": 604488559.0, + "step": 15846 + }, + { + "epoch": 2.015901284823814, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.134138107299805, + "learning_rate": 1e-06, + "loss": 0.4222, + "mean_token_accuracy": 0.865551769733429, + "num_tokens": 604522538.0, + "step": 15847 + }, + { + "epoch": 2.0160284951024043, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.86184310913086, + "learning_rate": 1e-06, + "loss": 0.3343, + "mean_token_accuracy": 0.8916599750518799, + "num_tokens": 604561750.0, + "step": 15848 + }, + { + "epoch": 2.016155705380995, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.76329231262207, + "learning_rate": 1e-06, + "loss": 0.3638, + "mean_token_accuracy": 0.8857473134994507, + "num_tokens": 604594961.0, + "step": 15849 + }, + { + "epoch": 2.0162829156595854, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.103046417236328, + "learning_rate": 1e-06, + "loss": 0.4221, + "mean_token_accuracy": 0.8639722466468811, + "num_tokens": 604631301.0, + "step": 15850 + }, + { + "epoch": 2.016410125938176, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.948394775390625, + "learning_rate": 1e-06, + "loss": 0.4615, + "mean_token_accuracy": 0.8528062105178833, + "num_tokens": 604666049.0, + "step": 15851 + }, + { + "epoch": 2.0165373362167665, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.003238677978516, + "learning_rate": 1e-06, + "loss": 0.3394, + "mean_token_accuracy": 0.893762469291687, + "num_tokens": 604701984.0, + "step": 15852 + }, + { + "epoch": 2.016664546495357, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.959697723388672, + "learning_rate": 1e-06, + "loss": 0.3826, + "mean_token_accuracy": 0.8810202479362488, + "num_tokens": 604738414.0, + "step": 15853 + }, + { + "epoch": 2.0167917567739475, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.96778106689453, + "learning_rate": 1e-06, + "loss": 0.3946, + "mean_token_accuracy": 0.8749918937683105, + "num_tokens": 604779961.0, + "step": 15854 + }, + { + "epoch": 2.016918967052538, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.929590225219727, + "learning_rate": 1e-06, + "loss": 0.4185, + "mean_token_accuracy": 0.8658915758132935, + "num_tokens": 604819729.0, + "step": 15855 + }, + { + "epoch": 2.0170461773311286, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.89381980895996, + "learning_rate": 1e-06, + "loss": 0.37, + "mean_token_accuracy": 0.8825387954711914, + "num_tokens": 604864108.0, + "step": 15856 + }, + { + "epoch": 2.0171733876097186, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.793596267700195, + "learning_rate": 1e-06, + "loss": 0.4194, + "mean_token_accuracy": 0.8667199611663818, + "num_tokens": 604905582.0, + "step": 15857 + }, + { + "epoch": 2.017300597888309, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.890756607055664, + "learning_rate": 1e-06, + "loss": 0.3766, + "mean_token_accuracy": 0.8803682923316956, + "num_tokens": 604943931.0, + "step": 15858 + }, + { + "epoch": 2.0174278081668997, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.69106101989746, + "learning_rate": 1e-06, + "loss": 0.4402, + "mean_token_accuracy": 0.8690146803855896, + "num_tokens": 604982100.0, + "step": 15859 + }, + { + "epoch": 2.0175550184454902, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.75171661376953, + "learning_rate": 1e-06, + "loss": 0.4444, + "mean_token_accuracy": 0.8622112274169922, + "num_tokens": 605026289.0, + "step": 15860 + }, + { + "epoch": 2.0176822287240808, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.032730102539062, + "learning_rate": 1e-06, + "loss": 0.3866, + "mean_token_accuracy": 0.8767862915992737, + "num_tokens": 605070214.0, + "step": 15861 + }, + { + "epoch": 2.0178094390026713, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.82141876220703, + "learning_rate": 1e-06, + "loss": 0.4152, + "mean_token_accuracy": 0.8683532476425171, + "num_tokens": 605113495.0, + "step": 15862 + }, + { + "epoch": 2.017936649281262, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.812484741210938, + "learning_rate": 1e-06, + "loss": 0.3821, + "mean_token_accuracy": 0.8764417171478271, + "num_tokens": 605145717.0, + "step": 15863 + }, + { + "epoch": 2.0180638595598523, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.884963989257812, + "learning_rate": 1e-06, + "loss": 0.4351, + "mean_token_accuracy": 0.8664674758911133, + "num_tokens": 605180930.0, + "step": 15864 + }, + { + "epoch": 2.018191069838443, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.90705680847168, + "learning_rate": 1e-06, + "loss": 0.3789, + "mean_token_accuracy": 0.8795362710952759, + "num_tokens": 605214004.0, + "step": 15865 + }, + { + "epoch": 2.0183182801170334, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.11347007751465, + "learning_rate": 1e-06, + "loss": 0.4021, + "mean_token_accuracy": 0.87293541431427, + "num_tokens": 605253244.0, + "step": 15866 + }, + { + "epoch": 2.018445490395624, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.86414337158203, + "learning_rate": 1e-06, + "loss": 0.4177, + "mean_token_accuracy": 0.8691797852516174, + "num_tokens": 605293832.0, + "step": 15867 + }, + { + "epoch": 2.0185727006742145, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.84675407409668, + "learning_rate": 1e-06, + "loss": 0.3801, + "mean_token_accuracy": 0.8830210566520691, + "num_tokens": 605333811.0, + "step": 15868 + }, + { + "epoch": 2.018699910952805, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.106006622314453, + "learning_rate": 1e-06, + "loss": 0.4005, + "mean_token_accuracy": 0.8724874258041382, + "num_tokens": 605371560.0, + "step": 15869 + }, + { + "epoch": 2.0188271212313955, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.791654586791992, + "learning_rate": 1e-06, + "loss": 0.4085, + "mean_token_accuracy": 0.869596004486084, + "num_tokens": 605410555.0, + "step": 15870 + }, + { + "epoch": 2.018954331509986, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.929744720458984, + "learning_rate": 1e-06, + "loss": 0.4232, + "mean_token_accuracy": 0.866531252861023, + "num_tokens": 605451851.0, + "step": 15871 + }, + { + "epoch": 2.0190815417885766, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.79276466369629, + "learning_rate": 1e-06, + "loss": 0.4214, + "mean_token_accuracy": 0.8646374940872192, + "num_tokens": 605490442.0, + "step": 15872 + }, + { + "epoch": 2.019208752067167, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.927061080932617, + "learning_rate": 1e-06, + "loss": 0.4049, + "mean_token_accuracy": 0.8713933229446411, + "num_tokens": 605525237.0, + "step": 15873 + }, + { + "epoch": 2.0193359623457576, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.935781478881836, + "learning_rate": 1e-06, + "loss": 0.3873, + "mean_token_accuracy": 0.8777645826339722, + "num_tokens": 605563624.0, + "step": 15874 + }, + { + "epoch": 2.019463172624348, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.971546173095703, + "learning_rate": 1e-06, + "loss": 0.373, + "mean_token_accuracy": 0.8819754123687744, + "num_tokens": 605597419.0, + "step": 15875 + }, + { + "epoch": 2.0195903829029387, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.98844337463379, + "learning_rate": 1e-06, + "loss": 0.4338, + "mean_token_accuracy": 0.8659173250198364, + "num_tokens": 605639954.0, + "step": 15876 + }, + { + "epoch": 2.019717593181529, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.926698684692383, + "learning_rate": 1e-06, + "loss": 0.4162, + "mean_token_accuracy": 0.8728117346763611, + "num_tokens": 605678226.0, + "step": 15877 + }, + { + "epoch": 2.0198448034601197, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.848615646362305, + "learning_rate": 1e-06, + "loss": 0.3685, + "mean_token_accuracy": 0.8804742097854614, + "num_tokens": 605719925.0, + "step": 15878 + }, + { + "epoch": 2.0199720137387103, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.992692947387695, + "learning_rate": 1e-06, + "loss": 0.4515, + "mean_token_accuracy": 0.8539143800735474, + "num_tokens": 605761282.0, + "step": 15879 + }, + { + "epoch": 2.020099224017301, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.98975944519043, + "learning_rate": 1e-06, + "loss": 0.3823, + "mean_token_accuracy": 0.8817471861839294, + "num_tokens": 605800681.0, + "step": 15880 + }, + { + "epoch": 2.0202264342958913, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.8677978515625, + "learning_rate": 1e-06, + "loss": 0.4625, + "mean_token_accuracy": 0.8559251427650452, + "num_tokens": 605836282.0, + "step": 15881 + }, + { + "epoch": 2.0203536445744814, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.96011734008789, + "learning_rate": 1e-06, + "loss": 0.3923, + "mean_token_accuracy": 0.8755470514297485, + "num_tokens": 605880360.0, + "step": 15882 + }, + { + "epoch": 2.020480854853072, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.83051872253418, + "learning_rate": 1e-06, + "loss": 0.3592, + "mean_token_accuracy": 0.8827990293502808, + "num_tokens": 605918363.0, + "step": 15883 + }, + { + "epoch": 2.0206080651316625, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.830068588256836, + "learning_rate": 1e-06, + "loss": 0.3549, + "mean_token_accuracy": 0.8871893882751465, + "num_tokens": 605953032.0, + "step": 15884 + }, + { + "epoch": 2.020735275410253, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.78603744506836, + "learning_rate": 1e-06, + "loss": 0.3913, + "mean_token_accuracy": 0.8803740739822388, + "num_tokens": 605992633.0, + "step": 15885 + }, + { + "epoch": 2.0208624856888435, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.909711837768555, + "learning_rate": 1e-06, + "loss": 0.3978, + "mean_token_accuracy": 0.8754569292068481, + "num_tokens": 606031776.0, + "step": 15886 + }, + { + "epoch": 2.020989695967434, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.05089569091797, + "learning_rate": 1e-06, + "loss": 0.4256, + "mean_token_accuracy": 0.8662383556365967, + "num_tokens": 606070614.0, + "step": 15887 + }, + { + "epoch": 2.0211169062460246, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.97393226623535, + "learning_rate": 1e-06, + "loss": 0.3956, + "mean_token_accuracy": 0.8721659183502197, + "num_tokens": 606115898.0, + "step": 15888 + }, + { + "epoch": 2.021244116524615, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.931171417236328, + "learning_rate": 1e-06, + "loss": 0.3865, + "mean_token_accuracy": 0.8771527409553528, + "num_tokens": 606150231.0, + "step": 15889 + }, + { + "epoch": 2.0213713268032056, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.944866180419922, + "learning_rate": 1e-06, + "loss": 0.4261, + "mean_token_accuracy": 0.8643743395805359, + "num_tokens": 606191527.0, + "step": 15890 + }, + { + "epoch": 2.021498537081796, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.976030349731445, + "learning_rate": 1e-06, + "loss": 0.4194, + "mean_token_accuracy": 0.8656998872756958, + "num_tokens": 606229070.0, + "step": 15891 + }, + { + "epoch": 2.0216257473603867, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.929346084594727, + "learning_rate": 1e-06, + "loss": 0.3992, + "mean_token_accuracy": 0.8746570944786072, + "num_tokens": 606269555.0, + "step": 15892 + }, + { + "epoch": 2.021752957638977, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.066518783569336, + "learning_rate": 1e-06, + "loss": 0.4223, + "mean_token_accuracy": 0.868587851524353, + "num_tokens": 606302040.0, + "step": 15893 + }, + { + "epoch": 2.0218801679175677, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.918821334838867, + "learning_rate": 1e-06, + "loss": 0.3803, + "mean_token_accuracy": 0.8824211359024048, + "num_tokens": 606341769.0, + "step": 15894 + }, + { + "epoch": 2.0220073781961583, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.936429977416992, + "learning_rate": 1e-06, + "loss": 0.4391, + "mean_token_accuracy": 0.8598295450210571, + "num_tokens": 606379989.0, + "step": 15895 + }, + { + "epoch": 2.022134588474749, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.920536041259766, + "learning_rate": 1e-06, + "loss": 0.4147, + "mean_token_accuracy": 0.8670669794082642, + "num_tokens": 606414396.0, + "step": 15896 + }, + { + "epoch": 2.0222617987533393, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.89020538330078, + "learning_rate": 1e-06, + "loss": 0.4052, + "mean_token_accuracy": 0.8762686252593994, + "num_tokens": 606445949.0, + "step": 15897 + }, + { + "epoch": 2.02238900903193, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.79055404663086, + "learning_rate": 1e-06, + "loss": 0.3922, + "mean_token_accuracy": 0.8756433725357056, + "num_tokens": 606484751.0, + "step": 15898 + }, + { + "epoch": 2.0225162193105204, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.91033935546875, + "learning_rate": 1e-06, + "loss": 0.3889, + "mean_token_accuracy": 0.8762851357460022, + "num_tokens": 606525390.0, + "step": 15899 + }, + { + "epoch": 2.022643429589111, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.938873291015625, + "learning_rate": 1e-06, + "loss": 0.4092, + "mean_token_accuracy": 0.8695101737976074, + "num_tokens": 606563364.0, + "step": 15900 + }, + { + "epoch": 2.0227706398677014, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.94179344177246, + "learning_rate": 1e-06, + "loss": 0.3919, + "mean_token_accuracy": 0.8757901191711426, + "num_tokens": 606598140.0, + "step": 15901 + }, + { + "epoch": 2.022897850146292, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.099653244018555, + "learning_rate": 1e-06, + "loss": 0.4165, + "mean_token_accuracy": 0.8663490414619446, + "num_tokens": 606636480.0, + "step": 15902 + }, + { + "epoch": 2.0230250604248825, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.06589126586914, + "learning_rate": 1e-06, + "loss": 0.3935, + "mean_token_accuracy": 0.8769986629486084, + "num_tokens": 606676044.0, + "step": 15903 + }, + { + "epoch": 2.023152270703473, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.95764923095703, + "learning_rate": 1e-06, + "loss": 0.421, + "mean_token_accuracy": 0.8654149770736694, + "num_tokens": 606713262.0, + "step": 15904 + }, + { + "epoch": 2.0232794809820636, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.084810256958008, + "learning_rate": 1e-06, + "loss": 0.4171, + "mean_token_accuracy": 0.8692247867584229, + "num_tokens": 606754844.0, + "step": 15905 + }, + { + "epoch": 2.0234066912606536, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.946903228759766, + "learning_rate": 1e-06, + "loss": 0.4047, + "mean_token_accuracy": 0.8733102083206177, + "num_tokens": 606800910.0, + "step": 15906 + }, + { + "epoch": 2.023533901539244, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.174470901489258, + "learning_rate": 1e-06, + "loss": 0.4215, + "mean_token_accuracy": 0.8676947951316833, + "num_tokens": 606841855.0, + "step": 15907 + }, + { + "epoch": 2.0236611118178347, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.115245819091797, + "learning_rate": 1e-06, + "loss": 0.431, + "mean_token_accuracy": 0.8625075817108154, + "num_tokens": 606888261.0, + "step": 15908 + }, + { + "epoch": 2.0237883220964252, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.131120681762695, + "learning_rate": 1e-06, + "loss": 0.4329, + "mean_token_accuracy": 0.8644360899925232, + "num_tokens": 606926235.0, + "step": 15909 + }, + { + "epoch": 2.0239155323750158, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.204084396362305, + "learning_rate": 1e-06, + "loss": 0.41, + "mean_token_accuracy": 0.8684273362159729, + "num_tokens": 606962527.0, + "step": 15910 + }, + { + "epoch": 2.0240427426536063, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.42580223083496, + "learning_rate": 1e-06, + "loss": 0.3827, + "mean_token_accuracy": 0.8774480819702148, + "num_tokens": 607008924.0, + "step": 15911 + }, + { + "epoch": 2.024169952932197, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.077028274536133, + "learning_rate": 1e-06, + "loss": 0.4115, + "mean_token_accuracy": 0.868881344795227, + "num_tokens": 607049788.0, + "step": 15912 + }, + { + "epoch": 2.0242971632107873, + "ewc_loss": 0.03564453125, + "ewc_loss_parallel": 3.5762786865234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.998004913330078, + "learning_rate": 1e-06, + "loss": 0.4243, + "mean_token_accuracy": 0.8660892248153687, + "num_tokens": 607087592.0, + "step": 15913 + }, + { + "epoch": 2.024424373489378, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.5903377532959, + "learning_rate": 1e-06, + "loss": 0.35, + "mean_token_accuracy": 0.8875848650932312, + "num_tokens": 607119954.0, + "step": 15914 + }, + { + "epoch": 2.0245515837679684, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.59850311279297, + "learning_rate": 1e-06, + "loss": 0.4404, + "mean_token_accuracy": 0.8627864122390747, + "num_tokens": 607156511.0, + "step": 15915 + }, + { + "epoch": 2.024678794046559, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.918031692504883, + "learning_rate": 1e-06, + "loss": 0.3705, + "mean_token_accuracy": 0.8814176321029663, + "num_tokens": 607198292.0, + "step": 15916 + }, + { + "epoch": 2.0248060043251495, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.15852928161621, + "learning_rate": 1e-06, + "loss": 0.4107, + "mean_token_accuracy": 0.8706828355789185, + "num_tokens": 607237572.0, + "step": 15917 + }, + { + "epoch": 2.02493321460374, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.152774810791016, + "learning_rate": 1e-06, + "loss": 0.4257, + "mean_token_accuracy": 0.8630883693695068, + "num_tokens": 607282321.0, + "step": 15918 + }, + { + "epoch": 2.0250604248823305, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.841646194458008, + "learning_rate": 1e-06, + "loss": 0.4, + "mean_token_accuracy": 0.871387243270874, + "num_tokens": 607313355.0, + "step": 15919 + }, + { + "epoch": 2.025187635160921, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.15482521057129, + "learning_rate": 1e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.8703856468200684, + "num_tokens": 607351961.0, + "step": 15920 + }, + { + "epoch": 2.0253148454395116, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.21050453186035, + "learning_rate": 1e-06, + "loss": 0.403, + "mean_token_accuracy": 0.8750867247581482, + "num_tokens": 607391368.0, + "step": 15921 + }, + { + "epoch": 2.025442055718102, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.775068283081055, + "learning_rate": 1e-06, + "loss": 0.3414, + "mean_token_accuracy": 0.891202449798584, + "num_tokens": 607427328.0, + "step": 15922 + }, + { + "epoch": 2.0255692659966926, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.10789680480957, + "learning_rate": 1e-06, + "loss": 0.416, + "mean_token_accuracy": 0.8690401315689087, + "num_tokens": 607468243.0, + "step": 15923 + }, + { + "epoch": 2.025696476275283, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.017894744873047, + "learning_rate": 1e-06, + "loss": 0.4601, + "mean_token_accuracy": 0.856133222579956, + "num_tokens": 607502466.0, + "step": 15924 + }, + { + "epoch": 2.0258236865538737, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.810609817504883, + "learning_rate": 1e-06, + "loss": 0.4204, + "mean_token_accuracy": 0.8636571764945984, + "num_tokens": 607544093.0, + "step": 15925 + }, + { + "epoch": 2.025950896832464, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.995737075805664, + "learning_rate": 1e-06, + "loss": 0.3807, + "mean_token_accuracy": 0.8785111308097839, + "num_tokens": 607576054.0, + "step": 15926 + }, + { + "epoch": 2.0260781071110547, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.91741943359375, + "learning_rate": 1e-06, + "loss": 0.3622, + "mean_token_accuracy": 0.8852138519287109, + "num_tokens": 607608462.0, + "step": 15927 + }, + { + "epoch": 2.0262053173896453, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.03915786743164, + "learning_rate": 1e-06, + "loss": 0.4255, + "mean_token_accuracy": 0.8669923543930054, + "num_tokens": 607645272.0, + "step": 15928 + }, + { + "epoch": 2.026332527668236, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.942394256591797, + "learning_rate": 1e-06, + "loss": 0.3992, + "mean_token_accuracy": 0.8743404150009155, + "num_tokens": 607682585.0, + "step": 15929 + }, + { + "epoch": 2.0264597379468263, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.157621383666992, + "learning_rate": 1e-06, + "loss": 0.3482, + "mean_token_accuracy": 0.8900785446166992, + "num_tokens": 607721302.0, + "step": 15930 + }, + { + "epoch": 2.0265869482254164, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.07558250427246, + "learning_rate": 1e-06, + "loss": 0.4148, + "mean_token_accuracy": 0.8681502342224121, + "num_tokens": 607758357.0, + "step": 15931 + }, + { + "epoch": 2.026714158504007, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.967077255249023, + "learning_rate": 1e-06, + "loss": 0.4091, + "mean_token_accuracy": 0.8719468712806702, + "num_tokens": 607795467.0, + "step": 15932 + }, + { + "epoch": 2.0268413687825975, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.178489685058594, + "learning_rate": 1e-06, + "loss": 0.4202, + "mean_token_accuracy": 0.8673628568649292, + "num_tokens": 607833192.0, + "step": 15933 + }, + { + "epoch": 2.026968579061188, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.06499671936035, + "learning_rate": 1e-06, + "loss": 0.3813, + "mean_token_accuracy": 0.8809770345687866, + "num_tokens": 607866672.0, + "step": 15934 + }, + { + "epoch": 2.0270957893397785, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.926790237426758, + "learning_rate": 1e-06, + "loss": 0.3981, + "mean_token_accuracy": 0.8737623691558838, + "num_tokens": 607907976.0, + "step": 15935 + }, + { + "epoch": 2.027222999618369, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.45438575744629, + "learning_rate": 1e-06, + "loss": 0.3828, + "mean_token_accuracy": 0.876471996307373, + "num_tokens": 607946009.0, + "step": 15936 + }, + { + "epoch": 2.0273502098969596, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.856292724609375, + "learning_rate": 1e-06, + "loss": 0.3844, + "mean_token_accuracy": 0.8773058652877808, + "num_tokens": 607986800.0, + "step": 15937 + }, + { + "epoch": 2.02747742017555, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.047584533691406, + "learning_rate": 1e-06, + "loss": 0.4352, + "mean_token_accuracy": 0.8669939637184143, + "num_tokens": 608021807.0, + "step": 15938 + }, + { + "epoch": 2.0276046304541406, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.001680374145508, + "learning_rate": 1e-06, + "loss": 0.4103, + "mean_token_accuracy": 0.8713400959968567, + "num_tokens": 608060294.0, + "step": 15939 + }, + { + "epoch": 2.027731840732731, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.965970993041992, + "learning_rate": 1e-06, + "loss": 0.4106, + "mean_token_accuracy": 0.8717848062515259, + "num_tokens": 608100790.0, + "step": 15940 + }, + { + "epoch": 2.0278590510113217, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.979949951171875, + "learning_rate": 1e-06, + "loss": 0.4207, + "mean_token_accuracy": 0.8687150478363037, + "num_tokens": 608142164.0, + "step": 15941 + }, + { + "epoch": 2.027986261289912, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.120134353637695, + "learning_rate": 1e-06, + "loss": 0.3889, + "mean_token_accuracy": 0.8729935884475708, + "num_tokens": 608184175.0, + "step": 15942 + }, + { + "epoch": 2.0281134715685027, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.88457679748535, + "learning_rate": 1e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.8667166233062744, + "num_tokens": 608226801.0, + "step": 15943 + }, + { + "epoch": 2.0282406818470933, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.963151931762695, + "learning_rate": 1e-06, + "loss": 0.3729, + "mean_token_accuracy": 0.880728006362915, + "num_tokens": 608265155.0, + "step": 15944 + }, + { + "epoch": 2.028367892125684, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.975780487060547, + "learning_rate": 1e-06, + "loss": 0.3733, + "mean_token_accuracy": 0.8829529881477356, + "num_tokens": 608303817.0, + "step": 15945 + }, + { + "epoch": 2.0284951024042743, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.31077003479004, + "learning_rate": 1e-06, + "loss": 0.3766, + "mean_token_accuracy": 0.8788961172103882, + "num_tokens": 608338845.0, + "step": 15946 + }, + { + "epoch": 2.028622312682865, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.18292236328125, + "learning_rate": 1e-06, + "loss": 0.3923, + "mean_token_accuracy": 0.8723279237747192, + "num_tokens": 608377661.0, + "step": 15947 + }, + { + "epoch": 2.0287495229614554, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.10841178894043, + "learning_rate": 1e-06, + "loss": 0.406, + "mean_token_accuracy": 0.8723849654197693, + "num_tokens": 608412897.0, + "step": 15948 + }, + { + "epoch": 2.028876733240046, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.261499404907227, + "learning_rate": 1e-06, + "loss": 0.4288, + "mean_token_accuracy": 0.8645456433296204, + "num_tokens": 608453584.0, + "step": 15949 + }, + { + "epoch": 2.0290039435186364, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.0430908203125, + "learning_rate": 1e-06, + "loss": 0.4135, + "mean_token_accuracy": 0.8682119846343994, + "num_tokens": 608492211.0, + "step": 15950 + }, + { + "epoch": 2.029131153797227, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.310022354125977, + "learning_rate": 1e-06, + "loss": 0.4152, + "mean_token_accuracy": 0.8685516119003296, + "num_tokens": 608534317.0, + "step": 15951 + }, + { + "epoch": 2.0292583640758175, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.081192016601562, + "learning_rate": 1e-06, + "loss": 0.3674, + "mean_token_accuracy": 0.8821637034416199, + "num_tokens": 608569763.0, + "step": 15952 + }, + { + "epoch": 2.029385574354408, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.994110107421875, + "learning_rate": 1e-06, + "loss": 0.3955, + "mean_token_accuracy": 0.8758649826049805, + "num_tokens": 608612145.0, + "step": 15953 + }, + { + "epoch": 2.0295127846329986, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.066720962524414, + "learning_rate": 1e-06, + "loss": 0.4104, + "mean_token_accuracy": 0.8726425766944885, + "num_tokens": 608649528.0, + "step": 15954 + }, + { + "epoch": 2.0296399949115886, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.96013832092285, + "learning_rate": 1e-06, + "loss": 0.4266, + "mean_token_accuracy": 0.8670405745506287, + "num_tokens": 608686973.0, + "step": 15955 + }, + { + "epoch": 2.029767205190179, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.9777889251709, + "learning_rate": 1e-06, + "loss": 0.3973, + "mean_token_accuracy": 0.8748643398284912, + "num_tokens": 608729169.0, + "step": 15956 + }, + { + "epoch": 2.0298944154687697, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.096494674682617, + "learning_rate": 1e-06, + "loss": 0.4145, + "mean_token_accuracy": 0.8723127245903015, + "num_tokens": 608764502.0, + "step": 15957 + }, + { + "epoch": 2.0300216257473602, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.175094604492188, + "learning_rate": 1e-06, + "loss": 0.3991, + "mean_token_accuracy": 0.8735049962997437, + "num_tokens": 608805190.0, + "step": 15958 + }, + { + "epoch": 2.0301488360259508, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.965028762817383, + "learning_rate": 1e-06, + "loss": 0.3752, + "mean_token_accuracy": 0.8793148994445801, + "num_tokens": 608840006.0, + "step": 15959 + }, + { + "epoch": 2.0302760463045413, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.087202072143555, + "learning_rate": 1e-06, + "loss": 0.3769, + "mean_token_accuracy": 0.8791638016700745, + "num_tokens": 608871551.0, + "step": 15960 + }, + { + "epoch": 2.030403256583132, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.92702865600586, + "learning_rate": 1e-06, + "loss": 0.467, + "mean_token_accuracy": 0.8513902425765991, + "num_tokens": 608909610.0, + "step": 15961 + }, + { + "epoch": 2.0305304668617223, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.00053596496582, + "learning_rate": 1e-06, + "loss": 0.4193, + "mean_token_accuracy": 0.8700408339500427, + "num_tokens": 608950758.0, + "step": 15962 + }, + { + "epoch": 2.030657677140313, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.869081497192383, + "learning_rate": 1e-06, + "loss": 0.4438, + "mean_token_accuracy": 0.8608540296554565, + "num_tokens": 608990695.0, + "step": 15963 + }, + { + "epoch": 2.0307848874189034, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.00838279724121, + "learning_rate": 1e-06, + "loss": 0.3589, + "mean_token_accuracy": 0.8862345218658447, + "num_tokens": 609030978.0, + "step": 15964 + }, + { + "epoch": 2.030912097697494, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.709575653076172, + "learning_rate": 1e-06, + "loss": 0.3722, + "mean_token_accuracy": 0.8821092844009399, + "num_tokens": 609071974.0, + "step": 15965 + }, + { + "epoch": 2.0310393079760845, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.9802188873291, + "learning_rate": 1e-06, + "loss": 0.3963, + "mean_token_accuracy": 0.8743741512298584, + "num_tokens": 609111268.0, + "step": 15966 + }, + { + "epoch": 2.031166518254675, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.21146583557129, + "learning_rate": 1e-06, + "loss": 0.4117, + "mean_token_accuracy": 0.8685879707336426, + "num_tokens": 609149681.0, + "step": 15967 + }, + { + "epoch": 2.0312937285332655, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.881343841552734, + "learning_rate": 1e-06, + "loss": 0.3788, + "mean_token_accuracy": 0.8774774074554443, + "num_tokens": 609189121.0, + "step": 15968 + }, + { + "epoch": 2.031420938811856, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.999814987182617, + "learning_rate": 1e-06, + "loss": 0.375, + "mean_token_accuracy": 0.881577730178833, + "num_tokens": 609228070.0, + "step": 15969 + }, + { + "epoch": 2.0315481490904466, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.877695083618164, + "learning_rate": 1e-06, + "loss": 0.3945, + "mean_token_accuracy": 0.8749465346336365, + "num_tokens": 609269718.0, + "step": 15970 + }, + { + "epoch": 2.031675359369037, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.16794776916504, + "learning_rate": 1e-06, + "loss": 0.3925, + "mean_token_accuracy": 0.8754715919494629, + "num_tokens": 609307236.0, + "step": 15971 + }, + { + "epoch": 2.0318025696476276, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.060302734375, + "learning_rate": 1e-06, + "loss": 0.4494, + "mean_token_accuracy": 0.85526043176651, + "num_tokens": 609342993.0, + "step": 15972 + }, + { + "epoch": 2.031929779926218, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.950519561767578, + "learning_rate": 1e-06, + "loss": 0.3742, + "mean_token_accuracy": 0.8798698782920837, + "num_tokens": 609376817.0, + "step": 15973 + }, + { + "epoch": 2.0320569902048087, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.955373764038086, + "learning_rate": 1e-06, + "loss": 0.4079, + "mean_token_accuracy": 0.8719828128814697, + "num_tokens": 609411209.0, + "step": 15974 + }, + { + "epoch": 2.032184200483399, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.056867599487305, + "learning_rate": 1e-06, + "loss": 0.3457, + "mean_token_accuracy": 0.8904497027397156, + "num_tokens": 609449055.0, + "step": 15975 + }, + { + "epoch": 2.0323114107619897, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.92105484008789, + "learning_rate": 1e-06, + "loss": 0.3939, + "mean_token_accuracy": 0.8741565942764282, + "num_tokens": 609486332.0, + "step": 15976 + }, + { + "epoch": 2.0324386210405803, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.172237396240234, + "learning_rate": 1e-06, + "loss": 0.4399, + "mean_token_accuracy": 0.8605290651321411, + "num_tokens": 609526924.0, + "step": 15977 + }, + { + "epoch": 2.032565831319171, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.885595321655273, + "learning_rate": 1e-06, + "loss": 0.3548, + "mean_token_accuracy": 0.8869558572769165, + "num_tokens": 609559542.0, + "step": 15978 + }, + { + "epoch": 2.032693041597761, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.984949111938477, + "learning_rate": 1e-06, + "loss": 0.4161, + "mean_token_accuracy": 0.8653899431228638, + "num_tokens": 609598989.0, + "step": 15979 + }, + { + "epoch": 2.0328202518763514, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.008846282958984, + "learning_rate": 1e-06, + "loss": 0.4454, + "mean_token_accuracy": 0.8607059717178345, + "num_tokens": 609635790.0, + "step": 15980 + }, + { + "epoch": 2.032947462154942, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.15772247314453, + "learning_rate": 1e-06, + "loss": 0.4165, + "mean_token_accuracy": 0.8652732968330383, + "num_tokens": 609670120.0, + "step": 15981 + }, + { + "epoch": 2.0330746724335325, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.891897201538086, + "learning_rate": 1e-06, + "loss": 0.4136, + "mean_token_accuracy": 0.8731387257575989, + "num_tokens": 609713260.0, + "step": 15982 + }, + { + "epoch": 2.033201882712123, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.021007537841797, + "learning_rate": 1e-06, + "loss": 0.3844, + "mean_token_accuracy": 0.8786587715148926, + "num_tokens": 609753236.0, + "step": 15983 + }, + { + "epoch": 2.0333290929907135, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.098905563354492, + "learning_rate": 1e-06, + "loss": 0.3932, + "mean_token_accuracy": 0.8760085105895996, + "num_tokens": 609791629.0, + "step": 15984 + }, + { + "epoch": 2.033456303269304, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.145000457763672, + "learning_rate": 1e-06, + "loss": 0.4614, + "mean_token_accuracy": 0.8574787974357605, + "num_tokens": 609833457.0, + "step": 15985 + }, + { + "epoch": 2.0335835135478946, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.011524200439453, + "learning_rate": 1e-06, + "loss": 0.4215, + "mean_token_accuracy": 0.8656646013259888, + "num_tokens": 609866007.0, + "step": 15986 + }, + { + "epoch": 2.033710723826485, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.116962432861328, + "learning_rate": 1e-06, + "loss": 0.3852, + "mean_token_accuracy": 0.8772501945495605, + "num_tokens": 609903346.0, + "step": 15987 + }, + { + "epoch": 2.0338379341050756, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.12653350830078, + "learning_rate": 1e-06, + "loss": 0.403, + "mean_token_accuracy": 0.8771982789039612, + "num_tokens": 609935885.0, + "step": 15988 + }, + { + "epoch": 2.033965144383666, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.170747756958008, + "learning_rate": 1e-06, + "loss": 0.422, + "mean_token_accuracy": 0.8662207126617432, + "num_tokens": 609971127.0, + "step": 15989 + }, + { + "epoch": 2.0340923546622567, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.07556915283203, + "learning_rate": 1e-06, + "loss": 0.4237, + "mean_token_accuracy": 0.8634225726127625, + "num_tokens": 610016719.0, + "step": 15990 + }, + { + "epoch": 2.034219564940847, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.130489349365234, + "learning_rate": 1e-06, + "loss": 0.4432, + "mean_token_accuracy": 0.8620636463165283, + "num_tokens": 610057002.0, + "step": 15991 + }, + { + "epoch": 2.0343467752194377, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.16887092590332, + "learning_rate": 1e-06, + "loss": 0.3871, + "mean_token_accuracy": 0.8777270317077637, + "num_tokens": 610092779.0, + "step": 15992 + }, + { + "epoch": 2.0344739854980283, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.010883331298828, + "learning_rate": 1e-06, + "loss": 0.4292, + "mean_token_accuracy": 0.8632028698921204, + "num_tokens": 610137092.0, + "step": 15993 + }, + { + "epoch": 2.034601195776619, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.281352996826172, + "learning_rate": 1e-06, + "loss": 0.3812, + "mean_token_accuracy": 0.8820066452026367, + "num_tokens": 610178632.0, + "step": 15994 + }, + { + "epoch": 2.0347284060552093, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.07598876953125, + "learning_rate": 1e-06, + "loss": 0.409, + "mean_token_accuracy": 0.8719170689582825, + "num_tokens": 610219789.0, + "step": 15995 + }, + { + "epoch": 2.0348556163338, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.049667358398438, + "learning_rate": 1e-06, + "loss": 0.4284, + "mean_token_accuracy": 0.85863196849823, + "num_tokens": 610259104.0, + "step": 15996 + }, + { + "epoch": 2.0349828266123904, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.025785446166992, + "learning_rate": 1e-06, + "loss": 0.4201, + "mean_token_accuracy": 0.8667314052581787, + "num_tokens": 610296317.0, + "step": 15997 + }, + { + "epoch": 2.035110036890981, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.169645309448242, + "learning_rate": 1e-06, + "loss": 0.4112, + "mean_token_accuracy": 0.8697237968444824, + "num_tokens": 610339300.0, + "step": 15998 + }, + { + "epoch": 2.0352372471695714, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.9194278717041, + "learning_rate": 1e-06, + "loss": 0.408, + "mean_token_accuracy": 0.8689616918563843, + "num_tokens": 610373917.0, + "step": 15999 + }, + { + "epoch": 2.035364457448162, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.046451568603516, + "learning_rate": 1e-06, + "loss": 0.4004, + "mean_token_accuracy": 0.8712435960769653, + "num_tokens": 610419571.0, + "step": 16000 + }, + { + "epoch": 2.0354916677267525, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.1061954498291, + "learning_rate": 1e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.8640431761741638, + "num_tokens": 610458073.0, + "step": 16001 + }, + { + "epoch": 2.035618878005343, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.0648136138916, + "learning_rate": 1e-06, + "loss": 0.4299, + "mean_token_accuracy": 0.862464964389801, + "num_tokens": 610497223.0, + "step": 16002 + }, + { + "epoch": 2.0357460882839336, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.03876304626465, + "learning_rate": 1e-06, + "loss": 0.4633, + "mean_token_accuracy": 0.8523601293563843, + "num_tokens": 610532563.0, + "step": 16003 + }, + { + "epoch": 2.0358732985625236, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.005496978759766, + "learning_rate": 1e-06, + "loss": 0.3759, + "mean_token_accuracy": 0.8831875920295715, + "num_tokens": 610566644.0, + "step": 16004 + }, + { + "epoch": 2.036000508841114, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.971881866455078, + "learning_rate": 1e-06, + "loss": 0.4384, + "mean_token_accuracy": 0.8604257106781006, + "num_tokens": 610604755.0, + "step": 16005 + }, + { + "epoch": 2.0361277191197047, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.158798217773438, + "learning_rate": 1e-06, + "loss": 0.3896, + "mean_token_accuracy": 0.8759555220603943, + "num_tokens": 610645905.0, + "step": 16006 + }, + { + "epoch": 2.036254929398295, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.894453048706055, + "learning_rate": 1e-06, + "loss": 0.3524, + "mean_token_accuracy": 0.8890049457550049, + "num_tokens": 610686800.0, + "step": 16007 + }, + { + "epoch": 2.0363821396768857, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.01852798461914, + "learning_rate": 1e-06, + "loss": 0.4001, + "mean_token_accuracy": 0.8724421858787537, + "num_tokens": 610728291.0, + "step": 16008 + }, + { + "epoch": 2.0365093499554763, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.08867645263672, + "learning_rate": 1e-06, + "loss": 0.4182, + "mean_token_accuracy": 0.8702234029769897, + "num_tokens": 610759818.0, + "step": 16009 + }, + { + "epoch": 2.036636560234067, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.044532775878906, + "learning_rate": 1e-06, + "loss": 0.4471, + "mean_token_accuracy": 0.8580350875854492, + "num_tokens": 610800125.0, + "step": 16010 + }, + { + "epoch": 2.0367637705126573, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.886323928833008, + "learning_rate": 1e-06, + "loss": 0.4338, + "mean_token_accuracy": 0.8636153936386108, + "num_tokens": 610839261.0, + "step": 16011 + }, + { + "epoch": 2.036890980791248, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.09602928161621, + "learning_rate": 1e-06, + "loss": 0.3945, + "mean_token_accuracy": 0.8764792680740356, + "num_tokens": 610879092.0, + "step": 16012 + }, + { + "epoch": 2.0370181910698384, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.914155960083008, + "learning_rate": 1e-06, + "loss": 0.397, + "mean_token_accuracy": 0.8720473051071167, + "num_tokens": 610914494.0, + "step": 16013 + }, + { + "epoch": 2.037145401348429, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.009063720703125, + "learning_rate": 1e-06, + "loss": 0.3766, + "mean_token_accuracy": 0.882996141910553, + "num_tokens": 610950784.0, + "step": 16014 + }, + { + "epoch": 2.0372726116270194, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.194292068481445, + "learning_rate": 1e-06, + "loss": 0.4269, + "mean_token_accuracy": 0.8613319396972656, + "num_tokens": 610989254.0, + "step": 16015 + }, + { + "epoch": 2.03739982190561, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.977710723876953, + "learning_rate": 1e-06, + "loss": 0.4221, + "mean_token_accuracy": 0.8651695251464844, + "num_tokens": 611022668.0, + "step": 16016 + }, + { + "epoch": 2.0375270321842005, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.04347801208496, + "learning_rate": 1e-06, + "loss": 0.4216, + "mean_token_accuracy": 0.8714330196380615, + "num_tokens": 611062110.0, + "step": 16017 + }, + { + "epoch": 2.037654242462791, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.327821731567383, + "learning_rate": 1e-06, + "loss": 0.3858, + "mean_token_accuracy": 0.8762632608413696, + "num_tokens": 611098663.0, + "step": 16018 + }, + { + "epoch": 2.0377814527413816, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.071426391601562, + "learning_rate": 1e-06, + "loss": 0.4065, + "mean_token_accuracy": 0.8691732883453369, + "num_tokens": 611134058.0, + "step": 16019 + }, + { + "epoch": 2.037908663019972, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.93563461303711, + "learning_rate": 1e-06, + "loss": 0.395, + "mean_token_accuracy": 0.872593343257904, + "num_tokens": 611174748.0, + "step": 16020 + }, + { + "epoch": 2.0380358732985626, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.08717918395996, + "learning_rate": 1e-06, + "loss": 0.3557, + "mean_token_accuracy": 0.888609766960144, + "num_tokens": 611216792.0, + "step": 16021 + }, + { + "epoch": 2.038163083577153, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.861970901489258, + "learning_rate": 1e-06, + "loss": 0.3713, + "mean_token_accuracy": 0.8818857073783875, + "num_tokens": 611253996.0, + "step": 16022 + }, + { + "epoch": 2.0382902938557437, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.183574676513672, + "learning_rate": 1e-06, + "loss": 0.3698, + "mean_token_accuracy": 0.8825653791427612, + "num_tokens": 611295351.0, + "step": 16023 + }, + { + "epoch": 2.038417504134334, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.126615524291992, + "learning_rate": 1e-06, + "loss": 0.3949, + "mean_token_accuracy": 0.8722172379493713, + "num_tokens": 611330513.0, + "step": 16024 + }, + { + "epoch": 2.0385447144129247, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.93913459777832, + "learning_rate": 1e-06, + "loss": 0.3826, + "mean_token_accuracy": 0.8800371289253235, + "num_tokens": 611369679.0, + "step": 16025 + }, + { + "epoch": 2.0386719246915153, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.115676879882812, + "learning_rate": 1e-06, + "loss": 0.378, + "mean_token_accuracy": 0.880606472492218, + "num_tokens": 611410398.0, + "step": 16026 + }, + { + "epoch": 2.038799134970106, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.929283142089844, + "learning_rate": 1e-06, + "loss": 0.4203, + "mean_token_accuracy": 0.8664121031761169, + "num_tokens": 611447151.0, + "step": 16027 + }, + { + "epoch": 2.0389263452486963, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.8391056060791, + "learning_rate": 1e-06, + "loss": 0.4303, + "mean_token_accuracy": 0.8642752170562744, + "num_tokens": 611486923.0, + "step": 16028 + }, + { + "epoch": 2.0390535555272864, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.31132698059082, + "learning_rate": 1e-06, + "loss": 0.3868, + "mean_token_accuracy": 0.8780602216720581, + "num_tokens": 611519502.0, + "step": 16029 + }, + { + "epoch": 2.039180765805877, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.956602096557617, + "learning_rate": 1e-06, + "loss": 0.4475, + "mean_token_accuracy": 0.8591236472129822, + "num_tokens": 611557647.0, + "step": 16030 + }, + { + "epoch": 2.0393079760844675, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.088499069213867, + "learning_rate": 1e-06, + "loss": 0.4015, + "mean_token_accuracy": 0.8707502484321594, + "num_tokens": 611597405.0, + "step": 16031 + }, + { + "epoch": 2.039435186363058, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.122852325439453, + "learning_rate": 1e-06, + "loss": 0.4204, + "mean_token_accuracy": 0.8667669296264648, + "num_tokens": 611635996.0, + "step": 16032 + }, + { + "epoch": 2.0395623966416485, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.989559173583984, + "learning_rate": 1e-06, + "loss": 0.3814, + "mean_token_accuracy": 0.8803397417068481, + "num_tokens": 611677900.0, + "step": 16033 + }, + { + "epoch": 2.039689606920239, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.95833969116211, + "learning_rate": 1e-06, + "loss": 0.3929, + "mean_token_accuracy": 0.8744899034500122, + "num_tokens": 611719184.0, + "step": 16034 + }, + { + "epoch": 2.0398168171988296, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.086761474609375, + "learning_rate": 1e-06, + "loss": 0.4078, + "mean_token_accuracy": 0.870802640914917, + "num_tokens": 611761270.0, + "step": 16035 + }, + { + "epoch": 2.03994402747742, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.112424850463867, + "learning_rate": 1e-06, + "loss": 0.4229, + "mean_token_accuracy": 0.8707947731018066, + "num_tokens": 611794939.0, + "step": 16036 + }, + { + "epoch": 2.0400712377560106, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.221012115478516, + "learning_rate": 1e-06, + "loss": 0.371, + "mean_token_accuracy": 0.8838177919387817, + "num_tokens": 611834093.0, + "step": 16037 + }, + { + "epoch": 2.040198448034601, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.17618179321289, + "learning_rate": 1e-06, + "loss": 0.413, + "mean_token_accuracy": 0.8674110174179077, + "num_tokens": 611871327.0, + "step": 16038 + }, + { + "epoch": 2.0403256583131917, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.99846839904785, + "learning_rate": 1e-06, + "loss": 0.4261, + "mean_token_accuracy": 0.865694522857666, + "num_tokens": 611907837.0, + "step": 16039 + }, + { + "epoch": 2.040452868591782, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.143373489379883, + "learning_rate": 1e-06, + "loss": 0.4613, + "mean_token_accuracy": 0.854372501373291, + "num_tokens": 611950964.0, + "step": 16040 + }, + { + "epoch": 2.0405800788703727, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.214126586914062, + "learning_rate": 1e-06, + "loss": 0.4168, + "mean_token_accuracy": 0.8681970834732056, + "num_tokens": 611985867.0, + "step": 16041 + }, + { + "epoch": 2.0407072891489633, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.037519454956055, + "learning_rate": 1e-06, + "loss": 0.3864, + "mean_token_accuracy": 0.8758400678634644, + "num_tokens": 612017819.0, + "step": 16042 + }, + { + "epoch": 2.040834499427554, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.099239349365234, + "learning_rate": 1e-06, + "loss": 0.4407, + "mean_token_accuracy": 0.8575824499130249, + "num_tokens": 612060669.0, + "step": 16043 + }, + { + "epoch": 2.0409617097061443, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.179765701293945, + "learning_rate": 1e-06, + "loss": 0.4327, + "mean_token_accuracy": 0.8590220212936401, + "num_tokens": 612099801.0, + "step": 16044 + }, + { + "epoch": 2.041088919984735, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.899290084838867, + "learning_rate": 1e-06, + "loss": 0.3898, + "mean_token_accuracy": 0.8765410780906677, + "num_tokens": 612139454.0, + "step": 16045 + }, + { + "epoch": 2.0412161302633254, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.128087997436523, + "learning_rate": 1e-06, + "loss": 0.4624, + "mean_token_accuracy": 0.8533375263214111, + "num_tokens": 612181695.0, + "step": 16046 + }, + { + "epoch": 2.041343340541916, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.996112823486328, + "learning_rate": 1e-06, + "loss": 0.3597, + "mean_token_accuracy": 0.8867579698562622, + "num_tokens": 612223200.0, + "step": 16047 + }, + { + "epoch": 2.0414705508205064, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.848995208740234, + "learning_rate": 1e-06, + "loss": 0.4049, + "mean_token_accuracy": 0.8728850483894348, + "num_tokens": 612265946.0, + "step": 16048 + }, + { + "epoch": 2.041597761099097, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.22586441040039, + "learning_rate": 1e-06, + "loss": 0.3571, + "mean_token_accuracy": 0.8875277042388916, + "num_tokens": 612301246.0, + "step": 16049 + }, + { + "epoch": 2.0417249713776875, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.940500259399414, + "learning_rate": 1e-06, + "loss": 0.4451, + "mean_token_accuracy": 0.8575952053070068, + "num_tokens": 612337789.0, + "step": 16050 + }, + { + "epoch": 2.041852181656278, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.168367385864258, + "learning_rate": 1e-06, + "loss": 0.3587, + "mean_token_accuracy": 0.8849869966506958, + "num_tokens": 612372391.0, + "step": 16051 + }, + { + "epoch": 2.0419793919348685, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.14229393005371, + "learning_rate": 1e-06, + "loss": 0.3948, + "mean_token_accuracy": 0.8758150339126587, + "num_tokens": 612408806.0, + "step": 16052 + }, + { + "epoch": 2.0421066022134586, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.24007797241211, + "learning_rate": 1e-06, + "loss": 0.367, + "mean_token_accuracy": 0.8820821046829224, + "num_tokens": 612446349.0, + "step": 16053 + }, + { + "epoch": 2.042233812492049, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.01494026184082, + "learning_rate": 1e-06, + "loss": 0.362, + "mean_token_accuracy": 0.8848543167114258, + "num_tokens": 612489899.0, + "step": 16054 + }, + { + "epoch": 2.0423610227706397, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.064775466918945, + "learning_rate": 1e-06, + "loss": 0.4263, + "mean_token_accuracy": 0.8674922585487366, + "num_tokens": 612532314.0, + "step": 16055 + }, + { + "epoch": 2.04248823304923, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.20319366455078, + "learning_rate": 1e-06, + "loss": 0.3761, + "mean_token_accuracy": 0.8790467977523804, + "num_tokens": 612568033.0, + "step": 16056 + }, + { + "epoch": 2.0426154433278207, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.05685043334961, + "learning_rate": 1e-06, + "loss": 0.3679, + "mean_token_accuracy": 0.8805418014526367, + "num_tokens": 612605160.0, + "step": 16057 + }, + { + "epoch": 2.0427426536064113, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.039236068725586, + "learning_rate": 1e-06, + "loss": 0.4357, + "mean_token_accuracy": 0.8611385226249695, + "num_tokens": 612647851.0, + "step": 16058 + }, + { + "epoch": 2.042869863885002, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.34173583984375, + "learning_rate": 1e-06, + "loss": 0.3983, + "mean_token_accuracy": 0.8725358247756958, + "num_tokens": 612684770.0, + "step": 16059 + }, + { + "epoch": 2.0429970741635923, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.09975242614746, + "learning_rate": 1e-06, + "loss": 0.4423, + "mean_token_accuracy": 0.8616373538970947, + "num_tokens": 612721689.0, + "step": 16060 + }, + { + "epoch": 2.043124284442183, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.059123992919922, + "learning_rate": 1e-06, + "loss": 0.4111, + "mean_token_accuracy": 0.871076226234436, + "num_tokens": 612757256.0, + "step": 16061 + }, + { + "epoch": 2.0432514947207734, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.99460220336914, + "learning_rate": 1e-06, + "loss": 0.3886, + "mean_token_accuracy": 0.8784420490264893, + "num_tokens": 612797806.0, + "step": 16062 + }, + { + "epoch": 2.043378704999364, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.15374183654785, + "learning_rate": 1e-06, + "loss": 0.3612, + "mean_token_accuracy": 0.884175181388855, + "num_tokens": 612833507.0, + "step": 16063 + }, + { + "epoch": 2.0435059152779544, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.956298828125, + "learning_rate": 1e-06, + "loss": 0.3816, + "mean_token_accuracy": 0.8790159225463867, + "num_tokens": 612872920.0, + "step": 16064 + }, + { + "epoch": 2.043633125556545, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.42703628540039, + "learning_rate": 1e-06, + "loss": 0.378, + "mean_token_accuracy": 0.8789368271827698, + "num_tokens": 612909534.0, + "step": 16065 + }, + { + "epoch": 2.0437603358351355, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.0461483001709, + "learning_rate": 1e-06, + "loss": 0.4188, + "mean_token_accuracy": 0.8692075610160828, + "num_tokens": 612950531.0, + "step": 16066 + }, + { + "epoch": 2.043887546113726, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.14046287536621, + "learning_rate": 1e-06, + "loss": 0.3955, + "mean_token_accuracy": 0.8740330934524536, + "num_tokens": 612984638.0, + "step": 16067 + }, + { + "epoch": 2.0440147563923166, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.34149742126465, + "learning_rate": 1e-06, + "loss": 0.3674, + "mean_token_accuracy": 0.8851870894432068, + "num_tokens": 613020632.0, + "step": 16068 + }, + { + "epoch": 2.044141966670907, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.005512237548828, + "learning_rate": 1e-06, + "loss": 0.3915, + "mean_token_accuracy": 0.8769572973251343, + "num_tokens": 613057381.0, + "step": 16069 + }, + { + "epoch": 2.0442691769494976, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.0793399810791, + "learning_rate": 1e-06, + "loss": 0.3985, + "mean_token_accuracy": 0.8709434270858765, + "num_tokens": 613094287.0, + "step": 16070 + }, + { + "epoch": 2.044396387228088, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.07163429260254, + "learning_rate": 1e-06, + "loss": 0.3704, + "mean_token_accuracy": 0.8800527453422546, + "num_tokens": 613126315.0, + "step": 16071 + }, + { + "epoch": 2.0445235975066787, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.000638961791992, + "learning_rate": 1e-06, + "loss": 0.4068, + "mean_token_accuracy": 0.8683722019195557, + "num_tokens": 613160970.0, + "step": 16072 + }, + { + "epoch": 2.044650807785269, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.832250595092773, + "learning_rate": 1e-06, + "loss": 0.4007, + "mean_token_accuracy": 0.8751202821731567, + "num_tokens": 613204685.0, + "step": 16073 + }, + { + "epoch": 2.0447780180638597, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.086681365966797, + "learning_rate": 1e-06, + "loss": 0.3689, + "mean_token_accuracy": 0.8815118670463562, + "num_tokens": 613242394.0, + "step": 16074 + }, + { + "epoch": 2.0449052283424503, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.088417053222656, + "learning_rate": 1e-06, + "loss": 0.3749, + "mean_token_accuracy": 0.881705641746521, + "num_tokens": 613283517.0, + "step": 16075 + }, + { + "epoch": 2.045032438621041, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.940488815307617, + "learning_rate": 1e-06, + "loss": 0.3787, + "mean_token_accuracy": 0.8780959844589233, + "num_tokens": 613317237.0, + "step": 16076 + }, + { + "epoch": 2.045159648899631, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.14095115661621, + "learning_rate": 1e-06, + "loss": 0.3899, + "mean_token_accuracy": 0.8750650882720947, + "num_tokens": 613355741.0, + "step": 16077 + }, + { + "epoch": 2.0452868591782214, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.116931915283203, + "learning_rate": 1e-06, + "loss": 0.4313, + "mean_token_accuracy": 0.8631985783576965, + "num_tokens": 613392131.0, + "step": 16078 + }, + { + "epoch": 2.045414069456812, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.16441535949707, + "learning_rate": 1e-06, + "loss": 0.4235, + "mean_token_accuracy": 0.8639944791793823, + "num_tokens": 613433687.0, + "step": 16079 + }, + { + "epoch": 2.0455412797354025, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.159263610839844, + "learning_rate": 1e-06, + "loss": 0.3861, + "mean_token_accuracy": 0.8763282299041748, + "num_tokens": 613472882.0, + "step": 16080 + }, + { + "epoch": 2.045668490013993, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.251575469970703, + "learning_rate": 1e-06, + "loss": 0.3983, + "mean_token_accuracy": 0.8772379755973816, + "num_tokens": 613515869.0, + "step": 16081 + }, + { + "epoch": 2.0457957002925835, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.852678298950195, + "learning_rate": 1e-06, + "loss": 0.3884, + "mean_token_accuracy": 0.8739241361618042, + "num_tokens": 613551835.0, + "step": 16082 + }, + { + "epoch": 2.045922910571174, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.379125595092773, + "learning_rate": 1e-06, + "loss": 0.3904, + "mean_token_accuracy": 0.8708838820457458, + "num_tokens": 613592277.0, + "step": 16083 + }, + { + "epoch": 2.0460501208497646, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.86928939819336, + "learning_rate": 1e-06, + "loss": 0.3916, + "mean_token_accuracy": 0.873222827911377, + "num_tokens": 613625594.0, + "step": 16084 + }, + { + "epoch": 2.046177331128355, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.09424591064453, + "learning_rate": 1e-06, + "loss": 0.3874, + "mean_token_accuracy": 0.874740719795227, + "num_tokens": 613659924.0, + "step": 16085 + }, + { + "epoch": 2.0463045414069456, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.25987434387207, + "learning_rate": 1e-06, + "loss": 0.4522, + "mean_token_accuracy": 0.8561903238296509, + "num_tokens": 613697067.0, + "step": 16086 + }, + { + "epoch": 2.046431751685536, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.814916610717773, + "learning_rate": 1e-06, + "loss": 0.3808, + "mean_token_accuracy": 0.8798981308937073, + "num_tokens": 613738211.0, + "step": 16087 + }, + { + "epoch": 2.0465589619641267, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.83039665222168, + "learning_rate": 1e-06, + "loss": 0.3829, + "mean_token_accuracy": 0.8779809474945068, + "num_tokens": 613782590.0, + "step": 16088 + }, + { + "epoch": 2.046686172242717, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.129566192626953, + "learning_rate": 1e-06, + "loss": 0.3704, + "mean_token_accuracy": 0.8802623152732849, + "num_tokens": 613825770.0, + "step": 16089 + }, + { + "epoch": 2.0468133825213077, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.98519515991211, + "learning_rate": 1e-06, + "loss": 0.3832, + "mean_token_accuracy": 0.8781607747077942, + "num_tokens": 613864025.0, + "step": 16090 + }, + { + "epoch": 2.0469405927998983, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.939334869384766, + "learning_rate": 1e-06, + "loss": 0.3918, + "mean_token_accuracy": 0.8758493661880493, + "num_tokens": 613907181.0, + "step": 16091 + }, + { + "epoch": 2.047067803078489, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.219045639038086, + "learning_rate": 1e-06, + "loss": 0.4307, + "mean_token_accuracy": 0.8636891841888428, + "num_tokens": 613951258.0, + "step": 16092 + }, + { + "epoch": 2.0471950133570793, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.017208099365234, + "learning_rate": 1e-06, + "loss": 0.4068, + "mean_token_accuracy": 0.8712128400802612, + "num_tokens": 613991221.0, + "step": 16093 + }, + { + "epoch": 2.04732222363567, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.990440368652344, + "learning_rate": 1e-06, + "loss": 0.37, + "mean_token_accuracy": 0.8837798833847046, + "num_tokens": 614026862.0, + "step": 16094 + }, + { + "epoch": 2.0474494339142604, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.12942886352539, + "learning_rate": 1e-06, + "loss": 0.4277, + "mean_token_accuracy": 0.8630474805831909, + "num_tokens": 614067560.0, + "step": 16095 + }, + { + "epoch": 2.047576644192851, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.897682189941406, + "learning_rate": 1e-06, + "loss": 0.3845, + "mean_token_accuracy": 0.8767849802970886, + "num_tokens": 614106974.0, + "step": 16096 + }, + { + "epoch": 2.0477038544714414, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.424278259277344, + "learning_rate": 1e-06, + "loss": 0.3175, + "mean_token_accuracy": 0.8988255262374878, + "num_tokens": 614145641.0, + "step": 16097 + }, + { + "epoch": 2.047831064750032, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.852794647216797, + "learning_rate": 1e-06, + "loss": 0.3828, + "mean_token_accuracy": 0.8770745396614075, + "num_tokens": 614181476.0, + "step": 16098 + }, + { + "epoch": 2.0479582750286225, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.963775634765625, + "learning_rate": 1e-06, + "loss": 0.4222, + "mean_token_accuracy": 0.8691205978393555, + "num_tokens": 614219212.0, + "step": 16099 + }, + { + "epoch": 2.048085485307213, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.057615280151367, + "learning_rate": 1e-06, + "loss": 0.4902, + "mean_token_accuracy": 0.8450253009796143, + "num_tokens": 614264017.0, + "step": 16100 + }, + { + "epoch": 2.0482126955858035, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.073928833007812, + "learning_rate": 1e-06, + "loss": 0.4553, + "mean_token_accuracy": 0.8585885763168335, + "num_tokens": 614299031.0, + "step": 16101 + }, + { + "epoch": 2.0483399058643936, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.018587112426758, + "learning_rate": 1e-06, + "loss": 0.3855, + "mean_token_accuracy": 0.8786017894744873, + "num_tokens": 614334387.0, + "step": 16102 + }, + { + "epoch": 2.048467116142984, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.022476196289062, + "learning_rate": 1e-06, + "loss": 0.4057, + "mean_token_accuracy": 0.8743178844451904, + "num_tokens": 614372438.0, + "step": 16103 + }, + { + "epoch": 2.0485943264215747, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.938682556152344, + "learning_rate": 1e-06, + "loss": 0.41, + "mean_token_accuracy": 0.8692725896835327, + "num_tokens": 614409976.0, + "step": 16104 + }, + { + "epoch": 2.048721536700165, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.100475311279297, + "learning_rate": 1e-06, + "loss": 0.3692, + "mean_token_accuracy": 0.8820114135742188, + "num_tokens": 614448740.0, + "step": 16105 + }, + { + "epoch": 2.0488487469787557, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.107908248901367, + "learning_rate": 1e-06, + "loss": 0.3773, + "mean_token_accuracy": 0.8783043026924133, + "num_tokens": 614484644.0, + "step": 16106 + }, + { + "epoch": 2.0489759572573463, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.996002197265625, + "learning_rate": 1e-06, + "loss": 0.4603, + "mean_token_accuracy": 0.8544982671737671, + "num_tokens": 614521210.0, + "step": 16107 + }, + { + "epoch": 2.049103167535937, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.934925079345703, + "learning_rate": 1e-06, + "loss": 0.4185, + "mean_token_accuracy": 0.8679016828536987, + "num_tokens": 614561783.0, + "step": 16108 + }, + { + "epoch": 2.0492303778145273, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.17707633972168, + "learning_rate": 1e-06, + "loss": 0.4069, + "mean_token_accuracy": 0.8736094236373901, + "num_tokens": 614598643.0, + "step": 16109 + }, + { + "epoch": 2.049357588093118, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.11029815673828, + "learning_rate": 1e-06, + "loss": 0.4047, + "mean_token_accuracy": 0.8726073503494263, + "num_tokens": 614633520.0, + "step": 16110 + }, + { + "epoch": 2.0494847983717084, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.11625862121582, + "learning_rate": 1e-06, + "loss": 0.4465, + "mean_token_accuracy": 0.86020827293396, + "num_tokens": 614673266.0, + "step": 16111 + }, + { + "epoch": 2.049612008650299, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.027992248535156, + "learning_rate": 1e-06, + "loss": 0.4297, + "mean_token_accuracy": 0.8637999892234802, + "num_tokens": 614707201.0, + "step": 16112 + }, + { + "epoch": 2.0497392189288894, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.007970809936523, + "learning_rate": 1e-06, + "loss": 0.3859, + "mean_token_accuracy": 0.878385066986084, + "num_tokens": 614746542.0, + "step": 16113 + }, + { + "epoch": 2.04986642920748, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.073076248168945, + "learning_rate": 1e-06, + "loss": 0.4309, + "mean_token_accuracy": 0.8624088764190674, + "num_tokens": 614781804.0, + "step": 16114 + }, + { + "epoch": 2.0499936394860705, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.15185546875, + "learning_rate": 1e-06, + "loss": 0.403, + "mean_token_accuracy": 0.8730500936508179, + "num_tokens": 614817281.0, + "step": 16115 + }, + { + "epoch": 2.050120849764661, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.028453826904297, + "learning_rate": 1e-06, + "loss": 0.4328, + "mean_token_accuracy": 0.863379955291748, + "num_tokens": 614853666.0, + "step": 16116 + }, + { + "epoch": 2.0502480600432516, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.14681053161621, + "learning_rate": 1e-06, + "loss": 0.3984, + "mean_token_accuracy": 0.8741568326950073, + "num_tokens": 614891221.0, + "step": 16117 + }, + { + "epoch": 2.050375270321842, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.213022232055664, + "learning_rate": 1e-06, + "loss": 0.3737, + "mean_token_accuracy": 0.8794280290603638, + "num_tokens": 614925956.0, + "step": 16118 + }, + { + "epoch": 2.0505024806004326, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.24127960205078, + "learning_rate": 1e-06, + "loss": 0.3779, + "mean_token_accuracy": 0.8778319954872131, + "num_tokens": 614959934.0, + "step": 16119 + }, + { + "epoch": 2.050629690879023, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.131832122802734, + "learning_rate": 1e-06, + "loss": 0.4273, + "mean_token_accuracy": 0.8695371747016907, + "num_tokens": 615000579.0, + "step": 16120 + }, + { + "epoch": 2.0507569011576137, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.863998413085938, + "learning_rate": 1e-06, + "loss": 0.4307, + "mean_token_accuracy": 0.8668326139450073, + "num_tokens": 615037371.0, + "step": 16121 + }, + { + "epoch": 2.050884111436204, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.08213233947754, + "learning_rate": 1e-06, + "loss": 0.4349, + "mean_token_accuracy": 0.8635158538818359, + "num_tokens": 615075948.0, + "step": 16122 + }, + { + "epoch": 2.0510113217147947, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.080760955810547, + "learning_rate": 1e-06, + "loss": 0.3622, + "mean_token_accuracy": 0.8858236074447632, + "num_tokens": 615114619.0, + "step": 16123 + }, + { + "epoch": 2.0511385319933853, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.816736221313477, + "learning_rate": 1e-06, + "loss": 0.4093, + "mean_token_accuracy": 0.8738142251968384, + "num_tokens": 615149260.0, + "step": 16124 + }, + { + "epoch": 2.051265742271976, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.18227195739746, + "learning_rate": 1e-06, + "loss": 0.3851, + "mean_token_accuracy": 0.8772508502006531, + "num_tokens": 615183284.0, + "step": 16125 + }, + { + "epoch": 2.0513929525505663, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.082317352294922, + "learning_rate": 1e-06, + "loss": 0.3842, + "mean_token_accuracy": 0.8775185942649841, + "num_tokens": 615220015.0, + "step": 16126 + }, + { + "epoch": 2.0515201628291564, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.78159523010254, + "learning_rate": 1e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.8668346405029297, + "num_tokens": 615255614.0, + "step": 16127 + }, + { + "epoch": 2.051647373107747, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.22838592529297, + "learning_rate": 1e-06, + "loss": 0.3945, + "mean_token_accuracy": 0.8739498853683472, + "num_tokens": 615290232.0, + "step": 16128 + }, + { + "epoch": 2.0517745833863374, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.125289916992188, + "learning_rate": 1e-06, + "loss": 0.3812, + "mean_token_accuracy": 0.8786395192146301, + "num_tokens": 615328204.0, + "step": 16129 + }, + { + "epoch": 2.051901793664928, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.16118049621582, + "learning_rate": 1e-06, + "loss": 0.4215, + "mean_token_accuracy": 0.8642014265060425, + "num_tokens": 615362846.0, + "step": 16130 + }, + { + "epoch": 2.0520290039435185, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.39545440673828, + "learning_rate": 1e-06, + "loss": 0.422, + "mean_token_accuracy": 0.8716006875038147, + "num_tokens": 615405404.0, + "step": 16131 + }, + { + "epoch": 2.052156214222109, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.771709442138672, + "learning_rate": 1e-06, + "loss": 0.4406, + "mean_token_accuracy": 0.8617382049560547, + "num_tokens": 615439835.0, + "step": 16132 + }, + { + "epoch": 2.0522834245006996, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.046249389648438, + "learning_rate": 1e-06, + "loss": 0.3712, + "mean_token_accuracy": 0.8833171129226685, + "num_tokens": 615477832.0, + "step": 16133 + }, + { + "epoch": 2.05241063477929, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.895578384399414, + "learning_rate": 1e-06, + "loss": 0.4395, + "mean_token_accuracy": 0.8649866580963135, + "num_tokens": 615518089.0, + "step": 16134 + }, + { + "epoch": 2.0525378450578806, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.050630569458008, + "learning_rate": 1e-06, + "loss": 0.3877, + "mean_token_accuracy": 0.8772193193435669, + "num_tokens": 615557286.0, + "step": 16135 + }, + { + "epoch": 2.052665055336471, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.89719581604004, + "learning_rate": 1e-06, + "loss": 0.3974, + "mean_token_accuracy": 0.876903772354126, + "num_tokens": 615593284.0, + "step": 16136 + }, + { + "epoch": 2.0527922656150617, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.94715118408203, + "learning_rate": 1e-06, + "loss": 0.4136, + "mean_token_accuracy": 0.8683589696884155, + "num_tokens": 615633579.0, + "step": 16137 + }, + { + "epoch": 2.052919475893652, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.019636154174805, + "learning_rate": 1e-06, + "loss": 0.3829, + "mean_token_accuracy": 0.8776686191558838, + "num_tokens": 615680135.0, + "step": 16138 + }, + { + "epoch": 2.0530466861722427, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.5378360748291, + "learning_rate": 1e-06, + "loss": 0.4425, + "mean_token_accuracy": 0.8618838787078857, + "num_tokens": 615714801.0, + "step": 16139 + }, + { + "epoch": 2.0531738964508333, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.100799560546875, + "learning_rate": 1e-06, + "loss": 0.4259, + "mean_token_accuracy": 0.86480712890625, + "num_tokens": 615750538.0, + "step": 16140 + }, + { + "epoch": 2.053301106729424, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.998889923095703, + "learning_rate": 1e-06, + "loss": 0.3892, + "mean_token_accuracy": 0.8782102465629578, + "num_tokens": 615789465.0, + "step": 16141 + }, + { + "epoch": 2.0534283170080143, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.182640075683594, + "learning_rate": 1e-06, + "loss": 0.45, + "mean_token_accuracy": 0.8584674596786499, + "num_tokens": 615832553.0, + "step": 16142 + }, + { + "epoch": 2.053555527286605, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.035430908203125, + "learning_rate": 1e-06, + "loss": 0.4122, + "mean_token_accuracy": 0.8687499761581421, + "num_tokens": 615868341.0, + "step": 16143 + }, + { + "epoch": 2.0536827375651954, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.130474090576172, + "learning_rate": 1e-06, + "loss": 0.3729, + "mean_token_accuracy": 0.8817918300628662, + "num_tokens": 615904324.0, + "step": 16144 + }, + { + "epoch": 2.053809947843786, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.01260757446289, + "learning_rate": 1e-06, + "loss": 0.3775, + "mean_token_accuracy": 0.879785418510437, + "num_tokens": 615947656.0, + "step": 16145 + }, + { + "epoch": 2.0539371581223764, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.10162353515625, + "learning_rate": 1e-06, + "loss": 0.4308, + "mean_token_accuracy": 0.867840051651001, + "num_tokens": 615989050.0, + "step": 16146 + }, + { + "epoch": 2.054064368400967, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.173574447631836, + "learning_rate": 1e-06, + "loss": 0.3848, + "mean_token_accuracy": 0.8773624897003174, + "num_tokens": 616027638.0, + "step": 16147 + }, + { + "epoch": 2.0541915786795575, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.06142807006836, + "learning_rate": 1e-06, + "loss": 0.381, + "mean_token_accuracy": 0.8806482553482056, + "num_tokens": 616066392.0, + "step": 16148 + }, + { + "epoch": 2.054318788958148, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.239025115966797, + "learning_rate": 1e-06, + "loss": 0.3688, + "mean_token_accuracy": 0.8826611042022705, + "num_tokens": 616108303.0, + "step": 16149 + }, + { + "epoch": 2.0544459992367385, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.930185317993164, + "learning_rate": 1e-06, + "loss": 0.4254, + "mean_token_accuracy": 0.8645105957984924, + "num_tokens": 616154174.0, + "step": 16150 + }, + { + "epoch": 2.0545732095153286, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.114118576049805, + "learning_rate": 1e-06, + "loss": 0.4212, + "mean_token_accuracy": 0.8708184957504272, + "num_tokens": 616193954.0, + "step": 16151 + }, + { + "epoch": 2.054700419793919, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.07010269165039, + "learning_rate": 1e-06, + "loss": 0.3851, + "mean_token_accuracy": 0.8769357204437256, + "num_tokens": 616232771.0, + "step": 16152 + }, + { + "epoch": 2.0548276300725097, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.991153717041016, + "learning_rate": 1e-06, + "loss": 0.375, + "mean_token_accuracy": 0.8834183216094971, + "num_tokens": 616265151.0, + "step": 16153 + }, + { + "epoch": 2.0549548403511, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.021146774291992, + "learning_rate": 1e-06, + "loss": 0.4128, + "mean_token_accuracy": 0.8676468729972839, + "num_tokens": 616303886.0, + "step": 16154 + }, + { + "epoch": 2.0550820506296907, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.04793930053711, + "learning_rate": 1e-06, + "loss": 0.3669, + "mean_token_accuracy": 0.8829660415649414, + "num_tokens": 616340040.0, + "step": 16155 + }, + { + "epoch": 2.0552092609082813, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.18393325805664, + "learning_rate": 1e-06, + "loss": 0.3901, + "mean_token_accuracy": 0.8774410486221313, + "num_tokens": 616371329.0, + "step": 16156 + }, + { + "epoch": 2.055336471186872, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.962583541870117, + "learning_rate": 1e-06, + "loss": 0.3606, + "mean_token_accuracy": 0.8871846795082092, + "num_tokens": 616406383.0, + "step": 16157 + }, + { + "epoch": 2.0554636814654623, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.073856353759766, + "learning_rate": 1e-06, + "loss": 0.4372, + "mean_token_accuracy": 0.8603680729866028, + "num_tokens": 616445600.0, + "step": 16158 + }, + { + "epoch": 2.055590891744053, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.03046417236328, + "learning_rate": 1e-06, + "loss": 0.4303, + "mean_token_accuracy": 0.8652640581130981, + "num_tokens": 616485376.0, + "step": 16159 + }, + { + "epoch": 2.0557181020226434, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.96647834777832, + "learning_rate": 1e-06, + "loss": 0.4265, + "mean_token_accuracy": 0.8643674254417419, + "num_tokens": 616527104.0, + "step": 16160 + }, + { + "epoch": 2.055845312301234, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.848608016967773, + "learning_rate": 1e-06, + "loss": 0.4003, + "mean_token_accuracy": 0.8752093315124512, + "num_tokens": 616567557.0, + "step": 16161 + }, + { + "epoch": 2.0559725225798244, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.061147689819336, + "learning_rate": 1e-06, + "loss": 0.3857, + "mean_token_accuracy": 0.8788793087005615, + "num_tokens": 616601612.0, + "step": 16162 + }, + { + "epoch": 2.056099732858415, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.03142738342285, + "learning_rate": 1e-06, + "loss": 0.3996, + "mean_token_accuracy": 0.8699769973754883, + "num_tokens": 616638019.0, + "step": 16163 + }, + { + "epoch": 2.0562269431370055, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.987600326538086, + "learning_rate": 1e-06, + "loss": 0.4113, + "mean_token_accuracy": 0.87028968334198, + "num_tokens": 616675224.0, + "step": 16164 + }, + { + "epoch": 2.056354153415596, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.06661033630371, + "learning_rate": 1e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.870656430721283, + "num_tokens": 616713628.0, + "step": 16165 + }, + { + "epoch": 2.0564813636941865, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.033653259277344, + "learning_rate": 1e-06, + "loss": 0.4136, + "mean_token_accuracy": 0.8699425458908081, + "num_tokens": 616755985.0, + "step": 16166 + }, + { + "epoch": 2.056608573972777, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.019376754760742, + "learning_rate": 1e-06, + "loss": 0.3922, + "mean_token_accuracy": 0.8730404376983643, + "num_tokens": 616786564.0, + "step": 16167 + }, + { + "epoch": 2.0567357842513676, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.894222259521484, + "learning_rate": 1e-06, + "loss": 0.4885, + "mean_token_accuracy": 0.8452209234237671, + "num_tokens": 616823285.0, + "step": 16168 + }, + { + "epoch": 2.056862994529958, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.980728149414062, + "learning_rate": 1e-06, + "loss": 0.3709, + "mean_token_accuracy": 0.8807483911514282, + "num_tokens": 616864465.0, + "step": 16169 + }, + { + "epoch": 2.0569902048085487, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.9227352142334, + "learning_rate": 1e-06, + "loss": 0.4262, + "mean_token_accuracy": 0.8637500405311584, + "num_tokens": 616906670.0, + "step": 16170 + }, + { + "epoch": 2.057117415087139, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.03253936767578, + "learning_rate": 1e-06, + "loss": 0.4061, + "mean_token_accuracy": 0.8711669445037842, + "num_tokens": 616947656.0, + "step": 16171 + }, + { + "epoch": 2.0572446253657297, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.167043685913086, + "learning_rate": 1e-06, + "loss": 0.4201, + "mean_token_accuracy": 0.8658163547515869, + "num_tokens": 616980161.0, + "step": 16172 + }, + { + "epoch": 2.0573718356443202, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.901269912719727, + "learning_rate": 1e-06, + "loss": 0.3978, + "mean_token_accuracy": 0.8721129894256592, + "num_tokens": 617015281.0, + "step": 16173 + }, + { + "epoch": 2.0574990459229108, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.944839477539062, + "learning_rate": 1e-06, + "loss": 0.393, + "mean_token_accuracy": 0.8737783432006836, + "num_tokens": 617054174.0, + "step": 16174 + }, + { + "epoch": 2.057626256201501, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.91645622253418, + "learning_rate": 1e-06, + "loss": 0.397, + "mean_token_accuracy": 0.8714295625686646, + "num_tokens": 617095160.0, + "step": 16175 + }, + { + "epoch": 2.0577534664800914, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.075719833374023, + "learning_rate": 1e-06, + "loss": 0.3862, + "mean_token_accuracy": 0.877241849899292, + "num_tokens": 617130781.0, + "step": 16176 + }, + { + "epoch": 2.057880676758682, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.92719841003418, + "learning_rate": 1e-06, + "loss": 0.4093, + "mean_token_accuracy": 0.8712960481643677, + "num_tokens": 617169473.0, + "step": 16177 + }, + { + "epoch": 2.0580078870372724, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.025516510009766, + "learning_rate": 1e-06, + "loss": 0.4183, + "mean_token_accuracy": 0.8667330741882324, + "num_tokens": 617212208.0, + "step": 16178 + }, + { + "epoch": 2.058135097315863, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.972702026367188, + "learning_rate": 1e-06, + "loss": 0.3959, + "mean_token_accuracy": 0.8745003938674927, + "num_tokens": 617246055.0, + "step": 16179 + }, + { + "epoch": 2.0582623075944535, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.949344635009766, + "learning_rate": 1e-06, + "loss": 0.4206, + "mean_token_accuracy": 0.8683533072471619, + "num_tokens": 617285614.0, + "step": 16180 + }, + { + "epoch": 2.058389517873044, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.997913360595703, + "learning_rate": 1e-06, + "loss": 0.3769, + "mean_token_accuracy": 0.8804103136062622, + "num_tokens": 617323401.0, + "step": 16181 + }, + { + "epoch": 2.0585167281516346, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.076950073242188, + "learning_rate": 1e-06, + "loss": 0.4446, + "mean_token_accuracy": 0.8590579032897949, + "num_tokens": 617372550.0, + "step": 16182 + }, + { + "epoch": 2.058643938430225, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.964698791503906, + "learning_rate": 1e-06, + "loss": 0.3796, + "mean_token_accuracy": 0.8786917924880981, + "num_tokens": 617407894.0, + "step": 16183 + }, + { + "epoch": 2.0587711487088156, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.05091094970703, + "learning_rate": 1e-06, + "loss": 0.4253, + "mean_token_accuracy": 0.8645049333572388, + "num_tokens": 617450117.0, + "step": 16184 + }, + { + "epoch": 2.058898358987406, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.008487701416016, + "learning_rate": 1e-06, + "loss": 0.3807, + "mean_token_accuracy": 0.8781846165657043, + "num_tokens": 617485885.0, + "step": 16185 + }, + { + "epoch": 2.0590255692659967, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.12407875061035, + "learning_rate": 1e-06, + "loss": 0.4354, + "mean_token_accuracy": 0.8561031818389893, + "num_tokens": 617520511.0, + "step": 16186 + }, + { + "epoch": 2.059152779544587, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.207624435424805, + "learning_rate": 1e-06, + "loss": 0.431, + "mean_token_accuracy": 0.8664660453796387, + "num_tokens": 617558288.0, + "step": 16187 + }, + { + "epoch": 2.0592799898231777, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.977928161621094, + "learning_rate": 1e-06, + "loss": 0.393, + "mean_token_accuracy": 0.871712327003479, + "num_tokens": 617602991.0, + "step": 16188 + }, + { + "epoch": 2.0594072001017683, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.90699577331543, + "learning_rate": 1e-06, + "loss": 0.4056, + "mean_token_accuracy": 0.8696498274803162, + "num_tokens": 617639629.0, + "step": 16189 + }, + { + "epoch": 2.059534410380359, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.170124053955078, + "learning_rate": 1e-06, + "loss": 0.4132, + "mean_token_accuracy": 0.8653668165206909, + "num_tokens": 617676218.0, + "step": 16190 + }, + { + "epoch": 2.0596616206589493, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.791967391967773, + "learning_rate": 1e-06, + "loss": 0.4194, + "mean_token_accuracy": 0.8657737374305725, + "num_tokens": 617716270.0, + "step": 16191 + }, + { + "epoch": 2.05978883093754, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.98041534423828, + "learning_rate": 1e-06, + "loss": 0.3773, + "mean_token_accuracy": 0.8809842467308044, + "num_tokens": 617755559.0, + "step": 16192 + }, + { + "epoch": 2.0599160412161304, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.104305267333984, + "learning_rate": 1e-06, + "loss": 0.3981, + "mean_token_accuracy": 0.8742489814758301, + "num_tokens": 617793522.0, + "step": 16193 + }, + { + "epoch": 2.060043251494721, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.89661407470703, + "learning_rate": 1e-06, + "loss": 0.4057, + "mean_token_accuracy": 0.8769798278808594, + "num_tokens": 617828005.0, + "step": 16194 + }, + { + "epoch": 2.0601704617733114, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.992815017700195, + "learning_rate": 1e-06, + "loss": 0.4294, + "mean_token_accuracy": 0.8685311079025269, + "num_tokens": 617863824.0, + "step": 16195 + }, + { + "epoch": 2.060297672051902, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.012779235839844, + "learning_rate": 1e-06, + "loss": 0.4182, + "mean_token_accuracy": 0.8681834936141968, + "num_tokens": 617903648.0, + "step": 16196 + }, + { + "epoch": 2.0604248823304925, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.990009307861328, + "learning_rate": 1e-06, + "loss": 0.395, + "mean_token_accuracy": 0.8691086769104004, + "num_tokens": 617946229.0, + "step": 16197 + }, + { + "epoch": 2.060552092609083, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.783143997192383, + "learning_rate": 1e-06, + "loss": 0.4032, + "mean_token_accuracy": 0.8725229501724243, + "num_tokens": 617990486.0, + "step": 16198 + }, + { + "epoch": 2.0606793028876735, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.989171981811523, + "learning_rate": 1e-06, + "loss": 0.3696, + "mean_token_accuracy": 0.8835710287094116, + "num_tokens": 618031743.0, + "step": 16199 + }, + { + "epoch": 2.0608065131662636, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.135744094848633, + "learning_rate": 1e-06, + "loss": 0.4145, + "mean_token_accuracy": 0.8688941597938538, + "num_tokens": 618070181.0, + "step": 16200 + }, + { + "epoch": 2.060933723444854, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.91490364074707, + "learning_rate": 1e-06, + "loss": 0.3919, + "mean_token_accuracy": 0.8792388439178467, + "num_tokens": 618106396.0, + "step": 16201 + }, + { + "epoch": 2.0610609337234447, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.077505111694336, + "learning_rate": 1e-06, + "loss": 0.3902, + "mean_token_accuracy": 0.8769935369491577, + "num_tokens": 618148281.0, + "step": 16202 + }, + { + "epoch": 2.061188144002035, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.947919845581055, + "learning_rate": 1e-06, + "loss": 0.4209, + "mean_token_accuracy": 0.8653095960617065, + "num_tokens": 618182721.0, + "step": 16203 + }, + { + "epoch": 2.0613153542806257, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.899351119995117, + "learning_rate": 1e-06, + "loss": 0.3571, + "mean_token_accuracy": 0.8877896070480347, + "num_tokens": 618219497.0, + "step": 16204 + }, + { + "epoch": 2.0614425645592163, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.143325805664062, + "learning_rate": 1e-06, + "loss": 0.3577, + "mean_token_accuracy": 0.8892362117767334, + "num_tokens": 618258643.0, + "step": 16205 + }, + { + "epoch": 2.061569774837807, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.088451385498047, + "learning_rate": 1e-06, + "loss": 0.3531, + "mean_token_accuracy": 0.888189971446991, + "num_tokens": 618292682.0, + "step": 16206 + }, + { + "epoch": 2.0616969851163973, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.161727905273438, + "learning_rate": 1e-06, + "loss": 0.4493, + "mean_token_accuracy": 0.8579464554786682, + "num_tokens": 618331375.0, + "step": 16207 + }, + { + "epoch": 2.061824195394988, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.04589080810547, + "learning_rate": 1e-06, + "loss": 0.3865, + "mean_token_accuracy": 0.8772488832473755, + "num_tokens": 618367452.0, + "step": 16208 + }, + { + "epoch": 2.0619514056735784, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.077068328857422, + "learning_rate": 1e-06, + "loss": 0.4052, + "mean_token_accuracy": 0.8710028529167175, + "num_tokens": 618410470.0, + "step": 16209 + }, + { + "epoch": 2.062078615952169, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.092496871948242, + "learning_rate": 1e-06, + "loss": 0.4158, + "mean_token_accuracy": 0.8655003309249878, + "num_tokens": 618447524.0, + "step": 16210 + }, + { + "epoch": 2.0622058262307594, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.956514358520508, + "learning_rate": 1e-06, + "loss": 0.455, + "mean_token_accuracy": 0.8538346290588379, + "num_tokens": 618481793.0, + "step": 16211 + }, + { + "epoch": 2.06233303650935, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.222854614257812, + "learning_rate": 1e-06, + "loss": 0.3773, + "mean_token_accuracy": 0.8793935179710388, + "num_tokens": 618517097.0, + "step": 16212 + }, + { + "epoch": 2.0624602467879405, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.10419464111328, + "learning_rate": 1e-06, + "loss": 0.4163, + "mean_token_accuracy": 0.8709077835083008, + "num_tokens": 618554221.0, + "step": 16213 + }, + { + "epoch": 2.062587457066531, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.115842819213867, + "learning_rate": 1e-06, + "loss": 0.3742, + "mean_token_accuracy": 0.8824993371963501, + "num_tokens": 618591279.0, + "step": 16214 + }, + { + "epoch": 2.0627146673451215, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.988950729370117, + "learning_rate": 1e-06, + "loss": 0.4399, + "mean_token_accuracy": 0.8598781824111938, + "num_tokens": 618628168.0, + "step": 16215 + }, + { + "epoch": 2.062841877623712, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.08673667907715, + "learning_rate": 1e-06, + "loss": 0.3783, + "mean_token_accuracy": 0.8776113390922546, + "num_tokens": 618664580.0, + "step": 16216 + }, + { + "epoch": 2.0629690879023026, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.149826049804688, + "learning_rate": 1e-06, + "loss": 0.436, + "mean_token_accuracy": 0.8613560199737549, + "num_tokens": 618699007.0, + "step": 16217 + }, + { + "epoch": 2.063096298180893, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.15045928955078, + "learning_rate": 1e-06, + "loss": 0.3965, + "mean_token_accuracy": 0.8738608360290527, + "num_tokens": 618736630.0, + "step": 16218 + }, + { + "epoch": 2.0632235084594837, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.050220489501953, + "learning_rate": 1e-06, + "loss": 0.3738, + "mean_token_accuracy": 0.880677342414856, + "num_tokens": 618777868.0, + "step": 16219 + }, + { + "epoch": 2.063350718738074, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.2464599609375, + "learning_rate": 1e-06, + "loss": 0.3864, + "mean_token_accuracy": 0.8760858774185181, + "num_tokens": 618819134.0, + "step": 16220 + }, + { + "epoch": 2.0634779290166647, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.028709411621094, + "learning_rate": 1e-06, + "loss": 0.4027, + "mean_token_accuracy": 0.8726142048835754, + "num_tokens": 618862231.0, + "step": 16221 + }, + { + "epoch": 2.0636051392952552, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.936519622802734, + "learning_rate": 1e-06, + "loss": 0.4107, + "mean_token_accuracy": 0.8665293455123901, + "num_tokens": 618906816.0, + "step": 16222 + }, + { + "epoch": 2.0637323495738458, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.319841384887695, + "learning_rate": 1e-06, + "loss": 0.4015, + "mean_token_accuracy": 0.8736891746520996, + "num_tokens": 618943239.0, + "step": 16223 + }, + { + "epoch": 2.0638595598524363, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.090200424194336, + "learning_rate": 1e-06, + "loss": 0.3562, + "mean_token_accuracy": 0.8859936594963074, + "num_tokens": 618978641.0, + "step": 16224 + }, + { + "epoch": 2.0639867701310264, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.100017547607422, + "learning_rate": 1e-06, + "loss": 0.4005, + "mean_token_accuracy": 0.8733947277069092, + "num_tokens": 619020522.0, + "step": 16225 + }, + { + "epoch": 2.064113980409617, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.10972785949707, + "learning_rate": 1e-06, + "loss": 0.3708, + "mean_token_accuracy": 0.885097861289978, + "num_tokens": 619060769.0, + "step": 16226 + }, + { + "epoch": 2.0642411906882074, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.23163414001465, + "learning_rate": 1e-06, + "loss": 0.3664, + "mean_token_accuracy": 0.8839908838272095, + "num_tokens": 619098711.0, + "step": 16227 + }, + { + "epoch": 2.064368400966798, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.13613510131836, + "learning_rate": 1e-06, + "loss": 0.4227, + "mean_token_accuracy": 0.8687595129013062, + "num_tokens": 619133673.0, + "step": 16228 + }, + { + "epoch": 2.0644956112453885, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.980791091918945, + "learning_rate": 1e-06, + "loss": 0.431, + "mean_token_accuracy": 0.8666836023330688, + "num_tokens": 619174554.0, + "step": 16229 + }, + { + "epoch": 2.064622821523979, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.218294143676758, + "learning_rate": 1e-06, + "loss": 0.4382, + "mean_token_accuracy": 0.8625384569168091, + "num_tokens": 619211095.0, + "step": 16230 + }, + { + "epoch": 2.0647500318025696, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.91473960876465, + "learning_rate": 1e-06, + "loss": 0.3663, + "mean_token_accuracy": 0.881956934928894, + "num_tokens": 619246346.0, + "step": 16231 + }, + { + "epoch": 2.06487724208116, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.24519157409668, + "learning_rate": 1e-06, + "loss": 0.4098, + "mean_token_accuracy": 0.8703867197036743, + "num_tokens": 619280629.0, + "step": 16232 + }, + { + "epoch": 2.0650044523597506, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.070816040039062, + "learning_rate": 1e-06, + "loss": 0.4657, + "mean_token_accuracy": 0.8540836572647095, + "num_tokens": 619320928.0, + "step": 16233 + }, + { + "epoch": 2.065131662638341, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.88346290588379, + "learning_rate": 1e-06, + "loss": 0.3957, + "mean_token_accuracy": 0.8740649223327637, + "num_tokens": 619369747.0, + "step": 16234 + }, + { + "epoch": 2.0652588729169317, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.1281795501709, + "learning_rate": 1e-06, + "loss": 0.3989, + "mean_token_accuracy": 0.8741744160652161, + "num_tokens": 619408694.0, + "step": 16235 + }, + { + "epoch": 2.065386083195522, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.035993576049805, + "learning_rate": 1e-06, + "loss": 0.367, + "mean_token_accuracy": 0.8808131217956543, + "num_tokens": 619446858.0, + "step": 16236 + }, + { + "epoch": 2.0655132934741127, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.151365280151367, + "learning_rate": 1e-06, + "loss": 0.3936, + "mean_token_accuracy": 0.8729423880577087, + "num_tokens": 619482281.0, + "step": 16237 + }, + { + "epoch": 2.0656405037527032, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.214033126831055, + "learning_rate": 1e-06, + "loss": 0.3799, + "mean_token_accuracy": 0.8819376230239868, + "num_tokens": 619520309.0, + "step": 16238 + }, + { + "epoch": 2.065767714031294, + "ewc_loss": 0.035888671875, + "ewc_loss_parallel": 3.600120544433594e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.901126861572266, + "learning_rate": 1e-06, + "loss": 0.3935, + "mean_token_accuracy": 0.8763110637664795, + "num_tokens": 619554662.0, + "step": 16239 + }, + { + "epoch": 2.0658949243098843, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.35564613342285, + "learning_rate": 1e-06, + "loss": 0.418, + "mean_token_accuracy": 0.8695417642593384, + "num_tokens": 619599632.0, + "step": 16240 + }, + { + "epoch": 2.066022134588475, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.862855911254883, + "learning_rate": 1e-06, + "loss": 0.36, + "mean_token_accuracy": 0.8857111930847168, + "num_tokens": 619639414.0, + "step": 16241 + }, + { + "epoch": 2.0661493448670654, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.15821075439453, + "learning_rate": 1e-06, + "loss": 0.4009, + "mean_token_accuracy": 0.8741740584373474, + "num_tokens": 619675878.0, + "step": 16242 + }, + { + "epoch": 2.066276555145656, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.083023071289062, + "learning_rate": 1e-06, + "loss": 0.3972, + "mean_token_accuracy": 0.8756799697875977, + "num_tokens": 619709486.0, + "step": 16243 + }, + { + "epoch": 2.0664037654242464, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.03786849975586, + "learning_rate": 1e-06, + "loss": 0.4036, + "mean_token_accuracy": 0.8729214072227478, + "num_tokens": 619753576.0, + "step": 16244 + }, + { + "epoch": 2.066530975702837, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.05076789855957, + "learning_rate": 1e-06, + "loss": 0.3983, + "mean_token_accuracy": 0.8730441331863403, + "num_tokens": 619792520.0, + "step": 16245 + }, + { + "epoch": 2.0666581859814275, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.00544548034668, + "learning_rate": 1e-06, + "loss": 0.4896, + "mean_token_accuracy": 0.8515985608100891, + "num_tokens": 619827871.0, + "step": 16246 + }, + { + "epoch": 2.066785396260018, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.970849990844727, + "learning_rate": 1e-06, + "loss": 0.4069, + "mean_token_accuracy": 0.8682982921600342, + "num_tokens": 619864527.0, + "step": 16247 + }, + { + "epoch": 2.0669126065386085, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.115427017211914, + "learning_rate": 1e-06, + "loss": 0.3968, + "mean_token_accuracy": 0.8743813037872314, + "num_tokens": 619904934.0, + "step": 16248 + }, + { + "epoch": 2.0670398168171986, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.98331642150879, + "learning_rate": 1e-06, + "loss": 0.3752, + "mean_token_accuracy": 0.880643367767334, + "num_tokens": 619945472.0, + "step": 16249 + }, + { + "epoch": 2.067167027095789, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.094993591308594, + "learning_rate": 1e-06, + "loss": 0.3668, + "mean_token_accuracy": 0.8843700885772705, + "num_tokens": 619981738.0, + "step": 16250 + }, + { + "epoch": 2.0672942373743797, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.03168487548828, + "learning_rate": 1e-06, + "loss": 0.3937, + "mean_token_accuracy": 0.8755900859832764, + "num_tokens": 620024593.0, + "step": 16251 + }, + { + "epoch": 2.06742144765297, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.055923461914062, + "learning_rate": 1e-06, + "loss": 0.3803, + "mean_token_accuracy": 0.8797640800476074, + "num_tokens": 620064059.0, + "step": 16252 + }, + { + "epoch": 2.0675486579315607, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.078868865966797, + "learning_rate": 1e-06, + "loss": 0.3894, + "mean_token_accuracy": 0.8750278949737549, + "num_tokens": 620100028.0, + "step": 16253 + }, + { + "epoch": 2.0676758682101513, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.007808685302734, + "learning_rate": 1e-06, + "loss": 0.3599, + "mean_token_accuracy": 0.8862869143486023, + "num_tokens": 620135181.0, + "step": 16254 + }, + { + "epoch": 2.067803078488742, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.231189727783203, + "learning_rate": 1e-06, + "loss": 0.4006, + "mean_token_accuracy": 0.8746979832649231, + "num_tokens": 620174887.0, + "step": 16255 + }, + { + "epoch": 2.0679302887673323, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.127098083496094, + "learning_rate": 1e-06, + "loss": 0.3963, + "mean_token_accuracy": 0.8732637763023376, + "num_tokens": 620212924.0, + "step": 16256 + }, + { + "epoch": 2.068057499045923, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.15964698791504, + "learning_rate": 1e-06, + "loss": 0.4307, + "mean_token_accuracy": 0.863879919052124, + "num_tokens": 620251088.0, + "step": 16257 + }, + { + "epoch": 2.0681847093245134, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.97698211669922, + "learning_rate": 1e-06, + "loss": 0.3982, + "mean_token_accuracy": 0.8727582097053528, + "num_tokens": 620287888.0, + "step": 16258 + }, + { + "epoch": 2.068311919603104, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.965471267700195, + "learning_rate": 1e-06, + "loss": 0.4125, + "mean_token_accuracy": 0.871962308883667, + "num_tokens": 620324937.0, + "step": 16259 + }, + { + "epoch": 2.0684391298816944, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.088027954101562, + "learning_rate": 1e-06, + "loss": 0.4013, + "mean_token_accuracy": 0.8741397261619568, + "num_tokens": 620363390.0, + "step": 16260 + }, + { + "epoch": 2.068566340160285, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.96940803527832, + "learning_rate": 1e-06, + "loss": 0.3775, + "mean_token_accuracy": 0.8804610371589661, + "num_tokens": 620403707.0, + "step": 16261 + }, + { + "epoch": 2.0686935504388755, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.99333953857422, + "learning_rate": 1e-06, + "loss": 0.3862, + "mean_token_accuracy": 0.8766026496887207, + "num_tokens": 620445994.0, + "step": 16262 + }, + { + "epoch": 2.068820760717466, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.098852157592773, + "learning_rate": 1e-06, + "loss": 0.3678, + "mean_token_accuracy": 0.8804507255554199, + "num_tokens": 620480883.0, + "step": 16263 + }, + { + "epoch": 2.0689479709960565, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.983694076538086, + "learning_rate": 1e-06, + "loss": 0.3634, + "mean_token_accuracy": 0.8864419460296631, + "num_tokens": 620518935.0, + "step": 16264 + }, + { + "epoch": 2.069075181274647, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.109580993652344, + "learning_rate": 1e-06, + "loss": 0.4224, + "mean_token_accuracy": 0.8665217161178589, + "num_tokens": 620551985.0, + "step": 16265 + }, + { + "epoch": 2.0692023915532376, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.079225540161133, + "learning_rate": 1e-06, + "loss": 0.4065, + "mean_token_accuracy": 0.8690955638885498, + "num_tokens": 620592809.0, + "step": 16266 + }, + { + "epoch": 2.069329601831828, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.066495895385742, + "learning_rate": 1e-06, + "loss": 0.4329, + "mean_token_accuracy": 0.8589879274368286, + "num_tokens": 620627951.0, + "step": 16267 + }, + { + "epoch": 2.0694568121104187, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.169170379638672, + "learning_rate": 1e-06, + "loss": 0.3875, + "mean_token_accuracy": 0.8783420324325562, + "num_tokens": 620660892.0, + "step": 16268 + }, + { + "epoch": 2.069584022389009, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.990148544311523, + "learning_rate": 1e-06, + "loss": 0.4728, + "mean_token_accuracy": 0.8491864800453186, + "num_tokens": 620698165.0, + "step": 16269 + }, + { + "epoch": 2.0697112326675997, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.12614631652832, + "learning_rate": 1e-06, + "loss": 0.4315, + "mean_token_accuracy": 0.8647991418838501, + "num_tokens": 620734758.0, + "step": 16270 + }, + { + "epoch": 2.0698384429461902, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.997583389282227, + "learning_rate": 1e-06, + "loss": 0.454, + "mean_token_accuracy": 0.8543754816055298, + "num_tokens": 620770224.0, + "step": 16271 + }, + { + "epoch": 2.0699656532247808, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.030261993408203, + "learning_rate": 1e-06, + "loss": 0.3652, + "mean_token_accuracy": 0.8810040950775146, + "num_tokens": 620805724.0, + "step": 16272 + }, + { + "epoch": 2.070092863503371, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.037067413330078, + "learning_rate": 1e-06, + "loss": 0.431, + "mean_token_accuracy": 0.8621532917022705, + "num_tokens": 620847622.0, + "step": 16273 + }, + { + "epoch": 2.0702200737819614, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.0074520111084, + "learning_rate": 1e-06, + "loss": 0.3975, + "mean_token_accuracy": 0.8692713975906372, + "num_tokens": 620879042.0, + "step": 16274 + }, + { + "epoch": 2.070347284060552, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.803491592407227, + "learning_rate": 1e-06, + "loss": 0.4102, + "mean_token_accuracy": 0.8688843250274658, + "num_tokens": 620922685.0, + "step": 16275 + }, + { + "epoch": 2.0704744943391424, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.020036697387695, + "learning_rate": 1e-06, + "loss": 0.4141, + "mean_token_accuracy": 0.8678723573684692, + "num_tokens": 620954854.0, + "step": 16276 + }, + { + "epoch": 2.070601704617733, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.08072280883789, + "learning_rate": 1e-06, + "loss": 0.4054, + "mean_token_accuracy": 0.8705360889434814, + "num_tokens": 620994581.0, + "step": 16277 + }, + { + "epoch": 2.0707289148963235, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.170351028442383, + "learning_rate": 1e-06, + "loss": 0.3937, + "mean_token_accuracy": 0.8741770386695862, + "num_tokens": 621030052.0, + "step": 16278 + }, + { + "epoch": 2.070856125174914, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.025522232055664, + "learning_rate": 1e-06, + "loss": 0.3707, + "mean_token_accuracy": 0.8819798231124878, + "num_tokens": 621072474.0, + "step": 16279 + }, + { + "epoch": 2.0709833354535045, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.063356399536133, + "learning_rate": 1e-06, + "loss": 0.4052, + "mean_token_accuracy": 0.8768036365509033, + "num_tokens": 621104365.0, + "step": 16280 + }, + { + "epoch": 2.071110545732095, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.02385711669922, + "learning_rate": 1e-06, + "loss": 0.3902, + "mean_token_accuracy": 0.8775970935821533, + "num_tokens": 621140497.0, + "step": 16281 + }, + { + "epoch": 2.0712377560106856, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.978591918945312, + "learning_rate": 1e-06, + "loss": 0.3998, + "mean_token_accuracy": 0.8758490681648254, + "num_tokens": 621177431.0, + "step": 16282 + }, + { + "epoch": 2.071364966289276, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.342266082763672, + "learning_rate": 1e-06, + "loss": 0.3859, + "mean_token_accuracy": 0.8750973343849182, + "num_tokens": 621212874.0, + "step": 16283 + }, + { + "epoch": 2.0714921765678667, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.120323181152344, + "learning_rate": 1e-06, + "loss": 0.389, + "mean_token_accuracy": 0.8786910772323608, + "num_tokens": 621249927.0, + "step": 16284 + }, + { + "epoch": 2.071619386846457, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.30975914001465, + "learning_rate": 1e-06, + "loss": 0.4693, + "mean_token_accuracy": 0.848215639591217, + "num_tokens": 621280794.0, + "step": 16285 + }, + { + "epoch": 2.0717465971250477, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.06768226623535, + "learning_rate": 1e-06, + "loss": 0.4515, + "mean_token_accuracy": 0.8575357794761658, + "num_tokens": 621321167.0, + "step": 16286 + }, + { + "epoch": 2.0718738074036382, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.983064651489258, + "learning_rate": 1e-06, + "loss": 0.3598, + "mean_token_accuracy": 0.8842974901199341, + "num_tokens": 621357962.0, + "step": 16287 + }, + { + "epoch": 2.0720010176822288, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.19875717163086, + "learning_rate": 1e-06, + "loss": 0.3497, + "mean_token_accuracy": 0.8909556865692139, + "num_tokens": 621398933.0, + "step": 16288 + }, + { + "epoch": 2.0721282279608193, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.038949966430664, + "learning_rate": 1e-06, + "loss": 0.3486, + "mean_token_accuracy": 0.8913443088531494, + "num_tokens": 621442334.0, + "step": 16289 + }, + { + "epoch": 2.07225543823941, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.05270004272461, + "learning_rate": 1e-06, + "loss": 0.3877, + "mean_token_accuracy": 0.8757795095443726, + "num_tokens": 621478706.0, + "step": 16290 + }, + { + "epoch": 2.0723826485180004, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.18033790588379, + "learning_rate": 1e-06, + "loss": 0.3768, + "mean_token_accuracy": 0.8793564438819885, + "num_tokens": 621513911.0, + "step": 16291 + }, + { + "epoch": 2.072509858796591, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.29106330871582, + "learning_rate": 1e-06, + "loss": 0.4255, + "mean_token_accuracy": 0.867724597454071, + "num_tokens": 621554080.0, + "step": 16292 + }, + { + "epoch": 2.0726370690751814, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.09821319580078, + "learning_rate": 1e-06, + "loss": 0.3931, + "mean_token_accuracy": 0.8754670023918152, + "num_tokens": 621592059.0, + "step": 16293 + }, + { + "epoch": 2.072764279353772, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.992258071899414, + "learning_rate": 1e-06, + "loss": 0.3635, + "mean_token_accuracy": 0.8854609131813049, + "num_tokens": 621629626.0, + "step": 16294 + }, + { + "epoch": 2.0728914896323625, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.9186954498291, + "learning_rate": 1e-06, + "loss": 0.3872, + "mean_token_accuracy": 0.8775054216384888, + "num_tokens": 621671341.0, + "step": 16295 + }, + { + "epoch": 2.073018699910953, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.10845184326172, + "learning_rate": 1e-06, + "loss": 0.3465, + "mean_token_accuracy": 0.8877055048942566, + "num_tokens": 621708756.0, + "step": 16296 + }, + { + "epoch": 2.0731459101895435, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.013887405395508, + "learning_rate": 1e-06, + "loss": 0.3795, + "mean_token_accuracy": 0.8776215314865112, + "num_tokens": 621748009.0, + "step": 16297 + }, + { + "epoch": 2.0732731204681336, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.106945037841797, + "learning_rate": 1e-06, + "loss": 0.368, + "mean_token_accuracy": 0.88185715675354, + "num_tokens": 621788695.0, + "step": 16298 + }, + { + "epoch": 2.073400330746724, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.02951431274414, + "learning_rate": 1e-06, + "loss": 0.406, + "mean_token_accuracy": 0.8727664947509766, + "num_tokens": 621828099.0, + "step": 16299 + }, + { + "epoch": 2.0735275410253147, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.121965408325195, + "learning_rate": 1e-06, + "loss": 0.4496, + "mean_token_accuracy": 0.8564619421958923, + "num_tokens": 621867033.0, + "step": 16300 + }, + { + "epoch": 2.073654751303905, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.08525848388672, + "learning_rate": 1e-06, + "loss": 0.4349, + "mean_token_accuracy": 0.8643589019775391, + "num_tokens": 621907435.0, + "step": 16301 + }, + { + "epoch": 2.0737819615824957, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.957683563232422, + "learning_rate": 1e-06, + "loss": 0.4238, + "mean_token_accuracy": 0.8637509942054749, + "num_tokens": 621948995.0, + "step": 16302 + }, + { + "epoch": 2.0739091718610863, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.05450439453125, + "learning_rate": 1e-06, + "loss": 0.373, + "mean_token_accuracy": 0.884638249874115, + "num_tokens": 621991109.0, + "step": 16303 + }, + { + "epoch": 2.074036382139677, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.258216857910156, + "learning_rate": 1e-06, + "loss": 0.3778, + "mean_token_accuracy": 0.8803058862686157, + "num_tokens": 622026585.0, + "step": 16304 + }, + { + "epoch": 2.0741635924182673, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.931325912475586, + "learning_rate": 1e-06, + "loss": 0.3766, + "mean_token_accuracy": 0.8830339312553406, + "num_tokens": 622065208.0, + "step": 16305 + }, + { + "epoch": 2.074290802696858, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.145618438720703, + "learning_rate": 1e-06, + "loss": 0.421, + "mean_token_accuracy": 0.8655010461807251, + "num_tokens": 622104838.0, + "step": 16306 + }, + { + "epoch": 2.0744180129754484, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.805389404296875, + "learning_rate": 1e-06, + "loss": 0.4262, + "mean_token_accuracy": 0.8641786575317383, + "num_tokens": 622136879.0, + "step": 16307 + }, + { + "epoch": 2.074545223254039, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.105016708374023, + "learning_rate": 1e-06, + "loss": 0.3665, + "mean_token_accuracy": 0.8847246170043945, + "num_tokens": 622175938.0, + "step": 16308 + }, + { + "epoch": 2.0746724335326294, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.14592170715332, + "learning_rate": 1e-06, + "loss": 0.3781, + "mean_token_accuracy": 0.8817412257194519, + "num_tokens": 622216663.0, + "step": 16309 + }, + { + "epoch": 2.07479964381122, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.172639846801758, + "learning_rate": 1e-06, + "loss": 0.3645, + "mean_token_accuracy": 0.8817442655563354, + "num_tokens": 622250450.0, + "step": 16310 + }, + { + "epoch": 2.0749268540898105, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.036659240722656, + "learning_rate": 1e-06, + "loss": 0.43, + "mean_token_accuracy": 0.8663170337677002, + "num_tokens": 622291919.0, + "step": 16311 + }, + { + "epoch": 2.075054064368401, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.216285705566406, + "learning_rate": 1e-06, + "loss": 0.408, + "mean_token_accuracy": 0.8730331063270569, + "num_tokens": 622325892.0, + "step": 16312 + }, + { + "epoch": 2.0751812746469915, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.0963077545166, + "learning_rate": 1e-06, + "loss": 0.3932, + "mean_token_accuracy": 0.8742048740386963, + "num_tokens": 622356966.0, + "step": 16313 + }, + { + "epoch": 2.075308484925582, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.171743392944336, + "learning_rate": 1e-06, + "loss": 0.3809, + "mean_token_accuracy": 0.8767416477203369, + "num_tokens": 622389931.0, + "step": 16314 + }, + { + "epoch": 2.0754356952041726, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.047420501708984, + "learning_rate": 1e-06, + "loss": 0.4042, + "mean_token_accuracy": 0.8732333183288574, + "num_tokens": 622427817.0, + "step": 16315 + }, + { + "epoch": 2.075562905482763, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.03792381286621, + "learning_rate": 1e-06, + "loss": 0.4195, + "mean_token_accuracy": 0.8641019463539124, + "num_tokens": 622470353.0, + "step": 16316 + }, + { + "epoch": 2.0756901157613536, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.09956169128418, + "learning_rate": 1e-06, + "loss": 0.3432, + "mean_token_accuracy": 0.889353334903717, + "num_tokens": 622507577.0, + "step": 16317 + }, + { + "epoch": 2.075817326039944, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.011831283569336, + "learning_rate": 1e-06, + "loss": 0.4153, + "mean_token_accuracy": 0.8687012791633606, + "num_tokens": 622544746.0, + "step": 16318 + }, + { + "epoch": 2.0759445363185347, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.9786376953125, + "learning_rate": 1e-06, + "loss": 0.4267, + "mean_token_accuracy": 0.8667234778404236, + "num_tokens": 622583688.0, + "step": 16319 + }, + { + "epoch": 2.0760717465971252, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.935380935668945, + "learning_rate": 1e-06, + "loss": 0.3993, + "mean_token_accuracy": 0.8737470507621765, + "num_tokens": 622619911.0, + "step": 16320 + }, + { + "epoch": 2.0761989568757158, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.00697135925293, + "learning_rate": 1e-06, + "loss": 0.3889, + "mean_token_accuracy": 0.8788878917694092, + "num_tokens": 622665732.0, + "step": 16321 + }, + { + "epoch": 2.0763261671543063, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.03394317626953, + "learning_rate": 1e-06, + "loss": 0.3739, + "mean_token_accuracy": 0.8807961344718933, + "num_tokens": 622705746.0, + "step": 16322 + }, + { + "epoch": 2.0764533774328964, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.07236099243164, + "learning_rate": 1e-06, + "loss": 0.4121, + "mean_token_accuracy": 0.8750470876693726, + "num_tokens": 622745481.0, + "step": 16323 + }, + { + "epoch": 2.076580587711487, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.994108200073242, + "learning_rate": 1e-06, + "loss": 0.3941, + "mean_token_accuracy": 0.8758529424667358, + "num_tokens": 622786910.0, + "step": 16324 + }, + { + "epoch": 2.0767077979900774, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.243480682373047, + "learning_rate": 1e-06, + "loss": 0.421, + "mean_token_accuracy": 0.8693019151687622, + "num_tokens": 622826976.0, + "step": 16325 + }, + { + "epoch": 2.076835008268668, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.240581512451172, + "learning_rate": 1e-06, + "loss": 0.3943, + "mean_token_accuracy": 0.8739388585090637, + "num_tokens": 622858945.0, + "step": 16326 + }, + { + "epoch": 2.0769622185472585, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.220535278320312, + "learning_rate": 1e-06, + "loss": 0.3653, + "mean_token_accuracy": 0.8849883079528809, + "num_tokens": 622893229.0, + "step": 16327 + }, + { + "epoch": 2.077089428825849, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.063020706176758, + "learning_rate": 1e-06, + "loss": 0.3687, + "mean_token_accuracy": 0.8834646940231323, + "num_tokens": 622933701.0, + "step": 16328 + }, + { + "epoch": 2.0772166391044395, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.093095779418945, + "learning_rate": 1e-06, + "loss": 0.4418, + "mean_token_accuracy": 0.8597851991653442, + "num_tokens": 622971226.0, + "step": 16329 + }, + { + "epoch": 2.07734384938303, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.114990234375, + "learning_rate": 1e-06, + "loss": 0.4079, + "mean_token_accuracy": 0.8704350590705872, + "num_tokens": 623009232.0, + "step": 16330 + }, + { + "epoch": 2.0774710596616206, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.016103744506836, + "learning_rate": 1e-06, + "loss": 0.4077, + "mean_token_accuracy": 0.8680645227432251, + "num_tokens": 623048005.0, + "step": 16331 + }, + { + "epoch": 2.077598269940211, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.185150146484375, + "learning_rate": 1e-06, + "loss": 0.3961, + "mean_token_accuracy": 0.8736090660095215, + "num_tokens": 623088474.0, + "step": 16332 + }, + { + "epoch": 2.0777254802188017, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.80544090270996, + "learning_rate": 1e-06, + "loss": 0.4053, + "mean_token_accuracy": 0.874680757522583, + "num_tokens": 623123700.0, + "step": 16333 + }, + { + "epoch": 2.077852690497392, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.091304779052734, + "learning_rate": 1e-06, + "loss": 0.3925, + "mean_token_accuracy": 0.8758808374404907, + "num_tokens": 623161406.0, + "step": 16334 + }, + { + "epoch": 2.0779799007759827, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.199796676635742, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8639687895774841, + "num_tokens": 623199770.0, + "step": 16335 + }, + { + "epoch": 2.0781071110545732, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.062047958374023, + "learning_rate": 1e-06, + "loss": 0.3886, + "mean_token_accuracy": 0.876767635345459, + "num_tokens": 623234635.0, + "step": 16336 + }, + { + "epoch": 2.0782343213331638, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.16118621826172, + "learning_rate": 1e-06, + "loss": 0.4012, + "mean_token_accuracy": 0.8756594061851501, + "num_tokens": 623273695.0, + "step": 16337 + }, + { + "epoch": 2.0783615316117543, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.228378295898438, + "learning_rate": 1e-06, + "loss": 0.4492, + "mean_token_accuracy": 0.8586503863334656, + "num_tokens": 623315278.0, + "step": 16338 + }, + { + "epoch": 2.078488741890345, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.283382415771484, + "learning_rate": 1e-06, + "loss": 0.3743, + "mean_token_accuracy": 0.8822861909866333, + "num_tokens": 623351753.0, + "step": 16339 + }, + { + "epoch": 2.0786159521689354, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.230588912963867, + "learning_rate": 1e-06, + "loss": 0.4712, + "mean_token_accuracy": 0.8558459281921387, + "num_tokens": 623387812.0, + "step": 16340 + }, + { + "epoch": 2.078743162447526, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.98057746887207, + "learning_rate": 1e-06, + "loss": 0.3868, + "mean_token_accuracy": 0.8776514530181885, + "num_tokens": 623426784.0, + "step": 16341 + }, + { + "epoch": 2.0788703727261164, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.206390380859375, + "learning_rate": 1e-06, + "loss": 0.4849, + "mean_token_accuracy": 0.8516385555267334, + "num_tokens": 623463362.0, + "step": 16342 + }, + { + "epoch": 2.078997583004707, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.961509704589844, + "learning_rate": 1e-06, + "loss": 0.4306, + "mean_token_accuracy": 0.8663215637207031, + "num_tokens": 623500108.0, + "step": 16343 + }, + { + "epoch": 2.0791247932832975, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.154876708984375, + "learning_rate": 1e-06, + "loss": 0.4081, + "mean_token_accuracy": 0.8697470426559448, + "num_tokens": 623537159.0, + "step": 16344 + }, + { + "epoch": 2.079252003561888, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.028949737548828, + "learning_rate": 1e-06, + "loss": 0.3442, + "mean_token_accuracy": 0.8920077085494995, + "num_tokens": 623576256.0, + "step": 16345 + }, + { + "epoch": 2.0793792138404785, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.070091247558594, + "learning_rate": 1e-06, + "loss": 0.4119, + "mean_token_accuracy": 0.8701061010360718, + "num_tokens": 623612454.0, + "step": 16346 + }, + { + "epoch": 2.0795064241190686, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.162979125976562, + "learning_rate": 1e-06, + "loss": 0.3527, + "mean_token_accuracy": 0.886261522769928, + "num_tokens": 623654392.0, + "step": 16347 + }, + { + "epoch": 2.079633634397659, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.08531951904297, + "learning_rate": 1e-06, + "loss": 0.4473, + "mean_token_accuracy": 0.8556314706802368, + "num_tokens": 623697350.0, + "step": 16348 + }, + { + "epoch": 2.0797608446762497, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.2115478515625, + "learning_rate": 1e-06, + "loss": 0.3892, + "mean_token_accuracy": 0.8773912191390991, + "num_tokens": 623736272.0, + "step": 16349 + }, + { + "epoch": 2.07988805495484, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.1796817779541, + "learning_rate": 1e-06, + "loss": 0.4317, + "mean_token_accuracy": 0.8646416068077087, + "num_tokens": 623781319.0, + "step": 16350 + }, + { + "epoch": 2.0800152652334307, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.056804656982422, + "learning_rate": 1e-06, + "loss": 0.3961, + "mean_token_accuracy": 0.8739292025566101, + "num_tokens": 623821944.0, + "step": 16351 + }, + { + "epoch": 2.0801424755120212, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.089801788330078, + "learning_rate": 1e-06, + "loss": 0.4151, + "mean_token_accuracy": 0.8685550093650818, + "num_tokens": 623858399.0, + "step": 16352 + }, + { + "epoch": 2.0802696857906118, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.14695167541504, + "learning_rate": 1e-06, + "loss": 0.4003, + "mean_token_accuracy": 0.875008225440979, + "num_tokens": 623897839.0, + "step": 16353 + }, + { + "epoch": 2.0803968960692023, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.22214126586914, + "learning_rate": 1e-06, + "loss": 0.4584, + "mean_token_accuracy": 0.8582216501235962, + "num_tokens": 623933127.0, + "step": 16354 + }, + { + "epoch": 2.080524106347793, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.184690475463867, + "learning_rate": 1e-06, + "loss": 0.4095, + "mean_token_accuracy": 0.8735094666481018, + "num_tokens": 623972140.0, + "step": 16355 + }, + { + "epoch": 2.0806513166263834, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.985095977783203, + "learning_rate": 1e-06, + "loss": 0.4354, + "mean_token_accuracy": 0.8623688220977783, + "num_tokens": 624008875.0, + "step": 16356 + }, + { + "epoch": 2.080778526904974, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.13207244873047, + "learning_rate": 1e-06, + "loss": 0.4058, + "mean_token_accuracy": 0.8704707026481628, + "num_tokens": 624045162.0, + "step": 16357 + }, + { + "epoch": 2.0809057371835644, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.9799747467041, + "learning_rate": 1e-06, + "loss": 0.3674, + "mean_token_accuracy": 0.8826860189437866, + "num_tokens": 624085275.0, + "step": 16358 + }, + { + "epoch": 2.081032947462155, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.047483444213867, + "learning_rate": 1e-06, + "loss": 0.4016, + "mean_token_accuracy": 0.8705276250839233, + "num_tokens": 624127479.0, + "step": 16359 + }, + { + "epoch": 2.0811601577407455, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.178176879882812, + "learning_rate": 1e-06, + "loss": 0.4024, + "mean_token_accuracy": 0.8762069344520569, + "num_tokens": 624165231.0, + "step": 16360 + }, + { + "epoch": 2.081287368019336, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.029821395874023, + "learning_rate": 1e-06, + "loss": 0.4372, + "mean_token_accuracy": 0.8615608215332031, + "num_tokens": 624205109.0, + "step": 16361 + }, + { + "epoch": 2.0814145782979265, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.258516311645508, + "learning_rate": 1e-06, + "loss": 0.4036, + "mean_token_accuracy": 0.8688476085662842, + "num_tokens": 624241849.0, + "step": 16362 + }, + { + "epoch": 2.081541788576517, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.961307525634766, + "learning_rate": 1e-06, + "loss": 0.3865, + "mean_token_accuracy": 0.8745708465576172, + "num_tokens": 624274174.0, + "step": 16363 + }, + { + "epoch": 2.0816689988551076, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.986717224121094, + "learning_rate": 1e-06, + "loss": 0.4092, + "mean_token_accuracy": 0.87116539478302, + "num_tokens": 624311526.0, + "step": 16364 + }, + { + "epoch": 2.081796209133698, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.003211975097656, + "learning_rate": 1e-06, + "loss": 0.4465, + "mean_token_accuracy": 0.8594756126403809, + "num_tokens": 624346657.0, + "step": 16365 + }, + { + "epoch": 2.0819234194122886, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.114765167236328, + "learning_rate": 1e-06, + "loss": 0.4459, + "mean_token_accuracy": 0.8581749200820923, + "num_tokens": 624386563.0, + "step": 16366 + }, + { + "epoch": 2.082050629690879, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.04937744140625, + "learning_rate": 1e-06, + "loss": 0.3709, + "mean_token_accuracy": 0.8827080130577087, + "num_tokens": 624427299.0, + "step": 16367 + }, + { + "epoch": 2.0821778399694697, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.900928497314453, + "learning_rate": 1e-06, + "loss": 0.4422, + "mean_token_accuracy": 0.8560915589332581, + "num_tokens": 624461253.0, + "step": 16368 + }, + { + "epoch": 2.0823050502480602, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.273481369018555, + "learning_rate": 1e-06, + "loss": 0.3761, + "mean_token_accuracy": 0.8811918497085571, + "num_tokens": 624498699.0, + "step": 16369 + }, + { + "epoch": 2.0824322605266508, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.056255340576172, + "learning_rate": 1e-06, + "loss": 0.4022, + "mean_token_accuracy": 0.8702734708786011, + "num_tokens": 624534655.0, + "step": 16370 + }, + { + "epoch": 2.082559470805241, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.1850643157959, + "learning_rate": 1e-06, + "loss": 0.3886, + "mean_token_accuracy": 0.8759931325912476, + "num_tokens": 624575342.0, + "step": 16371 + }, + { + "epoch": 2.0826866810838314, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.0161075592041, + "learning_rate": 1e-06, + "loss": 0.3944, + "mean_token_accuracy": 0.8776535391807556, + "num_tokens": 624608493.0, + "step": 16372 + }, + { + "epoch": 2.082813891362422, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.02265739440918, + "learning_rate": 1e-06, + "loss": 0.4509, + "mean_token_accuracy": 0.8556969165802002, + "num_tokens": 624655777.0, + "step": 16373 + }, + { + "epoch": 2.0829411016410124, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.061323165893555, + "learning_rate": 1e-06, + "loss": 0.3675, + "mean_token_accuracy": 0.8824530839920044, + "num_tokens": 624699349.0, + "step": 16374 + }, + { + "epoch": 2.083068311919603, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.03412628173828, + "learning_rate": 1e-06, + "loss": 0.3726, + "mean_token_accuracy": 0.8816738128662109, + "num_tokens": 624730146.0, + "step": 16375 + }, + { + "epoch": 2.0831955221981935, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.96963882446289, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.8647289276123047, + "num_tokens": 624773186.0, + "step": 16376 + }, + { + "epoch": 2.083322732476784, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.871063232421875, + "learning_rate": 1e-06, + "loss": 0.4063, + "mean_token_accuracy": 0.8712356090545654, + "num_tokens": 624810256.0, + "step": 16377 + }, + { + "epoch": 2.0834499427553745, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.980134963989258, + "learning_rate": 1e-06, + "loss": 0.4388, + "mean_token_accuracy": 0.8629410266876221, + "num_tokens": 624848166.0, + "step": 16378 + }, + { + "epoch": 2.083577153033965, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.020069122314453, + "learning_rate": 1e-06, + "loss": 0.3918, + "mean_token_accuracy": 0.8720130324363708, + "num_tokens": 624887601.0, + "step": 16379 + }, + { + "epoch": 2.0837043633125556, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.98763084411621, + "learning_rate": 1e-06, + "loss": 0.3971, + "mean_token_accuracy": 0.8724240660667419, + "num_tokens": 624921160.0, + "step": 16380 + }, + { + "epoch": 2.083831573591146, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.9592227935791, + "learning_rate": 1e-06, + "loss": 0.4199, + "mean_token_accuracy": 0.8654090166091919, + "num_tokens": 624959053.0, + "step": 16381 + }, + { + "epoch": 2.0839587838697367, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.14122772216797, + "learning_rate": 1e-06, + "loss": 0.3634, + "mean_token_accuracy": 0.8860819935798645, + "num_tokens": 624995272.0, + "step": 16382 + }, + { + "epoch": 2.084085994148327, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.126012802124023, + "learning_rate": 1e-06, + "loss": 0.3492, + "mean_token_accuracy": 0.8878045678138733, + "num_tokens": 625033388.0, + "step": 16383 + }, + { + "epoch": 2.0842132044269177, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.252967834472656, + "learning_rate": 1e-06, + "loss": 0.432, + "mean_token_accuracy": 0.861788809299469, + "num_tokens": 625071103.0, + "step": 16384 + }, + { + "epoch": 2.0843404147055082, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.92638397216797, + "learning_rate": 1e-06, + "loss": 0.3671, + "mean_token_accuracy": 0.8849067687988281, + "num_tokens": 625109562.0, + "step": 16385 + }, + { + "epoch": 2.0844676249840988, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.077945709228516, + "learning_rate": 1e-06, + "loss": 0.3864, + "mean_token_accuracy": 0.8774586319923401, + "num_tokens": 625147438.0, + "step": 16386 + }, + { + "epoch": 2.0845948352626893, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.07183837890625, + "learning_rate": 1e-06, + "loss": 0.3787, + "mean_token_accuracy": 0.881870687007904, + "num_tokens": 625184293.0, + "step": 16387 + }, + { + "epoch": 2.08472204554128, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.982532501220703, + "learning_rate": 1e-06, + "loss": 0.3934, + "mean_token_accuracy": 0.8740280270576477, + "num_tokens": 625218201.0, + "step": 16388 + }, + { + "epoch": 2.0848492558198704, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.184106826782227, + "learning_rate": 1e-06, + "loss": 0.4118, + "mean_token_accuracy": 0.8713041543960571, + "num_tokens": 625252072.0, + "step": 16389 + }, + { + "epoch": 2.084976466098461, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.990650177001953, + "learning_rate": 1e-06, + "loss": 0.3664, + "mean_token_accuracy": 0.8854140043258667, + "num_tokens": 625286990.0, + "step": 16390 + }, + { + "epoch": 2.0851036763770514, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.069883346557617, + "learning_rate": 1e-06, + "loss": 0.3703, + "mean_token_accuracy": 0.8831945061683655, + "num_tokens": 625326579.0, + "step": 16391 + }, + { + "epoch": 2.085230886655642, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.168899536132812, + "learning_rate": 1e-06, + "loss": 0.3792, + "mean_token_accuracy": 0.8779045939445496, + "num_tokens": 625367317.0, + "step": 16392 + }, + { + "epoch": 2.0853580969342325, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.882062911987305, + "learning_rate": 1e-06, + "loss": 0.4117, + "mean_token_accuracy": 0.8674075603485107, + "num_tokens": 625401047.0, + "step": 16393 + }, + { + "epoch": 2.085485307212823, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.29725456237793, + "learning_rate": 1e-06, + "loss": 0.4443, + "mean_token_accuracy": 0.8589625358581543, + "num_tokens": 625441878.0, + "step": 16394 + }, + { + "epoch": 2.0856125174914135, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.012252807617188, + "learning_rate": 1e-06, + "loss": 0.4093, + "mean_token_accuracy": 0.872132420539856, + "num_tokens": 625477007.0, + "step": 16395 + }, + { + "epoch": 2.0857397277700036, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.242340087890625, + "learning_rate": 1e-06, + "loss": 0.3679, + "mean_token_accuracy": 0.8850053548812866, + "num_tokens": 625519908.0, + "step": 16396 + }, + { + "epoch": 2.085866938048594, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.100263595581055, + "learning_rate": 1e-06, + "loss": 0.3995, + "mean_token_accuracy": 0.8759407997131348, + "num_tokens": 625554946.0, + "step": 16397 + }, + { + "epoch": 2.0859941483271847, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.018898010253906, + "learning_rate": 1e-06, + "loss": 0.3628, + "mean_token_accuracy": 0.8840658664703369, + "num_tokens": 625592121.0, + "step": 16398 + }, + { + "epoch": 2.086121358605775, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.110214233398438, + "learning_rate": 1e-06, + "loss": 0.3983, + "mean_token_accuracy": 0.8709464073181152, + "num_tokens": 625630596.0, + "step": 16399 + }, + { + "epoch": 2.0862485688843657, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.184301376342773, + "learning_rate": 1e-06, + "loss": 0.4513, + "mean_token_accuracy": 0.8557448983192444, + "num_tokens": 625669535.0, + "step": 16400 + }, + { + "epoch": 2.0863757791629562, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.122596740722656, + "learning_rate": 1e-06, + "loss": 0.4096, + "mean_token_accuracy": 0.8706915974617004, + "num_tokens": 625710072.0, + "step": 16401 + }, + { + "epoch": 2.0865029894415468, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.063371658325195, + "learning_rate": 1e-06, + "loss": 0.3923, + "mean_token_accuracy": 0.8804876804351807, + "num_tokens": 625746328.0, + "step": 16402 + }, + { + "epoch": 2.0866301997201373, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.230369567871094, + "learning_rate": 1e-06, + "loss": 0.3953, + "mean_token_accuracy": 0.8740517497062683, + "num_tokens": 625787108.0, + "step": 16403 + }, + { + "epoch": 2.086757409998728, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.050020217895508, + "learning_rate": 1e-06, + "loss": 0.3936, + "mean_token_accuracy": 0.876139760017395, + "num_tokens": 625827887.0, + "step": 16404 + }, + { + "epoch": 2.0868846202773184, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.17474365234375, + "learning_rate": 1e-06, + "loss": 0.393, + "mean_token_accuracy": 0.8721476197242737, + "num_tokens": 625863475.0, + "step": 16405 + }, + { + "epoch": 2.087011830555909, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.214008331298828, + "learning_rate": 1e-06, + "loss": 0.3556, + "mean_token_accuracy": 0.888276219367981, + "num_tokens": 625902369.0, + "step": 16406 + }, + { + "epoch": 2.0871390408344994, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.007062911987305, + "learning_rate": 1e-06, + "loss": 0.4708, + "mean_token_accuracy": 0.8484652638435364, + "num_tokens": 625941054.0, + "step": 16407 + }, + { + "epoch": 2.08726625111309, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.072025299072266, + "learning_rate": 1e-06, + "loss": 0.4736, + "mean_token_accuracy": 0.8548213839530945, + "num_tokens": 625981113.0, + "step": 16408 + }, + { + "epoch": 2.0873934613916805, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.15901756286621, + "learning_rate": 1e-06, + "loss": 0.4215, + "mean_token_accuracy": 0.86984783411026, + "num_tokens": 626014758.0, + "step": 16409 + }, + { + "epoch": 2.087520671670271, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.10721778869629, + "learning_rate": 1e-06, + "loss": 0.4395, + "mean_token_accuracy": 0.8590131998062134, + "num_tokens": 626054442.0, + "step": 16410 + }, + { + "epoch": 2.0876478819488615, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.14161491394043, + "learning_rate": 1e-06, + "loss": 0.4757, + "mean_token_accuracy": 0.8551955819129944, + "num_tokens": 626093993.0, + "step": 16411 + }, + { + "epoch": 2.087775092227452, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.14687156677246, + "learning_rate": 1e-06, + "loss": 0.4627, + "mean_token_accuracy": 0.855151891708374, + "num_tokens": 626131242.0, + "step": 16412 + }, + { + "epoch": 2.0879023025060426, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.178544998168945, + "learning_rate": 1e-06, + "loss": 0.4462, + "mean_token_accuracy": 0.8575351238250732, + "num_tokens": 626162137.0, + "step": 16413 + }, + { + "epoch": 2.088029512784633, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.020841598510742, + "learning_rate": 1e-06, + "loss": 0.3984, + "mean_token_accuracy": 0.8731153011322021, + "num_tokens": 626201031.0, + "step": 16414 + }, + { + "epoch": 2.0881567230632236, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.962751388549805, + "learning_rate": 1e-06, + "loss": 0.3665, + "mean_token_accuracy": 0.8816820383071899, + "num_tokens": 626243647.0, + "step": 16415 + }, + { + "epoch": 2.088283933341814, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.138427734375, + "learning_rate": 1e-06, + "loss": 0.3834, + "mean_token_accuracy": 0.8779710531234741, + "num_tokens": 626281346.0, + "step": 16416 + }, + { + "epoch": 2.0884111436204047, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.155607223510742, + "learning_rate": 1e-06, + "loss": 0.378, + "mean_token_accuracy": 0.8809828758239746, + "num_tokens": 626315964.0, + "step": 16417 + }, + { + "epoch": 2.0885383538989952, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.967758178710938, + "learning_rate": 1e-06, + "loss": 0.4286, + "mean_token_accuracy": 0.8658679723739624, + "num_tokens": 626367920.0, + "step": 16418 + }, + { + "epoch": 2.0886655641775858, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.20802879333496, + "learning_rate": 1e-06, + "loss": 0.433, + "mean_token_accuracy": 0.8688783645629883, + "num_tokens": 626412123.0, + "step": 16419 + }, + { + "epoch": 2.0887927744561763, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.01374053955078, + "learning_rate": 1e-06, + "loss": 0.4896, + "mean_token_accuracy": 0.8442481160163879, + "num_tokens": 626451560.0, + "step": 16420 + }, + { + "epoch": 2.0889199847347664, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.07309913635254, + "learning_rate": 1e-06, + "loss": 0.4253, + "mean_token_accuracy": 0.8652362823486328, + "num_tokens": 626489361.0, + "step": 16421 + }, + { + "epoch": 2.089047195013357, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.992481231689453, + "learning_rate": 1e-06, + "loss": 0.37, + "mean_token_accuracy": 0.8872999548912048, + "num_tokens": 626527150.0, + "step": 16422 + }, + { + "epoch": 2.0891744052919474, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.930532455444336, + "learning_rate": 1e-06, + "loss": 0.4219, + "mean_token_accuracy": 0.8650205135345459, + "num_tokens": 626568110.0, + "step": 16423 + }, + { + "epoch": 2.089301615570538, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.084781646728516, + "learning_rate": 1e-06, + "loss": 0.4139, + "mean_token_accuracy": 0.8736721873283386, + "num_tokens": 626607215.0, + "step": 16424 + }, + { + "epoch": 2.0894288258491285, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.05449104309082, + "learning_rate": 1e-06, + "loss": 0.3876, + "mean_token_accuracy": 0.8767882585525513, + "num_tokens": 626644424.0, + "step": 16425 + }, + { + "epoch": 2.089556036127719, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.020584106445312, + "learning_rate": 1e-06, + "loss": 0.4742, + "mean_token_accuracy": 0.8508807420730591, + "num_tokens": 626689509.0, + "step": 16426 + }, + { + "epoch": 2.0896832464063095, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.157548904418945, + "learning_rate": 1e-06, + "loss": 0.4375, + "mean_token_accuracy": 0.8616058230400085, + "num_tokens": 626733866.0, + "step": 16427 + }, + { + "epoch": 2.0898104566849, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.12993621826172, + "learning_rate": 1e-06, + "loss": 0.3762, + "mean_token_accuracy": 0.8804773688316345, + "num_tokens": 626767465.0, + "step": 16428 + }, + { + "epoch": 2.0899376669634906, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.92375373840332, + "learning_rate": 1e-06, + "loss": 0.4539, + "mean_token_accuracy": 0.8579259514808655, + "num_tokens": 626806049.0, + "step": 16429 + }, + { + "epoch": 2.090064877242081, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.17977523803711, + "learning_rate": 1e-06, + "loss": 0.4434, + "mean_token_accuracy": 0.858475923538208, + "num_tokens": 626848917.0, + "step": 16430 + }, + { + "epoch": 2.0901920875206716, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.2217960357666, + "learning_rate": 1e-06, + "loss": 0.4629, + "mean_token_accuracy": 0.8588731288909912, + "num_tokens": 626884442.0, + "step": 16431 + }, + { + "epoch": 2.090319297799262, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.007610321044922, + "learning_rate": 1e-06, + "loss": 0.413, + "mean_token_accuracy": 0.8702259063720703, + "num_tokens": 626922517.0, + "step": 16432 + }, + { + "epoch": 2.0904465080778527, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.015335083007812, + "learning_rate": 1e-06, + "loss": 0.412, + "mean_token_accuracy": 0.8703084588050842, + "num_tokens": 626962880.0, + "step": 16433 + }, + { + "epoch": 2.0905737183564432, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.189102172851562, + "learning_rate": 1e-06, + "loss": 0.3768, + "mean_token_accuracy": 0.8791533708572388, + "num_tokens": 627008115.0, + "step": 16434 + }, + { + "epoch": 2.0907009286350338, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.134721755981445, + "learning_rate": 1e-06, + "loss": 0.3633, + "mean_token_accuracy": 0.8832157850265503, + "num_tokens": 627043935.0, + "step": 16435 + }, + { + "epoch": 2.0908281389136243, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.196720123291016, + "learning_rate": 1e-06, + "loss": 0.3924, + "mean_token_accuracy": 0.873140275478363, + "num_tokens": 627078495.0, + "step": 16436 + }, + { + "epoch": 2.090955349192215, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.957138061523438, + "learning_rate": 1e-06, + "loss": 0.3913, + "mean_token_accuracy": 0.8762136101722717, + "num_tokens": 627116652.0, + "step": 16437 + }, + { + "epoch": 2.0910825594708053, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.941984176635742, + "learning_rate": 1e-06, + "loss": 0.3754, + "mean_token_accuracy": 0.8799815773963928, + "num_tokens": 627152326.0, + "step": 16438 + }, + { + "epoch": 2.091209769749396, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.092300415039062, + "learning_rate": 1e-06, + "loss": 0.368, + "mean_token_accuracy": 0.8810739517211914, + "num_tokens": 627189502.0, + "step": 16439 + }, + { + "epoch": 2.0913369800279864, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.91114616394043, + "learning_rate": 1e-06, + "loss": 0.3573, + "mean_token_accuracy": 0.8874838352203369, + "num_tokens": 627225438.0, + "step": 16440 + }, + { + "epoch": 2.091464190306577, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.00706672668457, + "learning_rate": 1e-06, + "loss": 0.4147, + "mean_token_accuracy": 0.8673309087753296, + "num_tokens": 627258154.0, + "step": 16441 + }, + { + "epoch": 2.0915914005851675, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.998720169067383, + "learning_rate": 1e-06, + "loss": 0.4244, + "mean_token_accuracy": 0.8646472692489624, + "num_tokens": 627301804.0, + "step": 16442 + }, + { + "epoch": 2.091718610863758, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.14072608947754, + "learning_rate": 1e-06, + "loss": 0.3922, + "mean_token_accuracy": 0.8760265111923218, + "num_tokens": 627344301.0, + "step": 16443 + }, + { + "epoch": 2.0918458211423485, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.086389541625977, + "learning_rate": 1e-06, + "loss": 0.3731, + "mean_token_accuracy": 0.8805534243583679, + "num_tokens": 627381068.0, + "step": 16444 + }, + { + "epoch": 2.0919730314209386, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.168231964111328, + "learning_rate": 1e-06, + "loss": 0.4303, + "mean_token_accuracy": 0.865158200263977, + "num_tokens": 627423410.0, + "step": 16445 + }, + { + "epoch": 2.092100241699529, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.14832305908203, + "learning_rate": 1e-06, + "loss": 0.3642, + "mean_token_accuracy": 0.8815348744392395, + "num_tokens": 627456593.0, + "step": 16446 + }, + { + "epoch": 2.0922274519781197, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.05154800415039, + "learning_rate": 1e-06, + "loss": 0.402, + "mean_token_accuracy": 0.8750553131103516, + "num_tokens": 627493742.0, + "step": 16447 + }, + { + "epoch": 2.09235466225671, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.176738739013672, + "learning_rate": 1e-06, + "loss": 0.388, + "mean_token_accuracy": 0.8789987564086914, + "num_tokens": 627529809.0, + "step": 16448 + }, + { + "epoch": 2.0924818725353007, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.153600692749023, + "learning_rate": 1e-06, + "loss": 0.4057, + "mean_token_accuracy": 0.8746813535690308, + "num_tokens": 627570693.0, + "step": 16449 + }, + { + "epoch": 2.0926090828138912, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.035154342651367, + "learning_rate": 1e-06, + "loss": 0.3775, + "mean_token_accuracy": 0.8794682025909424, + "num_tokens": 627607608.0, + "step": 16450 + }, + { + "epoch": 2.0927362930924818, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.132814407348633, + "learning_rate": 1e-06, + "loss": 0.4109, + "mean_token_accuracy": 0.8707824945449829, + "num_tokens": 627642102.0, + "step": 16451 + }, + { + "epoch": 2.0928635033710723, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.081920623779297, + "learning_rate": 1e-06, + "loss": 0.4188, + "mean_token_accuracy": 0.8667308688163757, + "num_tokens": 627678838.0, + "step": 16452 + }, + { + "epoch": 2.092990713649663, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.066848754882812, + "learning_rate": 1e-06, + "loss": 0.39, + "mean_token_accuracy": 0.875727653503418, + "num_tokens": 627714952.0, + "step": 16453 + }, + { + "epoch": 2.0931179239282534, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.062938690185547, + "learning_rate": 1e-06, + "loss": 0.4258, + "mean_token_accuracy": 0.8622407913208008, + "num_tokens": 627754710.0, + "step": 16454 + }, + { + "epoch": 2.093245134206844, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.0368595123291, + "learning_rate": 1e-06, + "loss": 0.3983, + "mean_token_accuracy": 0.8768503665924072, + "num_tokens": 627791504.0, + "step": 16455 + }, + { + "epoch": 2.0933723444854344, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.076068878173828, + "learning_rate": 1e-06, + "loss": 0.3657, + "mean_token_accuracy": 0.8861914873123169, + "num_tokens": 627829683.0, + "step": 16456 + }, + { + "epoch": 2.093499554764025, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.211946487426758, + "learning_rate": 1e-06, + "loss": 0.3984, + "mean_token_accuracy": 0.8727874755859375, + "num_tokens": 627869826.0, + "step": 16457 + }, + { + "epoch": 2.0936267650426155, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.93144989013672, + "learning_rate": 1e-06, + "loss": 0.4063, + "mean_token_accuracy": 0.8692663311958313, + "num_tokens": 627904165.0, + "step": 16458 + }, + { + "epoch": 2.093753975321206, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.08416175842285, + "learning_rate": 1e-06, + "loss": 0.3912, + "mean_token_accuracy": 0.877971887588501, + "num_tokens": 627936559.0, + "step": 16459 + }, + { + "epoch": 2.0938811855997965, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.175987243652344, + "learning_rate": 1e-06, + "loss": 0.3973, + "mean_token_accuracy": 0.8771363496780396, + "num_tokens": 627971742.0, + "step": 16460 + }, + { + "epoch": 2.094008395878387, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.967384338378906, + "learning_rate": 1e-06, + "loss": 0.3707, + "mean_token_accuracy": 0.8818426132202148, + "num_tokens": 628013104.0, + "step": 16461 + }, + { + "epoch": 2.0941356061569776, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.066831588745117, + "learning_rate": 1e-06, + "loss": 0.4088, + "mean_token_accuracy": 0.8687037229537964, + "num_tokens": 628049378.0, + "step": 16462 + }, + { + "epoch": 2.094262816435568, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.96245002746582, + "learning_rate": 1e-06, + "loss": 0.3479, + "mean_token_accuracy": 0.890062153339386, + "num_tokens": 628087213.0, + "step": 16463 + }, + { + "epoch": 2.0943900267141586, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.992067337036133, + "learning_rate": 1e-06, + "loss": 0.3914, + "mean_token_accuracy": 0.878390908241272, + "num_tokens": 628122060.0, + "step": 16464 + }, + { + "epoch": 2.094517236992749, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.03730583190918, + "learning_rate": 1e-06, + "loss": 0.3974, + "mean_token_accuracy": 0.8755041360855103, + "num_tokens": 628166554.0, + "step": 16465 + }, + { + "epoch": 2.0946444472713397, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.042465209960938, + "learning_rate": 1e-06, + "loss": 0.4108, + "mean_token_accuracy": 0.8708820939064026, + "num_tokens": 628199953.0, + "step": 16466 + }, + { + "epoch": 2.09477165754993, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.338706970214844, + "learning_rate": 1e-06, + "loss": 0.3905, + "mean_token_accuracy": 0.8746390342712402, + "num_tokens": 628237044.0, + "step": 16467 + }, + { + "epoch": 2.0948988678285207, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.174211502075195, + "learning_rate": 1e-06, + "loss": 0.4097, + "mean_token_accuracy": 0.8746421337127686, + "num_tokens": 628275773.0, + "step": 16468 + }, + { + "epoch": 2.095026078107111, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.031831741333008, + "learning_rate": 1e-06, + "loss": 0.4332, + "mean_token_accuracy": 0.858982503414154, + "num_tokens": 628314549.0, + "step": 16469 + }, + { + "epoch": 2.0951532883857014, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.178207397460938, + "learning_rate": 1e-06, + "loss": 0.4256, + "mean_token_accuracy": 0.8684247136116028, + "num_tokens": 628349417.0, + "step": 16470 + }, + { + "epoch": 2.095280498664292, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.103038787841797, + "learning_rate": 1e-06, + "loss": 0.3865, + "mean_token_accuracy": 0.8768564462661743, + "num_tokens": 628384435.0, + "step": 16471 + }, + { + "epoch": 2.0954077089428824, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.080232620239258, + "learning_rate": 1e-06, + "loss": 0.4465, + "mean_token_accuracy": 0.8598666191101074, + "num_tokens": 628424124.0, + "step": 16472 + }, + { + "epoch": 2.095534919221473, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.165664672851562, + "learning_rate": 1e-06, + "loss": 0.3935, + "mean_token_accuracy": 0.8746042847633362, + "num_tokens": 628462412.0, + "step": 16473 + }, + { + "epoch": 2.0956621295000635, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.07128143310547, + "learning_rate": 1e-06, + "loss": 0.4608, + "mean_token_accuracy": 0.855254054069519, + "num_tokens": 628499066.0, + "step": 16474 + }, + { + "epoch": 2.095789339778654, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.052711486816406, + "learning_rate": 1e-06, + "loss": 0.3615, + "mean_token_accuracy": 0.8866210579872131, + "num_tokens": 628532379.0, + "step": 16475 + }, + { + "epoch": 2.0959165500572445, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.105485916137695, + "learning_rate": 1e-06, + "loss": 0.372, + "mean_token_accuracy": 0.882332444190979, + "num_tokens": 628570497.0, + "step": 16476 + }, + { + "epoch": 2.096043760335835, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.979652404785156, + "learning_rate": 1e-06, + "loss": 0.3913, + "mean_token_accuracy": 0.8779067993164062, + "num_tokens": 628608298.0, + "step": 16477 + }, + { + "epoch": 2.0961709706144256, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.93972396850586, + "learning_rate": 1e-06, + "loss": 0.3433, + "mean_token_accuracy": 0.8907730579376221, + "num_tokens": 628641217.0, + "step": 16478 + }, + { + "epoch": 2.096298180893016, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.199438095092773, + "learning_rate": 1e-06, + "loss": 0.4009, + "mean_token_accuracy": 0.8734575510025024, + "num_tokens": 628682480.0, + "step": 16479 + }, + { + "epoch": 2.0964253911716066, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.18315315246582, + "learning_rate": 1e-06, + "loss": 0.4049, + "mean_token_accuracy": 0.8701205253601074, + "num_tokens": 628718359.0, + "step": 16480 + }, + { + "epoch": 2.096552601450197, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.394222259521484, + "learning_rate": 1e-06, + "loss": 0.3697, + "mean_token_accuracy": 0.8805727362632751, + "num_tokens": 628752214.0, + "step": 16481 + }, + { + "epoch": 2.0966798117287877, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.01945686340332, + "learning_rate": 1e-06, + "loss": 0.361, + "mean_token_accuracy": 0.8869218826293945, + "num_tokens": 628787882.0, + "step": 16482 + }, + { + "epoch": 2.0968070220073782, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.802337646484375, + "learning_rate": 1e-06, + "loss": 0.414, + "mean_token_accuracy": 0.8700204491615295, + "num_tokens": 628824910.0, + "step": 16483 + }, + { + "epoch": 2.0969342322859688, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.14580726623535, + "learning_rate": 1e-06, + "loss": 0.3619, + "mean_token_accuracy": 0.8837028741836548, + "num_tokens": 628860657.0, + "step": 16484 + }, + { + "epoch": 2.0970614425645593, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.345535278320312, + "learning_rate": 1e-06, + "loss": 0.4458, + "mean_token_accuracy": 0.8564331531524658, + "num_tokens": 628900619.0, + "step": 16485 + }, + { + "epoch": 2.09718865284315, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.258298873901367, + "learning_rate": 1e-06, + "loss": 0.3764, + "mean_token_accuracy": 0.8845274448394775, + "num_tokens": 628944304.0, + "step": 16486 + }, + { + "epoch": 2.0973158631217403, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.16085433959961, + "learning_rate": 1e-06, + "loss": 0.3848, + "mean_token_accuracy": 0.8773320317268372, + "num_tokens": 628980490.0, + "step": 16487 + }, + { + "epoch": 2.097443073400331, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.2897891998291, + "learning_rate": 1e-06, + "loss": 0.384, + "mean_token_accuracy": 0.8774263858795166, + "num_tokens": 629011924.0, + "step": 16488 + }, + { + "epoch": 2.0975702836789214, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.172189712524414, + "learning_rate": 1e-06, + "loss": 0.4856, + "mean_token_accuracy": 0.848302960395813, + "num_tokens": 629049514.0, + "step": 16489 + }, + { + "epoch": 2.097697493957512, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.3021297454834, + "learning_rate": 1e-06, + "loss": 0.3458, + "mean_token_accuracy": 0.8904857635498047, + "num_tokens": 629085374.0, + "step": 16490 + }, + { + "epoch": 2.0978247042361025, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.088973999023438, + "learning_rate": 1e-06, + "loss": 0.416, + "mean_token_accuracy": 0.8696705102920532, + "num_tokens": 629126307.0, + "step": 16491 + }, + { + "epoch": 2.097951914514693, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.291067123413086, + "learning_rate": 1e-06, + "loss": 0.4068, + "mean_token_accuracy": 0.8726543188095093, + "num_tokens": 629159693.0, + "step": 16492 + }, + { + "epoch": 2.0980791247932835, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.169321060180664, + "learning_rate": 1e-06, + "loss": 0.3932, + "mean_token_accuracy": 0.878645122051239, + "num_tokens": 629198544.0, + "step": 16493 + }, + { + "epoch": 2.0982063350718736, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.08380699157715, + "learning_rate": 1e-06, + "loss": 0.4209, + "mean_token_accuracy": 0.8649805188179016, + "num_tokens": 629240956.0, + "step": 16494 + }, + { + "epoch": 2.098333545350464, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.380815505981445, + "learning_rate": 1e-06, + "loss": 0.3888, + "mean_token_accuracy": 0.874398946762085, + "num_tokens": 629276196.0, + "step": 16495 + }, + { + "epoch": 2.0984607556290547, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.219045639038086, + "learning_rate": 1e-06, + "loss": 0.4416, + "mean_token_accuracy": 0.8606458902359009, + "num_tokens": 629311398.0, + "step": 16496 + }, + { + "epoch": 2.098587965907645, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.139413833618164, + "learning_rate": 1e-06, + "loss": 0.4204, + "mean_token_accuracy": 0.8685551285743713, + "num_tokens": 629349488.0, + "step": 16497 + }, + { + "epoch": 2.0987151761862357, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.084928512573242, + "learning_rate": 1e-06, + "loss": 0.3678, + "mean_token_accuracy": 0.8852730989456177, + "num_tokens": 629388023.0, + "step": 16498 + }, + { + "epoch": 2.0988423864648262, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.057172775268555, + "learning_rate": 1e-06, + "loss": 0.4097, + "mean_token_accuracy": 0.8724311590194702, + "num_tokens": 629432140.0, + "step": 16499 + }, + { + "epoch": 2.0989695967434168, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.32068634033203, + "learning_rate": 1e-06, + "loss": 0.4427, + "mean_token_accuracy": 0.8587701916694641, + "num_tokens": 629480690.0, + "step": 16500 + }, + { + "epoch": 2.0990968070220073, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.18993377685547, + "learning_rate": 1e-06, + "loss": 0.3527, + "mean_token_accuracy": 0.8903291821479797, + "num_tokens": 629522500.0, + "step": 16501 + }, + { + "epoch": 2.099224017300598, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.26103401184082, + "learning_rate": 1e-06, + "loss": 0.4013, + "mean_token_accuracy": 0.8737493753433228, + "num_tokens": 629563450.0, + "step": 16502 + }, + { + "epoch": 2.0993512275791884, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.094970703125, + "learning_rate": 1e-06, + "loss": 0.3744, + "mean_token_accuracy": 0.8814907670021057, + "num_tokens": 629603753.0, + "step": 16503 + }, + { + "epoch": 2.099478437857779, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.17807388305664, + "learning_rate": 1e-06, + "loss": 0.3905, + "mean_token_accuracy": 0.8757766485214233, + "num_tokens": 629637828.0, + "step": 16504 + }, + { + "epoch": 2.0996056481363694, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.086149215698242, + "learning_rate": 1e-06, + "loss": 0.3778, + "mean_token_accuracy": 0.8811429738998413, + "num_tokens": 629674636.0, + "step": 16505 + }, + { + "epoch": 2.09973285841496, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.380332946777344, + "learning_rate": 1e-06, + "loss": 0.4332, + "mean_token_accuracy": 0.8633882999420166, + "num_tokens": 629717046.0, + "step": 16506 + }, + { + "epoch": 2.0998600686935505, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.15616226196289, + "learning_rate": 1e-06, + "loss": 0.3673, + "mean_token_accuracy": 0.8825439810752869, + "num_tokens": 629750331.0, + "step": 16507 + }, + { + "epoch": 2.099987278972141, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.269216537475586, + "learning_rate": 1e-06, + "loss": 0.447, + "mean_token_accuracy": 0.8540105819702148, + "num_tokens": 629790095.0, + "step": 16508 + }, + { + "epoch": 2.1001144892507315, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.234161376953125, + "learning_rate": 1e-06, + "loss": 0.3463, + "mean_token_accuracy": 0.8874855041503906, + "num_tokens": 629823035.0, + "step": 16509 + }, + { + "epoch": 2.100241699529322, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.13524055480957, + "learning_rate": 1e-06, + "loss": 0.4162, + "mean_token_accuracy": 0.8679794073104858, + "num_tokens": 629857096.0, + "step": 16510 + }, + { + "epoch": 2.1003689098079126, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.277910232543945, + "learning_rate": 1e-06, + "loss": 0.4026, + "mean_token_accuracy": 0.8724063634872437, + "num_tokens": 629893448.0, + "step": 16511 + }, + { + "epoch": 2.100496120086503, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.026721954345703, + "learning_rate": 1e-06, + "loss": 0.4006, + "mean_token_accuracy": 0.8701270818710327, + "num_tokens": 629931301.0, + "step": 16512 + }, + { + "epoch": 2.1006233303650936, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.26894187927246, + "learning_rate": 1e-06, + "loss": 0.3733, + "mean_token_accuracy": 0.8811985850334167, + "num_tokens": 629963481.0, + "step": 16513 + }, + { + "epoch": 2.100750540643684, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.142427444458008, + "learning_rate": 1e-06, + "loss": 0.4186, + "mean_token_accuracy": 0.8685225248336792, + "num_tokens": 630002739.0, + "step": 16514 + }, + { + "epoch": 2.1008777509222747, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.22747230529785, + "learning_rate": 1e-06, + "loss": 0.4223, + "mean_token_accuracy": 0.8660557866096497, + "num_tokens": 630041206.0, + "step": 16515 + }, + { + "epoch": 2.101004961200865, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.14198112487793, + "learning_rate": 1e-06, + "loss": 0.4436, + "mean_token_accuracy": 0.8585697412490845, + "num_tokens": 630083811.0, + "step": 16516 + }, + { + "epoch": 2.1011321714794557, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.20353126525879, + "learning_rate": 1e-06, + "loss": 0.4059, + "mean_token_accuracy": 0.8717372417449951, + "num_tokens": 630119823.0, + "step": 16517 + }, + { + "epoch": 2.1012593817580463, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.234920501708984, + "learning_rate": 1e-06, + "loss": 0.4086, + "mean_token_accuracy": 0.8727669715881348, + "num_tokens": 630163807.0, + "step": 16518 + }, + { + "epoch": 2.1013865920366364, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.94280242919922, + "learning_rate": 1e-06, + "loss": 0.4217, + "mean_token_accuracy": 0.8684839606285095, + "num_tokens": 630203038.0, + "step": 16519 + }, + { + "epoch": 2.101513802315227, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.06256866455078, + "learning_rate": 1e-06, + "loss": 0.3917, + "mean_token_accuracy": 0.877174973487854, + "num_tokens": 630238505.0, + "step": 16520 + }, + { + "epoch": 2.1016410125938174, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.266199111938477, + "learning_rate": 1e-06, + "loss": 0.4017, + "mean_token_accuracy": 0.8719971776008606, + "num_tokens": 630275279.0, + "step": 16521 + }, + { + "epoch": 2.101768222872408, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.138151168823242, + "learning_rate": 1e-06, + "loss": 0.4036, + "mean_token_accuracy": 0.8704265356063843, + "num_tokens": 630317288.0, + "step": 16522 + }, + { + "epoch": 2.1018954331509985, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.06856346130371, + "learning_rate": 1e-06, + "loss": 0.3918, + "mean_token_accuracy": 0.8735498189926147, + "num_tokens": 630352786.0, + "step": 16523 + }, + { + "epoch": 2.102022643429589, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.415435791015625, + "learning_rate": 1e-06, + "loss": 0.388, + "mean_token_accuracy": 0.8732573986053467, + "num_tokens": 630391138.0, + "step": 16524 + }, + { + "epoch": 2.1021498537081795, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.196762084960938, + "learning_rate": 1e-06, + "loss": 0.4427, + "mean_token_accuracy": 0.8611866235733032, + "num_tokens": 630426567.0, + "step": 16525 + }, + { + "epoch": 2.10227706398677, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.138553619384766, + "learning_rate": 1e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.862686038017273, + "num_tokens": 630467308.0, + "step": 16526 + }, + { + "epoch": 2.1024042742653606, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.23443031311035, + "learning_rate": 1e-06, + "loss": 0.3753, + "mean_token_accuracy": 0.8835378885269165, + "num_tokens": 630503333.0, + "step": 16527 + }, + { + "epoch": 2.102531484543951, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.064441680908203, + "learning_rate": 1e-06, + "loss": 0.4197, + "mean_token_accuracy": 0.8686535358428955, + "num_tokens": 630536838.0, + "step": 16528 + }, + { + "epoch": 2.1026586948225416, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.103515625, + "learning_rate": 1e-06, + "loss": 0.3666, + "mean_token_accuracy": 0.8855425119400024, + "num_tokens": 630570833.0, + "step": 16529 + }, + { + "epoch": 2.102785905101132, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.34085464477539, + "learning_rate": 1e-06, + "loss": 0.3744, + "mean_token_accuracy": 0.8810850381851196, + "num_tokens": 630615043.0, + "step": 16530 + }, + { + "epoch": 2.1029131153797227, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.21656036376953, + "learning_rate": 1e-06, + "loss": 0.3406, + "mean_token_accuracy": 0.8947083950042725, + "num_tokens": 630651368.0, + "step": 16531 + }, + { + "epoch": 2.1030403256583132, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.204439163208008, + "learning_rate": 1e-06, + "loss": 0.4168, + "mean_token_accuracy": 0.865038275718689, + "num_tokens": 630690341.0, + "step": 16532 + }, + { + "epoch": 2.1031675359369038, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.085678100585938, + "learning_rate": 1e-06, + "loss": 0.4142, + "mean_token_accuracy": 0.8714200258255005, + "num_tokens": 630727168.0, + "step": 16533 + }, + { + "epoch": 2.1032947462154943, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.17728042602539, + "learning_rate": 1e-06, + "loss": 0.3902, + "mean_token_accuracy": 0.8772684931755066, + "num_tokens": 630766737.0, + "step": 16534 + }, + { + "epoch": 2.103421956494085, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.209918975830078, + "learning_rate": 1e-06, + "loss": 0.4064, + "mean_token_accuracy": 0.8710228800773621, + "num_tokens": 630803721.0, + "step": 16535 + }, + { + "epoch": 2.1035491667726753, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.31464195251465, + "learning_rate": 1e-06, + "loss": 0.3843, + "mean_token_accuracy": 0.8796672821044922, + "num_tokens": 630843697.0, + "step": 16536 + }, + { + "epoch": 2.103676377051266, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.37643051147461, + "learning_rate": 1e-06, + "loss": 0.3855, + "mean_token_accuracy": 0.8743481040000916, + "num_tokens": 630874325.0, + "step": 16537 + }, + { + "epoch": 2.1038035873298564, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.35546875, + "learning_rate": 1e-06, + "loss": 0.408, + "mean_token_accuracy": 0.8691904544830322, + "num_tokens": 630911456.0, + "step": 16538 + }, + { + "epoch": 2.103930797608447, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.249048233032227, + "learning_rate": 1e-06, + "loss": 0.4031, + "mean_token_accuracy": 0.8717958331108093, + "num_tokens": 630954418.0, + "step": 16539 + }, + { + "epoch": 2.1040580078870375, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.44218635559082, + "learning_rate": 1e-06, + "loss": 0.3997, + "mean_token_accuracy": 0.872747004032135, + "num_tokens": 630991826.0, + "step": 16540 + }, + { + "epoch": 2.104185218165628, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.02035903930664, + "learning_rate": 1e-06, + "loss": 0.4081, + "mean_token_accuracy": 0.8731034994125366, + "num_tokens": 631025831.0, + "step": 16541 + }, + { + "epoch": 2.1043124284442185, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.393447875976562, + "learning_rate": 1e-06, + "loss": 0.4595, + "mean_token_accuracy": 0.8577797412872314, + "num_tokens": 631062896.0, + "step": 16542 + }, + { + "epoch": 2.1044396387228086, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.33930778503418, + "learning_rate": 1e-06, + "loss": 0.3845, + "mean_token_accuracy": 0.8794779777526855, + "num_tokens": 631099291.0, + "step": 16543 + }, + { + "epoch": 2.104566849001399, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.181297302246094, + "learning_rate": 1e-06, + "loss": 0.4283, + "mean_token_accuracy": 0.8648756742477417, + "num_tokens": 631141570.0, + "step": 16544 + }, + { + "epoch": 2.1046940592799896, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.104270935058594, + "learning_rate": 1e-06, + "loss": 0.428, + "mean_token_accuracy": 0.8657590746879578, + "num_tokens": 631178882.0, + "step": 16545 + }, + { + "epoch": 2.10482126955858, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.256927490234375, + "learning_rate": 1e-06, + "loss": 0.4003, + "mean_token_accuracy": 0.870026707649231, + "num_tokens": 631218059.0, + "step": 16546 + }, + { + "epoch": 2.1049484798371707, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.27693748474121, + "learning_rate": 1e-06, + "loss": 0.4055, + "mean_token_accuracy": 0.8741984367370605, + "num_tokens": 631252827.0, + "step": 16547 + }, + { + "epoch": 2.1050756901157612, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.188814163208008, + "learning_rate": 1e-06, + "loss": 0.3743, + "mean_token_accuracy": 0.8798976540565491, + "num_tokens": 631294370.0, + "step": 16548 + }, + { + "epoch": 2.1052029003943518, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.108396530151367, + "learning_rate": 1e-06, + "loss": 0.4262, + "mean_token_accuracy": 0.8694196939468384, + "num_tokens": 631331091.0, + "step": 16549 + }, + { + "epoch": 2.1053301106729423, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.31624412536621, + "learning_rate": 1e-06, + "loss": 0.3961, + "mean_token_accuracy": 0.8764463663101196, + "num_tokens": 631367285.0, + "step": 16550 + }, + { + "epoch": 2.105457320951533, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.14889907836914, + "learning_rate": 1e-06, + "loss": 0.4459, + "mean_token_accuracy": 0.8594019412994385, + "num_tokens": 631402471.0, + "step": 16551 + }, + { + "epoch": 2.1055845312301233, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.15471839904785, + "learning_rate": 1e-06, + "loss": 0.395, + "mean_token_accuracy": 0.8738620281219482, + "num_tokens": 631439706.0, + "step": 16552 + }, + { + "epoch": 2.105711741508714, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.19890785217285, + "learning_rate": 1e-06, + "loss": 0.3867, + "mean_token_accuracy": 0.8771008253097534, + "num_tokens": 631471437.0, + "step": 16553 + }, + { + "epoch": 2.1058389517873044, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.592838287353516, + "learning_rate": 1e-06, + "loss": 0.4112, + "mean_token_accuracy": 0.8707367181777954, + "num_tokens": 631515028.0, + "step": 16554 + }, + { + "epoch": 2.105966162065895, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.241291046142578, + "learning_rate": 1e-06, + "loss": 0.4145, + "mean_token_accuracy": 0.8678230047225952, + "num_tokens": 631552946.0, + "step": 16555 + }, + { + "epoch": 2.1060933723444855, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.038297653198242, + "learning_rate": 1e-06, + "loss": 0.3836, + "mean_token_accuracy": 0.8776603937149048, + "num_tokens": 631596045.0, + "step": 16556 + }, + { + "epoch": 2.106220582623076, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.11844825744629, + "learning_rate": 1e-06, + "loss": 0.3761, + "mean_token_accuracy": 0.880765438079834, + "num_tokens": 631630233.0, + "step": 16557 + }, + { + "epoch": 2.1063477929016665, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.132619857788086, + "learning_rate": 1e-06, + "loss": 0.3815, + "mean_token_accuracy": 0.8792831897735596, + "num_tokens": 631666665.0, + "step": 16558 + }, + { + "epoch": 2.106475003180257, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.17340087890625, + "learning_rate": 1e-06, + "loss": 0.4094, + "mean_token_accuracy": 0.8692340850830078, + "num_tokens": 631709536.0, + "step": 16559 + }, + { + "epoch": 2.1066022134588476, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.979921340942383, + "learning_rate": 1e-06, + "loss": 0.3532, + "mean_token_accuracy": 0.8874986171722412, + "num_tokens": 631744333.0, + "step": 16560 + }, + { + "epoch": 2.106729423737438, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.09486961364746, + "learning_rate": 1e-06, + "loss": 0.4373, + "mean_token_accuracy": 0.8584274053573608, + "num_tokens": 631783522.0, + "step": 16561 + }, + { + "epoch": 2.1068566340160286, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.438966751098633, + "learning_rate": 1e-06, + "loss": 0.4495, + "mean_token_accuracy": 0.8566653728485107, + "num_tokens": 631822579.0, + "step": 16562 + }, + { + "epoch": 2.106983844294619, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.992176055908203, + "learning_rate": 1e-06, + "loss": 0.3806, + "mean_token_accuracy": 0.8790022730827332, + "num_tokens": 631862642.0, + "step": 16563 + }, + { + "epoch": 2.1071110545732097, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.033287048339844, + "learning_rate": 1e-06, + "loss": 0.3694, + "mean_token_accuracy": 0.8831872940063477, + "num_tokens": 631903161.0, + "step": 16564 + }, + { + "epoch": 2.1072382648518, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.136066436767578, + "learning_rate": 1e-06, + "loss": 0.3912, + "mean_token_accuracy": 0.876204252243042, + "num_tokens": 631938149.0, + "step": 16565 + }, + { + "epoch": 2.1073654751303907, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.21518325805664, + "learning_rate": 1e-06, + "loss": 0.3917, + "mean_token_accuracy": 0.874504804611206, + "num_tokens": 631974717.0, + "step": 16566 + }, + { + "epoch": 2.107492685408981, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.236146926879883, + "learning_rate": 1e-06, + "loss": 0.3878, + "mean_token_accuracy": 0.8765223622322083, + "num_tokens": 632011147.0, + "step": 16567 + }, + { + "epoch": 2.1076198956875714, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.935548782348633, + "learning_rate": 1e-06, + "loss": 0.3835, + "mean_token_accuracy": 0.8799650073051453, + "num_tokens": 632051915.0, + "step": 16568 + }, + { + "epoch": 2.107747105966162, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.067564010620117, + "learning_rate": 1e-06, + "loss": 0.4329, + "mean_token_accuracy": 0.8640838861465454, + "num_tokens": 632083047.0, + "step": 16569 + }, + { + "epoch": 2.1078743162447524, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.231203079223633, + "learning_rate": 1e-06, + "loss": 0.3686, + "mean_token_accuracy": 0.8852062821388245, + "num_tokens": 632119250.0, + "step": 16570 + }, + { + "epoch": 2.108001526523343, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.30712890625, + "learning_rate": 1e-06, + "loss": 0.3819, + "mean_token_accuracy": 0.878828227519989, + "num_tokens": 632152696.0, + "step": 16571 + }, + { + "epoch": 2.1081287368019335, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.15685272216797, + "learning_rate": 1e-06, + "loss": 0.3419, + "mean_token_accuracy": 0.8931776285171509, + "num_tokens": 632188167.0, + "step": 16572 + }, + { + "epoch": 2.108255947080524, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.09011459350586, + "learning_rate": 1e-06, + "loss": 0.4155, + "mean_token_accuracy": 0.8714640736579895, + "num_tokens": 632229567.0, + "step": 16573 + }, + { + "epoch": 2.1083831573591145, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.291072845458984, + "learning_rate": 1e-06, + "loss": 0.3655, + "mean_token_accuracy": 0.8822367787361145, + "num_tokens": 632266870.0, + "step": 16574 + }, + { + "epoch": 2.108510367637705, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.029428482055664, + "learning_rate": 1e-06, + "loss": 0.4094, + "mean_token_accuracy": 0.873079240322113, + "num_tokens": 632303844.0, + "step": 16575 + }, + { + "epoch": 2.1086375779162956, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.055496215820312, + "learning_rate": 1e-06, + "loss": 0.3964, + "mean_token_accuracy": 0.8754276037216187, + "num_tokens": 632339343.0, + "step": 16576 + }, + { + "epoch": 2.108764788194886, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.09742546081543, + "learning_rate": 1e-06, + "loss": 0.3911, + "mean_token_accuracy": 0.8781954050064087, + "num_tokens": 632385516.0, + "step": 16577 + }, + { + "epoch": 2.1088919984734766, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.0596923828125, + "learning_rate": 1e-06, + "loss": 0.3854, + "mean_token_accuracy": 0.8771854639053345, + "num_tokens": 632420664.0, + "step": 16578 + }, + { + "epoch": 2.109019208752067, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.17719268798828, + "learning_rate": 1e-06, + "loss": 0.3815, + "mean_token_accuracy": 0.8790746927261353, + "num_tokens": 632458082.0, + "step": 16579 + }, + { + "epoch": 2.1091464190306577, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.181320190429688, + "learning_rate": 1e-06, + "loss": 0.4341, + "mean_token_accuracy": 0.8614678382873535, + "num_tokens": 632495963.0, + "step": 16580 + }, + { + "epoch": 2.109273629309248, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.081802368164062, + "learning_rate": 1e-06, + "loss": 0.3949, + "mean_token_accuracy": 0.8749332427978516, + "num_tokens": 632529968.0, + "step": 16581 + }, + { + "epoch": 2.1094008395878387, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.22073745727539, + "learning_rate": 1e-06, + "loss": 0.3829, + "mean_token_accuracy": 0.8816121816635132, + "num_tokens": 632567527.0, + "step": 16582 + }, + { + "epoch": 2.1095280498664293, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.235031127929688, + "learning_rate": 1e-06, + "loss": 0.3956, + "mean_token_accuracy": 0.8719282150268555, + "num_tokens": 632606192.0, + "step": 16583 + }, + { + "epoch": 2.10965526014502, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.138458251953125, + "learning_rate": 1e-06, + "loss": 0.3764, + "mean_token_accuracy": 0.8836735486984253, + "num_tokens": 632645940.0, + "step": 16584 + }, + { + "epoch": 2.1097824704236103, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.21401023864746, + "learning_rate": 1e-06, + "loss": 0.3729, + "mean_token_accuracy": 0.8813239336013794, + "num_tokens": 632689385.0, + "step": 16585 + }, + { + "epoch": 2.109909680702201, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.111055374145508, + "learning_rate": 1e-06, + "loss": 0.3718, + "mean_token_accuracy": 0.879806399345398, + "num_tokens": 632720178.0, + "step": 16586 + }, + { + "epoch": 2.1100368909807914, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.184885025024414, + "learning_rate": 1e-06, + "loss": 0.4148, + "mean_token_accuracy": 0.8654404878616333, + "num_tokens": 632763249.0, + "step": 16587 + }, + { + "epoch": 2.110164101259382, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.042034149169922, + "learning_rate": 1e-06, + "loss": 0.4601, + "mean_token_accuracy": 0.8552680015563965, + "num_tokens": 632803293.0, + "step": 16588 + }, + { + "epoch": 2.1102913115379724, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.35219383239746, + "learning_rate": 1e-06, + "loss": 0.4195, + "mean_token_accuracy": 0.8644225597381592, + "num_tokens": 632838731.0, + "step": 16589 + }, + { + "epoch": 2.110418521816563, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.015491485595703, + "learning_rate": 1e-06, + "loss": 0.4285, + "mean_token_accuracy": 0.8645225763320923, + "num_tokens": 632875044.0, + "step": 16590 + }, + { + "epoch": 2.1105457320951535, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.038162231445312, + "learning_rate": 1e-06, + "loss": 0.456, + "mean_token_accuracy": 0.8530586957931519, + "num_tokens": 632911330.0, + "step": 16591 + }, + { + "epoch": 2.1106729423737436, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.152027130126953, + "learning_rate": 1e-06, + "loss": 0.433, + "mean_token_accuracy": 0.8624905347824097, + "num_tokens": 632952122.0, + "step": 16592 + }, + { + "epoch": 2.110800152652334, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.19409942626953, + "learning_rate": 1e-06, + "loss": 0.4573, + "mean_token_accuracy": 0.8558798432350159, + "num_tokens": 632985602.0, + "step": 16593 + }, + { + "epoch": 2.1109273629309246, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.15032386779785, + "learning_rate": 1e-06, + "loss": 0.4013, + "mean_token_accuracy": 0.8742027282714844, + "num_tokens": 633023792.0, + "step": 16594 + }, + { + "epoch": 2.111054573209515, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.218936920166016, + "learning_rate": 1e-06, + "loss": 0.3999, + "mean_token_accuracy": 0.8745383620262146, + "num_tokens": 633065457.0, + "step": 16595 + }, + { + "epoch": 2.1111817834881057, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.99378776550293, + "learning_rate": 1e-06, + "loss": 0.3635, + "mean_token_accuracy": 0.8822329640388489, + "num_tokens": 633101402.0, + "step": 16596 + }, + { + "epoch": 2.1113089937666962, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.025859832763672, + "learning_rate": 1e-06, + "loss": 0.4322, + "mean_token_accuracy": 0.864208459854126, + "num_tokens": 633147002.0, + "step": 16597 + }, + { + "epoch": 2.1114362040452868, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.232587814331055, + "learning_rate": 1e-06, + "loss": 0.3975, + "mean_token_accuracy": 0.8730796575546265, + "num_tokens": 633184209.0, + "step": 16598 + }, + { + "epoch": 2.1115634143238773, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.14998435974121, + "learning_rate": 1e-06, + "loss": 0.3956, + "mean_token_accuracy": 0.8776968717575073, + "num_tokens": 633224525.0, + "step": 16599 + }, + { + "epoch": 2.111690624602468, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.17302131652832, + "learning_rate": 1e-06, + "loss": 0.357, + "mean_token_accuracy": 0.8880453109741211, + "num_tokens": 633253171.0, + "step": 16600 + }, + { + "epoch": 2.1118178348810583, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.153396606445312, + "learning_rate": 1e-06, + "loss": 0.4166, + "mean_token_accuracy": 0.870145857334137, + "num_tokens": 633294326.0, + "step": 16601 + }, + { + "epoch": 2.111945045159649, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.233203887939453, + "learning_rate": 1e-06, + "loss": 0.3678, + "mean_token_accuracy": 0.8861439228057861, + "num_tokens": 633327236.0, + "step": 16602 + }, + { + "epoch": 2.1120722554382394, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.105215072631836, + "learning_rate": 1e-06, + "loss": 0.3796, + "mean_token_accuracy": 0.8832924962043762, + "num_tokens": 633368593.0, + "step": 16603 + }, + { + "epoch": 2.11219946571683, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.02484130859375, + "learning_rate": 1e-06, + "loss": 0.4684, + "mean_token_accuracy": 0.852593719959259, + "num_tokens": 633402598.0, + "step": 16604 + }, + { + "epoch": 2.1123266759954205, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.36574363708496, + "learning_rate": 1e-06, + "loss": 0.4147, + "mean_token_accuracy": 0.8686157464981079, + "num_tokens": 633437069.0, + "step": 16605 + }, + { + "epoch": 2.112453886274011, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.027109146118164, + "learning_rate": 1e-06, + "loss": 0.3978, + "mean_token_accuracy": 0.8752982020378113, + "num_tokens": 633479752.0, + "step": 16606 + }, + { + "epoch": 2.1125810965526015, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.338851928710938, + "learning_rate": 1e-06, + "loss": 0.4091, + "mean_token_accuracy": 0.869636595249176, + "num_tokens": 633516949.0, + "step": 16607 + }, + { + "epoch": 2.112708306831192, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.189903259277344, + "learning_rate": 1e-06, + "loss": 0.3877, + "mean_token_accuracy": 0.8775005340576172, + "num_tokens": 633547511.0, + "step": 16608 + }, + { + "epoch": 2.1128355171097826, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.129838943481445, + "learning_rate": 1e-06, + "loss": 0.3798, + "mean_token_accuracy": 0.8756055235862732, + "num_tokens": 633583940.0, + "step": 16609 + }, + { + "epoch": 2.112962727388373, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.22675323486328, + "learning_rate": 1e-06, + "loss": 0.4295, + "mean_token_accuracy": 0.8626418113708496, + "num_tokens": 633626245.0, + "step": 16610 + }, + { + "epoch": 2.1130899376669636, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.01824951171875, + "learning_rate": 1e-06, + "loss": 0.3914, + "mean_token_accuracy": 0.8760178685188293, + "num_tokens": 633662889.0, + "step": 16611 + }, + { + "epoch": 2.113217147945554, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.190032958984375, + "learning_rate": 1e-06, + "loss": 0.4076, + "mean_token_accuracy": 0.869452953338623, + "num_tokens": 633701461.0, + "step": 16612 + }, + { + "epoch": 2.1133443582241447, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.157787322998047, + "learning_rate": 1e-06, + "loss": 0.4029, + "mean_token_accuracy": 0.8737157583236694, + "num_tokens": 633741302.0, + "step": 16613 + }, + { + "epoch": 2.113471568502735, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.172115325927734, + "learning_rate": 1e-06, + "loss": 0.3964, + "mean_token_accuracy": 0.878637433052063, + "num_tokens": 633780337.0, + "step": 16614 + }, + { + "epoch": 2.1135987787813257, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.106563568115234, + "learning_rate": 1e-06, + "loss": 0.3815, + "mean_token_accuracy": 0.8802822828292847, + "num_tokens": 633814450.0, + "step": 16615 + }, + { + "epoch": 2.1137259890599163, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.94045066833496, + "learning_rate": 1e-06, + "loss": 0.4315, + "mean_token_accuracy": 0.8659589886665344, + "num_tokens": 633856252.0, + "step": 16616 + }, + { + "epoch": 2.1138531993385064, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.234020233154297, + "learning_rate": 1e-06, + "loss": 0.4324, + "mean_token_accuracy": 0.8620370626449585, + "num_tokens": 633895235.0, + "step": 16617 + }, + { + "epoch": 2.113980409617097, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.98695182800293, + "learning_rate": 1e-06, + "loss": 0.393, + "mean_token_accuracy": 0.8746997714042664, + "num_tokens": 633938948.0, + "step": 16618 + }, + { + "epoch": 2.1141076198956874, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.171234130859375, + "learning_rate": 1e-06, + "loss": 0.3656, + "mean_token_accuracy": 0.8796831965446472, + "num_tokens": 633973393.0, + "step": 16619 + }, + { + "epoch": 2.114234830174278, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.176326751708984, + "learning_rate": 1e-06, + "loss": 0.3973, + "mean_token_accuracy": 0.8775907754898071, + "num_tokens": 634006714.0, + "step": 16620 + }, + { + "epoch": 2.1143620404528685, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.02447509765625, + "learning_rate": 1e-06, + "loss": 0.3901, + "mean_token_accuracy": 0.8788328170776367, + "num_tokens": 634046104.0, + "step": 16621 + }, + { + "epoch": 2.114489250731459, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.98275375366211, + "learning_rate": 1e-06, + "loss": 0.4199, + "mean_token_accuracy": 0.8650635480880737, + "num_tokens": 634086401.0, + "step": 16622 + }, + { + "epoch": 2.1146164610100495, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.301530838012695, + "learning_rate": 1e-06, + "loss": 0.4275, + "mean_token_accuracy": 0.8646590709686279, + "num_tokens": 634125479.0, + "step": 16623 + }, + { + "epoch": 2.11474367128864, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.984220504760742, + "learning_rate": 1e-06, + "loss": 0.4623, + "mean_token_accuracy": 0.854583740234375, + "num_tokens": 634163873.0, + "step": 16624 + }, + { + "epoch": 2.1148708815672306, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.523881912231445, + "learning_rate": 1e-06, + "loss": 0.3995, + "mean_token_accuracy": 0.8705164194107056, + "num_tokens": 634200533.0, + "step": 16625 + }, + { + "epoch": 2.114998091845821, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.91698455810547, + "learning_rate": 1e-06, + "loss": 0.434, + "mean_token_accuracy": 0.866202175617218, + "num_tokens": 634235891.0, + "step": 16626 + }, + { + "epoch": 2.1151253021244116, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.171024322509766, + "learning_rate": 1e-06, + "loss": 0.3903, + "mean_token_accuracy": 0.8718916177749634, + "num_tokens": 634266748.0, + "step": 16627 + }, + { + "epoch": 2.115252512403002, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.145734786987305, + "learning_rate": 1e-06, + "loss": 0.434, + "mean_token_accuracy": 0.8608932495117188, + "num_tokens": 634302603.0, + "step": 16628 + }, + { + "epoch": 2.1153797226815927, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.183996200561523, + "learning_rate": 1e-06, + "loss": 0.3935, + "mean_token_accuracy": 0.8746328353881836, + "num_tokens": 634345471.0, + "step": 16629 + }, + { + "epoch": 2.115506932960183, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.24094009399414, + "learning_rate": 1e-06, + "loss": 0.4059, + "mean_token_accuracy": 0.8705408573150635, + "num_tokens": 634383902.0, + "step": 16630 + }, + { + "epoch": 2.1156341432387737, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.25509262084961, + "learning_rate": 1e-06, + "loss": 0.4386, + "mean_token_accuracy": 0.8663623332977295, + "num_tokens": 634421082.0, + "step": 16631 + }, + { + "epoch": 2.1157613535173643, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.153417587280273, + "learning_rate": 1e-06, + "loss": 0.4139, + "mean_token_accuracy": 0.8672940135002136, + "num_tokens": 634456490.0, + "step": 16632 + }, + { + "epoch": 2.115888563795955, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.21332359313965, + "learning_rate": 1e-06, + "loss": 0.4558, + "mean_token_accuracy": 0.8540346622467041, + "num_tokens": 634498252.0, + "step": 16633 + }, + { + "epoch": 2.1160157740745453, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.083662033081055, + "learning_rate": 1e-06, + "loss": 0.3877, + "mean_token_accuracy": 0.8804186582565308, + "num_tokens": 634540769.0, + "step": 16634 + }, + { + "epoch": 2.116142984353136, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.436079025268555, + "learning_rate": 1e-06, + "loss": 0.372, + "mean_token_accuracy": 0.8791858553886414, + "num_tokens": 634573807.0, + "step": 16635 + }, + { + "epoch": 2.1162701946317264, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.289236068725586, + "learning_rate": 1e-06, + "loss": 0.4122, + "mean_token_accuracy": 0.8683151006698608, + "num_tokens": 634609542.0, + "step": 16636 + }, + { + "epoch": 2.116397404910317, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.035436630249023, + "learning_rate": 1e-06, + "loss": 0.3772, + "mean_token_accuracy": 0.8812747001647949, + "num_tokens": 634648641.0, + "step": 16637 + }, + { + "epoch": 2.1165246151889074, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.22873306274414, + "learning_rate": 1e-06, + "loss": 0.4083, + "mean_token_accuracy": 0.8711445331573486, + "num_tokens": 634685531.0, + "step": 16638 + }, + { + "epoch": 2.116651825467498, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.178359985351562, + "learning_rate": 1e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.8743367195129395, + "num_tokens": 634717239.0, + "step": 16639 + }, + { + "epoch": 2.116779035746088, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.1124210357666, + "learning_rate": 1e-06, + "loss": 0.3952, + "mean_token_accuracy": 0.8690769672393799, + "num_tokens": 634750058.0, + "step": 16640 + }, + { + "epoch": 2.1169062460246786, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.239051818847656, + "learning_rate": 1e-06, + "loss": 0.4076, + "mean_token_accuracy": 0.8662289381027222, + "num_tokens": 634793480.0, + "step": 16641 + }, + { + "epoch": 2.117033456303269, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.336519241333008, + "learning_rate": 1e-06, + "loss": 0.4411, + "mean_token_accuracy": 0.8627090454101562, + "num_tokens": 634829923.0, + "step": 16642 + }, + { + "epoch": 2.1171606665818596, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.462095260620117, + "learning_rate": 1e-06, + "loss": 0.4095, + "mean_token_accuracy": 0.8675161600112915, + "num_tokens": 634868499.0, + "step": 16643 + }, + { + "epoch": 2.11728787686045, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.09673500061035, + "learning_rate": 1e-06, + "loss": 0.3744, + "mean_token_accuracy": 0.8809433579444885, + "num_tokens": 634905922.0, + "step": 16644 + }, + { + "epoch": 2.1174150871390407, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.300949096679688, + "learning_rate": 1e-06, + "loss": 0.3956, + "mean_token_accuracy": 0.8735138773918152, + "num_tokens": 634946190.0, + "step": 16645 + }, + { + "epoch": 2.1175422974176312, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.106287002563477, + "learning_rate": 1e-06, + "loss": 0.3626, + "mean_token_accuracy": 0.884511411190033, + "num_tokens": 634981840.0, + "step": 16646 + }, + { + "epoch": 2.1176695076962218, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.075963973999023, + "learning_rate": 1e-06, + "loss": 0.4026, + "mean_token_accuracy": 0.8707653284072876, + "num_tokens": 635024755.0, + "step": 16647 + }, + { + "epoch": 2.1177967179748123, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.1641845703125, + "learning_rate": 1e-06, + "loss": 0.3848, + "mean_token_accuracy": 0.8769165873527527, + "num_tokens": 635063076.0, + "step": 16648 + }, + { + "epoch": 2.117923928253403, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.247173309326172, + "learning_rate": 1e-06, + "loss": 0.3622, + "mean_token_accuracy": 0.8843924403190613, + "num_tokens": 635103052.0, + "step": 16649 + }, + { + "epoch": 2.1180511385319933, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.98343276977539, + "learning_rate": 1e-06, + "loss": 0.3769, + "mean_token_accuracy": 0.8825328350067139, + "num_tokens": 635140917.0, + "step": 16650 + }, + { + "epoch": 2.118178348810584, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.227291107177734, + "learning_rate": 1e-06, + "loss": 0.3886, + "mean_token_accuracy": 0.8774161338806152, + "num_tokens": 635182049.0, + "step": 16651 + }, + { + "epoch": 2.1183055590891744, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.338470458984375, + "learning_rate": 1e-06, + "loss": 0.428, + "mean_token_accuracy": 0.8681271076202393, + "num_tokens": 635217613.0, + "step": 16652 + }, + { + "epoch": 2.118432769367765, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.230680465698242, + "learning_rate": 1e-06, + "loss": 0.3695, + "mean_token_accuracy": 0.881477952003479, + "num_tokens": 635252716.0, + "step": 16653 + }, + { + "epoch": 2.1185599796463555, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.007688522338867, + "learning_rate": 1e-06, + "loss": 0.3939, + "mean_token_accuracy": 0.8760199546813965, + "num_tokens": 635288919.0, + "step": 16654 + }, + { + "epoch": 2.118687189924946, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.064565658569336, + "learning_rate": 1e-06, + "loss": 0.4111, + "mean_token_accuracy": 0.8680132627487183, + "num_tokens": 635329781.0, + "step": 16655 + }, + { + "epoch": 2.1188144002035365, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.354888916015625, + "learning_rate": 1e-06, + "loss": 0.3719, + "mean_token_accuracy": 0.8822107315063477, + "num_tokens": 635363844.0, + "step": 16656 + }, + { + "epoch": 2.118941610482127, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.11797332763672, + "learning_rate": 1e-06, + "loss": 0.4031, + "mean_token_accuracy": 0.8698229193687439, + "num_tokens": 635397568.0, + "step": 16657 + }, + { + "epoch": 2.1190688207607176, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.281984329223633, + "learning_rate": 1e-06, + "loss": 0.4431, + "mean_token_accuracy": 0.857864260673523, + "num_tokens": 635432903.0, + "step": 16658 + }, + { + "epoch": 2.119196031039308, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.23981475830078, + "learning_rate": 1e-06, + "loss": 0.3932, + "mean_token_accuracy": 0.8743187189102173, + "num_tokens": 635473388.0, + "step": 16659 + }, + { + "epoch": 2.1193232413178986, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.243282318115234, + "learning_rate": 1e-06, + "loss": 0.3805, + "mean_token_accuracy": 0.8765558004379272, + "num_tokens": 635512966.0, + "step": 16660 + }, + { + "epoch": 2.119450451596489, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.1429386138916, + "learning_rate": 1e-06, + "loss": 0.4476, + "mean_token_accuracy": 0.8596100807189941, + "num_tokens": 635547814.0, + "step": 16661 + }, + { + "epoch": 2.1195776618750797, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.165342330932617, + "learning_rate": 1e-06, + "loss": 0.4304, + "mean_token_accuracy": 0.863369345664978, + "num_tokens": 635583120.0, + "step": 16662 + }, + { + "epoch": 2.11970487215367, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.92806053161621, + "learning_rate": 1e-06, + "loss": 0.4309, + "mean_token_accuracy": 0.8684880137443542, + "num_tokens": 635621531.0, + "step": 16663 + }, + { + "epoch": 2.1198320824322607, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.280052185058594, + "learning_rate": 1e-06, + "loss": 0.444, + "mean_token_accuracy": 0.862592339515686, + "num_tokens": 635659641.0, + "step": 16664 + }, + { + "epoch": 2.119959292710851, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.21562385559082, + "learning_rate": 1e-06, + "loss": 0.416, + "mean_token_accuracy": 0.8678696751594543, + "num_tokens": 635697953.0, + "step": 16665 + }, + { + "epoch": 2.1200865029894413, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.313627243041992, + "learning_rate": 1e-06, + "loss": 0.4775, + "mean_token_accuracy": 0.847055196762085, + "num_tokens": 635742519.0, + "step": 16666 + }, + { + "epoch": 2.120213713268032, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.339696884155273, + "learning_rate": 1e-06, + "loss": 0.381, + "mean_token_accuracy": 0.8799959421157837, + "num_tokens": 635777786.0, + "step": 16667 + }, + { + "epoch": 2.1203409235466224, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.198993682861328, + "learning_rate": 1e-06, + "loss": 0.3849, + "mean_token_accuracy": 0.8744650483131409, + "num_tokens": 635814490.0, + "step": 16668 + }, + { + "epoch": 2.120468133825213, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.447614669799805, + "learning_rate": 1e-06, + "loss": 0.4428, + "mean_token_accuracy": 0.8594458103179932, + "num_tokens": 635858351.0, + "step": 16669 + }, + { + "epoch": 2.1205953441038035, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.07895851135254, + "learning_rate": 1e-06, + "loss": 0.4124, + "mean_token_accuracy": 0.8690213561058044, + "num_tokens": 635895585.0, + "step": 16670 + }, + { + "epoch": 2.120722554382394, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.21245574951172, + "learning_rate": 1e-06, + "loss": 0.3861, + "mean_token_accuracy": 0.8794046640396118, + "num_tokens": 635933591.0, + "step": 16671 + }, + { + "epoch": 2.1208497646609845, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.218215942382812, + "learning_rate": 1e-06, + "loss": 0.4191, + "mean_token_accuracy": 0.8687390089035034, + "num_tokens": 635972040.0, + "step": 16672 + }, + { + "epoch": 2.120976974939575, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.22045135498047, + "learning_rate": 1e-06, + "loss": 0.3959, + "mean_token_accuracy": 0.8757290244102478, + "num_tokens": 636013287.0, + "step": 16673 + }, + { + "epoch": 2.1211041852181656, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.99224281311035, + "learning_rate": 1e-06, + "loss": 0.417, + "mean_token_accuracy": 0.8668947219848633, + "num_tokens": 636046137.0, + "step": 16674 + }, + { + "epoch": 2.121231395496756, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.6275634765625, + "learning_rate": 1e-06, + "loss": 0.4337, + "mean_token_accuracy": 0.8614062666893005, + "num_tokens": 636088870.0, + "step": 16675 + }, + { + "epoch": 2.1213586057753466, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.229543685913086, + "learning_rate": 1e-06, + "loss": 0.3797, + "mean_token_accuracy": 0.8821058869361877, + "num_tokens": 636130519.0, + "step": 16676 + }, + { + "epoch": 2.121485816053937, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.479658126831055, + "learning_rate": 1e-06, + "loss": 0.3867, + "mean_token_accuracy": 0.8754369020462036, + "num_tokens": 636165641.0, + "step": 16677 + }, + { + "epoch": 2.1216130263325277, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.314716339111328, + "learning_rate": 1e-06, + "loss": 0.4444, + "mean_token_accuracy": 0.8591052293777466, + "num_tokens": 636203811.0, + "step": 16678 + }, + { + "epoch": 2.121740236611118, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.324134826660156, + "learning_rate": 1e-06, + "loss": 0.3706, + "mean_token_accuracy": 0.8843526244163513, + "num_tokens": 636239503.0, + "step": 16679 + }, + { + "epoch": 2.1218674468897087, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.223846435546875, + "learning_rate": 1e-06, + "loss": 0.3954, + "mean_token_accuracy": 0.8757744431495667, + "num_tokens": 636278994.0, + "step": 16680 + }, + { + "epoch": 2.1219946571682993, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.374679565429688, + "learning_rate": 1e-06, + "loss": 0.4161, + "mean_token_accuracy": 0.8679774403572083, + "num_tokens": 636318025.0, + "step": 16681 + }, + { + "epoch": 2.12212186744689, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.170499801635742, + "learning_rate": 1e-06, + "loss": 0.3976, + "mean_token_accuracy": 0.8740103244781494, + "num_tokens": 636357289.0, + "step": 16682 + }, + { + "epoch": 2.1222490777254803, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.362730026245117, + "learning_rate": 1e-06, + "loss": 0.4433, + "mean_token_accuracy": 0.8569506406784058, + "num_tokens": 636399067.0, + "step": 16683 + }, + { + "epoch": 2.122376288004071, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.542943954467773, + "learning_rate": 1e-06, + "loss": 0.3857, + "mean_token_accuracy": 0.8775715231895447, + "num_tokens": 636442418.0, + "step": 16684 + }, + { + "epoch": 2.1225034982826614, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.09941291809082, + "learning_rate": 1e-06, + "loss": 0.4357, + "mean_token_accuracy": 0.8606764674186707, + "num_tokens": 636484273.0, + "step": 16685 + }, + { + "epoch": 2.122630708561252, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.16946029663086, + "learning_rate": 1e-06, + "loss": 0.3875, + "mean_token_accuracy": 0.8814818859100342, + "num_tokens": 636525675.0, + "step": 16686 + }, + { + "epoch": 2.1227579188398424, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.303327560424805, + "learning_rate": 1e-06, + "loss": 0.3994, + "mean_token_accuracy": 0.8714168667793274, + "num_tokens": 636559652.0, + "step": 16687 + }, + { + "epoch": 2.122885129118433, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.138736724853516, + "learning_rate": 1e-06, + "loss": 0.3511, + "mean_token_accuracy": 0.8900920152664185, + "num_tokens": 636597900.0, + "step": 16688 + }, + { + "epoch": 2.1230123393970235, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.122066497802734, + "learning_rate": 1e-06, + "loss": 0.3964, + "mean_token_accuracy": 0.8738962411880493, + "num_tokens": 636635822.0, + "step": 16689 + }, + { + "epoch": 2.1231395496756136, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.16619110107422, + "learning_rate": 1e-06, + "loss": 0.4364, + "mean_token_accuracy": 0.8635434508323669, + "num_tokens": 636667275.0, + "step": 16690 + }, + { + "epoch": 2.123266759954204, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.23402214050293, + "learning_rate": 1e-06, + "loss": 0.3977, + "mean_token_accuracy": 0.8747873902320862, + "num_tokens": 636704723.0, + "step": 16691 + }, + { + "epoch": 2.1233939702327946, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.210975646972656, + "learning_rate": 1e-06, + "loss": 0.3945, + "mean_token_accuracy": 0.877269446849823, + "num_tokens": 636748209.0, + "step": 16692 + }, + { + "epoch": 2.123521180511385, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.16668128967285, + "learning_rate": 1e-06, + "loss": 0.3881, + "mean_token_accuracy": 0.8784313797950745, + "num_tokens": 636785052.0, + "step": 16693 + }, + { + "epoch": 2.1236483907899757, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.285978317260742, + "learning_rate": 1e-06, + "loss": 0.3562, + "mean_token_accuracy": 0.8843856453895569, + "num_tokens": 636816073.0, + "step": 16694 + }, + { + "epoch": 2.123775601068566, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.24022102355957, + "learning_rate": 1e-06, + "loss": 0.3869, + "mean_token_accuracy": 0.879468560218811, + "num_tokens": 636848963.0, + "step": 16695 + }, + { + "epoch": 2.1239028113471567, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.10550308227539, + "learning_rate": 1e-06, + "loss": 0.3596, + "mean_token_accuracy": 0.8863844871520996, + "num_tokens": 636880632.0, + "step": 16696 + }, + { + "epoch": 2.1240300216257473, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.952219009399414, + "learning_rate": 1e-06, + "loss": 0.4273, + "mean_token_accuracy": 0.8601233959197998, + "num_tokens": 636923597.0, + "step": 16697 + }, + { + "epoch": 2.124157231904338, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.374082565307617, + "learning_rate": 1e-06, + "loss": 0.4178, + "mean_token_accuracy": 0.8669754266738892, + "num_tokens": 636962250.0, + "step": 16698 + }, + { + "epoch": 2.1242844421829283, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.305221557617188, + "learning_rate": 1e-06, + "loss": 0.3772, + "mean_token_accuracy": 0.8798280954360962, + "num_tokens": 636997973.0, + "step": 16699 + }, + { + "epoch": 2.124411652461519, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.158414840698242, + "learning_rate": 1e-06, + "loss": 0.428, + "mean_token_accuracy": 0.8601491451263428, + "num_tokens": 637037320.0, + "step": 16700 + }, + { + "epoch": 2.1245388627401094, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.060588836669922, + "learning_rate": 1e-06, + "loss": 0.4699, + "mean_token_accuracy": 0.8514459133148193, + "num_tokens": 637072267.0, + "step": 16701 + }, + { + "epoch": 2.1246660730187, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.266977310180664, + "learning_rate": 1e-06, + "loss": 0.3575, + "mean_token_accuracy": 0.8869711756706238, + "num_tokens": 637108782.0, + "step": 16702 + }, + { + "epoch": 2.1247932832972904, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.197948455810547, + "learning_rate": 1e-06, + "loss": 0.4058, + "mean_token_accuracy": 0.8715755343437195, + "num_tokens": 637147368.0, + "step": 16703 + }, + { + "epoch": 2.124920493575881, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.83271026611328, + "learning_rate": 1e-06, + "loss": 0.378, + "mean_token_accuracy": 0.8825060129165649, + "num_tokens": 637181909.0, + "step": 16704 + }, + { + "epoch": 2.1250477038544715, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.939979553222656, + "learning_rate": 1e-06, + "loss": 0.403, + "mean_token_accuracy": 0.872277557849884, + "num_tokens": 637218498.0, + "step": 16705 + }, + { + "epoch": 2.125174914133062, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.117076873779297, + "learning_rate": 1e-06, + "loss": 0.402, + "mean_token_accuracy": 0.8740270137786865, + "num_tokens": 637254825.0, + "step": 16706 + }, + { + "epoch": 2.1253021244116526, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.72327995300293, + "learning_rate": 1e-06, + "loss": 0.4275, + "mean_token_accuracy": 0.8634356260299683, + "num_tokens": 637301593.0, + "step": 16707 + }, + { + "epoch": 2.125429334690243, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.982690811157227, + "learning_rate": 1e-06, + "loss": 0.3765, + "mean_token_accuracy": 0.8847317695617676, + "num_tokens": 637341676.0, + "step": 16708 + }, + { + "epoch": 2.1255565449688336, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.068647384643555, + "learning_rate": 1e-06, + "loss": 0.3842, + "mean_token_accuracy": 0.8774372935295105, + "num_tokens": 637378840.0, + "step": 16709 + }, + { + "epoch": 2.125683755247424, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.130088806152344, + "learning_rate": 1e-06, + "loss": 0.4586, + "mean_token_accuracy": 0.8559581637382507, + "num_tokens": 637412711.0, + "step": 16710 + }, + { + "epoch": 2.1258109655260147, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.266498565673828, + "learning_rate": 1e-06, + "loss": 0.4122, + "mean_token_accuracy": 0.8714081645011902, + "num_tokens": 637448287.0, + "step": 16711 + }, + { + "epoch": 2.125938175804605, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.106365203857422, + "learning_rate": 1e-06, + "loss": 0.4125, + "mean_token_accuracy": 0.8715540170669556, + "num_tokens": 637487899.0, + "step": 16712 + }, + { + "epoch": 2.1260653860831957, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.20018196105957, + "learning_rate": 1e-06, + "loss": 0.3863, + "mean_token_accuracy": 0.8772988319396973, + "num_tokens": 637528161.0, + "step": 16713 + }, + { + "epoch": 2.1261925963617863, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.019920349121094, + "learning_rate": 1e-06, + "loss": 0.4401, + "mean_token_accuracy": 0.8611422777175903, + "num_tokens": 637565692.0, + "step": 16714 + }, + { + "epoch": 2.1263198066403763, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.56474494934082, + "learning_rate": 1e-06, + "loss": 0.4303, + "mean_token_accuracy": 0.8603441119194031, + "num_tokens": 637600368.0, + "step": 16715 + }, + { + "epoch": 2.126447016918967, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.219144821166992, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.8700928688049316, + "num_tokens": 637635732.0, + "step": 16716 + }, + { + "epoch": 2.1265742271975574, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.069211959838867, + "learning_rate": 1e-06, + "loss": 0.4214, + "mean_token_accuracy": 0.8695799112319946, + "num_tokens": 637677127.0, + "step": 16717 + }, + { + "epoch": 2.126701437476148, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.313499450683594, + "learning_rate": 1e-06, + "loss": 0.407, + "mean_token_accuracy": 0.8719491958618164, + "num_tokens": 637715278.0, + "step": 16718 + }, + { + "epoch": 2.1268286477547385, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.128204345703125, + "learning_rate": 1e-06, + "loss": 0.3838, + "mean_token_accuracy": 0.8757661581039429, + "num_tokens": 637742937.0, + "step": 16719 + }, + { + "epoch": 2.126955858033329, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.227853775024414, + "learning_rate": 1e-06, + "loss": 0.3939, + "mean_token_accuracy": 0.8763257265090942, + "num_tokens": 637776761.0, + "step": 16720 + }, + { + "epoch": 2.1270830683119195, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.601215362548828, + "learning_rate": 1e-06, + "loss": 0.3954, + "mean_token_accuracy": 0.8745898604393005, + "num_tokens": 637812197.0, + "step": 16721 + }, + { + "epoch": 2.12721027859051, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.32356071472168, + "learning_rate": 1e-06, + "loss": 0.3939, + "mean_token_accuracy": 0.8767067193984985, + "num_tokens": 637850442.0, + "step": 16722 + }, + { + "epoch": 2.1273374888691006, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.20184898376465, + "learning_rate": 1e-06, + "loss": 0.4016, + "mean_token_accuracy": 0.873717725276947, + "num_tokens": 637889352.0, + "step": 16723 + }, + { + "epoch": 2.127464699147691, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.182397842407227, + "learning_rate": 1e-06, + "loss": 0.4498, + "mean_token_accuracy": 0.8547087907791138, + "num_tokens": 637927195.0, + "step": 16724 + }, + { + "epoch": 2.1275919094262816, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.138229370117188, + "learning_rate": 1e-06, + "loss": 0.4381, + "mean_token_accuracy": 0.8620796203613281, + "num_tokens": 637969431.0, + "step": 16725 + }, + { + "epoch": 2.127719119704872, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.297317504882812, + "learning_rate": 1e-06, + "loss": 0.3517, + "mean_token_accuracy": 0.8900332450866699, + "num_tokens": 638013910.0, + "step": 16726 + }, + { + "epoch": 2.1278463299834627, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.325273513793945, + "learning_rate": 1e-06, + "loss": 0.4349, + "mean_token_accuracy": 0.8621822595596313, + "num_tokens": 638051848.0, + "step": 16727 + }, + { + "epoch": 2.127973540262053, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.364805221557617, + "learning_rate": 1e-06, + "loss": 0.4096, + "mean_token_accuracy": 0.873475968837738, + "num_tokens": 638092192.0, + "step": 16728 + }, + { + "epoch": 2.1281007505406437, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.216262817382812, + "learning_rate": 1e-06, + "loss": 0.3774, + "mean_token_accuracy": 0.8817775249481201, + "num_tokens": 638134306.0, + "step": 16729 + }, + { + "epoch": 2.1282279608192343, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.183507919311523, + "learning_rate": 1e-06, + "loss": 0.4503, + "mean_token_accuracy": 0.8587443828582764, + "num_tokens": 638177988.0, + "step": 16730 + }, + { + "epoch": 2.128355171097825, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.195192337036133, + "learning_rate": 1e-06, + "loss": 0.3676, + "mean_token_accuracy": 0.8835744857788086, + "num_tokens": 638212237.0, + "step": 16731 + }, + { + "epoch": 2.1284823813764153, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.24884033203125, + "learning_rate": 1e-06, + "loss": 0.4209, + "mean_token_accuracy": 0.8656934499740601, + "num_tokens": 638252492.0, + "step": 16732 + }, + { + "epoch": 2.128609591655006, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.559206008911133, + "learning_rate": 1e-06, + "loss": 0.4034, + "mean_token_accuracy": 0.8721575736999512, + "num_tokens": 638294308.0, + "step": 16733 + }, + { + "epoch": 2.1287368019335964, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.174760818481445, + "learning_rate": 1e-06, + "loss": 0.3886, + "mean_token_accuracy": 0.8764861226081848, + "num_tokens": 638330178.0, + "step": 16734 + }, + { + "epoch": 2.128864012212187, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.191547393798828, + "learning_rate": 1e-06, + "loss": 0.4718, + "mean_token_accuracy": 0.8524935841560364, + "num_tokens": 638372731.0, + "step": 16735 + }, + { + "epoch": 2.1289912224907774, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.358335494995117, + "learning_rate": 1e-06, + "loss": 0.4163, + "mean_token_accuracy": 0.8688939213752747, + "num_tokens": 638407522.0, + "step": 16736 + }, + { + "epoch": 2.129118432769368, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.254283905029297, + "learning_rate": 1e-06, + "loss": 0.3974, + "mean_token_accuracy": 0.8745589256286621, + "num_tokens": 638443262.0, + "step": 16737 + }, + { + "epoch": 2.129245643047958, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.00914192199707, + "learning_rate": 1e-06, + "loss": 0.414, + "mean_token_accuracy": 0.8725959658622742, + "num_tokens": 638482729.0, + "step": 16738 + }, + { + "epoch": 2.129372853326549, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.320226669311523, + "learning_rate": 1e-06, + "loss": 0.3836, + "mean_token_accuracy": 0.8812946081161499, + "num_tokens": 638520034.0, + "step": 16739 + }, + { + "epoch": 2.129500063605139, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.13569450378418, + "learning_rate": 1e-06, + "loss": 0.388, + "mean_token_accuracy": 0.8790382146835327, + "num_tokens": 638558631.0, + "step": 16740 + }, + { + "epoch": 2.1296272738837296, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.185888290405273, + "learning_rate": 1e-06, + "loss": 0.3572, + "mean_token_accuracy": 0.8837209939956665, + "num_tokens": 638593565.0, + "step": 16741 + }, + { + "epoch": 2.12975448416232, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.50876808166504, + "learning_rate": 1e-06, + "loss": 0.3815, + "mean_token_accuracy": 0.8773065805435181, + "num_tokens": 638625952.0, + "step": 16742 + }, + { + "epoch": 2.1298816944409107, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.238536834716797, + "learning_rate": 1e-06, + "loss": 0.4052, + "mean_token_accuracy": 0.8701545000076294, + "num_tokens": 638662000.0, + "step": 16743 + }, + { + "epoch": 2.130008904719501, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.23027992248535, + "learning_rate": 1e-06, + "loss": 0.4433, + "mean_token_accuracy": 0.8590331077575684, + "num_tokens": 638706020.0, + "step": 16744 + }, + { + "epoch": 2.1301361149980917, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.318721771240234, + "learning_rate": 1e-06, + "loss": 0.3864, + "mean_token_accuracy": 0.87945556640625, + "num_tokens": 638745354.0, + "step": 16745 + }, + { + "epoch": 2.1302633252766823, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.099777221679688, + "learning_rate": 1e-06, + "loss": 0.4386, + "mean_token_accuracy": 0.8664053082466125, + "num_tokens": 638785497.0, + "step": 16746 + }, + { + "epoch": 2.130390535555273, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.317541122436523, + "learning_rate": 1e-06, + "loss": 0.4488, + "mean_token_accuracy": 0.8527452945709229, + "num_tokens": 638821650.0, + "step": 16747 + }, + { + "epoch": 2.1305177458338633, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.252422332763672, + "learning_rate": 1e-06, + "loss": 0.4446, + "mean_token_accuracy": 0.8591995239257812, + "num_tokens": 638855589.0, + "step": 16748 + }, + { + "epoch": 2.130644956112454, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.235797882080078, + "learning_rate": 1e-06, + "loss": 0.3987, + "mean_token_accuracy": 0.871774435043335, + "num_tokens": 638895766.0, + "step": 16749 + }, + { + "epoch": 2.1307721663910444, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.228853225708008, + "learning_rate": 1e-06, + "loss": 0.432, + "mean_token_accuracy": 0.8642652034759521, + "num_tokens": 638932622.0, + "step": 16750 + }, + { + "epoch": 2.130899376669635, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.245481491088867, + "learning_rate": 1e-06, + "loss": 0.4362, + "mean_token_accuracy": 0.8619795441627502, + "num_tokens": 638971588.0, + "step": 16751 + }, + { + "epoch": 2.1310265869482254, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.10287857055664, + "learning_rate": 1e-06, + "loss": 0.3882, + "mean_token_accuracy": 0.8775435090065002, + "num_tokens": 639009921.0, + "step": 16752 + }, + { + "epoch": 2.131153797226816, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.36961555480957, + "learning_rate": 1e-06, + "loss": 0.412, + "mean_token_accuracy": 0.8713827133178711, + "num_tokens": 639048160.0, + "step": 16753 + }, + { + "epoch": 2.1312810075054065, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.33884048461914, + "learning_rate": 1e-06, + "loss": 0.3533, + "mean_token_accuracy": 0.8851406574249268, + "num_tokens": 639080260.0, + "step": 16754 + }, + { + "epoch": 2.131408217783997, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.026540756225586, + "learning_rate": 1e-06, + "loss": 0.4224, + "mean_token_accuracy": 0.8646335005760193, + "num_tokens": 639118758.0, + "step": 16755 + }, + { + "epoch": 2.1315354280625876, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.34716033935547, + "learning_rate": 1e-06, + "loss": 0.3949, + "mean_token_accuracy": 0.8729771375656128, + "num_tokens": 639159528.0, + "step": 16756 + }, + { + "epoch": 2.131662638341178, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.145145416259766, + "learning_rate": 1e-06, + "loss": 0.3965, + "mean_token_accuracy": 0.8760098218917847, + "num_tokens": 639200394.0, + "step": 16757 + }, + { + "epoch": 2.1317898486197686, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.34253692626953, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8635741472244263, + "num_tokens": 639240938.0, + "step": 16758 + }, + { + "epoch": 2.131917058898359, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.20200538635254, + "learning_rate": 1e-06, + "loss": 0.4117, + "mean_token_accuracy": 0.8670520782470703, + "num_tokens": 639278097.0, + "step": 16759 + }, + { + "epoch": 2.1320442691769497, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.351329803466797, + "learning_rate": 1e-06, + "loss": 0.4001, + "mean_token_accuracy": 0.8719184994697571, + "num_tokens": 639324512.0, + "step": 16760 + }, + { + "epoch": 2.13217147945554, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.468400955200195, + "learning_rate": 1e-06, + "loss": 0.3947, + "mean_token_accuracy": 0.8766969442367554, + "num_tokens": 639366833.0, + "step": 16761 + }, + { + "epoch": 2.1322986897341307, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.095252990722656, + "learning_rate": 1e-06, + "loss": 0.4536, + "mean_token_accuracy": 0.8582043647766113, + "num_tokens": 639405765.0, + "step": 16762 + }, + { + "epoch": 2.132425900012721, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.348773956298828, + "learning_rate": 1e-06, + "loss": 0.4143, + "mean_token_accuracy": 0.8681807518005371, + "num_tokens": 639439622.0, + "step": 16763 + }, + { + "epoch": 2.1325531102913113, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.333158493041992, + "learning_rate": 1e-06, + "loss": 0.3778, + "mean_token_accuracy": 0.8837763667106628, + "num_tokens": 639479943.0, + "step": 16764 + }, + { + "epoch": 2.132680320569902, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.255531311035156, + "learning_rate": 1e-06, + "loss": 0.4204, + "mean_token_accuracy": 0.8668073415756226, + "num_tokens": 639520487.0, + "step": 16765 + }, + { + "epoch": 2.1328075308484924, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.05224609375, + "learning_rate": 1e-06, + "loss": 0.4017, + "mean_token_accuracy": 0.8710936307907104, + "num_tokens": 639558997.0, + "step": 16766 + }, + { + "epoch": 2.132934741127083, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.531341552734375, + "learning_rate": 1e-06, + "loss": 0.4001, + "mean_token_accuracy": 0.8718284964561462, + "num_tokens": 639594569.0, + "step": 16767 + }, + { + "epoch": 2.1330619514056735, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.454191207885742, + "learning_rate": 1e-06, + "loss": 0.3554, + "mean_token_accuracy": 0.8856143951416016, + "num_tokens": 639629827.0, + "step": 16768 + }, + { + "epoch": 2.133189161684264, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.045448303222656, + "learning_rate": 1e-06, + "loss": 0.4304, + "mean_token_accuracy": 0.8664102554321289, + "num_tokens": 639667634.0, + "step": 16769 + }, + { + "epoch": 2.1333163719628545, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.2794132232666, + "learning_rate": 1e-06, + "loss": 0.3966, + "mean_token_accuracy": 0.8704220056533813, + "num_tokens": 639700536.0, + "step": 16770 + }, + { + "epoch": 2.133443582241445, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.30474853515625, + "learning_rate": 1e-06, + "loss": 0.392, + "mean_token_accuracy": 0.8745259046554565, + "num_tokens": 639744006.0, + "step": 16771 + }, + { + "epoch": 2.1335707925200356, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.978994369506836, + "learning_rate": 1e-06, + "loss": 0.3962, + "mean_token_accuracy": 0.8768289089202881, + "num_tokens": 639784950.0, + "step": 16772 + }, + { + "epoch": 2.133698002798626, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.229106903076172, + "learning_rate": 1e-06, + "loss": 0.3584, + "mean_token_accuracy": 0.8853518962860107, + "num_tokens": 639823567.0, + "step": 16773 + }, + { + "epoch": 2.1338252130772166, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.491514205932617, + "learning_rate": 1e-06, + "loss": 0.3571, + "mean_token_accuracy": 0.882035493850708, + "num_tokens": 639863386.0, + "step": 16774 + }, + { + "epoch": 2.133952423355807, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.325275421142578, + "learning_rate": 1e-06, + "loss": 0.4262, + "mean_token_accuracy": 0.8602622151374817, + "num_tokens": 639901219.0, + "step": 16775 + }, + { + "epoch": 2.1340796336343977, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.999231338500977, + "learning_rate": 1e-06, + "loss": 0.4709, + "mean_token_accuracy": 0.8566142916679382, + "num_tokens": 639935240.0, + "step": 16776 + }, + { + "epoch": 2.134206843912988, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.360231399536133, + "learning_rate": 1e-06, + "loss": 0.439, + "mean_token_accuracy": 0.8639333248138428, + "num_tokens": 639975370.0, + "step": 16777 + }, + { + "epoch": 2.1343340541915787, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.240392684936523, + "learning_rate": 1e-06, + "loss": 0.4577, + "mean_token_accuracy": 0.8535604476928711, + "num_tokens": 640020463.0, + "step": 16778 + }, + { + "epoch": 2.1344612644701693, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.107267379760742, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.8669911026954651, + "num_tokens": 640057142.0, + "step": 16779 + }, + { + "epoch": 2.13458847474876, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.41428565979004, + "learning_rate": 1e-06, + "loss": 0.4126, + "mean_token_accuracy": 0.8686572313308716, + "num_tokens": 640098879.0, + "step": 16780 + }, + { + "epoch": 2.1347156850273503, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.217191696166992, + "learning_rate": 1e-06, + "loss": 0.3879, + "mean_token_accuracy": 0.8741244077682495, + "num_tokens": 640136077.0, + "step": 16781 + }, + { + "epoch": 2.134842895305941, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.196157455444336, + "learning_rate": 1e-06, + "loss": 0.442, + "mean_token_accuracy": 0.8592169284820557, + "num_tokens": 640175160.0, + "step": 16782 + }, + { + "epoch": 2.1349701055845314, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.166873931884766, + "learning_rate": 1e-06, + "loss": 0.3341, + "mean_token_accuracy": 0.8965831995010376, + "num_tokens": 640216670.0, + "step": 16783 + }, + { + "epoch": 2.135097315863122, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.263349533081055, + "learning_rate": 1e-06, + "loss": 0.4165, + "mean_token_accuracy": 0.8693689107894897, + "num_tokens": 640254872.0, + "step": 16784 + }, + { + "epoch": 2.1352245261417124, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.385028839111328, + "learning_rate": 1e-06, + "loss": 0.4089, + "mean_token_accuracy": 0.8702839612960815, + "num_tokens": 640289319.0, + "step": 16785 + }, + { + "epoch": 2.135351736420303, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.36564064025879, + "learning_rate": 1e-06, + "loss": 0.357, + "mean_token_accuracy": 0.8867409229278564, + "num_tokens": 640323897.0, + "step": 16786 + }, + { + "epoch": 2.1354789466988935, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.040925979614258, + "learning_rate": 1e-06, + "loss": 0.3788, + "mean_token_accuracy": 0.8780917525291443, + "num_tokens": 640366203.0, + "step": 16787 + }, + { + "epoch": 2.1356061569774836, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.19110107421875, + "learning_rate": 1e-06, + "loss": 0.3762, + "mean_token_accuracy": 0.881608247756958, + "num_tokens": 640408089.0, + "step": 16788 + }, + { + "epoch": 2.135733367256074, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.371068954467773, + "learning_rate": 1e-06, + "loss": 0.3756, + "mean_token_accuracy": 0.8804105520248413, + "num_tokens": 640443532.0, + "step": 16789 + }, + { + "epoch": 2.1358605775346646, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.121370315551758, + "learning_rate": 1e-06, + "loss": 0.4251, + "mean_token_accuracy": 0.8654403686523438, + "num_tokens": 640480425.0, + "step": 16790 + }, + { + "epoch": 2.135987787813255, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.49948501586914, + "learning_rate": 1e-06, + "loss": 0.3936, + "mean_token_accuracy": 0.8750408887863159, + "num_tokens": 640525658.0, + "step": 16791 + }, + { + "epoch": 2.1361149980918457, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.29585838317871, + "learning_rate": 1e-06, + "loss": 0.3746, + "mean_token_accuracy": 0.8813052177429199, + "num_tokens": 640563276.0, + "step": 16792 + }, + { + "epoch": 2.136242208370436, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.1541748046875, + "learning_rate": 1e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.8657711148262024, + "num_tokens": 640608150.0, + "step": 16793 + }, + { + "epoch": 2.1363694186490267, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.320018768310547, + "learning_rate": 1e-06, + "loss": 0.389, + "mean_token_accuracy": 0.8740270137786865, + "num_tokens": 640646127.0, + "step": 16794 + }, + { + "epoch": 2.1364966289276173, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.084598541259766, + "learning_rate": 1e-06, + "loss": 0.385, + "mean_token_accuracy": 0.878319263458252, + "num_tokens": 640683195.0, + "step": 16795 + }, + { + "epoch": 2.136623839206208, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.113161087036133, + "learning_rate": 1e-06, + "loss": 0.3582, + "mean_token_accuracy": 0.8864784240722656, + "num_tokens": 640724102.0, + "step": 16796 + }, + { + "epoch": 2.1367510494847983, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.23854637145996, + "learning_rate": 1e-06, + "loss": 0.4093, + "mean_token_accuracy": 0.8687046766281128, + "num_tokens": 640762624.0, + "step": 16797 + }, + { + "epoch": 2.136878259763389, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.357040405273438, + "learning_rate": 1e-06, + "loss": 0.4124, + "mean_token_accuracy": 0.8678622245788574, + "num_tokens": 640799437.0, + "step": 16798 + }, + { + "epoch": 2.1370054700419794, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.186683654785156, + "learning_rate": 1e-06, + "loss": 0.3926, + "mean_token_accuracy": 0.8787871599197388, + "num_tokens": 640837115.0, + "step": 16799 + }, + { + "epoch": 2.13713268032057, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.06134605407715, + "learning_rate": 1e-06, + "loss": 0.4115, + "mean_token_accuracy": 0.8723514080047607, + "num_tokens": 640882312.0, + "step": 16800 + }, + { + "epoch": 2.1372598905991604, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.291484832763672, + "learning_rate": 1e-06, + "loss": 0.4053, + "mean_token_accuracy": 0.8721886873245239, + "num_tokens": 640921032.0, + "step": 16801 + }, + { + "epoch": 2.137387100877751, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.12238311767578, + "learning_rate": 1e-06, + "loss": 0.4072, + "mean_token_accuracy": 0.8733710050582886, + "num_tokens": 640957424.0, + "step": 16802 + }, + { + "epoch": 2.1375143111563415, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.22992706298828, + "learning_rate": 1e-06, + "loss": 0.4381, + "mean_token_accuracy": 0.8637447953224182, + "num_tokens": 640992985.0, + "step": 16803 + }, + { + "epoch": 2.137641521434932, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.1248779296875, + "learning_rate": 1e-06, + "loss": 0.3723, + "mean_token_accuracy": 0.879427433013916, + "num_tokens": 641030379.0, + "step": 16804 + }, + { + "epoch": 2.1377687317135226, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.093862533569336, + "learning_rate": 1e-06, + "loss": 0.4237, + "mean_token_accuracy": 0.8646956086158752, + "num_tokens": 641068526.0, + "step": 16805 + }, + { + "epoch": 2.137895941992113, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.310714721679688, + "learning_rate": 1e-06, + "loss": 0.419, + "mean_token_accuracy": 0.8662540316581726, + "num_tokens": 641103351.0, + "step": 16806 + }, + { + "epoch": 2.1380231522707036, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.05322265625, + "learning_rate": 1e-06, + "loss": 0.3978, + "mean_token_accuracy": 0.874222993850708, + "num_tokens": 641143570.0, + "step": 16807 + }, + { + "epoch": 2.138150362549294, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.062780380249023, + "learning_rate": 1e-06, + "loss": 0.3876, + "mean_token_accuracy": 0.877413809299469, + "num_tokens": 641180625.0, + "step": 16808 + }, + { + "epoch": 2.1382775728278847, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.235445022583008, + "learning_rate": 1e-06, + "loss": 0.4782, + "mean_token_accuracy": 0.8519457578659058, + "num_tokens": 641226770.0, + "step": 16809 + }, + { + "epoch": 2.138404783106475, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.047168731689453, + "learning_rate": 1e-06, + "loss": 0.3715, + "mean_token_accuracy": 0.8845281600952148, + "num_tokens": 641269682.0, + "step": 16810 + }, + { + "epoch": 2.1385319933850653, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.347200393676758, + "learning_rate": 1e-06, + "loss": 0.3602, + "mean_token_accuracy": 0.8844772577285767, + "num_tokens": 641303803.0, + "step": 16811 + }, + { + "epoch": 2.1386592036636562, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.201345443725586, + "learning_rate": 1e-06, + "loss": 0.418, + "mean_token_accuracy": 0.8663265705108643, + "num_tokens": 641343141.0, + "step": 16812 + }, + { + "epoch": 2.1387864139422463, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.269969940185547, + "learning_rate": 1e-06, + "loss": 0.3892, + "mean_token_accuracy": 0.8765655755996704, + "num_tokens": 641384006.0, + "step": 16813 + }, + { + "epoch": 2.138913624220837, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.227514266967773, + "learning_rate": 1e-06, + "loss": 0.467, + "mean_token_accuracy": 0.8510485291481018, + "num_tokens": 641423998.0, + "step": 16814 + }, + { + "epoch": 2.1390408344994274, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.275707244873047, + "learning_rate": 1e-06, + "loss": 0.4052, + "mean_token_accuracy": 0.8716375827789307, + "num_tokens": 641457886.0, + "step": 16815 + }, + { + "epoch": 2.139168044778018, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.246910095214844, + "learning_rate": 1e-06, + "loss": 0.4018, + "mean_token_accuracy": 0.8725026249885559, + "num_tokens": 641503386.0, + "step": 16816 + }, + { + "epoch": 2.1392952550566084, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.34305191040039, + "learning_rate": 1e-06, + "loss": 0.3663, + "mean_token_accuracy": 0.8826887011528015, + "num_tokens": 641539533.0, + "step": 16817 + }, + { + "epoch": 2.139422465335199, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.140644073486328, + "learning_rate": 1e-06, + "loss": 0.4124, + "mean_token_accuracy": 0.8671958446502686, + "num_tokens": 641574020.0, + "step": 16818 + }, + { + "epoch": 2.1395496756137895, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.370664596557617, + "learning_rate": 1e-06, + "loss": 0.3876, + "mean_token_accuracy": 0.8761518001556396, + "num_tokens": 641605206.0, + "step": 16819 + }, + { + "epoch": 2.13967688589238, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.09389305114746, + "learning_rate": 1e-06, + "loss": 0.4349, + "mean_token_accuracy": 0.8624374866485596, + "num_tokens": 641649857.0, + "step": 16820 + }, + { + "epoch": 2.1398040961709706, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.141918182373047, + "learning_rate": 1e-06, + "loss": 0.4058, + "mean_token_accuracy": 0.8671807050704956, + "num_tokens": 641683360.0, + "step": 16821 + }, + { + "epoch": 2.139931306449561, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.215476989746094, + "learning_rate": 1e-06, + "loss": 0.4088, + "mean_token_accuracy": 0.8726009130477905, + "num_tokens": 641720105.0, + "step": 16822 + }, + { + "epoch": 2.1400585167281516, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.278871536254883, + "learning_rate": 1e-06, + "loss": 0.3942, + "mean_token_accuracy": 0.8747222423553467, + "num_tokens": 641750718.0, + "step": 16823 + }, + { + "epoch": 2.140185727006742, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.37703514099121, + "learning_rate": 1e-06, + "loss": 0.4065, + "mean_token_accuracy": 0.8711696863174438, + "num_tokens": 641791620.0, + "step": 16824 + }, + { + "epoch": 2.1403129372853327, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.15540885925293, + "learning_rate": 1e-06, + "loss": 0.4495, + "mean_token_accuracy": 0.8604953289031982, + "num_tokens": 641833566.0, + "step": 16825 + }, + { + "epoch": 2.140440147563923, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.19773292541504, + "learning_rate": 1e-06, + "loss": 0.3818, + "mean_token_accuracy": 0.8783224821090698, + "num_tokens": 641869080.0, + "step": 16826 + }, + { + "epoch": 2.1405673578425137, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.280139923095703, + "learning_rate": 1e-06, + "loss": 0.416, + "mean_token_accuracy": 0.8693397045135498, + "num_tokens": 641910195.0, + "step": 16827 + }, + { + "epoch": 2.1406945681211043, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.152511596679688, + "learning_rate": 1e-06, + "loss": 0.4434, + "mean_token_accuracy": 0.8594516515731812, + "num_tokens": 641947270.0, + "step": 16828 + }, + { + "epoch": 2.140821778399695, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.232826232910156, + "learning_rate": 1e-06, + "loss": 0.401, + "mean_token_accuracy": 0.873852014541626, + "num_tokens": 641986472.0, + "step": 16829 + }, + { + "epoch": 2.1409489886782853, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.193241119384766, + "learning_rate": 1e-06, + "loss": 0.458, + "mean_token_accuracy": 0.8587129712104797, + "num_tokens": 642028130.0, + "step": 16830 + }, + { + "epoch": 2.141076198956876, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.129802703857422, + "learning_rate": 1e-06, + "loss": 0.4171, + "mean_token_accuracy": 0.8666378855705261, + "num_tokens": 642067097.0, + "step": 16831 + }, + { + "epoch": 2.1412034092354664, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.218069076538086, + "learning_rate": 1e-06, + "loss": 0.3671, + "mean_token_accuracy": 0.8827871084213257, + "num_tokens": 642099628.0, + "step": 16832 + }, + { + "epoch": 2.141330619514057, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.32455062866211, + "learning_rate": 1e-06, + "loss": 0.3322, + "mean_token_accuracy": 0.894615650177002, + "num_tokens": 642135941.0, + "step": 16833 + }, + { + "epoch": 2.1414578297926474, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.043363571166992, + "learning_rate": 1e-06, + "loss": 0.4247, + "mean_token_accuracy": 0.8660631775856018, + "num_tokens": 642169465.0, + "step": 16834 + }, + { + "epoch": 2.141585040071238, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.104833602905273, + "learning_rate": 1e-06, + "loss": 0.397, + "mean_token_accuracy": 0.87577885389328, + "num_tokens": 642208961.0, + "step": 16835 + }, + { + "epoch": 2.141712250349828, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.15558433532715, + "learning_rate": 1e-06, + "loss": 0.4095, + "mean_token_accuracy": 0.8716221451759338, + "num_tokens": 642249691.0, + "step": 16836 + }, + { + "epoch": 2.141839460628419, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.103940963745117, + "learning_rate": 1e-06, + "loss": 0.4022, + "mean_token_accuracy": 0.8748448491096497, + "num_tokens": 642292425.0, + "step": 16837 + }, + { + "epoch": 2.141966670907009, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.196182250976562, + "learning_rate": 1e-06, + "loss": 0.3673, + "mean_token_accuracy": 0.8817583322525024, + "num_tokens": 642330982.0, + "step": 16838 + }, + { + "epoch": 2.1420938811855996, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.389490127563477, + "learning_rate": 1e-06, + "loss": 0.4041, + "mean_token_accuracy": 0.8735744953155518, + "num_tokens": 642365414.0, + "step": 16839 + }, + { + "epoch": 2.14222109146419, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.241228103637695, + "learning_rate": 1e-06, + "loss": 0.4221, + "mean_token_accuracy": 0.8669753074645996, + "num_tokens": 642398660.0, + "step": 16840 + }, + { + "epoch": 2.1423483017427807, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.042285919189453, + "learning_rate": 1e-06, + "loss": 0.3962, + "mean_token_accuracy": 0.8788079023361206, + "num_tokens": 642432605.0, + "step": 16841 + }, + { + "epoch": 2.142475512021371, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.17226219177246, + "learning_rate": 1e-06, + "loss": 0.3801, + "mean_token_accuracy": 0.8806870579719543, + "num_tokens": 642472041.0, + "step": 16842 + }, + { + "epoch": 2.1426027222999617, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.164453506469727, + "learning_rate": 1e-06, + "loss": 0.3678, + "mean_token_accuracy": 0.8827040195465088, + "num_tokens": 642512399.0, + "step": 16843 + }, + { + "epoch": 2.1427299325785523, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.135114669799805, + "learning_rate": 1e-06, + "loss": 0.4222, + "mean_token_accuracy": 0.8662272691726685, + "num_tokens": 642550829.0, + "step": 16844 + }, + { + "epoch": 2.142857142857143, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.523162841796875, + "learning_rate": 1e-06, + "loss": 0.4049, + "mean_token_accuracy": 0.8720120191574097, + "num_tokens": 642591255.0, + "step": 16845 + }, + { + "epoch": 2.1429843531357333, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.017118453979492, + "learning_rate": 1e-06, + "loss": 0.3961, + "mean_token_accuracy": 0.8742530345916748, + "num_tokens": 642625772.0, + "step": 16846 + }, + { + "epoch": 2.143111563414324, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.20758819580078, + "learning_rate": 1e-06, + "loss": 0.4303, + "mean_token_accuracy": 0.8660010099411011, + "num_tokens": 642667596.0, + "step": 16847 + }, + { + "epoch": 2.1432387736929144, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.167545318603516, + "learning_rate": 1e-06, + "loss": 0.397, + "mean_token_accuracy": 0.8733375072479248, + "num_tokens": 642702159.0, + "step": 16848 + }, + { + "epoch": 2.143365983971505, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.251676559448242, + "learning_rate": 1e-06, + "loss": 0.3546, + "mean_token_accuracy": 0.8865460157394409, + "num_tokens": 642737307.0, + "step": 16849 + }, + { + "epoch": 2.1434931942500954, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.193090438842773, + "learning_rate": 1e-06, + "loss": 0.4281, + "mean_token_accuracy": 0.8630161285400391, + "num_tokens": 642776660.0, + "step": 16850 + }, + { + "epoch": 2.143620404528686, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.33843231201172, + "learning_rate": 1e-06, + "loss": 0.4341, + "mean_token_accuracy": 0.8668385744094849, + "num_tokens": 642814131.0, + "step": 16851 + }, + { + "epoch": 2.1437476148072765, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.203384399414062, + "learning_rate": 1e-06, + "loss": 0.4543, + "mean_token_accuracy": 0.853603720664978, + "num_tokens": 642854138.0, + "step": 16852 + }, + { + "epoch": 2.143874825085867, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.181943893432617, + "learning_rate": 1e-06, + "loss": 0.3891, + "mean_token_accuracy": 0.8799049854278564, + "num_tokens": 642893767.0, + "step": 16853 + }, + { + "epoch": 2.1440020353644575, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.188335418701172, + "learning_rate": 1e-06, + "loss": 0.3523, + "mean_token_accuracy": 0.886725902557373, + "num_tokens": 642928286.0, + "step": 16854 + }, + { + "epoch": 2.144129245643048, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.349470138549805, + "learning_rate": 1e-06, + "loss": 0.3818, + "mean_token_accuracy": 0.8796574473381042, + "num_tokens": 642971220.0, + "step": 16855 + }, + { + "epoch": 2.1442564559216386, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.32130241394043, + "learning_rate": 1e-06, + "loss": 0.3904, + "mean_token_accuracy": 0.876437783241272, + "num_tokens": 643013618.0, + "step": 16856 + }, + { + "epoch": 2.144383666200229, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.08254051208496, + "learning_rate": 1e-06, + "loss": 0.3964, + "mean_token_accuracy": 0.8750746846199036, + "num_tokens": 643058122.0, + "step": 16857 + }, + { + "epoch": 2.1445108764788197, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.34974479675293, + "learning_rate": 1e-06, + "loss": 0.443, + "mean_token_accuracy": 0.8597811460494995, + "num_tokens": 643096662.0, + "step": 16858 + }, + { + "epoch": 2.14463808675741, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.028358459472656, + "learning_rate": 1e-06, + "loss": 0.3606, + "mean_token_accuracy": 0.8848569393157959, + "num_tokens": 643137923.0, + "step": 16859 + }, + { + "epoch": 2.1447652970360007, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.247745513916016, + "learning_rate": 1e-06, + "loss": 0.3821, + "mean_token_accuracy": 0.8784151673316956, + "num_tokens": 643176918.0, + "step": 16860 + }, + { + "epoch": 2.144892507314591, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.30120277404785, + "learning_rate": 1e-06, + "loss": 0.352, + "mean_token_accuracy": 0.8902894854545593, + "num_tokens": 643217525.0, + "step": 16861 + }, + { + "epoch": 2.1450197175931813, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.250213623046875, + "learning_rate": 1e-06, + "loss": 0.3491, + "mean_token_accuracy": 0.8894720077514648, + "num_tokens": 643255131.0, + "step": 16862 + }, + { + "epoch": 2.145146927871772, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.24159049987793, + "learning_rate": 1e-06, + "loss": 0.3808, + "mean_token_accuracy": 0.8810580968856812, + "num_tokens": 643292092.0, + "step": 16863 + }, + { + "epoch": 2.1452741381503624, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.36073875427246, + "learning_rate": 1e-06, + "loss": 0.4111, + "mean_token_accuracy": 0.8695048093795776, + "num_tokens": 643330510.0, + "step": 16864 + }, + { + "epoch": 2.145401348428953, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.159351348876953, + "learning_rate": 1e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.8684719800949097, + "num_tokens": 643372900.0, + "step": 16865 + }, + { + "epoch": 2.1455285587075434, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.18074607849121, + "learning_rate": 1e-06, + "loss": 0.3813, + "mean_token_accuracy": 0.881009578704834, + "num_tokens": 643412866.0, + "step": 16866 + }, + { + "epoch": 2.145655768986134, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.38353729248047, + "learning_rate": 1e-06, + "loss": 0.4418, + "mean_token_accuracy": 0.8564938902854919, + "num_tokens": 643452846.0, + "step": 16867 + }, + { + "epoch": 2.1457829792647245, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.393495559692383, + "learning_rate": 1e-06, + "loss": 0.3841, + "mean_token_accuracy": 0.8783404231071472, + "num_tokens": 643488264.0, + "step": 16868 + }, + { + "epoch": 2.145910189543315, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.43146324157715, + "learning_rate": 1e-06, + "loss": 0.4408, + "mean_token_accuracy": 0.8610765933990479, + "num_tokens": 643528131.0, + "step": 16869 + }, + { + "epoch": 2.1460373998219056, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.2733211517334, + "learning_rate": 1e-06, + "loss": 0.3968, + "mean_token_accuracy": 0.8736330270767212, + "num_tokens": 643564041.0, + "step": 16870 + }, + { + "epoch": 2.146164610100496, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.382131576538086, + "learning_rate": 1e-06, + "loss": 0.4402, + "mean_token_accuracy": 0.860655665397644, + "num_tokens": 643601507.0, + "step": 16871 + }, + { + "epoch": 2.1462918203790866, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.483911514282227, + "learning_rate": 1e-06, + "loss": 0.4452, + "mean_token_accuracy": 0.8571673631668091, + "num_tokens": 643647161.0, + "step": 16872 + }, + { + "epoch": 2.146419030657677, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.212385177612305, + "learning_rate": 1e-06, + "loss": 0.3825, + "mean_token_accuracy": 0.8776189684867859, + "num_tokens": 643687114.0, + "step": 16873 + }, + { + "epoch": 2.1465462409362677, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.30339813232422, + "learning_rate": 1e-06, + "loss": 0.4435, + "mean_token_accuracy": 0.8581925630569458, + "num_tokens": 643722110.0, + "step": 16874 + }, + { + "epoch": 2.146673451214858, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.055700302124023, + "learning_rate": 1e-06, + "loss": 0.482, + "mean_token_accuracy": 0.8503930568695068, + "num_tokens": 643765770.0, + "step": 16875 + }, + { + "epoch": 2.1468006614934487, + "ewc_loss": 0.0361328125, + "ewc_loss_parallel": 3.62396240234375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.135313034057617, + "learning_rate": 1e-06, + "loss": 0.4328, + "mean_token_accuracy": 0.865020751953125, + "num_tokens": 643810097.0, + "step": 16876 + }, + { + "epoch": 2.1469278717720393, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.105213165283203, + "learning_rate": 1e-06, + "loss": 0.4764, + "mean_token_accuracy": 0.847011923789978, + "num_tokens": 643847904.0, + "step": 16877 + }, + { + "epoch": 2.14705508205063, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.26600456237793, + "learning_rate": 1e-06, + "loss": 0.374, + "mean_token_accuracy": 0.881048858165741, + "num_tokens": 643889157.0, + "step": 16878 + }, + { + "epoch": 2.1471822923292203, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.24355697631836, + "learning_rate": 1e-06, + "loss": 0.3894, + "mean_token_accuracy": 0.8752647638320923, + "num_tokens": 643931181.0, + "step": 16879 + }, + { + "epoch": 2.147309502607811, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.02440071105957, + "learning_rate": 1e-06, + "loss": 0.3978, + "mean_token_accuracy": 0.870039701461792, + "num_tokens": 643965415.0, + "step": 16880 + }, + { + "epoch": 2.1474367128864014, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.30115509033203, + "learning_rate": 1e-06, + "loss": 0.4539, + "mean_token_accuracy": 0.857096791267395, + "num_tokens": 644010647.0, + "step": 16881 + }, + { + "epoch": 2.147563923164992, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.155109405517578, + "learning_rate": 1e-06, + "loss": 0.3541, + "mean_token_accuracy": 0.8883289098739624, + "num_tokens": 644044347.0, + "step": 16882 + }, + { + "epoch": 2.1476911334435824, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.093021392822266, + "learning_rate": 1e-06, + "loss": 0.3804, + "mean_token_accuracy": 0.8800781965255737, + "num_tokens": 644083456.0, + "step": 16883 + }, + { + "epoch": 2.147818343722173, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.231103897094727, + "learning_rate": 1e-06, + "loss": 0.4308, + "mean_token_accuracy": 0.8636047840118408, + "num_tokens": 644123851.0, + "step": 16884 + }, + { + "epoch": 2.1479455540007635, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.37171745300293, + "learning_rate": 1e-06, + "loss": 0.3796, + "mean_token_accuracy": 0.8821161389350891, + "num_tokens": 644168880.0, + "step": 16885 + }, + { + "epoch": 2.1480727642793536, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.007986068725586, + "learning_rate": 1e-06, + "loss": 0.3897, + "mean_token_accuracy": 0.874643862247467, + "num_tokens": 644204532.0, + "step": 16886 + }, + { + "epoch": 2.148199974557944, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.221975326538086, + "learning_rate": 1e-06, + "loss": 0.3474, + "mean_token_accuracy": 0.8903388977050781, + "num_tokens": 644246160.0, + "step": 16887 + }, + { + "epoch": 2.1483271848365346, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.35064697265625, + "learning_rate": 1e-06, + "loss": 0.3595, + "mean_token_accuracy": 0.8852661848068237, + "num_tokens": 644284881.0, + "step": 16888 + }, + { + "epoch": 2.148454395115125, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.26433753967285, + "learning_rate": 1e-06, + "loss": 0.4249, + "mean_token_accuracy": 0.8661136627197266, + "num_tokens": 644326661.0, + "step": 16889 + }, + { + "epoch": 2.1485816053937157, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.030118942260742, + "learning_rate": 1e-06, + "loss": 0.385, + "mean_token_accuracy": 0.8765102624893188, + "num_tokens": 644364420.0, + "step": 16890 + }, + { + "epoch": 2.148708815672306, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.657018661499023, + "learning_rate": 1e-06, + "loss": 0.4588, + "mean_token_accuracy": 0.8554094433784485, + "num_tokens": 644398626.0, + "step": 16891 + }, + { + "epoch": 2.1488360259508967, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.41749382019043, + "learning_rate": 1e-06, + "loss": 0.4035, + "mean_token_accuracy": 0.8728107213973999, + "num_tokens": 644434772.0, + "step": 16892 + }, + { + "epoch": 2.1489632362294873, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.210742950439453, + "learning_rate": 1e-06, + "loss": 0.4104, + "mean_token_accuracy": 0.8707125782966614, + "num_tokens": 644474425.0, + "step": 16893 + }, + { + "epoch": 2.149090446508078, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.28668785095215, + "learning_rate": 1e-06, + "loss": 0.4219, + "mean_token_accuracy": 0.8651723265647888, + "num_tokens": 644507796.0, + "step": 16894 + }, + { + "epoch": 2.1492176567866683, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.31833267211914, + "learning_rate": 1e-06, + "loss": 0.406, + "mean_token_accuracy": 0.8695710897445679, + "num_tokens": 644544172.0, + "step": 16895 + }, + { + "epoch": 2.149344867065259, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.131845474243164, + "learning_rate": 1e-06, + "loss": 0.4049, + "mean_token_accuracy": 0.8716287612915039, + "num_tokens": 644582217.0, + "step": 16896 + }, + { + "epoch": 2.1494720773438494, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.19127082824707, + "learning_rate": 1e-06, + "loss": 0.3796, + "mean_token_accuracy": 0.8782763481140137, + "num_tokens": 644620028.0, + "step": 16897 + }, + { + "epoch": 2.14959928762244, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.379518508911133, + "learning_rate": 1e-06, + "loss": 0.443, + "mean_token_accuracy": 0.8567785024642944, + "num_tokens": 644663743.0, + "step": 16898 + }, + { + "epoch": 2.1497264979010304, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.491947174072266, + "learning_rate": 1e-06, + "loss": 0.3869, + "mean_token_accuracy": 0.87445467710495, + "num_tokens": 644700914.0, + "step": 16899 + }, + { + "epoch": 2.149853708179621, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 24.89237403869629, + "learning_rate": 1e-06, + "loss": 0.4002, + "mean_token_accuracy": 0.8696582317352295, + "num_tokens": 644744232.0, + "step": 16900 + }, + { + "epoch": 2.1499809184582115, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.227252960205078, + "learning_rate": 1e-06, + "loss": 0.4242, + "mean_token_accuracy": 0.8661733865737915, + "num_tokens": 644780851.0, + "step": 16901 + }, + { + "epoch": 2.150108128736802, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.173860549926758, + "learning_rate": 1e-06, + "loss": 0.3806, + "mean_token_accuracy": 0.8810313940048218, + "num_tokens": 644822103.0, + "step": 16902 + }, + { + "epoch": 2.1502353390153925, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.330644607543945, + "learning_rate": 1e-06, + "loss": 0.4101, + "mean_token_accuracy": 0.8697974681854248, + "num_tokens": 644861209.0, + "step": 16903 + }, + { + "epoch": 2.150362549293983, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.230030059814453, + "learning_rate": 1e-06, + "loss": 0.4537, + "mean_token_accuracy": 0.8593171238899231, + "num_tokens": 644899860.0, + "step": 16904 + }, + { + "epoch": 2.1504897595725736, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.083852767944336, + "learning_rate": 1e-06, + "loss": 0.4125, + "mean_token_accuracy": 0.8680697679519653, + "num_tokens": 644943101.0, + "step": 16905 + }, + { + "epoch": 2.150616969851164, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.459983825683594, + "learning_rate": 1e-06, + "loss": 0.4438, + "mean_token_accuracy": 0.8614631295204163, + "num_tokens": 644981786.0, + "step": 16906 + }, + { + "epoch": 2.1507441801297547, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.347135543823242, + "learning_rate": 1e-06, + "loss": 0.3987, + "mean_token_accuracy": 0.874590277671814, + "num_tokens": 645017949.0, + "step": 16907 + }, + { + "epoch": 2.150871390408345, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.25343894958496, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.8706893920898438, + "num_tokens": 645053443.0, + "step": 16908 + }, + { + "epoch": 2.1509986006869353, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.307043075561523, + "learning_rate": 1e-06, + "loss": 0.4135, + "mean_token_accuracy": 0.8748387098312378, + "num_tokens": 645089916.0, + "step": 16909 + }, + { + "epoch": 2.1511258109655262, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.279727935791016, + "learning_rate": 1e-06, + "loss": 0.455, + "mean_token_accuracy": 0.856670081615448, + "num_tokens": 645134531.0, + "step": 16910 + }, + { + "epoch": 2.1512530212441163, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.213491439819336, + "learning_rate": 1e-06, + "loss": 0.4175, + "mean_token_accuracy": 0.8678538203239441, + "num_tokens": 645172608.0, + "step": 16911 + }, + { + "epoch": 2.151380231522707, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.173063278198242, + "learning_rate": 1e-06, + "loss": 0.3999, + "mean_token_accuracy": 0.8683489561080933, + "num_tokens": 645209895.0, + "step": 16912 + }, + { + "epoch": 2.1515074418012974, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.622480392456055, + "learning_rate": 1e-06, + "loss": 0.3858, + "mean_token_accuracy": 0.8782466650009155, + "num_tokens": 645245259.0, + "step": 16913 + }, + { + "epoch": 2.151634652079888, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.280126571655273, + "learning_rate": 1e-06, + "loss": 0.4247, + "mean_token_accuracy": 0.8669161796569824, + "num_tokens": 645284962.0, + "step": 16914 + }, + { + "epoch": 2.1517618623584784, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.06968116760254, + "learning_rate": 1e-06, + "loss": 0.3634, + "mean_token_accuracy": 0.8866164684295654, + "num_tokens": 645322352.0, + "step": 16915 + }, + { + "epoch": 2.151889072637069, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.56499671936035, + "learning_rate": 1e-06, + "loss": 0.4075, + "mean_token_accuracy": 0.8716750144958496, + "num_tokens": 645358401.0, + "step": 16916 + }, + { + "epoch": 2.1520162829156595, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.130050659179688, + "learning_rate": 1e-06, + "loss": 0.3646, + "mean_token_accuracy": 0.8846246004104614, + "num_tokens": 645399147.0, + "step": 16917 + }, + { + "epoch": 2.15214349319425, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.41506004333496, + "learning_rate": 1e-06, + "loss": 0.4562, + "mean_token_accuracy": 0.8538129925727844, + "num_tokens": 645437286.0, + "step": 16918 + }, + { + "epoch": 2.1522707034728406, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.266782760620117, + "learning_rate": 1e-06, + "loss": 0.4334, + "mean_token_accuracy": 0.8650637269020081, + "num_tokens": 645477853.0, + "step": 16919 + }, + { + "epoch": 2.152397913751431, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.148813247680664, + "learning_rate": 1e-06, + "loss": 0.4443, + "mean_token_accuracy": 0.8622539639472961, + "num_tokens": 645520476.0, + "step": 16920 + }, + { + "epoch": 2.1525251240300216, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.311330795288086, + "learning_rate": 1e-06, + "loss": 0.4533, + "mean_token_accuracy": 0.8580464124679565, + "num_tokens": 645561539.0, + "step": 16921 + }, + { + "epoch": 2.152652334308612, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.30611228942871, + "learning_rate": 1e-06, + "loss": 0.4133, + "mean_token_accuracy": 0.8674920797348022, + "num_tokens": 645602950.0, + "step": 16922 + }, + { + "epoch": 2.1527795445872027, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.572559356689453, + "learning_rate": 1e-06, + "loss": 0.4267, + "mean_token_accuracy": 0.861632764339447, + "num_tokens": 645639215.0, + "step": 16923 + }, + { + "epoch": 2.152906754865793, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.364648818969727, + "learning_rate": 1e-06, + "loss": 0.4117, + "mean_token_accuracy": 0.8712283372879028, + "num_tokens": 645677287.0, + "step": 16924 + }, + { + "epoch": 2.1530339651443837, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.195283889770508, + "learning_rate": 1e-06, + "loss": 0.375, + "mean_token_accuracy": 0.8795541524887085, + "num_tokens": 645710784.0, + "step": 16925 + }, + { + "epoch": 2.1531611754229742, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.44748306274414, + "learning_rate": 1e-06, + "loss": 0.4174, + "mean_token_accuracy": 0.8669173717498779, + "num_tokens": 645750568.0, + "step": 16926 + }, + { + "epoch": 2.1532883857015648, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.42289924621582, + "learning_rate": 1e-06, + "loss": 0.3772, + "mean_token_accuracy": 0.8772672414779663, + "num_tokens": 645783911.0, + "step": 16927 + }, + { + "epoch": 2.1534155959801553, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.32130241394043, + "learning_rate": 1e-06, + "loss": 0.3647, + "mean_token_accuracy": 0.8882662057876587, + "num_tokens": 645822524.0, + "step": 16928 + }, + { + "epoch": 2.153542806258746, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.33653450012207, + "learning_rate": 1e-06, + "loss": 0.4273, + "mean_token_accuracy": 0.8674479722976685, + "num_tokens": 645856924.0, + "step": 16929 + }, + { + "epoch": 2.1536700165373364, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.24897575378418, + "learning_rate": 1e-06, + "loss": 0.4273, + "mean_token_accuracy": 0.8649599552154541, + "num_tokens": 645896995.0, + "step": 16930 + }, + { + "epoch": 2.153797226815927, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.457216262817383, + "learning_rate": 1e-06, + "loss": 0.3817, + "mean_token_accuracy": 0.8786790370941162, + "num_tokens": 645941768.0, + "step": 16931 + }, + { + "epoch": 2.1539244370945174, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.433666229248047, + "learning_rate": 1e-06, + "loss": 0.4105, + "mean_token_accuracy": 0.8703071475028992, + "num_tokens": 645981909.0, + "step": 16932 + }, + { + "epoch": 2.154051647373108, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.263456344604492, + "learning_rate": 1e-06, + "loss": 0.4298, + "mean_token_accuracy": 0.8631024360656738, + "num_tokens": 646022034.0, + "step": 16933 + }, + { + "epoch": 2.154178857651698, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.312793731689453, + "learning_rate": 1e-06, + "loss": 0.4049, + "mean_token_accuracy": 0.8702081441879272, + "num_tokens": 646056714.0, + "step": 16934 + }, + { + "epoch": 2.1543060679302886, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.365161895751953, + "learning_rate": 1e-06, + "loss": 0.4267, + "mean_token_accuracy": 0.864016056060791, + "num_tokens": 646095668.0, + "step": 16935 + }, + { + "epoch": 2.154433278208879, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.237667083740234, + "learning_rate": 1e-06, + "loss": 0.4617, + "mean_token_accuracy": 0.8559693694114685, + "num_tokens": 646136745.0, + "step": 16936 + }, + { + "epoch": 2.1545604884874696, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.367015838623047, + "learning_rate": 1e-06, + "loss": 0.4189, + "mean_token_accuracy": 0.8691047430038452, + "num_tokens": 646180314.0, + "step": 16937 + }, + { + "epoch": 2.15468769876606, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.43621063232422, + "learning_rate": 1e-06, + "loss": 0.3917, + "mean_token_accuracy": 0.8736791014671326, + "num_tokens": 646218191.0, + "step": 16938 + }, + { + "epoch": 2.1548149090446507, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.292383193969727, + "learning_rate": 1e-06, + "loss": 0.3658, + "mean_token_accuracy": 0.8864657878875732, + "num_tokens": 646256350.0, + "step": 16939 + }, + { + "epoch": 2.154942119323241, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.22737693786621, + "learning_rate": 1e-06, + "loss": 0.3266, + "mean_token_accuracy": 0.8939181566238403, + "num_tokens": 646290915.0, + "step": 16940 + }, + { + "epoch": 2.1550693296018317, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.296972274780273, + "learning_rate": 1e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.8646327257156372, + "num_tokens": 646325838.0, + "step": 16941 + }, + { + "epoch": 2.1551965398804223, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.26608657836914, + "learning_rate": 1e-06, + "loss": 0.4633, + "mean_token_accuracy": 0.853191614151001, + "num_tokens": 646362742.0, + "step": 16942 + }, + { + "epoch": 2.155323750159013, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.28681755065918, + "learning_rate": 1e-06, + "loss": 0.4556, + "mean_token_accuracy": 0.8578990697860718, + "num_tokens": 646403958.0, + "step": 16943 + }, + { + "epoch": 2.1554509604376033, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.245861053466797, + "learning_rate": 1e-06, + "loss": 0.4218, + "mean_token_accuracy": 0.865982711315155, + "num_tokens": 646442872.0, + "step": 16944 + }, + { + "epoch": 2.155578170716194, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.32316780090332, + "learning_rate": 1e-06, + "loss": 0.387, + "mean_token_accuracy": 0.8783156871795654, + "num_tokens": 646484642.0, + "step": 16945 + }, + { + "epoch": 2.1557053809947844, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.222043991088867, + "learning_rate": 1e-06, + "loss": 0.38, + "mean_token_accuracy": 0.8808010816574097, + "num_tokens": 646521688.0, + "step": 16946 + }, + { + "epoch": 2.155832591273375, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.267208099365234, + "learning_rate": 1e-06, + "loss": 0.3843, + "mean_token_accuracy": 0.8795192837715149, + "num_tokens": 646560182.0, + "step": 16947 + }, + { + "epoch": 2.1559598015519654, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.148578643798828, + "learning_rate": 1e-06, + "loss": 0.4025, + "mean_token_accuracy": 0.8751955032348633, + "num_tokens": 646593252.0, + "step": 16948 + }, + { + "epoch": 2.156087011830556, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.3890438079834, + "learning_rate": 1e-06, + "loss": 0.4163, + "mean_token_accuracy": 0.8702805042266846, + "num_tokens": 646635514.0, + "step": 16949 + }, + { + "epoch": 2.1562142221091465, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.022167205810547, + "learning_rate": 1e-06, + "loss": 0.4366, + "mean_token_accuracy": 0.8584581613540649, + "num_tokens": 646674785.0, + "step": 16950 + }, + { + "epoch": 2.156341432387737, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.341646194458008, + "learning_rate": 1e-06, + "loss": 0.4063, + "mean_token_accuracy": 0.8705907464027405, + "num_tokens": 646713832.0, + "step": 16951 + }, + { + "epoch": 2.1564686426663275, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.302146911621094, + "learning_rate": 1e-06, + "loss": 0.4008, + "mean_token_accuracy": 0.875532865524292, + "num_tokens": 646752218.0, + "step": 16952 + }, + { + "epoch": 2.156595852944918, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.270408630371094, + "learning_rate": 1e-06, + "loss": 0.3976, + "mean_token_accuracy": 0.8703786730766296, + "num_tokens": 646786196.0, + "step": 16953 + }, + { + "epoch": 2.1567230632235086, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.361928939819336, + "learning_rate": 1e-06, + "loss": 0.4288, + "mean_token_accuracy": 0.8651553988456726, + "num_tokens": 646821462.0, + "step": 16954 + }, + { + "epoch": 2.156850273502099, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.283700942993164, + "learning_rate": 1e-06, + "loss": 0.3811, + "mean_token_accuracy": 0.8782964944839478, + "num_tokens": 646856613.0, + "step": 16955 + }, + { + "epoch": 2.1569774837806897, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.457881927490234, + "learning_rate": 1e-06, + "loss": 0.3755, + "mean_token_accuracy": 0.882513165473938, + "num_tokens": 646893013.0, + "step": 16956 + }, + { + "epoch": 2.15710469405928, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.37779426574707, + "learning_rate": 1e-06, + "loss": 0.3903, + "mean_token_accuracy": 0.8763265609741211, + "num_tokens": 646936180.0, + "step": 16957 + }, + { + "epoch": 2.1572319043378707, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.3630313873291, + "learning_rate": 1e-06, + "loss": 0.4055, + "mean_token_accuracy": 0.871726393699646, + "num_tokens": 646979314.0, + "step": 16958 + }, + { + "epoch": 2.157359114616461, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.197059631347656, + "learning_rate": 1e-06, + "loss": 0.3811, + "mean_token_accuracy": 0.881354808807373, + "num_tokens": 647020136.0, + "step": 16959 + }, + { + "epoch": 2.1574863248950513, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.320419311523438, + "learning_rate": 1e-06, + "loss": 0.3838, + "mean_token_accuracy": 0.8762215375900269, + "num_tokens": 647052970.0, + "step": 16960 + }, + { + "epoch": 2.157613535173642, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.117292404174805, + "learning_rate": 1e-06, + "loss": 0.4197, + "mean_token_accuracy": 0.8662220239639282, + "num_tokens": 647093195.0, + "step": 16961 + }, + { + "epoch": 2.1577407454522324, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.397581100463867, + "learning_rate": 1e-06, + "loss": 0.4136, + "mean_token_accuracy": 0.8724069595336914, + "num_tokens": 647133800.0, + "step": 16962 + }, + { + "epoch": 2.157867955730823, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.276758193969727, + "learning_rate": 1e-06, + "loss": 0.4561, + "mean_token_accuracy": 0.855547308921814, + "num_tokens": 647167704.0, + "step": 16963 + }, + { + "epoch": 2.1579951660094134, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.393489837646484, + "learning_rate": 1e-06, + "loss": 0.4395, + "mean_token_accuracy": 0.8595099449157715, + "num_tokens": 647208474.0, + "step": 16964 + }, + { + "epoch": 2.158122376288004, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.245229721069336, + "learning_rate": 1e-06, + "loss": 0.3665, + "mean_token_accuracy": 0.8846670389175415, + "num_tokens": 647242131.0, + "step": 16965 + }, + { + "epoch": 2.1582495865665945, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.105016708374023, + "learning_rate": 1e-06, + "loss": 0.4554, + "mean_token_accuracy": 0.8526358604431152, + "num_tokens": 647280952.0, + "step": 16966 + }, + { + "epoch": 2.158376796845185, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.298139572143555, + "learning_rate": 1e-06, + "loss": 0.4322, + "mean_token_accuracy": 0.8668496012687683, + "num_tokens": 647318411.0, + "step": 16967 + }, + { + "epoch": 2.1585040071237755, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.325349807739258, + "learning_rate": 1e-06, + "loss": 0.4681, + "mean_token_accuracy": 0.8579082489013672, + "num_tokens": 647361002.0, + "step": 16968 + }, + { + "epoch": 2.158631217402366, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.082386016845703, + "learning_rate": 1e-06, + "loss": 0.3962, + "mean_token_accuracy": 0.8751861453056335, + "num_tokens": 647402995.0, + "step": 16969 + }, + { + "epoch": 2.1587584276809566, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.12343978881836, + "learning_rate": 1e-06, + "loss": 0.3909, + "mean_token_accuracy": 0.8738991022109985, + "num_tokens": 647441220.0, + "step": 16970 + }, + { + "epoch": 2.158885637959547, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.270862579345703, + "learning_rate": 1e-06, + "loss": 0.4229, + "mean_token_accuracy": 0.8642286062240601, + "num_tokens": 647476832.0, + "step": 16971 + }, + { + "epoch": 2.1590128482381377, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.35772705078125, + "learning_rate": 1e-06, + "loss": 0.4025, + "mean_token_accuracy": 0.8738834857940674, + "num_tokens": 647516855.0, + "step": 16972 + }, + { + "epoch": 2.159140058516728, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.425405502319336, + "learning_rate": 1e-06, + "loss": 0.4176, + "mean_token_accuracy": 0.8682774305343628, + "num_tokens": 647556913.0, + "step": 16973 + }, + { + "epoch": 2.1592672687953187, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.294435501098633, + "learning_rate": 1e-06, + "loss": 0.3523, + "mean_token_accuracy": 0.8881793022155762, + "num_tokens": 647591242.0, + "step": 16974 + }, + { + "epoch": 2.1593944790739092, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.166555404663086, + "learning_rate": 1e-06, + "loss": 0.3941, + "mean_token_accuracy": 0.8753414154052734, + "num_tokens": 647632156.0, + "step": 16975 + }, + { + "epoch": 2.1595216893524998, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.33818817138672, + "learning_rate": 1e-06, + "loss": 0.3799, + "mean_token_accuracy": 0.8780252933502197, + "num_tokens": 647669857.0, + "step": 16976 + }, + { + "epoch": 2.1596488996310903, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.185409545898438, + "learning_rate": 1e-06, + "loss": 0.4439, + "mean_token_accuracy": 0.8597061634063721, + "num_tokens": 647715580.0, + "step": 16977 + }, + { + "epoch": 2.159776109909681, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.10509490966797, + "learning_rate": 1e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.8632541298866272, + "num_tokens": 647755917.0, + "step": 16978 + }, + { + "epoch": 2.1599033201882714, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.391372680664062, + "learning_rate": 1e-06, + "loss": 0.3646, + "mean_token_accuracy": 0.8871528506278992, + "num_tokens": 647802205.0, + "step": 16979 + }, + { + "epoch": 2.160030530466862, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.41424560546875, + "learning_rate": 1e-06, + "loss": 0.4542, + "mean_token_accuracy": 0.8579323887825012, + "num_tokens": 647846627.0, + "step": 16980 + }, + { + "epoch": 2.1601577407454524, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.18306541442871, + "learning_rate": 1e-06, + "loss": 0.4326, + "mean_token_accuracy": 0.8631290197372437, + "num_tokens": 647885033.0, + "step": 16981 + }, + { + "epoch": 2.160284951024043, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.297395706176758, + "learning_rate": 1e-06, + "loss": 0.4444, + "mean_token_accuracy": 0.8614181280136108, + "num_tokens": 647923370.0, + "step": 16982 + }, + { + "epoch": 2.1604121613026335, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.459856033325195, + "learning_rate": 1e-06, + "loss": 0.3814, + "mean_token_accuracy": 0.8777555227279663, + "num_tokens": 647963431.0, + "step": 16983 + }, + { + "epoch": 2.1605393715812236, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.11276626586914, + "learning_rate": 1e-06, + "loss": 0.4274, + "mean_token_accuracy": 0.8644523620605469, + "num_tokens": 648002854.0, + "step": 16984 + }, + { + "epoch": 2.160666581859814, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.223485946655273, + "learning_rate": 1e-06, + "loss": 0.4526, + "mean_token_accuracy": 0.8600445985794067, + "num_tokens": 648041082.0, + "step": 16985 + }, + { + "epoch": 2.1607937921384046, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.30381202697754, + "learning_rate": 1e-06, + "loss": 0.4398, + "mean_token_accuracy": 0.8635121583938599, + "num_tokens": 648082109.0, + "step": 16986 + }, + { + "epoch": 2.160921002416995, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.313644409179688, + "learning_rate": 1e-06, + "loss": 0.3535, + "mean_token_accuracy": 0.8859559893608093, + "num_tokens": 648117398.0, + "step": 16987 + }, + { + "epoch": 2.1610482126955857, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.22381591796875, + "learning_rate": 1e-06, + "loss": 0.3726, + "mean_token_accuracy": 0.8794174790382385, + "num_tokens": 648158997.0, + "step": 16988 + }, + { + "epoch": 2.161175422974176, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.246681213378906, + "learning_rate": 1e-06, + "loss": 0.4165, + "mean_token_accuracy": 0.8688029646873474, + "num_tokens": 648194334.0, + "step": 16989 + }, + { + "epoch": 2.1613026332527667, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.406394958496094, + "learning_rate": 1e-06, + "loss": 0.4183, + "mean_token_accuracy": 0.8679693937301636, + "num_tokens": 648230601.0, + "step": 16990 + }, + { + "epoch": 2.1614298435313573, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.285587310791016, + "learning_rate": 1e-06, + "loss": 0.4139, + "mean_token_accuracy": 0.8654261231422424, + "num_tokens": 648274979.0, + "step": 16991 + }, + { + "epoch": 2.161557053809948, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.323148727416992, + "learning_rate": 1e-06, + "loss": 0.4511, + "mean_token_accuracy": 0.8554459810256958, + "num_tokens": 648319229.0, + "step": 16992 + }, + { + "epoch": 2.1616842640885383, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.258377075195312, + "learning_rate": 1e-06, + "loss": 0.3678, + "mean_token_accuracy": 0.8838014602661133, + "num_tokens": 648354550.0, + "step": 16993 + }, + { + "epoch": 2.161811474367129, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.50127410888672, + "learning_rate": 1e-06, + "loss": 0.3731, + "mean_token_accuracy": 0.8837746381759644, + "num_tokens": 648389820.0, + "step": 16994 + }, + { + "epoch": 2.1619386846457194, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.2392578125, + "learning_rate": 1e-06, + "loss": 0.3611, + "mean_token_accuracy": 0.8855355978012085, + "num_tokens": 648423841.0, + "step": 16995 + }, + { + "epoch": 2.16206589492431, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.264535903930664, + "learning_rate": 1e-06, + "loss": 0.4348, + "mean_token_accuracy": 0.8589106798171997, + "num_tokens": 648459865.0, + "step": 16996 + }, + { + "epoch": 2.1621931052029004, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.362579345703125, + "learning_rate": 1e-06, + "loss": 0.3428, + "mean_token_accuracy": 0.8889627456665039, + "num_tokens": 648492639.0, + "step": 16997 + }, + { + "epoch": 2.162320315481491, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.41535186767578, + "learning_rate": 1e-06, + "loss": 0.3917, + "mean_token_accuracy": 0.875912070274353, + "num_tokens": 648533456.0, + "step": 16998 + }, + { + "epoch": 2.1624475257600815, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.288345336914062, + "learning_rate": 1e-06, + "loss": 0.3668, + "mean_token_accuracy": 0.8851835131645203, + "num_tokens": 648571955.0, + "step": 16999 + }, + { + "epoch": 2.162574736038672, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.32052993774414, + "learning_rate": 1e-06, + "loss": 0.3522, + "mean_token_accuracy": 0.88847416639328, + "num_tokens": 648614284.0, + "step": 17000 + }, + { + "epoch": 2.1627019463172625, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.364585876464844, + "learning_rate": 1e-06, + "loss": 0.3854, + "mean_token_accuracy": 0.880150556564331, + "num_tokens": 648654399.0, + "step": 17001 + }, + { + "epoch": 2.162829156595853, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.191516876220703, + "learning_rate": 1e-06, + "loss": 0.411, + "mean_token_accuracy": 0.8743208646774292, + "num_tokens": 648694490.0, + "step": 17002 + }, + { + "epoch": 2.1629563668744436, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.375551223754883, + "learning_rate": 1e-06, + "loss": 0.3837, + "mean_token_accuracy": 0.8774033784866333, + "num_tokens": 648730665.0, + "step": 17003 + }, + { + "epoch": 2.163083577153034, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.23470115661621, + "learning_rate": 1e-06, + "loss": 0.3844, + "mean_token_accuracy": 0.876312255859375, + "num_tokens": 648765289.0, + "step": 17004 + }, + { + "epoch": 2.1632107874316246, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.24261474609375, + "learning_rate": 1e-06, + "loss": 0.4058, + "mean_token_accuracy": 0.8722066283226013, + "num_tokens": 648803982.0, + "step": 17005 + }, + { + "epoch": 2.163337997710215, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.286922454833984, + "learning_rate": 1e-06, + "loss": 0.3872, + "mean_token_accuracy": 0.8775122761726379, + "num_tokens": 648840992.0, + "step": 17006 + }, + { + "epoch": 2.1634652079888053, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.268102645874023, + "learning_rate": 1e-06, + "loss": 0.4157, + "mean_token_accuracy": 0.8675954937934875, + "num_tokens": 648883621.0, + "step": 17007 + }, + { + "epoch": 2.1635924182673962, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.58730125427246, + "learning_rate": 1e-06, + "loss": 0.4477, + "mean_token_accuracy": 0.857868492603302, + "num_tokens": 648922258.0, + "step": 17008 + }, + { + "epoch": 2.1637196285459863, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.333290100097656, + "learning_rate": 1e-06, + "loss": 0.4067, + "mean_token_accuracy": 0.8713064193725586, + "num_tokens": 648965010.0, + "step": 17009 + }, + { + "epoch": 2.163846838824577, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.34345817565918, + "learning_rate": 1e-06, + "loss": 0.4294, + "mean_token_accuracy": 0.8666970729827881, + "num_tokens": 649008124.0, + "step": 17010 + }, + { + "epoch": 2.1639740491031674, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.544448852539062, + "learning_rate": 1e-06, + "loss": 0.4106, + "mean_token_accuracy": 0.8707370758056641, + "num_tokens": 649044743.0, + "step": 17011 + }, + { + "epoch": 2.164101259381758, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.388959884643555, + "learning_rate": 1e-06, + "loss": 0.3629, + "mean_token_accuracy": 0.8844906687736511, + "num_tokens": 649078410.0, + "step": 17012 + }, + { + "epoch": 2.1642284696603484, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.25335693359375, + "learning_rate": 1e-06, + "loss": 0.3775, + "mean_token_accuracy": 0.8825750350952148, + "num_tokens": 649115415.0, + "step": 17013 + }, + { + "epoch": 2.164355679938939, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.456613540649414, + "learning_rate": 1e-06, + "loss": 0.408, + "mean_token_accuracy": 0.8700233697891235, + "num_tokens": 649156266.0, + "step": 17014 + }, + { + "epoch": 2.1644828902175295, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.364425659179688, + "learning_rate": 1e-06, + "loss": 0.3832, + "mean_token_accuracy": 0.8780478835105896, + "num_tokens": 649192552.0, + "step": 17015 + }, + { + "epoch": 2.16461010049612, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.362621307373047, + "learning_rate": 1e-06, + "loss": 0.3973, + "mean_token_accuracy": 0.8740571737289429, + "num_tokens": 649224163.0, + "step": 17016 + }, + { + "epoch": 2.1647373107747105, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.314271926879883, + "learning_rate": 1e-06, + "loss": 0.3378, + "mean_token_accuracy": 0.8910331726074219, + "num_tokens": 649259348.0, + "step": 17017 + }, + { + "epoch": 2.164864521053301, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.44998550415039, + "learning_rate": 1e-06, + "loss": 0.3563, + "mean_token_accuracy": 0.8877207040786743, + "num_tokens": 649296875.0, + "step": 17018 + }, + { + "epoch": 2.1649917313318916, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.337968826293945, + "learning_rate": 1e-06, + "loss": 0.4488, + "mean_token_accuracy": 0.860413134098053, + "num_tokens": 649333455.0, + "step": 17019 + }, + { + "epoch": 2.165118941610482, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.563072204589844, + "learning_rate": 1e-06, + "loss": 0.3973, + "mean_token_accuracy": 0.8725723028182983, + "num_tokens": 649364567.0, + "step": 17020 + }, + { + "epoch": 2.1652461518890727, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.114471435546875, + "learning_rate": 1e-06, + "loss": 0.4677, + "mean_token_accuracy": 0.8551235198974609, + "num_tokens": 649400579.0, + "step": 17021 + }, + { + "epoch": 2.165373362167663, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.476430892944336, + "learning_rate": 1e-06, + "loss": 0.3627, + "mean_token_accuracy": 0.8839185237884521, + "num_tokens": 649432940.0, + "step": 17022 + }, + { + "epoch": 2.1655005724462537, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.275182723999023, + "learning_rate": 1e-06, + "loss": 0.3831, + "mean_token_accuracy": 0.8764225840568542, + "num_tokens": 649466695.0, + "step": 17023 + }, + { + "epoch": 2.1656277827248442, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.411643981933594, + "learning_rate": 1e-06, + "loss": 0.4058, + "mean_token_accuracy": 0.8696079254150391, + "num_tokens": 649504578.0, + "step": 17024 + }, + { + "epoch": 2.1657549930034348, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.31199836730957, + "learning_rate": 1e-06, + "loss": 0.4401, + "mean_token_accuracy": 0.8619468212127686, + "num_tokens": 649541536.0, + "step": 17025 + }, + { + "epoch": 2.1658822032820253, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.45547103881836, + "learning_rate": 1e-06, + "loss": 0.4059, + "mean_token_accuracy": 0.8698527812957764, + "num_tokens": 649581280.0, + "step": 17026 + }, + { + "epoch": 2.166009413560616, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.235200881958008, + "learning_rate": 1e-06, + "loss": 0.3437, + "mean_token_accuracy": 0.8936139941215515, + "num_tokens": 649619837.0, + "step": 17027 + }, + { + "epoch": 2.1661366238392064, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.45421028137207, + "learning_rate": 1e-06, + "loss": 0.3612, + "mean_token_accuracy": 0.88392573595047, + "num_tokens": 649654708.0, + "step": 17028 + }, + { + "epoch": 2.166263834117797, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.539491653442383, + "learning_rate": 1e-06, + "loss": 0.4618, + "mean_token_accuracy": 0.8527271747589111, + "num_tokens": 649691709.0, + "step": 17029 + }, + { + "epoch": 2.1663910443963874, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.175479888916016, + "learning_rate": 1e-06, + "loss": 0.4234, + "mean_token_accuracy": 0.8686890602111816, + "num_tokens": 649734876.0, + "step": 17030 + }, + { + "epoch": 2.166518254674978, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.280689239501953, + "learning_rate": 1e-06, + "loss": 0.3859, + "mean_token_accuracy": 0.8805426359176636, + "num_tokens": 649775477.0, + "step": 17031 + }, + { + "epoch": 2.166645464953568, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.238697052001953, + "learning_rate": 1e-06, + "loss": 0.3701, + "mean_token_accuracy": 0.8836108446121216, + "num_tokens": 649816136.0, + "step": 17032 + }, + { + "epoch": 2.1667726752321586, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.375600814819336, + "learning_rate": 1e-06, + "loss": 0.3687, + "mean_token_accuracy": 0.8811591863632202, + "num_tokens": 649847942.0, + "step": 17033 + }, + { + "epoch": 2.166899885510749, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.32844352722168, + "learning_rate": 1e-06, + "loss": 0.415, + "mean_token_accuracy": 0.8690977096557617, + "num_tokens": 649887875.0, + "step": 17034 + }, + { + "epoch": 2.1670270957893396, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.34096908569336, + "learning_rate": 1e-06, + "loss": 0.449, + "mean_token_accuracy": 0.8592296242713928, + "num_tokens": 649920557.0, + "step": 17035 + }, + { + "epoch": 2.16715430606793, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.361909866333008, + "learning_rate": 1e-06, + "loss": 0.3752, + "mean_token_accuracy": 0.88097083568573, + "num_tokens": 649959704.0, + "step": 17036 + }, + { + "epoch": 2.1672815163465207, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.14892578125, + "learning_rate": 1e-06, + "loss": 0.3817, + "mean_token_accuracy": 0.8770443201065063, + "num_tokens": 650000433.0, + "step": 17037 + }, + { + "epoch": 2.167408726625111, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.55740737915039, + "learning_rate": 1e-06, + "loss": 0.3704, + "mean_token_accuracy": 0.884186863899231, + "num_tokens": 650038793.0, + "step": 17038 + }, + { + "epoch": 2.1675359369037017, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.294071197509766, + "learning_rate": 1e-06, + "loss": 0.3688, + "mean_token_accuracy": 0.8815442323684692, + "num_tokens": 650072877.0, + "step": 17039 + }, + { + "epoch": 2.1676631471822922, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.42827796936035, + "learning_rate": 1e-06, + "loss": 0.4392, + "mean_token_accuracy": 0.8603508472442627, + "num_tokens": 650109245.0, + "step": 17040 + }, + { + "epoch": 2.1677903574608828, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.433420181274414, + "learning_rate": 1e-06, + "loss": 0.3605, + "mean_token_accuracy": 0.8855066299438477, + "num_tokens": 650150254.0, + "step": 17041 + }, + { + "epoch": 2.1679175677394733, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.252925872802734, + "learning_rate": 1e-06, + "loss": 0.459, + "mean_token_accuracy": 0.8536819219589233, + "num_tokens": 650189587.0, + "step": 17042 + }, + { + "epoch": 2.168044778018064, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.28628921508789, + "learning_rate": 1e-06, + "loss": 0.4102, + "mean_token_accuracy": 0.8680884838104248, + "num_tokens": 650225443.0, + "step": 17043 + }, + { + "epoch": 2.1681719882966544, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.287473678588867, + "learning_rate": 1e-06, + "loss": 0.4216, + "mean_token_accuracy": 0.8687188625335693, + "num_tokens": 650262319.0, + "step": 17044 + }, + { + "epoch": 2.168299198575245, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.2297420501709, + "learning_rate": 1e-06, + "loss": 0.4347, + "mean_token_accuracy": 0.8564378023147583, + "num_tokens": 650297137.0, + "step": 17045 + }, + { + "epoch": 2.1684264088538354, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.277103424072266, + "learning_rate": 1e-06, + "loss": 0.4075, + "mean_token_accuracy": 0.8710134029388428, + "num_tokens": 650335521.0, + "step": 17046 + }, + { + "epoch": 2.168553619132426, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.190710067749023, + "learning_rate": 1e-06, + "loss": 0.4197, + "mean_token_accuracy": 0.8676444292068481, + "num_tokens": 650373050.0, + "step": 17047 + }, + { + "epoch": 2.1686808294110165, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.21117401123047, + "learning_rate": 1e-06, + "loss": 0.3859, + "mean_token_accuracy": 0.8793116211891174, + "num_tokens": 650407438.0, + "step": 17048 + }, + { + "epoch": 2.168808039689607, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.29118537902832, + "learning_rate": 1e-06, + "loss": 0.4074, + "mean_token_accuracy": 0.8678438663482666, + "num_tokens": 650447793.0, + "step": 17049 + }, + { + "epoch": 2.1689352499681975, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.200937271118164, + "learning_rate": 1e-06, + "loss": 0.4238, + "mean_token_accuracy": 0.8687878251075745, + "num_tokens": 650487167.0, + "step": 17050 + }, + { + "epoch": 2.169062460246788, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.32859992980957, + "learning_rate": 1e-06, + "loss": 0.4384, + "mean_token_accuracy": 0.8597790598869324, + "num_tokens": 650522960.0, + "step": 17051 + }, + { + "epoch": 2.1691896705253786, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.331512451171875, + "learning_rate": 1e-06, + "loss": 0.4119, + "mean_token_accuracy": 0.8665379285812378, + "num_tokens": 650564161.0, + "step": 17052 + }, + { + "epoch": 2.169316880803969, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.300073623657227, + "learning_rate": 1e-06, + "loss": 0.3762, + "mean_token_accuracy": 0.879030704498291, + "num_tokens": 650599201.0, + "step": 17053 + }, + { + "epoch": 2.1694440910825596, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.361461639404297, + "learning_rate": 1e-06, + "loss": 0.4571, + "mean_token_accuracy": 0.8538922071456909, + "num_tokens": 650637385.0, + "step": 17054 + }, + { + "epoch": 2.16957130136115, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.480165481567383, + "learning_rate": 1e-06, + "loss": 0.4092, + "mean_token_accuracy": 0.8715751767158508, + "num_tokens": 650671954.0, + "step": 17055 + }, + { + "epoch": 2.1696985116397407, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.220027923583984, + "learning_rate": 1e-06, + "loss": 0.3483, + "mean_token_accuracy": 0.8892418742179871, + "num_tokens": 650708475.0, + "step": 17056 + }, + { + "epoch": 2.169825721918331, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.371906280517578, + "learning_rate": 1e-06, + "loss": 0.3961, + "mean_token_accuracy": 0.8765619993209839, + "num_tokens": 650748089.0, + "step": 17057 + }, + { + "epoch": 2.1699529321969213, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.349952697753906, + "learning_rate": 1e-06, + "loss": 0.3987, + "mean_token_accuracy": 0.8747572302818298, + "num_tokens": 650790869.0, + "step": 17058 + }, + { + "epoch": 2.170080142475512, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.301040649414062, + "learning_rate": 1e-06, + "loss": 0.4398, + "mean_token_accuracy": 0.8598684072494507, + "num_tokens": 650830697.0, + "step": 17059 + }, + { + "epoch": 2.1702073527541024, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.35189437866211, + "learning_rate": 1e-06, + "loss": 0.3735, + "mean_token_accuracy": 0.8810129165649414, + "num_tokens": 650868755.0, + "step": 17060 + }, + { + "epoch": 2.170334563032693, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.1669921875, + "learning_rate": 1e-06, + "loss": 0.3745, + "mean_token_accuracy": 0.8829532861709595, + "num_tokens": 650908019.0, + "step": 17061 + }, + { + "epoch": 2.1704617733112834, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.056438446044922, + "learning_rate": 1e-06, + "loss": 0.4073, + "mean_token_accuracy": 0.8707306385040283, + "num_tokens": 650954195.0, + "step": 17062 + }, + { + "epoch": 2.170588983589874, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.44530487060547, + "learning_rate": 1e-06, + "loss": 0.3704, + "mean_token_accuracy": 0.8845293521881104, + "num_tokens": 650992740.0, + "step": 17063 + }, + { + "epoch": 2.1707161938684645, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.217496871948242, + "learning_rate": 1e-06, + "loss": 0.3591, + "mean_token_accuracy": 0.8852817416191101, + "num_tokens": 651031524.0, + "step": 17064 + }, + { + "epoch": 2.170843404147055, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.182493209838867, + "learning_rate": 1e-06, + "loss": 0.4106, + "mean_token_accuracy": 0.8706082701683044, + "num_tokens": 651070004.0, + "step": 17065 + }, + { + "epoch": 2.1709706144256455, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.371871948242188, + "learning_rate": 1e-06, + "loss": 0.4382, + "mean_token_accuracy": 0.8591793775558472, + "num_tokens": 651104474.0, + "step": 17066 + }, + { + "epoch": 2.171097824704236, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.358793258666992, + "learning_rate": 1e-06, + "loss": 0.4185, + "mean_token_accuracy": 0.8655951023101807, + "num_tokens": 651148913.0, + "step": 17067 + }, + { + "epoch": 2.1712250349828266, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.391061782836914, + "learning_rate": 1e-06, + "loss": 0.3864, + "mean_token_accuracy": 0.879879355430603, + "num_tokens": 651192072.0, + "step": 17068 + }, + { + "epoch": 2.171352245261417, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.101247787475586, + "learning_rate": 1e-06, + "loss": 0.4338, + "mean_token_accuracy": 0.8643332123756409, + "num_tokens": 651226132.0, + "step": 17069 + }, + { + "epoch": 2.1714794555400077, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.219263076782227, + "learning_rate": 1e-06, + "loss": 0.3863, + "mean_token_accuracy": 0.8747748136520386, + "num_tokens": 651263467.0, + "step": 17070 + }, + { + "epoch": 2.171606665818598, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.288496017456055, + "learning_rate": 1e-06, + "loss": 0.4327, + "mean_token_accuracy": 0.8643528819084167, + "num_tokens": 651309521.0, + "step": 17071 + }, + { + "epoch": 2.1717338760971887, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.566770553588867, + "learning_rate": 1e-06, + "loss": 0.4094, + "mean_token_accuracy": 0.8746207356452942, + "num_tokens": 651347026.0, + "step": 17072 + }, + { + "epoch": 2.1718610863757792, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.316362380981445, + "learning_rate": 1e-06, + "loss": 0.3884, + "mean_token_accuracy": 0.8805853128433228, + "num_tokens": 651380545.0, + "step": 17073 + }, + { + "epoch": 2.1719882966543698, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.24380111694336, + "learning_rate": 1e-06, + "loss": 0.3915, + "mean_token_accuracy": 0.8778722882270813, + "num_tokens": 651409428.0, + "step": 17074 + }, + { + "epoch": 2.1721155069329603, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.37196159362793, + "learning_rate": 1e-06, + "loss": 0.4445, + "mean_token_accuracy": 0.8593063950538635, + "num_tokens": 651447993.0, + "step": 17075 + }, + { + "epoch": 2.172242717211551, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.24839210510254, + "learning_rate": 1e-06, + "loss": 0.4161, + "mean_token_accuracy": 0.8674073219299316, + "num_tokens": 651488210.0, + "step": 17076 + }, + { + "epoch": 2.1723699274901414, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.527206420898438, + "learning_rate": 1e-06, + "loss": 0.4132, + "mean_token_accuracy": 0.8679582476615906, + "num_tokens": 651525742.0, + "step": 17077 + }, + { + "epoch": 2.172497137768732, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.368440628051758, + "learning_rate": 1e-06, + "loss": 0.3879, + "mean_token_accuracy": 0.8792631030082703, + "num_tokens": 651565569.0, + "step": 17078 + }, + { + "epoch": 2.1726243480473224, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.186798095703125, + "learning_rate": 1e-06, + "loss": 0.4185, + "mean_token_accuracy": 0.8676903247833252, + "num_tokens": 651605291.0, + "step": 17079 + }, + { + "epoch": 2.172751558325913, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.281055450439453, + "learning_rate": 1e-06, + "loss": 0.4011, + "mean_token_accuracy": 0.8724051117897034, + "num_tokens": 651641998.0, + "step": 17080 + }, + { + "epoch": 2.1728787686045035, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.31292152404785, + "learning_rate": 1e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.8701337575912476, + "num_tokens": 651683040.0, + "step": 17081 + }, + { + "epoch": 2.1730059788830935, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.129276275634766, + "learning_rate": 1e-06, + "loss": 0.3738, + "mean_token_accuracy": 0.8811885714530945, + "num_tokens": 651718067.0, + "step": 17082 + }, + { + "epoch": 2.173133189161684, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.604381561279297, + "learning_rate": 1e-06, + "loss": 0.3568, + "mean_token_accuracy": 0.8866145610809326, + "num_tokens": 651761995.0, + "step": 17083 + }, + { + "epoch": 2.1732603994402746, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.176128387451172, + "learning_rate": 1e-06, + "loss": 0.3632, + "mean_token_accuracy": 0.8866924047470093, + "num_tokens": 651797780.0, + "step": 17084 + }, + { + "epoch": 2.173387609718865, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.531709671020508, + "learning_rate": 1e-06, + "loss": 0.4175, + "mean_token_accuracy": 0.8620141744613647, + "num_tokens": 651832931.0, + "step": 17085 + }, + { + "epoch": 2.1735148199974557, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.50298500061035, + "learning_rate": 1e-06, + "loss": 0.4221, + "mean_token_accuracy": 0.8668546676635742, + "num_tokens": 651868440.0, + "step": 17086 + }, + { + "epoch": 2.173642030276046, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.27484703063965, + "learning_rate": 1e-06, + "loss": 0.3989, + "mean_token_accuracy": 0.8754860162734985, + "num_tokens": 651900029.0, + "step": 17087 + }, + { + "epoch": 2.1737692405546367, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.375524520874023, + "learning_rate": 1e-06, + "loss": 0.4139, + "mean_token_accuracy": 0.8695695400238037, + "num_tokens": 651944892.0, + "step": 17088 + }, + { + "epoch": 2.1738964508332272, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.295320510864258, + "learning_rate": 1e-06, + "loss": 0.3775, + "mean_token_accuracy": 0.8801767230033875, + "num_tokens": 651979330.0, + "step": 17089 + }, + { + "epoch": 2.1740236611118178, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.36130714416504, + "learning_rate": 1e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.8644613027572632, + "num_tokens": 652017389.0, + "step": 17090 + }, + { + "epoch": 2.1741508713904083, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.38450813293457, + "learning_rate": 1e-06, + "loss": 0.4134, + "mean_token_accuracy": 0.8684996366500854, + "num_tokens": 652052260.0, + "step": 17091 + }, + { + "epoch": 2.174278081668999, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.30134391784668, + "learning_rate": 1e-06, + "loss": 0.442, + "mean_token_accuracy": 0.8585737347602844, + "num_tokens": 652089917.0, + "step": 17092 + }, + { + "epoch": 2.1744052919475894, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.260486602783203, + "learning_rate": 1e-06, + "loss": 0.3857, + "mean_token_accuracy": 0.8754279613494873, + "num_tokens": 652129849.0, + "step": 17093 + }, + { + "epoch": 2.17453250222618, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.20181655883789, + "learning_rate": 1e-06, + "loss": 0.418, + "mean_token_accuracy": 0.8647002577781677, + "num_tokens": 652163178.0, + "step": 17094 + }, + { + "epoch": 2.1746597125047704, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.407608032226562, + "learning_rate": 1e-06, + "loss": 0.3806, + "mean_token_accuracy": 0.8781934380531311, + "num_tokens": 652201521.0, + "step": 17095 + }, + { + "epoch": 2.174786922783361, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.780752182006836, + "learning_rate": 1e-06, + "loss": 0.4101, + "mean_token_accuracy": 0.8727065324783325, + "num_tokens": 652234820.0, + "step": 17096 + }, + { + "epoch": 2.1749141330619515, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.27391815185547, + "learning_rate": 1e-06, + "loss": 0.4185, + "mean_token_accuracy": 0.8690409064292908, + "num_tokens": 652275686.0, + "step": 17097 + }, + { + "epoch": 2.175041343340542, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.461475372314453, + "learning_rate": 1e-06, + "loss": 0.4371, + "mean_token_accuracy": 0.8638542294502258, + "num_tokens": 652313243.0, + "step": 17098 + }, + { + "epoch": 2.1751685536191325, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.24921989440918, + "learning_rate": 1e-06, + "loss": 0.3613, + "mean_token_accuracy": 0.8873777389526367, + "num_tokens": 652351550.0, + "step": 17099 + }, + { + "epoch": 2.175295763897723, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.228614807128906, + "learning_rate": 1e-06, + "loss": 0.4134, + "mean_token_accuracy": 0.8695617318153381, + "num_tokens": 652399834.0, + "step": 17100 + }, + { + "epoch": 2.1754229741763136, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.51274299621582, + "learning_rate": 1e-06, + "loss": 0.452, + "mean_token_accuracy": 0.8590565919876099, + "num_tokens": 652435839.0, + "step": 17101 + }, + { + "epoch": 2.175550184454904, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.246213912963867, + "learning_rate": 1e-06, + "loss": 0.4109, + "mean_token_accuracy": 0.868586003780365, + "num_tokens": 652475747.0, + "step": 17102 + }, + { + "epoch": 2.1756773947334946, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.32221031188965, + "learning_rate": 1e-06, + "loss": 0.3667, + "mean_token_accuracy": 0.8846604228019714, + "num_tokens": 652517147.0, + "step": 17103 + }, + { + "epoch": 2.175804605012085, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.328750610351562, + "learning_rate": 1e-06, + "loss": 0.3652, + "mean_token_accuracy": 0.8840352296829224, + "num_tokens": 652552811.0, + "step": 17104 + }, + { + "epoch": 2.1759318152906753, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.576438903808594, + "learning_rate": 1e-06, + "loss": 0.3753, + "mean_token_accuracy": 0.8798858523368835, + "num_tokens": 652587709.0, + "step": 17105 + }, + { + "epoch": 2.1760590255692662, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.344812393188477, + "learning_rate": 1e-06, + "loss": 0.4054, + "mean_token_accuracy": 0.872125506401062, + "num_tokens": 652626213.0, + "step": 17106 + }, + { + "epoch": 2.1761862358478563, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.34984016418457, + "learning_rate": 1e-06, + "loss": 0.5007, + "mean_token_accuracy": 0.8437358140945435, + "num_tokens": 652665692.0, + "step": 17107 + }, + { + "epoch": 2.176313446126447, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.474987030029297, + "learning_rate": 1e-06, + "loss": 0.431, + "mean_token_accuracy": 0.8646491765975952, + "num_tokens": 652705515.0, + "step": 17108 + }, + { + "epoch": 2.1764406564050374, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.450443267822266, + "learning_rate": 1e-06, + "loss": 0.3637, + "mean_token_accuracy": 0.8835351467132568, + "num_tokens": 652743343.0, + "step": 17109 + }, + { + "epoch": 2.176567866683628, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.274850845336914, + "learning_rate": 1e-06, + "loss": 0.4172, + "mean_token_accuracy": 0.8664512634277344, + "num_tokens": 652784587.0, + "step": 17110 + }, + { + "epoch": 2.1766950769622184, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.458858489990234, + "learning_rate": 1e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.8638832569122314, + "num_tokens": 652821985.0, + "step": 17111 + }, + { + "epoch": 2.176822287240809, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.34626007080078, + "learning_rate": 1e-06, + "loss": 0.3769, + "mean_token_accuracy": 0.8824441432952881, + "num_tokens": 652855772.0, + "step": 17112 + }, + { + "epoch": 2.1769494975193995, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.272029876708984, + "learning_rate": 1e-06, + "loss": 0.4214, + "mean_token_accuracy": 0.8672913908958435, + "num_tokens": 652890497.0, + "step": 17113 + }, + { + "epoch": 2.17707670779799, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.21674919128418, + "learning_rate": 1e-06, + "loss": 0.3711, + "mean_token_accuracy": 0.8828141689300537, + "num_tokens": 652931611.0, + "step": 17114 + }, + { + "epoch": 2.1772039180765805, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.2804012298584, + "learning_rate": 1e-06, + "loss": 0.4076, + "mean_token_accuracy": 0.873106062412262, + "num_tokens": 652972129.0, + "step": 17115 + }, + { + "epoch": 2.177331128355171, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.236160278320312, + "learning_rate": 1e-06, + "loss": 0.4222, + "mean_token_accuracy": 0.8661138415336609, + "num_tokens": 653009848.0, + "step": 17116 + }, + { + "epoch": 2.1774583386337616, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.17078399658203, + "learning_rate": 1e-06, + "loss": 0.4133, + "mean_token_accuracy": 0.8696978092193604, + "num_tokens": 653044614.0, + "step": 17117 + }, + { + "epoch": 2.177585548912352, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.344907760620117, + "learning_rate": 1e-06, + "loss": 0.3628, + "mean_token_accuracy": 0.8854240775108337, + "num_tokens": 653087614.0, + "step": 17118 + }, + { + "epoch": 2.1777127591909426, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.22039222717285, + "learning_rate": 1e-06, + "loss": 0.3653, + "mean_token_accuracy": 0.8853335380554199, + "num_tokens": 653120434.0, + "step": 17119 + }, + { + "epoch": 2.177839969469533, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.22275161743164, + "learning_rate": 1e-06, + "loss": 0.3761, + "mean_token_accuracy": 0.8827663660049438, + "num_tokens": 653159390.0, + "step": 17120 + }, + { + "epoch": 2.1779671797481237, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.136701583862305, + "learning_rate": 1e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.8641477823257446, + "num_tokens": 653201045.0, + "step": 17121 + }, + { + "epoch": 2.1780943900267142, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.33681297302246, + "learning_rate": 1e-06, + "loss": 0.3952, + "mean_token_accuracy": 0.8748059868812561, + "num_tokens": 653244968.0, + "step": 17122 + }, + { + "epoch": 2.1782216003053048, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.366744995117188, + "learning_rate": 1e-06, + "loss": 0.4293, + "mean_token_accuracy": 0.8643937110900879, + "num_tokens": 653281157.0, + "step": 17123 + }, + { + "epoch": 2.1783488105838953, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.100011825561523, + "learning_rate": 1e-06, + "loss": 0.367, + "mean_token_accuracy": 0.8863149285316467, + "num_tokens": 653321055.0, + "step": 17124 + }, + { + "epoch": 2.178476020862486, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.398033142089844, + "learning_rate": 1e-06, + "loss": 0.444, + "mean_token_accuracy": 0.8597776889801025, + "num_tokens": 653359320.0, + "step": 17125 + }, + { + "epoch": 2.1786032311410763, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.164579391479492, + "learning_rate": 1e-06, + "loss": 0.3375, + "mean_token_accuracy": 0.8920550346374512, + "num_tokens": 653404186.0, + "step": 17126 + }, + { + "epoch": 2.178730441419667, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.414602279663086, + "learning_rate": 1e-06, + "loss": 0.3748, + "mean_token_accuracy": 0.8828790187835693, + "num_tokens": 653438122.0, + "step": 17127 + }, + { + "epoch": 2.1788576516982574, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.463699340820312, + "learning_rate": 1e-06, + "loss": 0.4055, + "mean_token_accuracy": 0.8717348575592041, + "num_tokens": 653474727.0, + "step": 17128 + }, + { + "epoch": 2.178984861976848, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.47168731689453, + "learning_rate": 1e-06, + "loss": 0.3804, + "mean_token_accuracy": 0.8785015344619751, + "num_tokens": 653513531.0, + "step": 17129 + }, + { + "epoch": 2.179112072255438, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.22357749938965, + "learning_rate": 1e-06, + "loss": 0.3687, + "mean_token_accuracy": 0.8833345174789429, + "num_tokens": 653555871.0, + "step": 17130 + }, + { + "epoch": 2.1792392825340285, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.506160736083984, + "learning_rate": 1e-06, + "loss": 0.3924, + "mean_token_accuracy": 0.8740943670272827, + "num_tokens": 653598825.0, + "step": 17131 + }, + { + "epoch": 2.179366492812619, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.247743606567383, + "learning_rate": 1e-06, + "loss": 0.3673, + "mean_token_accuracy": 0.882061779499054, + "num_tokens": 653639838.0, + "step": 17132 + }, + { + "epoch": 2.1794937030912096, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.528200149536133, + "learning_rate": 1e-06, + "loss": 0.4221, + "mean_token_accuracy": 0.8642441630363464, + "num_tokens": 653678958.0, + "step": 17133 + }, + { + "epoch": 2.1796209133698, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.434194564819336, + "learning_rate": 1e-06, + "loss": 0.375, + "mean_token_accuracy": 0.8806301355361938, + "num_tokens": 653718043.0, + "step": 17134 + }, + { + "epoch": 2.1797481236483907, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.356901168823242, + "learning_rate": 1e-06, + "loss": 0.3556, + "mean_token_accuracy": 0.8869360685348511, + "num_tokens": 653756308.0, + "step": 17135 + }, + { + "epoch": 2.179875333926981, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.63238525390625, + "learning_rate": 1e-06, + "loss": 0.374, + "mean_token_accuracy": 0.8824427127838135, + "num_tokens": 653793140.0, + "step": 17136 + }, + { + "epoch": 2.1800025442055717, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.311843872070312, + "learning_rate": 1e-06, + "loss": 0.3908, + "mean_token_accuracy": 0.8787567615509033, + "num_tokens": 653823441.0, + "step": 17137 + }, + { + "epoch": 2.1801297544841622, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.431955337524414, + "learning_rate": 1e-06, + "loss": 0.4199, + "mean_token_accuracy": 0.863124430179596, + "num_tokens": 653860743.0, + "step": 17138 + }, + { + "epoch": 2.1802569647627528, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.313926696777344, + "learning_rate": 1e-06, + "loss": 0.3546, + "mean_token_accuracy": 0.8852181434631348, + "num_tokens": 653894785.0, + "step": 17139 + }, + { + "epoch": 2.1803841750413433, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.302671432495117, + "learning_rate": 1e-06, + "loss": 0.4164, + "mean_token_accuracy": 0.8692914247512817, + "num_tokens": 653932208.0, + "step": 17140 + }, + { + "epoch": 2.180511385319934, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.30333709716797, + "learning_rate": 1e-06, + "loss": 0.4338, + "mean_token_accuracy": 0.859492301940918, + "num_tokens": 653968818.0, + "step": 17141 + }, + { + "epoch": 2.1806385955985244, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.362167358398438, + "learning_rate": 1e-06, + "loss": 0.4073, + "mean_token_accuracy": 0.8696044683456421, + "num_tokens": 654010696.0, + "step": 17142 + }, + { + "epoch": 2.180765805877115, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.524850845336914, + "learning_rate": 1e-06, + "loss": 0.3886, + "mean_token_accuracy": 0.8766671419143677, + "num_tokens": 654047632.0, + "step": 17143 + }, + { + "epoch": 2.1808930161557054, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.616485595703125, + "learning_rate": 1e-06, + "loss": 0.3875, + "mean_token_accuracy": 0.8772193193435669, + "num_tokens": 654088200.0, + "step": 17144 + }, + { + "epoch": 2.181020226434296, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.123689651489258, + "learning_rate": 1e-06, + "loss": 0.4046, + "mean_token_accuracy": 0.8705102205276489, + "num_tokens": 654124225.0, + "step": 17145 + }, + { + "epoch": 2.1811474367128865, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.496871948242188, + "learning_rate": 1e-06, + "loss": 0.3945, + "mean_token_accuracy": 0.8754330277442932, + "num_tokens": 654163317.0, + "step": 17146 + }, + { + "epoch": 2.181274646991477, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.24989891052246, + "learning_rate": 1e-06, + "loss": 0.4389, + "mean_token_accuracy": 0.8651811480522156, + "num_tokens": 654199208.0, + "step": 17147 + }, + { + "epoch": 2.1814018572700675, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.4205265045166, + "learning_rate": 1e-06, + "loss": 0.4376, + "mean_token_accuracy": 0.8617806434631348, + "num_tokens": 654239988.0, + "step": 17148 + }, + { + "epoch": 2.181529067548658, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.413007736206055, + "learning_rate": 1e-06, + "loss": 0.4089, + "mean_token_accuracy": 0.8672614097595215, + "num_tokens": 654276361.0, + "step": 17149 + }, + { + "epoch": 2.1816562778272486, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.32195472717285, + "learning_rate": 1e-06, + "loss": 0.3988, + "mean_token_accuracy": 0.8751392364501953, + "num_tokens": 654316690.0, + "step": 17150 + }, + { + "epoch": 2.181783488105839, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.297876358032227, + "learning_rate": 1e-06, + "loss": 0.3721, + "mean_token_accuracy": 0.8819065690040588, + "num_tokens": 654354112.0, + "step": 17151 + }, + { + "epoch": 2.1819106983844296, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.513845443725586, + "learning_rate": 1e-06, + "loss": 0.3701, + "mean_token_accuracy": 0.8842365741729736, + "num_tokens": 654398896.0, + "step": 17152 + }, + { + "epoch": 2.18203790866302, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.09912872314453, + "learning_rate": 1e-06, + "loss": 0.3658, + "mean_token_accuracy": 0.8853195905685425, + "num_tokens": 654435180.0, + "step": 17153 + }, + { + "epoch": 2.1821651189416107, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.329191207885742, + "learning_rate": 1e-06, + "loss": 0.4353, + "mean_token_accuracy": 0.861652135848999, + "num_tokens": 654473958.0, + "step": 17154 + }, + { + "epoch": 2.1822923292202008, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.537755966186523, + "learning_rate": 1e-06, + "loss": 0.4511, + "mean_token_accuracy": 0.8583641052246094, + "num_tokens": 654512918.0, + "step": 17155 + }, + { + "epoch": 2.1824195394987913, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.14482879638672, + "learning_rate": 1e-06, + "loss": 0.3704, + "mean_token_accuracy": 0.8803195953369141, + "num_tokens": 654552296.0, + "step": 17156 + }, + { + "epoch": 2.182546749777382, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.332183837890625, + "learning_rate": 1e-06, + "loss": 0.3767, + "mean_token_accuracy": 0.879386305809021, + "num_tokens": 654592088.0, + "step": 17157 + }, + { + "epoch": 2.1826739600559724, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.308006286621094, + "learning_rate": 1e-06, + "loss": 0.4371, + "mean_token_accuracy": 0.8656456470489502, + "num_tokens": 654635119.0, + "step": 17158 + }, + { + "epoch": 2.182801170334563, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.273115158081055, + "learning_rate": 1e-06, + "loss": 0.4536, + "mean_token_accuracy": 0.8608874082565308, + "num_tokens": 654671204.0, + "step": 17159 + }, + { + "epoch": 2.1829283806131534, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.361534118652344, + "learning_rate": 1e-06, + "loss": 0.3982, + "mean_token_accuracy": 0.8719268441200256, + "num_tokens": 654702986.0, + "step": 17160 + }, + { + "epoch": 2.183055590891744, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.346710205078125, + "learning_rate": 1e-06, + "loss": 0.4745, + "mean_token_accuracy": 0.8499369621276855, + "num_tokens": 654745813.0, + "step": 17161 + }, + { + "epoch": 2.1831828011703345, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.284042358398438, + "learning_rate": 1e-06, + "loss": 0.4036, + "mean_token_accuracy": 0.8730244636535645, + "num_tokens": 654783305.0, + "step": 17162 + }, + { + "epoch": 2.183310011448925, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.202573776245117, + "learning_rate": 1e-06, + "loss": 0.3776, + "mean_token_accuracy": 0.8804265856742859, + "num_tokens": 654823427.0, + "step": 17163 + }, + { + "epoch": 2.1834372217275155, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.27912139892578, + "learning_rate": 1e-06, + "loss": 0.3734, + "mean_token_accuracy": 0.8856745958328247, + "num_tokens": 654862400.0, + "step": 17164 + }, + { + "epoch": 2.183564432006106, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.416255950927734, + "learning_rate": 1e-06, + "loss": 0.3868, + "mean_token_accuracy": 0.8770864605903625, + "num_tokens": 654902484.0, + "step": 17165 + }, + { + "epoch": 2.1836916422846966, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.298683166503906, + "learning_rate": 1e-06, + "loss": 0.3806, + "mean_token_accuracy": 0.8789006471633911, + "num_tokens": 654946882.0, + "step": 17166 + }, + { + "epoch": 2.183818852563287, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.5284481048584, + "learning_rate": 1e-06, + "loss": 0.4272, + "mean_token_accuracy": 0.8638237714767456, + "num_tokens": 654977850.0, + "step": 17167 + }, + { + "epoch": 2.1839460628418776, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.364337921142578, + "learning_rate": 1e-06, + "loss": 0.4574, + "mean_token_accuracy": 0.8593055009841919, + "num_tokens": 655015352.0, + "step": 17168 + }, + { + "epoch": 2.184073273120468, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.354877471923828, + "learning_rate": 1e-06, + "loss": 0.4013, + "mean_token_accuracy": 0.8741855025291443, + "num_tokens": 655047488.0, + "step": 17169 + }, + { + "epoch": 2.1842004833990587, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.381546020507812, + "learning_rate": 1e-06, + "loss": 0.4011, + "mean_token_accuracy": 0.8698779940605164, + "num_tokens": 655083410.0, + "step": 17170 + }, + { + "epoch": 2.1843276936776492, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.39408302307129, + "learning_rate": 1e-06, + "loss": 0.3703, + "mean_token_accuracy": 0.8811980485916138, + "num_tokens": 655120594.0, + "step": 17171 + }, + { + "epoch": 2.1844549039562398, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.26982879638672, + "learning_rate": 1e-06, + "loss": 0.4113, + "mean_token_accuracy": 0.8713008165359497, + "num_tokens": 655157310.0, + "step": 17172 + }, + { + "epoch": 2.1845821142348303, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.39223861694336, + "learning_rate": 1e-06, + "loss": 0.3735, + "mean_token_accuracy": 0.8821127414703369, + "num_tokens": 655193749.0, + "step": 17173 + }, + { + "epoch": 2.184709324513421, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.409866333007812, + "learning_rate": 1e-06, + "loss": 0.3662, + "mean_token_accuracy": 0.886854887008667, + "num_tokens": 655235532.0, + "step": 17174 + }, + { + "epoch": 2.1848365347920113, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.209897994995117, + "learning_rate": 1e-06, + "loss": 0.3893, + "mean_token_accuracy": 0.8781079053878784, + "num_tokens": 655282154.0, + "step": 17175 + }, + { + "epoch": 2.184963745070602, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.249103546142578, + "learning_rate": 1e-06, + "loss": 0.4148, + "mean_token_accuracy": 0.8666788339614868, + "num_tokens": 655321342.0, + "step": 17176 + }, + { + "epoch": 2.1850909553491924, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.354108810424805, + "learning_rate": 1e-06, + "loss": 0.357, + "mean_token_accuracy": 0.8901976346969604, + "num_tokens": 655358107.0, + "step": 17177 + }, + { + "epoch": 2.185218165627783, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.259111404418945, + "learning_rate": 1e-06, + "loss": 0.4192, + "mean_token_accuracy": 0.8682372570037842, + "num_tokens": 655393462.0, + "step": 17178 + }, + { + "epoch": 2.1853453759063735, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.369043350219727, + "learning_rate": 1e-06, + "loss": 0.3907, + "mean_token_accuracy": 0.87471604347229, + "num_tokens": 655436114.0, + "step": 17179 + }, + { + "epoch": 2.1854725861849635, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.29908561706543, + "learning_rate": 1e-06, + "loss": 0.397, + "mean_token_accuracy": 0.8753843903541565, + "num_tokens": 655473119.0, + "step": 17180 + }, + { + "epoch": 2.185599796463554, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.317760467529297, + "learning_rate": 1e-06, + "loss": 0.4154, + "mean_token_accuracy": 0.8674987554550171, + "num_tokens": 655515537.0, + "step": 17181 + }, + { + "epoch": 2.1857270067421446, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.374191284179688, + "learning_rate": 1e-06, + "loss": 0.4175, + "mean_token_accuracy": 0.8667784333229065, + "num_tokens": 655555944.0, + "step": 17182 + }, + { + "epoch": 2.185854217020735, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.32449722290039, + "learning_rate": 1e-06, + "loss": 0.4614, + "mean_token_accuracy": 0.8563041687011719, + "num_tokens": 655589952.0, + "step": 17183 + }, + { + "epoch": 2.1859814272993257, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.437179565429688, + "learning_rate": 1e-06, + "loss": 0.361, + "mean_token_accuracy": 0.8857592344284058, + "num_tokens": 655628269.0, + "step": 17184 + }, + { + "epoch": 2.186108637577916, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.234214782714844, + "learning_rate": 1e-06, + "loss": 0.3948, + "mean_token_accuracy": 0.8761626482009888, + "num_tokens": 655657315.0, + "step": 17185 + }, + { + "epoch": 2.1862358478565067, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.358827590942383, + "learning_rate": 1e-06, + "loss": 0.3977, + "mean_token_accuracy": 0.8730805516242981, + "num_tokens": 655696117.0, + "step": 17186 + }, + { + "epoch": 2.1863630581350972, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.284589767456055, + "learning_rate": 1e-06, + "loss": 0.4562, + "mean_token_accuracy": 0.8558530807495117, + "num_tokens": 655738823.0, + "step": 17187 + }, + { + "epoch": 2.1864902684136878, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.294750213623047, + "learning_rate": 1e-06, + "loss": 0.4263, + "mean_token_accuracy": 0.8633319139480591, + "num_tokens": 655781457.0, + "step": 17188 + }, + { + "epoch": 2.1866174786922783, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.567485809326172, + "learning_rate": 1e-06, + "loss": 0.4219, + "mean_token_accuracy": 0.8683774471282959, + "num_tokens": 655824704.0, + "step": 17189 + }, + { + "epoch": 2.186744688970869, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.197690963745117, + "learning_rate": 1e-06, + "loss": 0.3684, + "mean_token_accuracy": 0.8827041387557983, + "num_tokens": 655860013.0, + "step": 17190 + }, + { + "epoch": 2.1868718992494594, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.437286376953125, + "learning_rate": 1e-06, + "loss": 0.417, + "mean_token_accuracy": 0.8704968690872192, + "num_tokens": 655897146.0, + "step": 17191 + }, + { + "epoch": 2.18699910952805, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.31158447265625, + "learning_rate": 1e-06, + "loss": 0.3899, + "mean_token_accuracy": 0.8788149356842041, + "num_tokens": 655934299.0, + "step": 17192 + }, + { + "epoch": 2.1871263198066404, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.49000358581543, + "learning_rate": 1e-06, + "loss": 0.3993, + "mean_token_accuracy": 0.8763418793678284, + "num_tokens": 655976554.0, + "step": 17193 + }, + { + "epoch": 2.187253530085231, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.42299461364746, + "learning_rate": 1e-06, + "loss": 0.3672, + "mean_token_accuracy": 0.8835074305534363, + "num_tokens": 656011756.0, + "step": 17194 + }, + { + "epoch": 2.1873807403638215, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.452518463134766, + "learning_rate": 1e-06, + "loss": 0.4073, + "mean_token_accuracy": 0.8706441521644592, + "num_tokens": 656054405.0, + "step": 17195 + }, + { + "epoch": 2.187507950642412, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.31159210205078, + "learning_rate": 1e-06, + "loss": 0.3842, + "mean_token_accuracy": 0.876746654510498, + "num_tokens": 656094668.0, + "step": 17196 + }, + { + "epoch": 2.1876351609210025, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.356382369995117, + "learning_rate": 1e-06, + "loss": 0.4121, + "mean_token_accuracy": 0.8732665181159973, + "num_tokens": 656137664.0, + "step": 17197 + }, + { + "epoch": 2.187762371199593, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.4853458404541, + "learning_rate": 1e-06, + "loss": 0.4178, + "mean_token_accuracy": 0.8652948141098022, + "num_tokens": 656180611.0, + "step": 17198 + }, + { + "epoch": 2.1878895814781836, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.446325302124023, + "learning_rate": 1e-06, + "loss": 0.428, + "mean_token_accuracy": 0.8651662468910217, + "num_tokens": 656217789.0, + "step": 17199 + }, + { + "epoch": 2.188016791756774, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.418474197387695, + "learning_rate": 1e-06, + "loss": 0.4118, + "mean_token_accuracy": 0.8693856000900269, + "num_tokens": 656261940.0, + "step": 17200 + }, + { + "epoch": 2.1881440020353646, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.39087677001953, + "learning_rate": 1e-06, + "loss": 0.3686, + "mean_token_accuracy": 0.8826069831848145, + "num_tokens": 656296000.0, + "step": 17201 + }, + { + "epoch": 2.188271212313955, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.303850173950195, + "learning_rate": 1e-06, + "loss": 0.3765, + "mean_token_accuracy": 0.8825762867927551, + "num_tokens": 656340862.0, + "step": 17202 + }, + { + "epoch": 2.1883984225925452, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.309818267822266, + "learning_rate": 1e-06, + "loss": 0.3306, + "mean_token_accuracy": 0.8963901996612549, + "num_tokens": 656376923.0, + "step": 17203 + }, + { + "epoch": 2.188525632871136, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.20149803161621, + "learning_rate": 1e-06, + "loss": 0.4055, + "mean_token_accuracy": 0.8697242140769958, + "num_tokens": 656414058.0, + "step": 17204 + }, + { + "epoch": 2.1886528431497263, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.311813354492188, + "learning_rate": 1e-06, + "loss": 0.4313, + "mean_token_accuracy": 0.8641386032104492, + "num_tokens": 656450077.0, + "step": 17205 + }, + { + "epoch": 2.188780053428317, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.40334129333496, + "learning_rate": 1e-06, + "loss": 0.426, + "mean_token_accuracy": 0.8655166625976562, + "num_tokens": 656487258.0, + "step": 17206 + }, + { + "epoch": 2.1889072637069074, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.293441772460938, + "learning_rate": 1e-06, + "loss": 0.3682, + "mean_token_accuracy": 0.8869000673294067, + "num_tokens": 656525141.0, + "step": 17207 + }, + { + "epoch": 2.189034473985498, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.417720794677734, + "learning_rate": 1e-06, + "loss": 0.3964, + "mean_token_accuracy": 0.8759729862213135, + "num_tokens": 656561517.0, + "step": 17208 + }, + { + "epoch": 2.1891616842640884, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.24340057373047, + "learning_rate": 1e-06, + "loss": 0.3851, + "mean_token_accuracy": 0.8789702653884888, + "num_tokens": 656595859.0, + "step": 17209 + }, + { + "epoch": 2.189288894542679, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.486513137817383, + "learning_rate": 1e-06, + "loss": 0.404, + "mean_token_accuracy": 0.8695328831672668, + "num_tokens": 656627652.0, + "step": 17210 + }, + { + "epoch": 2.1894161048212695, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.23248863220215, + "learning_rate": 1e-06, + "loss": 0.3839, + "mean_token_accuracy": 0.8794103264808655, + "num_tokens": 656669951.0, + "step": 17211 + }, + { + "epoch": 2.18954331509986, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.47749137878418, + "learning_rate": 1e-06, + "loss": 0.3661, + "mean_token_accuracy": 0.8847220540046692, + "num_tokens": 656702216.0, + "step": 17212 + }, + { + "epoch": 2.1896705253784505, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.349153518676758, + "learning_rate": 1e-06, + "loss": 0.4334, + "mean_token_accuracy": 0.8628513216972351, + "num_tokens": 656744816.0, + "step": 17213 + }, + { + "epoch": 2.189797735657041, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.284692764282227, + "learning_rate": 1e-06, + "loss": 0.4101, + "mean_token_accuracy": 0.8686797022819519, + "num_tokens": 656785533.0, + "step": 17214 + }, + { + "epoch": 2.1899249459356316, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.491958618164062, + "learning_rate": 1e-06, + "loss": 0.4062, + "mean_token_accuracy": 0.871219277381897, + "num_tokens": 656825375.0, + "step": 17215 + }, + { + "epoch": 2.190052156214222, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.46040153503418, + "learning_rate": 1e-06, + "loss": 0.3751, + "mean_token_accuracy": 0.8807398080825806, + "num_tokens": 656857951.0, + "step": 17216 + }, + { + "epoch": 2.1901793664928126, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.478084564208984, + "learning_rate": 1e-06, + "loss": 0.3712, + "mean_token_accuracy": 0.8820079565048218, + "num_tokens": 656896911.0, + "step": 17217 + }, + { + "epoch": 2.190306576771403, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.396989822387695, + "learning_rate": 1e-06, + "loss": 0.3742, + "mean_token_accuracy": 0.8797955513000488, + "num_tokens": 656935723.0, + "step": 17218 + }, + { + "epoch": 2.1904337870499937, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.205698013305664, + "learning_rate": 1e-06, + "loss": 0.3608, + "mean_token_accuracy": 0.8846625089645386, + "num_tokens": 656970881.0, + "step": 17219 + }, + { + "epoch": 2.1905609973285842, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.410400390625, + "learning_rate": 1e-06, + "loss": 0.4097, + "mean_token_accuracy": 0.86912602186203, + "num_tokens": 657007721.0, + "step": 17220 + }, + { + "epoch": 2.1906882076071748, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.438940048217773, + "learning_rate": 1e-06, + "loss": 0.3516, + "mean_token_accuracy": 0.8837641477584839, + "num_tokens": 657042359.0, + "step": 17221 + }, + { + "epoch": 2.1908154178857653, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.15288734436035, + "learning_rate": 1e-06, + "loss": 0.3983, + "mean_token_accuracy": 0.87750244140625, + "num_tokens": 657084662.0, + "step": 17222 + }, + { + "epoch": 2.190942628164356, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.48108673095703, + "learning_rate": 1e-06, + "loss": 0.3972, + "mean_token_accuracy": 0.8752541542053223, + "num_tokens": 657122002.0, + "step": 17223 + }, + { + "epoch": 2.1910698384429463, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.302061080932617, + "learning_rate": 1e-06, + "loss": 0.3889, + "mean_token_accuracy": 0.8746742606163025, + "num_tokens": 657158768.0, + "step": 17224 + }, + { + "epoch": 2.191197048721537, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.372695922851562, + "learning_rate": 1e-06, + "loss": 0.4306, + "mean_token_accuracy": 0.860758900642395, + "num_tokens": 657193445.0, + "step": 17225 + }, + { + "epoch": 2.1913242590001274, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.4195613861084, + "learning_rate": 1e-06, + "loss": 0.4125, + "mean_token_accuracy": 0.8725125789642334, + "num_tokens": 657234922.0, + "step": 17226 + }, + { + "epoch": 2.191451469278718, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.275123596191406, + "learning_rate": 1e-06, + "loss": 0.4404, + "mean_token_accuracy": 0.863935112953186, + "num_tokens": 657270774.0, + "step": 17227 + }, + { + "epoch": 2.191578679557308, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.535964965820312, + "learning_rate": 1e-06, + "loss": 0.3495, + "mean_token_accuracy": 0.8877142071723938, + "num_tokens": 657307046.0, + "step": 17228 + }, + { + "epoch": 2.1917058898358985, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.320369720458984, + "learning_rate": 1e-06, + "loss": 0.4293, + "mean_token_accuracy": 0.8637170195579529, + "num_tokens": 657352865.0, + "step": 17229 + }, + { + "epoch": 2.191833100114489, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.417142868041992, + "learning_rate": 1e-06, + "loss": 0.4657, + "mean_token_accuracy": 0.8531942367553711, + "num_tokens": 657393372.0, + "step": 17230 + }, + { + "epoch": 2.1919603103930796, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.421545028686523, + "learning_rate": 1e-06, + "loss": 0.4688, + "mean_token_accuracy": 0.8533737063407898, + "num_tokens": 657432445.0, + "step": 17231 + }, + { + "epoch": 2.19208752067167, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.327198028564453, + "learning_rate": 1e-06, + "loss": 0.4098, + "mean_token_accuracy": 0.8714443445205688, + "num_tokens": 657469145.0, + "step": 17232 + }, + { + "epoch": 2.1922147309502606, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.23894691467285, + "learning_rate": 1e-06, + "loss": 0.3636, + "mean_token_accuracy": 0.887876570224762, + "num_tokens": 657510369.0, + "step": 17233 + }, + { + "epoch": 2.192341941228851, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.29254150390625, + "learning_rate": 1e-06, + "loss": 0.3798, + "mean_token_accuracy": 0.8800312876701355, + "num_tokens": 657550257.0, + "step": 17234 + }, + { + "epoch": 2.1924691515074417, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.48230743408203, + "learning_rate": 1e-06, + "loss": 0.3901, + "mean_token_accuracy": 0.876198410987854, + "num_tokens": 657587095.0, + "step": 17235 + }, + { + "epoch": 2.1925963617860322, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.343969345092773, + "learning_rate": 1e-06, + "loss": 0.3945, + "mean_token_accuracy": 0.8774192333221436, + "num_tokens": 657619710.0, + "step": 17236 + }, + { + "epoch": 2.1927235720646228, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.2598934173584, + "learning_rate": 1e-06, + "loss": 0.4085, + "mean_token_accuracy": 0.868553638458252, + "num_tokens": 657653316.0, + "step": 17237 + }, + { + "epoch": 2.1928507823432133, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.26996612548828, + "learning_rate": 1e-06, + "loss": 0.4145, + "mean_token_accuracy": 0.8679062724113464, + "num_tokens": 657693570.0, + "step": 17238 + }, + { + "epoch": 2.192977992621804, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.52131462097168, + "learning_rate": 1e-06, + "loss": 0.3964, + "mean_token_accuracy": 0.8740897178649902, + "num_tokens": 657736781.0, + "step": 17239 + }, + { + "epoch": 2.1931052029003943, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.48239517211914, + "learning_rate": 1e-06, + "loss": 0.4241, + "mean_token_accuracy": 0.8644024133682251, + "num_tokens": 657775603.0, + "step": 17240 + }, + { + "epoch": 2.193232413178985, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.37127113342285, + "learning_rate": 1e-06, + "loss": 0.4059, + "mean_token_accuracy": 0.8710043430328369, + "num_tokens": 657809065.0, + "step": 17241 + }, + { + "epoch": 2.1933596234575754, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.469711303710938, + "learning_rate": 1e-06, + "loss": 0.401, + "mean_token_accuracy": 0.8712797164916992, + "num_tokens": 657851369.0, + "step": 17242 + }, + { + "epoch": 2.193486833736166, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.523509979248047, + "learning_rate": 1e-06, + "loss": 0.3419, + "mean_token_accuracy": 0.89389967918396, + "num_tokens": 657889008.0, + "step": 17243 + }, + { + "epoch": 2.1936140440147565, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.33287811279297, + "learning_rate": 1e-06, + "loss": 0.4136, + "mean_token_accuracy": 0.8714332580566406, + "num_tokens": 657925867.0, + "step": 17244 + }, + { + "epoch": 2.193741254293347, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.47697639465332, + "learning_rate": 1e-06, + "loss": 0.4244, + "mean_token_accuracy": 0.863518238067627, + "num_tokens": 657959502.0, + "step": 17245 + }, + { + "epoch": 2.1938684645719375, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.560762405395508, + "learning_rate": 1e-06, + "loss": 0.3861, + "mean_token_accuracy": 0.8768748044967651, + "num_tokens": 657999063.0, + "step": 17246 + }, + { + "epoch": 2.193995674850528, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.30928611755371, + "learning_rate": 1e-06, + "loss": 0.4285, + "mean_token_accuracy": 0.8634163737297058, + "num_tokens": 658033016.0, + "step": 17247 + }, + { + "epoch": 2.1941228851291186, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.48137092590332, + "learning_rate": 1e-06, + "loss": 0.3834, + "mean_token_accuracy": 0.878960371017456, + "num_tokens": 658071259.0, + "step": 17248 + }, + { + "epoch": 2.194250095407709, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.729066848754883, + "learning_rate": 1e-06, + "loss": 0.3913, + "mean_token_accuracy": 0.8749540448188782, + "num_tokens": 658103190.0, + "step": 17249 + }, + { + "epoch": 2.1943773056862996, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.338319778442383, + "learning_rate": 1e-06, + "loss": 0.4228, + "mean_token_accuracy": 0.8661379814147949, + "num_tokens": 658143346.0, + "step": 17250 + }, + { + "epoch": 2.19450451596489, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.555593490600586, + "learning_rate": 1e-06, + "loss": 0.3721, + "mean_token_accuracy": 0.8824499845504761, + "num_tokens": 658181747.0, + "step": 17251 + }, + { + "epoch": 2.1946317262434807, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.314620971679688, + "learning_rate": 1e-06, + "loss": 0.3992, + "mean_token_accuracy": 0.8751028776168823, + "num_tokens": 658221770.0, + "step": 17252 + }, + { + "epoch": 2.1947589365220708, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.341285705566406, + "learning_rate": 1e-06, + "loss": 0.3875, + "mean_token_accuracy": 0.8778144121170044, + "num_tokens": 658260157.0, + "step": 17253 + }, + { + "epoch": 2.1948861468006613, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.38357925415039, + "learning_rate": 1e-06, + "loss": 0.4289, + "mean_token_accuracy": 0.8635978698730469, + "num_tokens": 658301815.0, + "step": 17254 + }, + { + "epoch": 2.195013357079252, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.42140007019043, + "learning_rate": 1e-06, + "loss": 0.3985, + "mean_token_accuracy": 0.8759989738464355, + "num_tokens": 658340732.0, + "step": 17255 + }, + { + "epoch": 2.1951405673578424, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.350173950195312, + "learning_rate": 1e-06, + "loss": 0.4133, + "mean_token_accuracy": 0.8679453134536743, + "num_tokens": 658380569.0, + "step": 17256 + }, + { + "epoch": 2.195267777636433, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.126964569091797, + "learning_rate": 1e-06, + "loss": 0.3606, + "mean_token_accuracy": 0.8849087953567505, + "num_tokens": 658415842.0, + "step": 17257 + }, + { + "epoch": 2.1953949879150234, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.646041870117188, + "learning_rate": 1e-06, + "loss": 0.3884, + "mean_token_accuracy": 0.8771397471427917, + "num_tokens": 658452861.0, + "step": 17258 + }, + { + "epoch": 2.195522198193614, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.29196548461914, + "learning_rate": 1e-06, + "loss": 0.3957, + "mean_token_accuracy": 0.876375675201416, + "num_tokens": 658487019.0, + "step": 17259 + }, + { + "epoch": 2.1956494084722045, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.293485641479492, + "learning_rate": 1e-06, + "loss": 0.4015, + "mean_token_accuracy": 0.8714601993560791, + "num_tokens": 658529290.0, + "step": 17260 + }, + { + "epoch": 2.195776618750795, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.378841400146484, + "learning_rate": 1e-06, + "loss": 0.4052, + "mean_token_accuracy": 0.8683413863182068, + "num_tokens": 658572118.0, + "step": 17261 + }, + { + "epoch": 2.1959038290293855, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.389923095703125, + "learning_rate": 1e-06, + "loss": 0.4081, + "mean_token_accuracy": 0.870111346244812, + "num_tokens": 658615972.0, + "step": 17262 + }, + { + "epoch": 2.196031039307976, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.420536041259766, + "learning_rate": 1e-06, + "loss": 0.3777, + "mean_token_accuracy": 0.8805342316627502, + "num_tokens": 658653848.0, + "step": 17263 + }, + { + "epoch": 2.1961582495865666, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.204729080200195, + "learning_rate": 1e-06, + "loss": 0.383, + "mean_token_accuracy": 0.8731081485748291, + "num_tokens": 658686247.0, + "step": 17264 + }, + { + "epoch": 2.196285459865157, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.23985481262207, + "learning_rate": 1e-06, + "loss": 0.4165, + "mean_token_accuracy": 0.8699326515197754, + "num_tokens": 658725787.0, + "step": 17265 + }, + { + "epoch": 2.1964126701437476, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.366920471191406, + "learning_rate": 1e-06, + "loss": 0.4064, + "mean_token_accuracy": 0.8695563077926636, + "num_tokens": 658765957.0, + "step": 17266 + }, + { + "epoch": 2.196539880422338, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.3590087890625, + "learning_rate": 1e-06, + "loss": 0.364, + "mean_token_accuracy": 0.8833250999450684, + "num_tokens": 658800996.0, + "step": 17267 + }, + { + "epoch": 2.1966670907009287, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.64317512512207, + "learning_rate": 1e-06, + "loss": 0.4214, + "mean_token_accuracy": 0.87061607837677, + "num_tokens": 658833779.0, + "step": 17268 + }, + { + "epoch": 2.196794300979519, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.268064498901367, + "learning_rate": 1e-06, + "loss": 0.3824, + "mean_token_accuracy": 0.8762975335121155, + "num_tokens": 658874425.0, + "step": 17269 + }, + { + "epoch": 2.1969215112581097, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.383756637573242, + "learning_rate": 1e-06, + "loss": 0.3653, + "mean_token_accuracy": 0.8823896646499634, + "num_tokens": 658912007.0, + "step": 17270 + }, + { + "epoch": 2.1970487215367003, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.393396377563477, + "learning_rate": 1e-06, + "loss": 0.4236, + "mean_token_accuracy": 0.8646831512451172, + "num_tokens": 658956775.0, + "step": 17271 + }, + { + "epoch": 2.197175931815291, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.310033798217773, + "learning_rate": 1e-06, + "loss": 0.3639, + "mean_token_accuracy": 0.8862247467041016, + "num_tokens": 658997132.0, + "step": 17272 + }, + { + "epoch": 2.1973031420938813, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.494718551635742, + "learning_rate": 1e-06, + "loss": 0.378, + "mean_token_accuracy": 0.8803070187568665, + "num_tokens": 659040437.0, + "step": 17273 + }, + { + "epoch": 2.197430352372472, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.531612396240234, + "learning_rate": 1e-06, + "loss": 0.3546, + "mean_token_accuracy": 0.8886435031890869, + "num_tokens": 659074461.0, + "step": 17274 + }, + { + "epoch": 2.1975575626510624, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.454652786254883, + "learning_rate": 1e-06, + "loss": 0.4042, + "mean_token_accuracy": 0.8703917264938354, + "num_tokens": 659113818.0, + "step": 17275 + }, + { + "epoch": 2.197684772929653, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.372495651245117, + "learning_rate": 1e-06, + "loss": 0.4444, + "mean_token_accuracy": 0.8615573048591614, + "num_tokens": 659149297.0, + "step": 17276 + }, + { + "epoch": 2.1978119832082434, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.434383392333984, + "learning_rate": 1e-06, + "loss": 0.3941, + "mean_token_accuracy": 0.8768525123596191, + "num_tokens": 659187500.0, + "step": 17277 + }, + { + "epoch": 2.1979391934868335, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.405027389526367, + "learning_rate": 1e-06, + "loss": 0.4877, + "mean_token_accuracy": 0.8477919697761536, + "num_tokens": 659228624.0, + "step": 17278 + }, + { + "epoch": 2.198066403765424, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.30614471435547, + "learning_rate": 1e-06, + "loss": 0.3882, + "mean_token_accuracy": 0.8811434507369995, + "num_tokens": 659270317.0, + "step": 17279 + }, + { + "epoch": 2.1981936140440146, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.265207290649414, + "learning_rate": 1e-06, + "loss": 0.4047, + "mean_token_accuracy": 0.8730848431587219, + "num_tokens": 659302133.0, + "step": 17280 + }, + { + "epoch": 2.198320824322605, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.541704177856445, + "learning_rate": 1e-06, + "loss": 0.3754, + "mean_token_accuracy": 0.8791238069534302, + "num_tokens": 659342469.0, + "step": 17281 + }, + { + "epoch": 2.1984480346011956, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.17190170288086, + "learning_rate": 1e-06, + "loss": 0.3606, + "mean_token_accuracy": 0.8848885893821716, + "num_tokens": 659373786.0, + "step": 17282 + }, + { + "epoch": 2.198575244879786, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.591175079345703, + "learning_rate": 1e-06, + "loss": 0.4369, + "mean_token_accuracy": 0.8667734265327454, + "num_tokens": 659412498.0, + "step": 17283 + }, + { + "epoch": 2.1987024551583767, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.316184997558594, + "learning_rate": 1e-06, + "loss": 0.4416, + "mean_token_accuracy": 0.8579630255699158, + "num_tokens": 659446415.0, + "step": 17284 + }, + { + "epoch": 2.1988296654369672, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.301259994506836, + "learning_rate": 1e-06, + "loss": 0.4269, + "mean_token_accuracy": 0.8654946088790894, + "num_tokens": 659488824.0, + "step": 17285 + }, + { + "epoch": 2.1989568757155578, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.337543487548828, + "learning_rate": 1e-06, + "loss": 0.3928, + "mean_token_accuracy": 0.8773860931396484, + "num_tokens": 659523036.0, + "step": 17286 + }, + { + "epoch": 2.1990840859941483, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.26184844970703, + "learning_rate": 1e-06, + "loss": 0.4156, + "mean_token_accuracy": 0.8679630756378174, + "num_tokens": 659562780.0, + "step": 17287 + }, + { + "epoch": 2.199211296272739, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.497516632080078, + "learning_rate": 1e-06, + "loss": 0.3823, + "mean_token_accuracy": 0.8798472285270691, + "num_tokens": 659599364.0, + "step": 17288 + }, + { + "epoch": 2.1993385065513293, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.299161911010742, + "learning_rate": 1e-06, + "loss": 0.4142, + "mean_token_accuracy": 0.867958664894104, + "num_tokens": 659636275.0, + "step": 17289 + }, + { + "epoch": 2.19946571682992, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.34684181213379, + "learning_rate": 1e-06, + "loss": 0.3874, + "mean_token_accuracy": 0.8730251789093018, + "num_tokens": 659670315.0, + "step": 17290 + }, + { + "epoch": 2.1995929271085104, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.32320785522461, + "learning_rate": 1e-06, + "loss": 0.4209, + "mean_token_accuracy": 0.8691326379776001, + "num_tokens": 659710968.0, + "step": 17291 + }, + { + "epoch": 2.199720137387101, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.29867935180664, + "learning_rate": 1e-06, + "loss": 0.3763, + "mean_token_accuracy": 0.8819707632064819, + "num_tokens": 659751334.0, + "step": 17292 + }, + { + "epoch": 2.1998473476656915, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.376094818115234, + "learning_rate": 1e-06, + "loss": 0.3994, + "mean_token_accuracy": 0.8736728429794312, + "num_tokens": 659793867.0, + "step": 17293 + }, + { + "epoch": 2.199974557944282, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.399890899658203, + "learning_rate": 1e-06, + "loss": 0.3962, + "mean_token_accuracy": 0.8735318183898926, + "num_tokens": 659838724.0, + "step": 17294 + }, + { + "epoch": 2.2001017682228725, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.218416213989258, + "learning_rate": 1e-06, + "loss": 0.423, + "mean_token_accuracy": 0.8707833886146545, + "num_tokens": 659876589.0, + "step": 17295 + }, + { + "epoch": 2.200228978501463, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.376928329467773, + "learning_rate": 1e-06, + "loss": 0.4288, + "mean_token_accuracy": 0.8647393584251404, + "num_tokens": 659914864.0, + "step": 17296 + }, + { + "epoch": 2.2003561887800536, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.448532104492188, + "learning_rate": 1e-06, + "loss": 0.3703, + "mean_token_accuracy": 0.8801060914993286, + "num_tokens": 659952482.0, + "step": 17297 + }, + { + "epoch": 2.200483399058644, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.41586685180664, + "learning_rate": 1e-06, + "loss": 0.4037, + "mean_token_accuracy": 0.8722296953201294, + "num_tokens": 659985192.0, + "step": 17298 + }, + { + "epoch": 2.2006106093372346, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.256397247314453, + "learning_rate": 1e-06, + "loss": 0.3811, + "mean_token_accuracy": 0.8809722661972046, + "num_tokens": 660019505.0, + "step": 17299 + }, + { + "epoch": 2.200737819615825, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.53952980041504, + "learning_rate": 1e-06, + "loss": 0.4042, + "mean_token_accuracy": 0.8669154644012451, + "num_tokens": 660054942.0, + "step": 17300 + }, + { + "epoch": 2.2008650298944152, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.2661075592041, + "learning_rate": 1e-06, + "loss": 0.3952, + "mean_token_accuracy": 0.8724316358566284, + "num_tokens": 660092836.0, + "step": 17301 + }, + { + "epoch": 2.200992240173006, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.5203800201416, + "learning_rate": 1e-06, + "loss": 0.4131, + "mean_token_accuracy": 0.8681119680404663, + "num_tokens": 660129419.0, + "step": 17302 + }, + { + "epoch": 2.2011194504515963, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.412527084350586, + "learning_rate": 1e-06, + "loss": 0.3901, + "mean_token_accuracy": 0.8759422898292542, + "num_tokens": 660168116.0, + "step": 17303 + }, + { + "epoch": 2.201246660730187, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.35820960998535, + "learning_rate": 1e-06, + "loss": 0.4194, + "mean_token_accuracy": 0.8664459586143494, + "num_tokens": 660208006.0, + "step": 17304 + }, + { + "epoch": 2.2013738710087773, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.44453239440918, + "learning_rate": 1e-06, + "loss": 0.4069, + "mean_token_accuracy": 0.872593879699707, + "num_tokens": 660254046.0, + "step": 17305 + }, + { + "epoch": 2.201501081287368, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.49870491027832, + "learning_rate": 1e-06, + "loss": 0.4143, + "mean_token_accuracy": 0.8730409741401672, + "num_tokens": 660289321.0, + "step": 17306 + }, + { + "epoch": 2.2016282915659584, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.417869567871094, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.8700035810470581, + "num_tokens": 660325052.0, + "step": 17307 + }, + { + "epoch": 2.201755501844549, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.4591007232666, + "learning_rate": 1e-06, + "loss": 0.4237, + "mean_token_accuracy": 0.8647720813751221, + "num_tokens": 660357942.0, + "step": 17308 + }, + { + "epoch": 2.2018827121231395, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.457901000976562, + "learning_rate": 1e-06, + "loss": 0.4822, + "mean_token_accuracy": 0.8471060991287231, + "num_tokens": 660390654.0, + "step": 17309 + }, + { + "epoch": 2.20200992240173, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.31150245666504, + "learning_rate": 1e-06, + "loss": 0.3962, + "mean_token_accuracy": 0.8742460012435913, + "num_tokens": 660432020.0, + "step": 17310 + }, + { + "epoch": 2.2021371326803205, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.44868278503418, + "learning_rate": 1e-06, + "loss": 0.3804, + "mean_token_accuracy": 0.8736060857772827, + "num_tokens": 660465051.0, + "step": 17311 + }, + { + "epoch": 2.202264342958911, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.32016372680664, + "learning_rate": 1e-06, + "loss": 0.4135, + "mean_token_accuracy": 0.8689833879470825, + "num_tokens": 660504967.0, + "step": 17312 + }, + { + "epoch": 2.2023915532375016, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.069171905517578, + "learning_rate": 1e-06, + "loss": 0.4334, + "mean_token_accuracy": 0.8634805083274841, + "num_tokens": 660543826.0, + "step": 17313 + }, + { + "epoch": 2.202518763516092, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.37042999267578, + "learning_rate": 1e-06, + "loss": 0.4075, + "mean_token_accuracy": 0.8692566156387329, + "num_tokens": 660583074.0, + "step": 17314 + }, + { + "epoch": 2.2026459737946826, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.320011138916016, + "learning_rate": 1e-06, + "loss": 0.3479, + "mean_token_accuracy": 0.8891702890396118, + "num_tokens": 660619974.0, + "step": 17315 + }, + { + "epoch": 2.202773184073273, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.390548706054688, + "learning_rate": 1e-06, + "loss": 0.3841, + "mean_token_accuracy": 0.8780456781387329, + "num_tokens": 660654005.0, + "step": 17316 + }, + { + "epoch": 2.2029003943518637, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.425846099853516, + "learning_rate": 1e-06, + "loss": 0.4402, + "mean_token_accuracy": 0.8615818023681641, + "num_tokens": 660689179.0, + "step": 17317 + }, + { + "epoch": 2.203027604630454, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.421871185302734, + "learning_rate": 1e-06, + "loss": 0.4235, + "mean_token_accuracy": 0.8612026572227478, + "num_tokens": 660724923.0, + "step": 17318 + }, + { + "epoch": 2.2031548149090447, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.565479278564453, + "learning_rate": 1e-06, + "loss": 0.3832, + "mean_token_accuracy": 0.8765487670898438, + "num_tokens": 660760763.0, + "step": 17319 + }, + { + "epoch": 2.2032820251876353, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.263996124267578, + "learning_rate": 1e-06, + "loss": 0.3968, + "mean_token_accuracy": 0.8742756843566895, + "num_tokens": 660795520.0, + "step": 17320 + }, + { + "epoch": 2.203409235466226, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.359785079956055, + "learning_rate": 1e-06, + "loss": 0.3713, + "mean_token_accuracy": 0.883278489112854, + "num_tokens": 660834858.0, + "step": 17321 + }, + { + "epoch": 2.2035364457448163, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.405799865722656, + "learning_rate": 1e-06, + "loss": 0.3753, + "mean_token_accuracy": 0.881212592124939, + "num_tokens": 660870319.0, + "step": 17322 + }, + { + "epoch": 2.203663656023407, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.359060287475586, + "learning_rate": 1e-06, + "loss": 0.4402, + "mean_token_accuracy": 0.8608676791191101, + "num_tokens": 660907313.0, + "step": 17323 + }, + { + "epoch": 2.2037908663019974, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.298908233642578, + "learning_rate": 1e-06, + "loss": 0.3622, + "mean_token_accuracy": 0.8883350491523743, + "num_tokens": 660948087.0, + "step": 17324 + }, + { + "epoch": 2.203918076580588, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.64487075805664, + "learning_rate": 1e-06, + "loss": 0.3889, + "mean_token_accuracy": 0.8787135481834412, + "num_tokens": 660984984.0, + "step": 17325 + }, + { + "epoch": 2.204045286859178, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.144561767578125, + "learning_rate": 1e-06, + "loss": 0.4172, + "mean_token_accuracy": 0.8701711893081665, + "num_tokens": 661022898.0, + "step": 17326 + }, + { + "epoch": 2.2041724971377685, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.44100570678711, + "learning_rate": 1e-06, + "loss": 0.3702, + "mean_token_accuracy": 0.8827756643295288, + "num_tokens": 661058645.0, + "step": 17327 + }, + { + "epoch": 2.204299707416359, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.487079620361328, + "learning_rate": 1e-06, + "loss": 0.3439, + "mean_token_accuracy": 0.8944786787033081, + "num_tokens": 661096954.0, + "step": 17328 + }, + { + "epoch": 2.2044269176949496, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.432756423950195, + "learning_rate": 1e-06, + "loss": 0.4363, + "mean_token_accuracy": 0.8623068928718567, + "num_tokens": 661136947.0, + "step": 17329 + }, + { + "epoch": 2.20455412797354, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.220443725585938, + "learning_rate": 1e-06, + "loss": 0.386, + "mean_token_accuracy": 0.8741821050643921, + "num_tokens": 661175376.0, + "step": 17330 + }, + { + "epoch": 2.2046813382521306, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.452423095703125, + "learning_rate": 1e-06, + "loss": 0.4192, + "mean_token_accuracy": 0.8665481209754944, + "num_tokens": 661213238.0, + "step": 17331 + }, + { + "epoch": 2.204808548530721, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.355058670043945, + "learning_rate": 1e-06, + "loss": 0.3519, + "mean_token_accuracy": 0.8888794183731079, + "num_tokens": 661246512.0, + "step": 17332 + }, + { + "epoch": 2.2049357588093117, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.41988182067871, + "learning_rate": 1e-06, + "loss": 0.3643, + "mean_token_accuracy": 0.8833276033401489, + "num_tokens": 661288272.0, + "step": 17333 + }, + { + "epoch": 2.2050629690879022, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.320526123046875, + "learning_rate": 1e-06, + "loss": 0.3814, + "mean_token_accuracy": 0.879088819026947, + "num_tokens": 661328705.0, + "step": 17334 + }, + { + "epoch": 2.2051901793664928, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.34315299987793, + "learning_rate": 1e-06, + "loss": 0.3845, + "mean_token_accuracy": 0.8778135180473328, + "num_tokens": 661369877.0, + "step": 17335 + }, + { + "epoch": 2.2053173896450833, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.598690032958984, + "learning_rate": 1e-06, + "loss": 0.3942, + "mean_token_accuracy": 0.8759227991104126, + "num_tokens": 661405209.0, + "step": 17336 + }, + { + "epoch": 2.205444599923674, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.525035858154297, + "learning_rate": 1e-06, + "loss": 0.3833, + "mean_token_accuracy": 0.8773763179779053, + "num_tokens": 661441083.0, + "step": 17337 + }, + { + "epoch": 2.2055718102022643, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.433895111083984, + "learning_rate": 1e-06, + "loss": 0.3907, + "mean_token_accuracy": 0.8759864568710327, + "num_tokens": 661476938.0, + "step": 17338 + }, + { + "epoch": 2.205699020480855, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.47207260131836, + "learning_rate": 1e-06, + "loss": 0.4231, + "mean_token_accuracy": 0.8658028841018677, + "num_tokens": 661519281.0, + "step": 17339 + }, + { + "epoch": 2.2058262307594454, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.219573974609375, + "learning_rate": 1e-06, + "loss": 0.3496, + "mean_token_accuracy": 0.8881008625030518, + "num_tokens": 661555880.0, + "step": 17340 + }, + { + "epoch": 2.205953441038036, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.32947540283203, + "learning_rate": 1e-06, + "loss": 0.4354, + "mean_token_accuracy": 0.8644400835037231, + "num_tokens": 661595937.0, + "step": 17341 + }, + { + "epoch": 2.2060806513166265, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.52520179748535, + "learning_rate": 1e-06, + "loss": 0.4403, + "mean_token_accuracy": 0.8600316047668457, + "num_tokens": 661636868.0, + "step": 17342 + }, + { + "epoch": 2.206207861595217, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.581695556640625, + "learning_rate": 1e-06, + "loss": 0.4298, + "mean_token_accuracy": 0.8662823438644409, + "num_tokens": 661681503.0, + "step": 17343 + }, + { + "epoch": 2.2063350718738075, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.561681747436523, + "learning_rate": 1e-06, + "loss": 0.3959, + "mean_token_accuracy": 0.8769588470458984, + "num_tokens": 661718438.0, + "step": 17344 + }, + { + "epoch": 2.206462282152398, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.3330135345459, + "learning_rate": 1e-06, + "loss": 0.4589, + "mean_token_accuracy": 0.8536959886550903, + "num_tokens": 661759048.0, + "step": 17345 + }, + { + "epoch": 2.2065894924309886, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.367595672607422, + "learning_rate": 1e-06, + "loss": 0.3759, + "mean_token_accuracy": 0.8836002945899963, + "num_tokens": 661803131.0, + "step": 17346 + }, + { + "epoch": 2.206716702709579, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.522192001342773, + "learning_rate": 1e-06, + "loss": 0.4001, + "mean_token_accuracy": 0.8743797540664673, + "num_tokens": 661838139.0, + "step": 17347 + }, + { + "epoch": 2.2068439129881696, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.287376403808594, + "learning_rate": 1e-06, + "loss": 0.4206, + "mean_token_accuracy": 0.8663957118988037, + "num_tokens": 661878568.0, + "step": 17348 + }, + { + "epoch": 2.20697112326676, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.46433448791504, + "learning_rate": 1e-06, + "loss": 0.4142, + "mean_token_accuracy": 0.8666160106658936, + "num_tokens": 661909110.0, + "step": 17349 + }, + { + "epoch": 2.2070983335453507, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.320770263671875, + "learning_rate": 1e-06, + "loss": 0.3597, + "mean_token_accuracy": 0.8865869045257568, + "num_tokens": 661944472.0, + "step": 17350 + }, + { + "epoch": 2.2072255438239408, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.188888549804688, + "learning_rate": 1e-06, + "loss": 0.3604, + "mean_token_accuracy": 0.8853328824043274, + "num_tokens": 661985865.0, + "step": 17351 + }, + { + "epoch": 2.2073527541025313, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.577585220336914, + "learning_rate": 1e-06, + "loss": 0.4188, + "mean_token_accuracy": 0.8688708543777466, + "num_tokens": 662019460.0, + "step": 17352 + }, + { + "epoch": 2.207479964381122, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.4731388092041, + "learning_rate": 1e-06, + "loss": 0.3834, + "mean_token_accuracy": 0.8809050917625427, + "num_tokens": 662059109.0, + "step": 17353 + }, + { + "epoch": 2.2076071746597123, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.710983276367188, + "learning_rate": 1e-06, + "loss": 0.4213, + "mean_token_accuracy": 0.8669248223304749, + "num_tokens": 662096040.0, + "step": 17354 + }, + { + "epoch": 2.207734384938303, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.442169189453125, + "learning_rate": 1e-06, + "loss": 0.3837, + "mean_token_accuracy": 0.8783853054046631, + "num_tokens": 662139351.0, + "step": 17355 + }, + { + "epoch": 2.2078615952168934, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.450923919677734, + "learning_rate": 1e-06, + "loss": 0.3926, + "mean_token_accuracy": 0.8740885257720947, + "num_tokens": 662177353.0, + "step": 17356 + }, + { + "epoch": 2.207988805495484, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.520463943481445, + "learning_rate": 1e-06, + "loss": 0.4256, + "mean_token_accuracy": 0.8634215593338013, + "num_tokens": 662209519.0, + "step": 17357 + }, + { + "epoch": 2.2081160157740745, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.285600662231445, + "learning_rate": 1e-06, + "loss": 0.463, + "mean_token_accuracy": 0.8541970252990723, + "num_tokens": 662251311.0, + "step": 17358 + }, + { + "epoch": 2.208243226052665, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.463228225708008, + "learning_rate": 1e-06, + "loss": 0.376, + "mean_token_accuracy": 0.8800750374794006, + "num_tokens": 662282770.0, + "step": 17359 + }, + { + "epoch": 2.2083704363312555, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.479692459106445, + "learning_rate": 1e-06, + "loss": 0.412, + "mean_token_accuracy": 0.8695566654205322, + "num_tokens": 662326770.0, + "step": 17360 + }, + { + "epoch": 2.208497646609846, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.47524642944336, + "learning_rate": 1e-06, + "loss": 0.3846, + "mean_token_accuracy": 0.8726508617401123, + "num_tokens": 662366811.0, + "step": 17361 + }, + { + "epoch": 2.2086248568884366, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.429866790771484, + "learning_rate": 1e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.8688933849334717, + "num_tokens": 662404517.0, + "step": 17362 + }, + { + "epoch": 2.208752067167027, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.400049209594727, + "learning_rate": 1e-06, + "loss": 0.385, + "mean_token_accuracy": 0.879340648651123, + "num_tokens": 662442091.0, + "step": 17363 + }, + { + "epoch": 2.2088792774456176, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.332544326782227, + "learning_rate": 1e-06, + "loss": 0.4752, + "mean_token_accuracy": 0.8484057188034058, + "num_tokens": 662483343.0, + "step": 17364 + }, + { + "epoch": 2.209006487724208, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.47186851501465, + "learning_rate": 1e-06, + "loss": 0.4204, + "mean_token_accuracy": 0.8667914271354675, + "num_tokens": 662521994.0, + "step": 17365 + }, + { + "epoch": 2.2091336980027987, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.287424087524414, + "learning_rate": 1e-06, + "loss": 0.4087, + "mean_token_accuracy": 0.8689556121826172, + "num_tokens": 662562737.0, + "step": 17366 + }, + { + "epoch": 2.209260908281389, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.295766830444336, + "learning_rate": 1e-06, + "loss": 0.4079, + "mean_token_accuracy": 0.8701720237731934, + "num_tokens": 662596927.0, + "step": 17367 + }, + { + "epoch": 2.2093881185599797, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.29582977294922, + "learning_rate": 1e-06, + "loss": 0.3724, + "mean_token_accuracy": 0.881122350692749, + "num_tokens": 662632831.0, + "step": 17368 + }, + { + "epoch": 2.2095153288385703, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.432201385498047, + "learning_rate": 1e-06, + "loss": 0.4149, + "mean_token_accuracy": 0.8701484203338623, + "num_tokens": 662675554.0, + "step": 17369 + }, + { + "epoch": 2.209642539117161, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.204389572143555, + "learning_rate": 1e-06, + "loss": 0.3616, + "mean_token_accuracy": 0.88398677110672, + "num_tokens": 662720248.0, + "step": 17370 + }, + { + "epoch": 2.2097697493957513, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.209596633911133, + "learning_rate": 1e-06, + "loss": 0.4019, + "mean_token_accuracy": 0.8736891746520996, + "num_tokens": 662761388.0, + "step": 17371 + }, + { + "epoch": 2.209896959674342, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.538036346435547, + "learning_rate": 1e-06, + "loss": 0.4068, + "mean_token_accuracy": 0.8726363182067871, + "num_tokens": 662798007.0, + "step": 17372 + }, + { + "epoch": 2.2100241699529324, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.46316146850586, + "learning_rate": 1e-06, + "loss": 0.4078, + "mean_token_accuracy": 0.870256781578064, + "num_tokens": 662836042.0, + "step": 17373 + }, + { + "epoch": 2.210151380231523, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.25843048095703, + "learning_rate": 1e-06, + "loss": 0.4287, + "mean_token_accuracy": 0.8634666204452515, + "num_tokens": 662869465.0, + "step": 17374 + }, + { + "epoch": 2.2102785905101134, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.4422664642334, + "learning_rate": 1e-06, + "loss": 0.4218, + "mean_token_accuracy": 0.8652979135513306, + "num_tokens": 662909440.0, + "step": 17375 + }, + { + "epoch": 2.2104058007887035, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.36553955078125, + "learning_rate": 1e-06, + "loss": 0.4175, + "mean_token_accuracy": 0.868972897529602, + "num_tokens": 662949637.0, + "step": 17376 + }, + { + "epoch": 2.210533011067294, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.46933364868164, + "learning_rate": 1e-06, + "loss": 0.4077, + "mean_token_accuracy": 0.8710224032402039, + "num_tokens": 662987679.0, + "step": 17377 + }, + { + "epoch": 2.2106602213458846, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.364810943603516, + "learning_rate": 1e-06, + "loss": 0.4605, + "mean_token_accuracy": 0.8541721701622009, + "num_tokens": 663027651.0, + "step": 17378 + }, + { + "epoch": 2.210787431624475, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.51313018798828, + "learning_rate": 1e-06, + "loss": 0.3624, + "mean_token_accuracy": 0.8852549195289612, + "num_tokens": 663063708.0, + "step": 17379 + }, + { + "epoch": 2.2109146419030656, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.334108352661133, + "learning_rate": 1e-06, + "loss": 0.4457, + "mean_token_accuracy": 0.8654919862747192, + "num_tokens": 663097255.0, + "step": 17380 + }, + { + "epoch": 2.211041852181656, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.318012237548828, + "learning_rate": 1e-06, + "loss": 0.4131, + "mean_token_accuracy": 0.8698980808258057, + "num_tokens": 663134496.0, + "step": 17381 + }, + { + "epoch": 2.2111690624602467, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.315683364868164, + "learning_rate": 1e-06, + "loss": 0.3742, + "mean_token_accuracy": 0.8819403648376465, + "num_tokens": 663170278.0, + "step": 17382 + }, + { + "epoch": 2.211296272738837, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.374385833740234, + "learning_rate": 1e-06, + "loss": 0.4068, + "mean_token_accuracy": 0.8701387047767639, + "num_tokens": 663205407.0, + "step": 17383 + }, + { + "epoch": 2.2114234830174277, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.407588958740234, + "learning_rate": 1e-06, + "loss": 0.3882, + "mean_token_accuracy": 0.8773692846298218, + "num_tokens": 663243166.0, + "step": 17384 + }, + { + "epoch": 2.2115506932960183, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.33705711364746, + "learning_rate": 1e-06, + "loss": 0.3365, + "mean_token_accuracy": 0.8949975967407227, + "num_tokens": 663282267.0, + "step": 17385 + }, + { + "epoch": 2.211677903574609, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.598756790161133, + "learning_rate": 1e-06, + "loss": 0.4197, + "mean_token_accuracy": 0.8643472194671631, + "num_tokens": 663318620.0, + "step": 17386 + }, + { + "epoch": 2.2118051138531993, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.360654830932617, + "learning_rate": 1e-06, + "loss": 0.3452, + "mean_token_accuracy": 0.8903839588165283, + "num_tokens": 663353256.0, + "step": 17387 + }, + { + "epoch": 2.21193232413179, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.161788940429688, + "learning_rate": 1e-06, + "loss": 0.3971, + "mean_token_accuracy": 0.875859797000885, + "num_tokens": 663386691.0, + "step": 17388 + }, + { + "epoch": 2.2120595344103804, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.73381996154785, + "learning_rate": 1e-06, + "loss": 0.4163, + "mean_token_accuracy": 0.8693150281906128, + "num_tokens": 663416550.0, + "step": 17389 + }, + { + "epoch": 2.212186744688971, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.20103645324707, + "learning_rate": 1e-06, + "loss": 0.3935, + "mean_token_accuracy": 0.8748316764831543, + "num_tokens": 663464912.0, + "step": 17390 + }, + { + "epoch": 2.2123139549675614, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.393829345703125, + "learning_rate": 1e-06, + "loss": 0.4392, + "mean_token_accuracy": 0.8639573454856873, + "num_tokens": 663503786.0, + "step": 17391 + }, + { + "epoch": 2.212441165246152, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.540769577026367, + "learning_rate": 1e-06, + "loss": 0.3796, + "mean_token_accuracy": 0.880698561668396, + "num_tokens": 663540579.0, + "step": 17392 + }, + { + "epoch": 2.2125683755247425, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.297359466552734, + "learning_rate": 1e-06, + "loss": 0.3879, + "mean_token_accuracy": 0.8746925592422485, + "num_tokens": 663577284.0, + "step": 17393 + }, + { + "epoch": 2.212695585803333, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.39942741394043, + "learning_rate": 1e-06, + "loss": 0.4207, + "mean_token_accuracy": 0.8689827919006348, + "num_tokens": 663613118.0, + "step": 17394 + }, + { + "epoch": 2.2128227960819236, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.509748458862305, + "learning_rate": 1e-06, + "loss": 0.3865, + "mean_token_accuracy": 0.877473771572113, + "num_tokens": 663651548.0, + "step": 17395 + }, + { + "epoch": 2.212950006360514, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.223764419555664, + "learning_rate": 1e-06, + "loss": 0.4574, + "mean_token_accuracy": 0.8548506498336792, + "num_tokens": 663694381.0, + "step": 17396 + }, + { + "epoch": 2.2130772166391046, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.495830535888672, + "learning_rate": 1e-06, + "loss": 0.4023, + "mean_token_accuracy": 0.874055027961731, + "num_tokens": 663734686.0, + "step": 17397 + }, + { + "epoch": 2.213204426917695, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.462312698364258, + "learning_rate": 1e-06, + "loss": 0.4231, + "mean_token_accuracy": 0.8636181354522705, + "num_tokens": 663768764.0, + "step": 17398 + }, + { + "epoch": 2.2133316371962852, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.27702522277832, + "learning_rate": 1e-06, + "loss": 0.3427, + "mean_token_accuracy": 0.8928025960922241, + "num_tokens": 663807650.0, + "step": 17399 + }, + { + "epoch": 2.213458847474876, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.367918014526367, + "learning_rate": 1e-06, + "loss": 0.4086, + "mean_token_accuracy": 0.8704551458358765, + "num_tokens": 663849308.0, + "step": 17400 + }, + { + "epoch": 2.2135860577534663, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.394075393676758, + "learning_rate": 1e-06, + "loss": 0.4422, + "mean_token_accuracy": 0.8585152626037598, + "num_tokens": 663888917.0, + "step": 17401 + }, + { + "epoch": 2.213713268032057, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.569570541381836, + "learning_rate": 1e-06, + "loss": 0.4028, + "mean_token_accuracy": 0.8737264275550842, + "num_tokens": 663921048.0, + "step": 17402 + }, + { + "epoch": 2.2138404783106473, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.312597274780273, + "learning_rate": 1e-06, + "loss": 0.3869, + "mean_token_accuracy": 0.8781967163085938, + "num_tokens": 663963433.0, + "step": 17403 + }, + { + "epoch": 2.213967688589238, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.776784896850586, + "learning_rate": 1e-06, + "loss": 0.4592, + "mean_token_accuracy": 0.8565149307250977, + "num_tokens": 664003294.0, + "step": 17404 + }, + { + "epoch": 2.2140948988678284, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.438541412353516, + "learning_rate": 1e-06, + "loss": 0.439, + "mean_token_accuracy": 0.8613637685775757, + "num_tokens": 664043161.0, + "step": 17405 + }, + { + "epoch": 2.214222109146419, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.386741638183594, + "learning_rate": 1e-06, + "loss": 0.3666, + "mean_token_accuracy": 0.8844465613365173, + "num_tokens": 664082457.0, + "step": 17406 + }, + { + "epoch": 2.2143493194250095, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.598413467407227, + "learning_rate": 1e-06, + "loss": 0.3668, + "mean_token_accuracy": 0.8824745416641235, + "num_tokens": 664125999.0, + "step": 17407 + }, + { + "epoch": 2.2144765297036, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.477645874023438, + "learning_rate": 1e-06, + "loss": 0.3821, + "mean_token_accuracy": 0.8786633014678955, + "num_tokens": 664167256.0, + "step": 17408 + }, + { + "epoch": 2.2146037399821905, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.484575271606445, + "learning_rate": 1e-06, + "loss": 0.4137, + "mean_token_accuracy": 0.8686708807945251, + "num_tokens": 664201887.0, + "step": 17409 + }, + { + "epoch": 2.214730950260781, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.736268997192383, + "learning_rate": 1e-06, + "loss": 0.3837, + "mean_token_accuracy": 0.8776408433914185, + "num_tokens": 664238986.0, + "step": 17410 + }, + { + "epoch": 2.2148581605393716, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.598703384399414, + "learning_rate": 1e-06, + "loss": 0.419, + "mean_token_accuracy": 0.869388997554779, + "num_tokens": 664283436.0, + "step": 17411 + }, + { + "epoch": 2.214985370817962, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.506122589111328, + "learning_rate": 1e-06, + "loss": 0.4288, + "mean_token_accuracy": 0.8665115833282471, + "num_tokens": 664325196.0, + "step": 17412 + }, + { + "epoch": 2.2151125810965526, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.391817092895508, + "learning_rate": 1e-06, + "loss": 0.4284, + "mean_token_accuracy": 0.864355206489563, + "num_tokens": 664365722.0, + "step": 17413 + }, + { + "epoch": 2.215239791375143, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.72854995727539, + "learning_rate": 1e-06, + "loss": 0.4258, + "mean_token_accuracy": 0.8635144233703613, + "num_tokens": 664411570.0, + "step": 17414 + }, + { + "epoch": 2.2153670016537337, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.30593490600586, + "learning_rate": 1e-06, + "loss": 0.3933, + "mean_token_accuracy": 0.8755511045455933, + "num_tokens": 664453545.0, + "step": 17415 + }, + { + "epoch": 2.215494211932324, + "ewc_loss": 0.036376953125, + "ewc_loss_parallel": 3.647804260253906e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.442533493041992, + "learning_rate": 1e-06, + "loss": 0.3998, + "mean_token_accuracy": 0.8731573820114136, + "num_tokens": 664489882.0, + "step": 17416 + }, + { + "epoch": 2.2156214222109147, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.577499389648438, + "learning_rate": 1e-06, + "loss": 0.4057, + "mean_token_accuracy": 0.8707119226455688, + "num_tokens": 664528720.0, + "step": 17417 + }, + { + "epoch": 2.2157486324895053, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.674415588378906, + "learning_rate": 1e-06, + "loss": 0.4318, + "mean_token_accuracy": 0.8632286787033081, + "num_tokens": 664569645.0, + "step": 17418 + }, + { + "epoch": 2.215875842768096, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.254932403564453, + "learning_rate": 1e-06, + "loss": 0.4354, + "mean_token_accuracy": 0.8672928810119629, + "num_tokens": 664612033.0, + "step": 17419 + }, + { + "epoch": 2.2160030530466863, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.530838012695312, + "learning_rate": 1e-06, + "loss": 0.3798, + "mean_token_accuracy": 0.8803359270095825, + "num_tokens": 664651408.0, + "step": 17420 + }, + { + "epoch": 2.216130263325277, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.53752899169922, + "learning_rate": 1e-06, + "loss": 0.4606, + "mean_token_accuracy": 0.8571805953979492, + "num_tokens": 664695764.0, + "step": 17421 + }, + { + "epoch": 2.2162574736038674, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.415664672851562, + "learning_rate": 1e-06, + "loss": 0.3956, + "mean_token_accuracy": 0.875817060470581, + "num_tokens": 664735528.0, + "step": 17422 + }, + { + "epoch": 2.216384683882458, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.335838317871094, + "learning_rate": 1e-06, + "loss": 0.4242, + "mean_token_accuracy": 0.865301787853241, + "num_tokens": 664775612.0, + "step": 17423 + }, + { + "epoch": 2.216511894161048, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.341188430786133, + "learning_rate": 1e-06, + "loss": 0.4104, + "mean_token_accuracy": 0.8685490489006042, + "num_tokens": 664812485.0, + "step": 17424 + }, + { + "epoch": 2.2166391044396385, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.399734497070312, + "learning_rate": 1e-06, + "loss": 0.4507, + "mean_token_accuracy": 0.8566477298736572, + "num_tokens": 664853166.0, + "step": 17425 + }, + { + "epoch": 2.216766314718229, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.311553955078125, + "learning_rate": 1e-06, + "loss": 0.4401, + "mean_token_accuracy": 0.8614662289619446, + "num_tokens": 664888778.0, + "step": 17426 + }, + { + "epoch": 2.2168935249968196, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.457609176635742, + "learning_rate": 1e-06, + "loss": 0.4055, + "mean_token_accuracy": 0.8706657886505127, + "num_tokens": 664933783.0, + "step": 17427 + }, + { + "epoch": 2.21702073527541, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.685441970825195, + "learning_rate": 1e-06, + "loss": 0.4746, + "mean_token_accuracy": 0.8570376038551331, + "num_tokens": 664965228.0, + "step": 17428 + }, + { + "epoch": 2.2171479455540006, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.590307235717773, + "learning_rate": 1e-06, + "loss": 0.4079, + "mean_token_accuracy": 0.8687839508056641, + "num_tokens": 664996691.0, + "step": 17429 + }, + { + "epoch": 2.217275155832591, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.524839401245117, + "learning_rate": 1e-06, + "loss": 0.3701, + "mean_token_accuracy": 0.8830661177635193, + "num_tokens": 665032006.0, + "step": 17430 + }, + { + "epoch": 2.2174023661111817, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.374557495117188, + "learning_rate": 1e-06, + "loss": 0.4288, + "mean_token_accuracy": 0.8621507883071899, + "num_tokens": 665071819.0, + "step": 17431 + }, + { + "epoch": 2.217529576389772, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.677715301513672, + "learning_rate": 1e-06, + "loss": 0.4369, + "mean_token_accuracy": 0.8607156276702881, + "num_tokens": 665118883.0, + "step": 17432 + }, + { + "epoch": 2.2176567866683627, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.434579849243164, + "learning_rate": 1e-06, + "loss": 0.4703, + "mean_token_accuracy": 0.8543606996536255, + "num_tokens": 665154926.0, + "step": 17433 + }, + { + "epoch": 2.2177839969469533, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.364219665527344, + "learning_rate": 1e-06, + "loss": 0.3792, + "mean_token_accuracy": 0.8804128170013428, + "num_tokens": 665189910.0, + "step": 17434 + }, + { + "epoch": 2.217911207225544, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.491619110107422, + "learning_rate": 1e-06, + "loss": 0.3877, + "mean_token_accuracy": 0.8792587518692017, + "num_tokens": 665235849.0, + "step": 17435 + }, + { + "epoch": 2.2180384175041343, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.8217830657959, + "learning_rate": 1e-06, + "loss": 0.4251, + "mean_token_accuracy": 0.8624979853630066, + "num_tokens": 665274231.0, + "step": 17436 + }, + { + "epoch": 2.218165627782725, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.432886123657227, + "learning_rate": 1e-06, + "loss": 0.3487, + "mean_token_accuracy": 0.8892546892166138, + "num_tokens": 665309057.0, + "step": 17437 + }, + { + "epoch": 2.2182928380613154, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.38098907470703, + "learning_rate": 1e-06, + "loss": 0.4142, + "mean_token_accuracy": 0.8699815273284912, + "num_tokens": 665337031.0, + "step": 17438 + }, + { + "epoch": 2.218420048339906, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.33403778076172, + "learning_rate": 1e-06, + "loss": 0.4101, + "mean_token_accuracy": 0.8714257478713989, + "num_tokens": 665376557.0, + "step": 17439 + }, + { + "epoch": 2.2185472586184964, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.555423736572266, + "learning_rate": 1e-06, + "loss": 0.4019, + "mean_token_accuracy": 0.8704128265380859, + "num_tokens": 665419928.0, + "step": 17440 + }, + { + "epoch": 2.218674468897087, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.399913787841797, + "learning_rate": 1e-06, + "loss": 0.3828, + "mean_token_accuracy": 0.8778135180473328, + "num_tokens": 665458827.0, + "step": 17441 + }, + { + "epoch": 2.2188016791756775, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.34634780883789, + "learning_rate": 1e-06, + "loss": 0.3971, + "mean_token_accuracy": 0.8743739128112793, + "num_tokens": 665498623.0, + "step": 17442 + }, + { + "epoch": 2.218928889454268, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.456531524658203, + "learning_rate": 1e-06, + "loss": 0.4109, + "mean_token_accuracy": 0.8665110468864441, + "num_tokens": 665536103.0, + "step": 17443 + }, + { + "epoch": 2.2190560997328586, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.37116050720215, + "learning_rate": 1e-06, + "loss": 0.4088, + "mean_token_accuracy": 0.8694140315055847, + "num_tokens": 665572089.0, + "step": 17444 + }, + { + "epoch": 2.219183310011449, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.525808334350586, + "learning_rate": 1e-06, + "loss": 0.4171, + "mean_token_accuracy": 0.8636962175369263, + "num_tokens": 665608088.0, + "step": 17445 + }, + { + "epoch": 2.2193105202900396, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.522964477539062, + "learning_rate": 1e-06, + "loss": 0.3746, + "mean_token_accuracy": 0.880865216255188, + "num_tokens": 665641883.0, + "step": 17446 + }, + { + "epoch": 2.21943773056863, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.426523208618164, + "learning_rate": 1e-06, + "loss": 0.3858, + "mean_token_accuracy": 0.8769319653511047, + "num_tokens": 665682211.0, + "step": 17447 + }, + { + "epoch": 2.2195649408472207, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.380104064941406, + "learning_rate": 1e-06, + "loss": 0.3903, + "mean_token_accuracy": 0.8773187398910522, + "num_tokens": 665726182.0, + "step": 17448 + }, + { + "epoch": 2.2196921511258108, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.333093643188477, + "learning_rate": 1e-06, + "loss": 0.4017, + "mean_token_accuracy": 0.875957190990448, + "num_tokens": 665760404.0, + "step": 17449 + }, + { + "epoch": 2.2198193614044013, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.524747848510742, + "learning_rate": 1e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.8719608187675476, + "num_tokens": 665798235.0, + "step": 17450 + }, + { + "epoch": 2.219946571682992, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.49221420288086, + "learning_rate": 1e-06, + "loss": 0.4031, + "mean_token_accuracy": 0.8722460865974426, + "num_tokens": 665840118.0, + "step": 17451 + }, + { + "epoch": 2.2200737819615823, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.4188175201416, + "learning_rate": 1e-06, + "loss": 0.4137, + "mean_token_accuracy": 0.871018648147583, + "num_tokens": 665878498.0, + "step": 17452 + }, + { + "epoch": 2.220200992240173, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.744075775146484, + "learning_rate": 1e-06, + "loss": 0.4168, + "mean_token_accuracy": 0.8674139380455017, + "num_tokens": 665924179.0, + "step": 17453 + }, + { + "epoch": 2.2203282025187634, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.455469131469727, + "learning_rate": 1e-06, + "loss": 0.3603, + "mean_token_accuracy": 0.8849983215332031, + "num_tokens": 665963160.0, + "step": 17454 + }, + { + "epoch": 2.220455412797354, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.362829208374023, + "learning_rate": 1e-06, + "loss": 0.3715, + "mean_token_accuracy": 0.8811050057411194, + "num_tokens": 665998854.0, + "step": 17455 + }, + { + "epoch": 2.2205826230759445, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.341318130493164, + "learning_rate": 1e-06, + "loss": 0.3797, + "mean_token_accuracy": 0.8796955943107605, + "num_tokens": 666036420.0, + "step": 17456 + }, + { + "epoch": 2.220709833354535, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.330583572387695, + "learning_rate": 1e-06, + "loss": 0.4713, + "mean_token_accuracy": 0.8527106046676636, + "num_tokens": 666076692.0, + "step": 17457 + }, + { + "epoch": 2.2208370436331255, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.40568733215332, + "learning_rate": 1e-06, + "loss": 0.4656, + "mean_token_accuracy": 0.8581987619400024, + "num_tokens": 666118009.0, + "step": 17458 + }, + { + "epoch": 2.220964253911716, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.238548278808594, + "learning_rate": 1e-06, + "loss": 0.433, + "mean_token_accuracy": 0.8622806072235107, + "num_tokens": 666158132.0, + "step": 17459 + }, + { + "epoch": 2.2210914641903066, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.58930206298828, + "learning_rate": 1e-06, + "loss": 0.4329, + "mean_token_accuracy": 0.8675177097320557, + "num_tokens": 666187195.0, + "step": 17460 + }, + { + "epoch": 2.221218674468897, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.513338088989258, + "learning_rate": 1e-06, + "loss": 0.3956, + "mean_token_accuracy": 0.8770796060562134, + "num_tokens": 666227790.0, + "step": 17461 + }, + { + "epoch": 2.2213458847474876, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.427898406982422, + "learning_rate": 1e-06, + "loss": 0.4297, + "mean_token_accuracy": 0.8652489185333252, + "num_tokens": 666268517.0, + "step": 17462 + }, + { + "epoch": 2.221473095026078, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.383161544799805, + "learning_rate": 1e-06, + "loss": 0.3538, + "mean_token_accuracy": 0.8897848129272461, + "num_tokens": 666313727.0, + "step": 17463 + }, + { + "epoch": 2.2216003053046687, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.35161018371582, + "learning_rate": 1e-06, + "loss": 0.3926, + "mean_token_accuracy": 0.8747553825378418, + "num_tokens": 666352713.0, + "step": 17464 + }, + { + "epoch": 2.221727515583259, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.561674118041992, + "learning_rate": 1e-06, + "loss": 0.3839, + "mean_token_accuracy": 0.8793402910232544, + "num_tokens": 666395065.0, + "step": 17465 + }, + { + "epoch": 2.2218547258618497, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.492572784423828, + "learning_rate": 1e-06, + "loss": 0.404, + "mean_token_accuracy": 0.8733451962471008, + "num_tokens": 666428019.0, + "step": 17466 + }, + { + "epoch": 2.2219819361404403, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.431644439697266, + "learning_rate": 1e-06, + "loss": 0.4132, + "mean_token_accuracy": 0.8701788783073425, + "num_tokens": 666466830.0, + "step": 17467 + }, + { + "epoch": 2.222109146419031, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.357553482055664, + "learning_rate": 1e-06, + "loss": 0.3483, + "mean_token_accuracy": 0.8899096250534058, + "num_tokens": 666501028.0, + "step": 17468 + }, + { + "epoch": 2.2222363566976213, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.475433349609375, + "learning_rate": 1e-06, + "loss": 0.3576, + "mean_token_accuracy": 0.8848413825035095, + "num_tokens": 666538554.0, + "step": 17469 + }, + { + "epoch": 2.222363566976212, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.28005027770996, + "learning_rate": 1e-06, + "loss": 0.4073, + "mean_token_accuracy": 0.8715042471885681, + "num_tokens": 666577019.0, + "step": 17470 + }, + { + "epoch": 2.2224907772548024, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.498796463012695, + "learning_rate": 1e-06, + "loss": 0.3621, + "mean_token_accuracy": 0.8872631788253784, + "num_tokens": 666616597.0, + "step": 17471 + }, + { + "epoch": 2.222617987533393, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.415449142456055, + "learning_rate": 1e-06, + "loss": 0.3938, + "mean_token_accuracy": 0.875857412815094, + "num_tokens": 666651523.0, + "step": 17472 + }, + { + "epoch": 2.2227451978119834, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.486923217773438, + "learning_rate": 1e-06, + "loss": 0.4487, + "mean_token_accuracy": 0.8628556728363037, + "num_tokens": 666689270.0, + "step": 17473 + }, + { + "epoch": 2.2228724080905735, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.511205673217773, + "learning_rate": 1e-06, + "loss": 0.4132, + "mean_token_accuracy": 0.8684343695640564, + "num_tokens": 666726836.0, + "step": 17474 + }, + { + "epoch": 2.222999618369164, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.464393615722656, + "learning_rate": 1e-06, + "loss": 0.4108, + "mean_token_accuracy": 0.8697643280029297, + "num_tokens": 666761207.0, + "step": 17475 + }, + { + "epoch": 2.2231268286477546, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.614505767822266, + "learning_rate": 1e-06, + "loss": 0.384, + "mean_token_accuracy": 0.8803651332855225, + "num_tokens": 666800293.0, + "step": 17476 + }, + { + "epoch": 2.223254038926345, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.43254280090332, + "learning_rate": 1e-06, + "loss": 0.3792, + "mean_token_accuracy": 0.8807597160339355, + "num_tokens": 666843046.0, + "step": 17477 + }, + { + "epoch": 2.2233812492049356, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.453685760498047, + "learning_rate": 1e-06, + "loss": 0.3683, + "mean_token_accuracy": 0.8850923776626587, + "num_tokens": 666885840.0, + "step": 17478 + }, + { + "epoch": 2.223508459483526, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.391611099243164, + "learning_rate": 1e-06, + "loss": 0.3935, + "mean_token_accuracy": 0.8727326393127441, + "num_tokens": 666925654.0, + "step": 17479 + }, + { + "epoch": 2.2236356697621167, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.468225479125977, + "learning_rate": 1e-06, + "loss": 0.4007, + "mean_token_accuracy": 0.8740003108978271, + "num_tokens": 666968098.0, + "step": 17480 + }, + { + "epoch": 2.223762880040707, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.4177303314209, + "learning_rate": 1e-06, + "loss": 0.3731, + "mean_token_accuracy": 0.8832608461380005, + "num_tokens": 667010349.0, + "step": 17481 + }, + { + "epoch": 2.2238900903192977, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.649324417114258, + "learning_rate": 1e-06, + "loss": 0.4177, + "mean_token_accuracy": 0.863498866558075, + "num_tokens": 667047086.0, + "step": 17482 + }, + { + "epoch": 2.2240173005978883, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.530757904052734, + "learning_rate": 1e-06, + "loss": 0.4582, + "mean_token_accuracy": 0.8560172915458679, + "num_tokens": 667083955.0, + "step": 17483 + }, + { + "epoch": 2.224144510876479, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.377901077270508, + "learning_rate": 1e-06, + "loss": 0.3806, + "mean_token_accuracy": 0.8794071078300476, + "num_tokens": 667119751.0, + "step": 17484 + }, + { + "epoch": 2.2242717211550693, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.565837860107422, + "learning_rate": 1e-06, + "loss": 0.4187, + "mean_token_accuracy": 0.8683762550354004, + "num_tokens": 667150980.0, + "step": 17485 + }, + { + "epoch": 2.22439893143366, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.495994567871094, + "learning_rate": 1e-06, + "loss": 0.3975, + "mean_token_accuracy": 0.8704213500022888, + "num_tokens": 667185198.0, + "step": 17486 + }, + { + "epoch": 2.2245261417122504, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.42055320739746, + "learning_rate": 1e-06, + "loss": 0.4001, + "mean_token_accuracy": 0.8754444122314453, + "num_tokens": 667225286.0, + "step": 17487 + }, + { + "epoch": 2.224653351990841, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.404388427734375, + "learning_rate": 1e-06, + "loss": 0.4194, + "mean_token_accuracy": 0.8669458627700806, + "num_tokens": 667266294.0, + "step": 17488 + }, + { + "epoch": 2.2247805622694314, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.619346618652344, + "learning_rate": 1e-06, + "loss": 0.4106, + "mean_token_accuracy": 0.8700023889541626, + "num_tokens": 667302430.0, + "step": 17489 + }, + { + "epoch": 2.224907772548022, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.52341651916504, + "learning_rate": 1e-06, + "loss": 0.4231, + "mean_token_accuracy": 0.8670783638954163, + "num_tokens": 667342881.0, + "step": 17490 + }, + { + "epoch": 2.2250349828266125, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.529390335083008, + "learning_rate": 1e-06, + "loss": 0.3963, + "mean_token_accuracy": 0.8751846551895142, + "num_tokens": 667376122.0, + "step": 17491 + }, + { + "epoch": 2.225162193105203, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.39515495300293, + "learning_rate": 1e-06, + "loss": 0.4341, + "mean_token_accuracy": 0.8657206296920776, + "num_tokens": 667415120.0, + "step": 17492 + }, + { + "epoch": 2.2252894033837936, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.217334747314453, + "learning_rate": 1e-06, + "loss": 0.4124, + "mean_token_accuracy": 0.8720481395721436, + "num_tokens": 667451014.0, + "step": 17493 + }, + { + "epoch": 2.225416613662384, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.602460861206055, + "learning_rate": 1e-06, + "loss": 0.4202, + "mean_token_accuracy": 0.8637998104095459, + "num_tokens": 667485600.0, + "step": 17494 + }, + { + "epoch": 2.2255438239409746, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.188417434692383, + "learning_rate": 1e-06, + "loss": 0.4155, + "mean_token_accuracy": 0.8678102493286133, + "num_tokens": 667527561.0, + "step": 17495 + }, + { + "epoch": 2.225671034219565, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.341394424438477, + "learning_rate": 1e-06, + "loss": 0.382, + "mean_token_accuracy": 0.8796893358230591, + "num_tokens": 667569677.0, + "step": 17496 + }, + { + "epoch": 2.225798244498155, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.271516799926758, + "learning_rate": 1e-06, + "loss": 0.4113, + "mean_token_accuracy": 0.8722635507583618, + "num_tokens": 667605954.0, + "step": 17497 + }, + { + "epoch": 2.225925454776746, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.335411071777344, + "learning_rate": 1e-06, + "loss": 0.39, + "mean_token_accuracy": 0.8756892085075378, + "num_tokens": 667645082.0, + "step": 17498 + }, + { + "epoch": 2.2260526650553363, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.311683654785156, + "learning_rate": 1e-06, + "loss": 0.4161, + "mean_token_accuracy": 0.8690497875213623, + "num_tokens": 667681390.0, + "step": 17499 + }, + { + "epoch": 2.226179875333927, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.19817352294922, + "learning_rate": 1e-06, + "loss": 0.4346, + "mean_token_accuracy": 0.8633935451507568, + "num_tokens": 667722772.0, + "step": 17500 + }, + { + "epoch": 2.2263070856125173, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.606746673583984, + "learning_rate": 1e-06, + "loss": 0.4819, + "mean_token_accuracy": 0.8473495244979858, + "num_tokens": 667758647.0, + "step": 17501 + }, + { + "epoch": 2.226434295891108, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.36582374572754, + "learning_rate": 1e-06, + "loss": 0.4109, + "mean_token_accuracy": 0.8729401230812073, + "num_tokens": 667790625.0, + "step": 17502 + }, + { + "epoch": 2.2265615061696984, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.316112518310547, + "learning_rate": 1e-06, + "loss": 0.4296, + "mean_token_accuracy": 0.8619628548622131, + "num_tokens": 667823724.0, + "step": 17503 + }, + { + "epoch": 2.226688716448289, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.544668197631836, + "learning_rate": 1e-06, + "loss": 0.3615, + "mean_token_accuracy": 0.8861844539642334, + "num_tokens": 667862548.0, + "step": 17504 + }, + { + "epoch": 2.2268159267268794, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.359189987182617, + "learning_rate": 1e-06, + "loss": 0.4244, + "mean_token_accuracy": 0.8654555082321167, + "num_tokens": 667905994.0, + "step": 17505 + }, + { + "epoch": 2.22694313700547, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.360210418701172, + "learning_rate": 1e-06, + "loss": 0.4236, + "mean_token_accuracy": 0.8653786182403564, + "num_tokens": 667945087.0, + "step": 17506 + }, + { + "epoch": 2.2270703472840605, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.510000228881836, + "learning_rate": 1e-06, + "loss": 0.3697, + "mean_token_accuracy": 0.8828567862510681, + "num_tokens": 667984736.0, + "step": 17507 + }, + { + "epoch": 2.227197557562651, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.277996063232422, + "learning_rate": 1e-06, + "loss": 0.4222, + "mean_token_accuracy": 0.8673318028450012, + "num_tokens": 668024020.0, + "step": 17508 + }, + { + "epoch": 2.2273247678412416, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.50660514831543, + "learning_rate": 1e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.8670212626457214, + "num_tokens": 668059567.0, + "step": 17509 + }, + { + "epoch": 2.227451978119832, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.35427474975586, + "learning_rate": 1e-06, + "loss": 0.3903, + "mean_token_accuracy": 0.8776524066925049, + "num_tokens": 668099901.0, + "step": 17510 + }, + { + "epoch": 2.2275791883984226, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.48604965209961, + "learning_rate": 1e-06, + "loss": 0.3567, + "mean_token_accuracy": 0.8863785266876221, + "num_tokens": 668135974.0, + "step": 17511 + }, + { + "epoch": 2.227706398677013, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.380212783813477, + "learning_rate": 1e-06, + "loss": 0.4023, + "mean_token_accuracy": 0.8741312026977539, + "num_tokens": 668179200.0, + "step": 17512 + }, + { + "epoch": 2.2278336089556037, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.503332138061523, + "learning_rate": 1e-06, + "loss": 0.4662, + "mean_token_accuracy": 0.8506826162338257, + "num_tokens": 668219683.0, + "step": 17513 + }, + { + "epoch": 2.227960819234194, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.25984764099121, + "learning_rate": 1e-06, + "loss": 0.399, + "mean_token_accuracy": 0.8744894862174988, + "num_tokens": 668253899.0, + "step": 17514 + }, + { + "epoch": 2.2280880295127847, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.43126106262207, + "learning_rate": 1e-06, + "loss": 0.423, + "mean_token_accuracy": 0.8654808402061462, + "num_tokens": 668294140.0, + "step": 17515 + }, + { + "epoch": 2.2282152397913753, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.464183807373047, + "learning_rate": 1e-06, + "loss": 0.3901, + "mean_token_accuracy": 0.8781822323799133, + "num_tokens": 668328442.0, + "step": 17516 + }, + { + "epoch": 2.228342450069966, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.188344955444336, + "learning_rate": 1e-06, + "loss": 0.3981, + "mean_token_accuracy": 0.8760237693786621, + "num_tokens": 668367903.0, + "step": 17517 + }, + { + "epoch": 2.2284696603485563, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.492481231689453, + "learning_rate": 1e-06, + "loss": 0.374, + "mean_token_accuracy": 0.883929967880249, + "num_tokens": 668409889.0, + "step": 17518 + }, + { + "epoch": 2.228596870627147, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.434955596923828, + "learning_rate": 1e-06, + "loss": 0.3975, + "mean_token_accuracy": 0.8762797117233276, + "num_tokens": 668451061.0, + "step": 17519 + }, + { + "epoch": 2.2287240809057374, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.162921905517578, + "learning_rate": 1e-06, + "loss": 0.4634, + "mean_token_accuracy": 0.8505061268806458, + "num_tokens": 668493038.0, + "step": 17520 + }, + { + "epoch": 2.228851291184328, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.557092666625977, + "learning_rate": 1e-06, + "loss": 0.3861, + "mean_token_accuracy": 0.878324031829834, + "num_tokens": 668528087.0, + "step": 17521 + }, + { + "epoch": 2.228978501462918, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.369735717773438, + "learning_rate": 1e-06, + "loss": 0.4003, + "mean_token_accuracy": 0.8735893964767456, + "num_tokens": 668562325.0, + "step": 17522 + }, + { + "epoch": 2.2291057117415085, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.39974021911621, + "learning_rate": 1e-06, + "loss": 0.3993, + "mean_token_accuracy": 0.8754342794418335, + "num_tokens": 668601336.0, + "step": 17523 + }, + { + "epoch": 2.229232922020099, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.496334075927734, + "learning_rate": 1e-06, + "loss": 0.3392, + "mean_token_accuracy": 0.8906744718551636, + "num_tokens": 668637113.0, + "step": 17524 + }, + { + "epoch": 2.2293601322986896, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.567319869995117, + "learning_rate": 1e-06, + "loss": 0.414, + "mean_token_accuracy": 0.8687314987182617, + "num_tokens": 668675645.0, + "step": 17525 + }, + { + "epoch": 2.22948734257728, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.319162368774414, + "learning_rate": 1e-06, + "loss": 0.404, + "mean_token_accuracy": 0.872927188873291, + "num_tokens": 668706480.0, + "step": 17526 + }, + { + "epoch": 2.2296145528558706, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.500137329101562, + "learning_rate": 1e-06, + "loss": 0.4359, + "mean_token_accuracy": 0.8661156892776489, + "num_tokens": 668743956.0, + "step": 17527 + }, + { + "epoch": 2.229741763134461, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.116117477416992, + "learning_rate": 1e-06, + "loss": 0.3638, + "mean_token_accuracy": 0.8814300298690796, + "num_tokens": 668779134.0, + "step": 17528 + }, + { + "epoch": 2.2298689734130517, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.484289169311523, + "learning_rate": 1e-06, + "loss": 0.3986, + "mean_token_accuracy": 0.8741231560707092, + "num_tokens": 668814978.0, + "step": 17529 + }, + { + "epoch": 2.229996183691642, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.420324325561523, + "learning_rate": 1e-06, + "loss": 0.4418, + "mean_token_accuracy": 0.8564333319664001, + "num_tokens": 668854194.0, + "step": 17530 + }, + { + "epoch": 2.2301233939702327, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.474512100219727, + "learning_rate": 1e-06, + "loss": 0.442, + "mean_token_accuracy": 0.865038275718689, + "num_tokens": 668893095.0, + "step": 17531 + }, + { + "epoch": 2.2302506042488233, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.316726684570312, + "learning_rate": 1e-06, + "loss": 0.3902, + "mean_token_accuracy": 0.8799154758453369, + "num_tokens": 668930626.0, + "step": 17532 + }, + { + "epoch": 2.230377814527414, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.695100784301758, + "learning_rate": 1e-06, + "loss": 0.3772, + "mean_token_accuracy": 0.8797745704650879, + "num_tokens": 668960941.0, + "step": 17533 + }, + { + "epoch": 2.2305050248060043, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.418643951416016, + "learning_rate": 1e-06, + "loss": 0.39, + "mean_token_accuracy": 0.874840497970581, + "num_tokens": 668997771.0, + "step": 17534 + }, + { + "epoch": 2.230632235084595, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.611019134521484, + "learning_rate": 1e-06, + "loss": 0.3635, + "mean_token_accuracy": 0.8863545656204224, + "num_tokens": 669038714.0, + "step": 17535 + }, + { + "epoch": 2.2307594453631854, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.31658363342285, + "learning_rate": 1e-06, + "loss": 0.3689, + "mean_token_accuracy": 0.8811206817626953, + "num_tokens": 669072336.0, + "step": 17536 + }, + { + "epoch": 2.230886655641776, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.50995445251465, + "learning_rate": 1e-06, + "loss": 0.3693, + "mean_token_accuracy": 0.8822364211082458, + "num_tokens": 669110303.0, + "step": 17537 + }, + { + "epoch": 2.2310138659203664, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.58437728881836, + "learning_rate": 1e-06, + "loss": 0.4183, + "mean_token_accuracy": 0.8628900051116943, + "num_tokens": 669146874.0, + "step": 17538 + }, + { + "epoch": 2.231141076198957, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.351917266845703, + "learning_rate": 1e-06, + "loss": 0.3701, + "mean_token_accuracy": 0.884128987789154, + "num_tokens": 669181682.0, + "step": 17539 + }, + { + "epoch": 2.2312682864775475, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.36811637878418, + "learning_rate": 1e-06, + "loss": 0.3988, + "mean_token_accuracy": 0.8721507787704468, + "num_tokens": 669227201.0, + "step": 17540 + }, + { + "epoch": 2.231395496756138, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.551292419433594, + "learning_rate": 1e-06, + "loss": 0.3995, + "mean_token_accuracy": 0.8728665709495544, + "num_tokens": 669265915.0, + "step": 17541 + }, + { + "epoch": 2.2315227070347285, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.49496841430664, + "learning_rate": 1e-06, + "loss": 0.3821, + "mean_token_accuracy": 0.8770710229873657, + "num_tokens": 669304130.0, + "step": 17542 + }, + { + "epoch": 2.231649917313319, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.492830276489258, + "learning_rate": 1e-06, + "loss": 0.4156, + "mean_token_accuracy": 0.8710170984268188, + "num_tokens": 669344225.0, + "step": 17543 + }, + { + "epoch": 2.2317771275919096, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.600740432739258, + "learning_rate": 1e-06, + "loss": 0.4605, + "mean_token_accuracy": 0.8544152975082397, + "num_tokens": 669385126.0, + "step": 17544 + }, + { + "epoch": 2.2319043378705, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.383317947387695, + "learning_rate": 1e-06, + "loss": 0.4346, + "mean_token_accuracy": 0.8610987663269043, + "num_tokens": 669425774.0, + "step": 17545 + }, + { + "epoch": 2.2320315481490907, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.536462783813477, + "learning_rate": 1e-06, + "loss": 0.4413, + "mean_token_accuracy": 0.8634340763092041, + "num_tokens": 669459789.0, + "step": 17546 + }, + { + "epoch": 2.2321587584276807, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.368938446044922, + "learning_rate": 1e-06, + "loss": 0.3581, + "mean_token_accuracy": 0.888070821762085, + "num_tokens": 669498709.0, + "step": 17547 + }, + { + "epoch": 2.2322859687062713, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.36573028564453, + "learning_rate": 1e-06, + "loss": 0.3871, + "mean_token_accuracy": 0.8777954578399658, + "num_tokens": 669536021.0, + "step": 17548 + }, + { + "epoch": 2.232413178984862, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.514238357543945, + "learning_rate": 1e-06, + "loss": 0.4296, + "mean_token_accuracy": 0.8612245321273804, + "num_tokens": 669574316.0, + "step": 17549 + }, + { + "epoch": 2.2325403892634523, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.292722702026367, + "learning_rate": 1e-06, + "loss": 0.4322, + "mean_token_accuracy": 0.8632388114929199, + "num_tokens": 669611826.0, + "step": 17550 + }, + { + "epoch": 2.232667599542043, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.493267059326172, + "learning_rate": 1e-06, + "loss": 0.4066, + "mean_token_accuracy": 0.8695029020309448, + "num_tokens": 669655277.0, + "step": 17551 + }, + { + "epoch": 2.2327948098206334, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.47132682800293, + "learning_rate": 1e-06, + "loss": 0.4278, + "mean_token_accuracy": 0.8669488430023193, + "num_tokens": 669693481.0, + "step": 17552 + }, + { + "epoch": 2.232922020099224, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.417713165283203, + "learning_rate": 1e-06, + "loss": 0.4005, + "mean_token_accuracy": 0.8718639612197876, + "num_tokens": 669724104.0, + "step": 17553 + }, + { + "epoch": 2.2330492303778144, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.331846237182617, + "learning_rate": 1e-06, + "loss": 0.4558, + "mean_token_accuracy": 0.8552906513214111, + "num_tokens": 669768725.0, + "step": 17554 + }, + { + "epoch": 2.233176440656405, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.398874282836914, + "learning_rate": 1e-06, + "loss": 0.381, + "mean_token_accuracy": 0.8809161186218262, + "num_tokens": 669809508.0, + "step": 17555 + }, + { + "epoch": 2.2333036509349955, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.535064697265625, + "learning_rate": 1e-06, + "loss": 0.3946, + "mean_token_accuracy": 0.8733538389205933, + "num_tokens": 669848047.0, + "step": 17556 + }, + { + "epoch": 2.233430861213586, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.329219818115234, + "learning_rate": 1e-06, + "loss": 0.3542, + "mean_token_accuracy": 0.8883579969406128, + "num_tokens": 669881274.0, + "step": 17557 + }, + { + "epoch": 2.2335580714921766, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.605405807495117, + "learning_rate": 1e-06, + "loss": 0.415, + "mean_token_accuracy": 0.8660622835159302, + "num_tokens": 669918517.0, + "step": 17558 + }, + { + "epoch": 2.233685281770767, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.41315269470215, + "learning_rate": 1e-06, + "loss": 0.4419, + "mean_token_accuracy": 0.8648983240127563, + "num_tokens": 669960990.0, + "step": 17559 + }, + { + "epoch": 2.2338124920493576, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.369873046875, + "learning_rate": 1e-06, + "loss": 0.3864, + "mean_token_accuracy": 0.8778030872344971, + "num_tokens": 669995542.0, + "step": 17560 + }, + { + "epoch": 2.233939702327948, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.319976806640625, + "learning_rate": 1e-06, + "loss": 0.3864, + "mean_token_accuracy": 0.8729267716407776, + "num_tokens": 670036798.0, + "step": 17561 + }, + { + "epoch": 2.2340669126065387, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.63570213317871, + "learning_rate": 1e-06, + "loss": 0.4156, + "mean_token_accuracy": 0.8703226447105408, + "num_tokens": 670071763.0, + "step": 17562 + }, + { + "epoch": 2.234194122885129, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.494951248168945, + "learning_rate": 1e-06, + "loss": 0.4081, + "mean_token_accuracy": 0.8711784482002258, + "num_tokens": 670100864.0, + "step": 17563 + }, + { + "epoch": 2.2343213331637197, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.4205379486084, + "learning_rate": 1e-06, + "loss": 0.4412, + "mean_token_accuracy": 0.8607142567634583, + "num_tokens": 670144114.0, + "step": 17564 + }, + { + "epoch": 2.2344485434423103, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.3802547454834, + "learning_rate": 1e-06, + "loss": 0.3834, + "mean_token_accuracy": 0.876326322555542, + "num_tokens": 670182719.0, + "step": 17565 + }, + { + "epoch": 2.234575753720901, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.44350242614746, + "learning_rate": 1e-06, + "loss": 0.3919, + "mean_token_accuracy": 0.8766149282455444, + "num_tokens": 670218059.0, + "step": 17566 + }, + { + "epoch": 2.2347029639994913, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.395893096923828, + "learning_rate": 1e-06, + "loss": 0.4173, + "mean_token_accuracy": 0.8683013319969177, + "num_tokens": 670259589.0, + "step": 17567 + }, + { + "epoch": 2.234830174278082, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.38823699951172, + "learning_rate": 1e-06, + "loss": 0.4246, + "mean_token_accuracy": 0.863662838935852, + "num_tokens": 670301755.0, + "step": 17568 + }, + { + "epoch": 2.2349573845566724, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.498064041137695, + "learning_rate": 1e-06, + "loss": 0.382, + "mean_token_accuracy": 0.8762998580932617, + "num_tokens": 670339205.0, + "step": 17569 + }, + { + "epoch": 2.235084594835263, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.212873458862305, + "learning_rate": 1e-06, + "loss": 0.3959, + "mean_token_accuracy": 0.8752999305725098, + "num_tokens": 670382961.0, + "step": 17570 + }, + { + "epoch": 2.2352118051138534, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.442058563232422, + "learning_rate": 1e-06, + "loss": 0.4439, + "mean_token_accuracy": 0.8622391819953918, + "num_tokens": 670420747.0, + "step": 17571 + }, + { + "epoch": 2.2353390153924435, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.52741241455078, + "learning_rate": 1e-06, + "loss": 0.3565, + "mean_token_accuracy": 0.8894083499908447, + "num_tokens": 670457906.0, + "step": 17572 + }, + { + "epoch": 2.235466225671034, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.436120986938477, + "learning_rate": 1e-06, + "loss": 0.4338, + "mean_token_accuracy": 0.8605150580406189, + "num_tokens": 670493187.0, + "step": 17573 + }, + { + "epoch": 2.2355934359496246, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.35953140258789, + "learning_rate": 1e-06, + "loss": 0.3602, + "mean_token_accuracy": 0.8859797716140747, + "num_tokens": 670529750.0, + "step": 17574 + }, + { + "epoch": 2.235720646228215, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.33542823791504, + "learning_rate": 1e-06, + "loss": 0.4092, + "mean_token_accuracy": 0.8704453110694885, + "num_tokens": 670570147.0, + "step": 17575 + }, + { + "epoch": 2.2358478565068056, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.340654373168945, + "learning_rate": 1e-06, + "loss": 0.4032, + "mean_token_accuracy": 0.8739721775054932, + "num_tokens": 670604113.0, + "step": 17576 + }, + { + "epoch": 2.235975066785396, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.419912338256836, + "learning_rate": 1e-06, + "loss": 0.4067, + "mean_token_accuracy": 0.8714027404785156, + "num_tokens": 670645451.0, + "step": 17577 + }, + { + "epoch": 2.2361022770639867, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.474592208862305, + "learning_rate": 1e-06, + "loss": 0.3926, + "mean_token_accuracy": 0.8771111369132996, + "num_tokens": 670684055.0, + "step": 17578 + }, + { + "epoch": 2.236229487342577, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.25345802307129, + "learning_rate": 1e-06, + "loss": 0.4328, + "mean_token_accuracy": 0.8644649982452393, + "num_tokens": 670724397.0, + "step": 17579 + }, + { + "epoch": 2.2363566976211677, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.45669937133789, + "learning_rate": 1e-06, + "loss": 0.3644, + "mean_token_accuracy": 0.8824237585067749, + "num_tokens": 670761282.0, + "step": 17580 + }, + { + "epoch": 2.2364839078997583, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.287641525268555, + "learning_rate": 1e-06, + "loss": 0.3903, + "mean_token_accuracy": 0.8769041299819946, + "num_tokens": 670800007.0, + "step": 17581 + }, + { + "epoch": 2.236611118178349, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.237577438354492, + "learning_rate": 1e-06, + "loss": 0.4157, + "mean_token_accuracy": 0.8683243989944458, + "num_tokens": 670840040.0, + "step": 17582 + }, + { + "epoch": 2.2367383284569393, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.144344329833984, + "learning_rate": 1e-06, + "loss": 0.4216, + "mean_token_accuracy": 0.8651216626167297, + "num_tokens": 670881193.0, + "step": 17583 + }, + { + "epoch": 2.23686553873553, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.448925018310547, + "learning_rate": 1e-06, + "loss": 0.3993, + "mean_token_accuracy": 0.8751948475837708, + "num_tokens": 670927275.0, + "step": 17584 + }, + { + "epoch": 2.2369927490141204, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.370328903198242, + "learning_rate": 1e-06, + "loss": 0.3963, + "mean_token_accuracy": 0.8763552904129028, + "num_tokens": 670967327.0, + "step": 17585 + }, + { + "epoch": 2.237119959292711, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.371631622314453, + "learning_rate": 1e-06, + "loss": 0.3991, + "mean_token_accuracy": 0.876084566116333, + "num_tokens": 671009250.0, + "step": 17586 + }, + { + "epoch": 2.2372471695713014, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.395530700683594, + "learning_rate": 1e-06, + "loss": 0.389, + "mean_token_accuracy": 0.8761143088340759, + "num_tokens": 671045068.0, + "step": 17587 + }, + { + "epoch": 2.237374379849892, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.295766830444336, + "learning_rate": 1e-06, + "loss": 0.4513, + "mean_token_accuracy": 0.8594498038291931, + "num_tokens": 671084786.0, + "step": 17588 + }, + { + "epoch": 2.2375015901284825, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.427959442138672, + "learning_rate": 1e-06, + "loss": 0.4333, + "mean_token_accuracy": 0.8657126426696777, + "num_tokens": 671123235.0, + "step": 17589 + }, + { + "epoch": 2.237628800407073, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.292316436767578, + "learning_rate": 1e-06, + "loss": 0.3584, + "mean_token_accuracy": 0.8884439468383789, + "num_tokens": 671161702.0, + "step": 17590 + }, + { + "epoch": 2.2377560106856635, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.429790496826172, + "learning_rate": 1e-06, + "loss": 0.3724, + "mean_token_accuracy": 0.8835562467575073, + "num_tokens": 671195487.0, + "step": 17591 + }, + { + "epoch": 2.237883220964254, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.384321212768555, + "learning_rate": 1e-06, + "loss": 0.4061, + "mean_token_accuracy": 0.8700851202011108, + "num_tokens": 671232288.0, + "step": 17592 + }, + { + "epoch": 2.2380104312428446, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.49976921081543, + "learning_rate": 1e-06, + "loss": 0.4466, + "mean_token_accuracy": 0.8558960556983948, + "num_tokens": 671275154.0, + "step": 17593 + }, + { + "epoch": 2.238137641521435, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.416269302368164, + "learning_rate": 1e-06, + "loss": 0.4401, + "mean_token_accuracy": 0.8618457317352295, + "num_tokens": 671312266.0, + "step": 17594 + }, + { + "epoch": 2.238264851800025, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.583477020263672, + "learning_rate": 1e-06, + "loss": 0.3987, + "mean_token_accuracy": 0.8726696372032166, + "num_tokens": 671349690.0, + "step": 17595 + }, + { + "epoch": 2.238392062078616, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.46283721923828, + "learning_rate": 1e-06, + "loss": 0.3829, + "mean_token_accuracy": 0.8756213784217834, + "num_tokens": 671387898.0, + "step": 17596 + }, + { + "epoch": 2.2385192723572063, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.514768600463867, + "learning_rate": 1e-06, + "loss": 0.4703, + "mean_token_accuracy": 0.8507446646690369, + "num_tokens": 671428054.0, + "step": 17597 + }, + { + "epoch": 2.238646482635797, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.483070373535156, + "learning_rate": 1e-06, + "loss": 0.3878, + "mean_token_accuracy": 0.8788418173789978, + "num_tokens": 671467096.0, + "step": 17598 + }, + { + "epoch": 2.2387736929143873, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.36928367614746, + "learning_rate": 1e-06, + "loss": 0.4015, + "mean_token_accuracy": 0.8711265325546265, + "num_tokens": 671506752.0, + "step": 17599 + }, + { + "epoch": 2.238900903192978, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.60038185119629, + "learning_rate": 1e-06, + "loss": 0.4377, + "mean_token_accuracy": 0.8651663064956665, + "num_tokens": 671542403.0, + "step": 17600 + }, + { + "epoch": 2.2390281134715684, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.445552825927734, + "learning_rate": 1e-06, + "loss": 0.3569, + "mean_token_accuracy": 0.8829770088195801, + "num_tokens": 671578115.0, + "step": 17601 + }, + { + "epoch": 2.239155323750159, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.569856643676758, + "learning_rate": 1e-06, + "loss": 0.3733, + "mean_token_accuracy": 0.8823099732398987, + "num_tokens": 671619854.0, + "step": 17602 + }, + { + "epoch": 2.2392825340287494, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.233673095703125, + "learning_rate": 1e-06, + "loss": 0.3657, + "mean_token_accuracy": 0.8859925270080566, + "num_tokens": 671661617.0, + "step": 17603 + }, + { + "epoch": 2.23940974430734, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.366769790649414, + "learning_rate": 1e-06, + "loss": 0.4307, + "mean_token_accuracy": 0.8613644242286682, + "num_tokens": 671702231.0, + "step": 17604 + }, + { + "epoch": 2.2395369545859305, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.52545738220215, + "learning_rate": 1e-06, + "loss": 0.3934, + "mean_token_accuracy": 0.8723957538604736, + "num_tokens": 671741483.0, + "step": 17605 + }, + { + "epoch": 2.239664164864521, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.29787826538086, + "learning_rate": 1e-06, + "loss": 0.42, + "mean_token_accuracy": 0.8678181171417236, + "num_tokens": 671780168.0, + "step": 17606 + }, + { + "epoch": 2.2397913751431116, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.671911239624023, + "learning_rate": 1e-06, + "loss": 0.4727, + "mean_token_accuracy": 0.8499818444252014, + "num_tokens": 671818980.0, + "step": 17607 + }, + { + "epoch": 2.239918585421702, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.465173721313477, + "learning_rate": 1e-06, + "loss": 0.411, + "mean_token_accuracy": 0.8727592825889587, + "num_tokens": 671850010.0, + "step": 17608 + }, + { + "epoch": 2.2400457957002926, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.78089714050293, + "learning_rate": 1e-06, + "loss": 0.4205, + "mean_token_accuracy": 0.8652846813201904, + "num_tokens": 671888506.0, + "step": 17609 + }, + { + "epoch": 2.240173005978883, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.372526168823242, + "learning_rate": 1e-06, + "loss": 0.3791, + "mean_token_accuracy": 0.8801601529121399, + "num_tokens": 671930965.0, + "step": 17610 + }, + { + "epoch": 2.2403002162574737, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.27726936340332, + "learning_rate": 1e-06, + "loss": 0.3811, + "mean_token_accuracy": 0.8815309405326843, + "num_tokens": 671967472.0, + "step": 17611 + }, + { + "epoch": 2.240427426536064, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.933094024658203, + "learning_rate": 1e-06, + "loss": 0.3314, + "mean_token_accuracy": 0.8935330510139465, + "num_tokens": 672005986.0, + "step": 17612 + }, + { + "epoch": 2.2405546368146547, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.2004337310791, + "learning_rate": 1e-06, + "loss": 0.4111, + "mean_token_accuracy": 0.8731354475021362, + "num_tokens": 672044729.0, + "step": 17613 + }, + { + "epoch": 2.2406818470932452, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.60033416748047, + "learning_rate": 1e-06, + "loss": 0.3914, + "mean_token_accuracy": 0.8729925751686096, + "num_tokens": 672080141.0, + "step": 17614 + }, + { + "epoch": 2.2408090573718358, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.63494300842285, + "learning_rate": 1e-06, + "loss": 0.3807, + "mean_token_accuracy": 0.8805016279220581, + "num_tokens": 672119277.0, + "step": 17615 + }, + { + "epoch": 2.2409362676504263, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.614606857299805, + "learning_rate": 1e-06, + "loss": 0.3959, + "mean_token_accuracy": 0.8707282543182373, + "num_tokens": 672154205.0, + "step": 17616 + }, + { + "epoch": 2.241063477929017, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.417818069458008, + "learning_rate": 1e-06, + "loss": 0.4064, + "mean_token_accuracy": 0.8717712759971619, + "num_tokens": 672193667.0, + "step": 17617 + }, + { + "epoch": 2.2411906882076074, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.403966903686523, + "learning_rate": 1e-06, + "loss": 0.501, + "mean_token_accuracy": 0.8427951335906982, + "num_tokens": 672232360.0, + "step": 17618 + }, + { + "epoch": 2.241317898486198, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.395030975341797, + "learning_rate": 1e-06, + "loss": 0.4453, + "mean_token_accuracy": 0.8597251176834106, + "num_tokens": 672264127.0, + "step": 17619 + }, + { + "epoch": 2.241445108764788, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.4306697845459, + "learning_rate": 1e-06, + "loss": 0.3755, + "mean_token_accuracy": 0.8806737661361694, + "num_tokens": 672300719.0, + "step": 17620 + }, + { + "epoch": 2.2415723190433785, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.50126075744629, + "learning_rate": 1e-06, + "loss": 0.3755, + "mean_token_accuracy": 0.8826133608818054, + "num_tokens": 672342796.0, + "step": 17621 + }, + { + "epoch": 2.241699529321969, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.45852279663086, + "learning_rate": 1e-06, + "loss": 0.3939, + "mean_token_accuracy": 0.8745244145393372, + "num_tokens": 672382597.0, + "step": 17622 + }, + { + "epoch": 2.2418267396005596, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.512603759765625, + "learning_rate": 1e-06, + "loss": 0.4355, + "mean_token_accuracy": 0.8613278269767761, + "num_tokens": 672427350.0, + "step": 17623 + }, + { + "epoch": 2.24195394987915, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.371755599975586, + "learning_rate": 1e-06, + "loss": 0.4288, + "mean_token_accuracy": 0.8658742904663086, + "num_tokens": 672468997.0, + "step": 17624 + }, + { + "epoch": 2.2420811601577406, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.401906967163086, + "learning_rate": 1e-06, + "loss": 0.3795, + "mean_token_accuracy": 0.8786786794662476, + "num_tokens": 672505061.0, + "step": 17625 + }, + { + "epoch": 2.242208370436331, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.56938934326172, + "learning_rate": 1e-06, + "loss": 0.4073, + "mean_token_accuracy": 0.8710805177688599, + "num_tokens": 672544241.0, + "step": 17626 + }, + { + "epoch": 2.2423355807149217, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.402666091918945, + "learning_rate": 1e-06, + "loss": 0.4033, + "mean_token_accuracy": 0.8710891008377075, + "num_tokens": 672586334.0, + "step": 17627 + }, + { + "epoch": 2.242462790993512, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.487577438354492, + "learning_rate": 1e-06, + "loss": 0.4099, + "mean_token_accuracy": 0.8680068254470825, + "num_tokens": 672628627.0, + "step": 17628 + }, + { + "epoch": 2.2425900012721027, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.435930252075195, + "learning_rate": 1e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.8624404668807983, + "num_tokens": 672663046.0, + "step": 17629 + }, + { + "epoch": 2.2427172115506933, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.401247024536133, + "learning_rate": 1e-06, + "loss": 0.381, + "mean_token_accuracy": 0.8786824941635132, + "num_tokens": 672703394.0, + "step": 17630 + }, + { + "epoch": 2.242844421829284, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.6557559967041, + "learning_rate": 1e-06, + "loss": 0.4308, + "mean_token_accuracy": 0.8628438115119934, + "num_tokens": 672743262.0, + "step": 17631 + }, + { + "epoch": 2.2429716321078743, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.500274658203125, + "learning_rate": 1e-06, + "loss": 0.5022, + "mean_token_accuracy": 0.8387750387191772, + "num_tokens": 672782662.0, + "step": 17632 + }, + { + "epoch": 2.243098842386465, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.359745025634766, + "learning_rate": 1e-06, + "loss": 0.4062, + "mean_token_accuracy": 0.8696376085281372, + "num_tokens": 672818400.0, + "step": 17633 + }, + { + "epoch": 2.2432260526650554, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.349987030029297, + "learning_rate": 1e-06, + "loss": 0.362, + "mean_token_accuracy": 0.8843358159065247, + "num_tokens": 672858843.0, + "step": 17634 + }, + { + "epoch": 2.243353262943646, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.464168548583984, + "learning_rate": 1e-06, + "loss": 0.3978, + "mean_token_accuracy": 0.8731115460395813, + "num_tokens": 672900397.0, + "step": 17635 + }, + { + "epoch": 2.2434804732222364, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.362516403198242, + "learning_rate": 1e-06, + "loss": 0.4058, + "mean_token_accuracy": 0.8698506951332092, + "num_tokens": 672936025.0, + "step": 17636 + }, + { + "epoch": 2.243607683500827, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.581829071044922, + "learning_rate": 1e-06, + "loss": 0.466, + "mean_token_accuracy": 0.8538565635681152, + "num_tokens": 672979180.0, + "step": 17637 + }, + { + "epoch": 2.2437348937794175, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.49861717224121, + "learning_rate": 1e-06, + "loss": 0.3728, + "mean_token_accuracy": 0.8824226260185242, + "num_tokens": 673018053.0, + "step": 17638 + }, + { + "epoch": 2.243862104058008, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.40389633178711, + "learning_rate": 1e-06, + "loss": 0.4615, + "mean_token_accuracy": 0.8533071279525757, + "num_tokens": 673057596.0, + "step": 17639 + }, + { + "epoch": 2.2439893143365985, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.558589935302734, + "learning_rate": 1e-06, + "loss": 0.3645, + "mean_token_accuracy": 0.884149432182312, + "num_tokens": 673097740.0, + "step": 17640 + }, + { + "epoch": 2.244116524615189, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.578853607177734, + "learning_rate": 1e-06, + "loss": 0.3917, + "mean_token_accuracy": 0.8776578903198242, + "num_tokens": 673141821.0, + "step": 17641 + }, + { + "epoch": 2.2442437348937796, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.504112243652344, + "learning_rate": 1e-06, + "loss": 0.378, + "mean_token_accuracy": 0.8798989057540894, + "num_tokens": 673184279.0, + "step": 17642 + }, + { + "epoch": 2.24437094517237, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.54697608947754, + "learning_rate": 1e-06, + "loss": 0.3755, + "mean_token_accuracy": 0.8818861246109009, + "num_tokens": 673224328.0, + "step": 17643 + }, + { + "epoch": 2.2444981554509607, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.470495223999023, + "learning_rate": 1e-06, + "loss": 0.4516, + "mean_token_accuracy": 0.85920250415802, + "num_tokens": 673263328.0, + "step": 17644 + }, + { + "epoch": 2.2446253657295507, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.51711654663086, + "learning_rate": 1e-06, + "loss": 0.3931, + "mean_token_accuracy": 0.8766242265701294, + "num_tokens": 673301663.0, + "step": 17645 + }, + { + "epoch": 2.2447525760081413, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.513452529907227, + "learning_rate": 1e-06, + "loss": 0.4159, + "mean_token_accuracy": 0.8698999285697937, + "num_tokens": 673340606.0, + "step": 17646 + }, + { + "epoch": 2.244879786286732, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.688417434692383, + "learning_rate": 1e-06, + "loss": 0.4126, + "mean_token_accuracy": 0.869318962097168, + "num_tokens": 673382201.0, + "step": 17647 + }, + { + "epoch": 2.2450069965653223, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.43084144592285, + "learning_rate": 1e-06, + "loss": 0.3788, + "mean_token_accuracy": 0.8812574148178101, + "num_tokens": 673423330.0, + "step": 17648 + }, + { + "epoch": 2.245134206843913, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.485898971557617, + "learning_rate": 1e-06, + "loss": 0.3441, + "mean_token_accuracy": 0.8882861137390137, + "num_tokens": 673459690.0, + "step": 17649 + }, + { + "epoch": 2.2452614171225034, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.544677734375, + "learning_rate": 1e-06, + "loss": 0.4186, + "mean_token_accuracy": 0.8677964806556702, + "num_tokens": 673494827.0, + "step": 17650 + }, + { + "epoch": 2.245388627401094, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.296171188354492, + "learning_rate": 1e-06, + "loss": 0.4456, + "mean_token_accuracy": 0.8604934215545654, + "num_tokens": 673540568.0, + "step": 17651 + }, + { + "epoch": 2.2455158376796844, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.472288131713867, + "learning_rate": 1e-06, + "loss": 0.3574, + "mean_token_accuracy": 0.8883534669876099, + "num_tokens": 673574022.0, + "step": 17652 + }, + { + "epoch": 2.245643047958275, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.494552612304688, + "learning_rate": 1e-06, + "loss": 0.4175, + "mean_token_accuracy": 0.8689594268798828, + "num_tokens": 673612493.0, + "step": 17653 + }, + { + "epoch": 2.2457702582368655, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.466283798217773, + "learning_rate": 1e-06, + "loss": 0.4212, + "mean_token_accuracy": 0.8676217794418335, + "num_tokens": 673653729.0, + "step": 17654 + }, + { + "epoch": 2.245897468515456, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.521177291870117, + "learning_rate": 1e-06, + "loss": 0.3666, + "mean_token_accuracy": 0.8851603269577026, + "num_tokens": 673695626.0, + "step": 17655 + }, + { + "epoch": 2.2460246787940465, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.501209259033203, + "learning_rate": 1e-06, + "loss": 0.3736, + "mean_token_accuracy": 0.878551721572876, + "num_tokens": 673732188.0, + "step": 17656 + }, + { + "epoch": 2.246151889072637, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.57732391357422, + "learning_rate": 1e-06, + "loss": 0.4315, + "mean_token_accuracy": 0.8638929724693298, + "num_tokens": 673770259.0, + "step": 17657 + }, + { + "epoch": 2.2462790993512276, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.39846420288086, + "learning_rate": 1e-06, + "loss": 0.411, + "mean_token_accuracy": 0.868487536907196, + "num_tokens": 673805018.0, + "step": 17658 + }, + { + "epoch": 2.246406309629818, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.268850326538086, + "learning_rate": 1e-06, + "loss": 0.4194, + "mean_token_accuracy": 0.8646575212478638, + "num_tokens": 673846058.0, + "step": 17659 + }, + { + "epoch": 2.2465335199084087, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.379648208618164, + "learning_rate": 1e-06, + "loss": 0.442, + "mean_token_accuracy": 0.8592664003372192, + "num_tokens": 673886437.0, + "step": 17660 + }, + { + "epoch": 2.246660730186999, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.573680877685547, + "learning_rate": 1e-06, + "loss": 0.3903, + "mean_token_accuracy": 0.8757113814353943, + "num_tokens": 673925701.0, + "step": 17661 + }, + { + "epoch": 2.2467879404655897, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.386754989624023, + "learning_rate": 1e-06, + "loss": 0.4088, + "mean_token_accuracy": 0.8742204904556274, + "num_tokens": 673969854.0, + "step": 17662 + }, + { + "epoch": 2.2469151507441802, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.499923706054688, + "learning_rate": 1e-06, + "loss": 0.3908, + "mean_token_accuracy": 0.8744035959243774, + "num_tokens": 674005048.0, + "step": 17663 + }, + { + "epoch": 2.2470423610227708, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.16291618347168, + "learning_rate": 1e-06, + "loss": 0.4229, + "mean_token_accuracy": 0.8672982454299927, + "num_tokens": 674046548.0, + "step": 17664 + }, + { + "epoch": 2.2471695713013613, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.601593017578125, + "learning_rate": 1e-06, + "loss": 0.4158, + "mean_token_accuracy": 0.8664693832397461, + "num_tokens": 674086905.0, + "step": 17665 + }, + { + "epoch": 2.247296781579952, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.512983322143555, + "learning_rate": 1e-06, + "loss": 0.3665, + "mean_token_accuracy": 0.8819636702537537, + "num_tokens": 674124589.0, + "step": 17666 + }, + { + "epoch": 2.2474239918585424, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.444368362426758, + "learning_rate": 1e-06, + "loss": 0.3688, + "mean_token_accuracy": 0.8844300508499146, + "num_tokens": 674165135.0, + "step": 17667 + }, + { + "epoch": 2.247551202137133, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.485464096069336, + "learning_rate": 1e-06, + "loss": 0.4262, + "mean_token_accuracy": 0.8666955232620239, + "num_tokens": 674207269.0, + "step": 17668 + }, + { + "epoch": 2.2476784124157234, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.422847747802734, + "learning_rate": 1e-06, + "loss": 0.3469, + "mean_token_accuracy": 0.888300895690918, + "num_tokens": 674240806.0, + "step": 17669 + }, + { + "epoch": 2.2478056226943135, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.366641998291016, + "learning_rate": 1e-06, + "loss": 0.4246, + "mean_token_accuracy": 0.863182544708252, + "num_tokens": 674274697.0, + "step": 17670 + }, + { + "epoch": 2.247932832972904, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.77437400817871, + "learning_rate": 1e-06, + "loss": 0.4354, + "mean_token_accuracy": 0.8631190061569214, + "num_tokens": 674311362.0, + "step": 17671 + }, + { + "epoch": 2.2480600432514946, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.662250518798828, + "learning_rate": 1e-06, + "loss": 0.391, + "mean_token_accuracy": 0.8790079951286316, + "num_tokens": 674352505.0, + "step": 17672 + }, + { + "epoch": 2.248187253530085, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.55551528930664, + "learning_rate": 1e-06, + "loss": 0.3872, + "mean_token_accuracy": 0.876397967338562, + "num_tokens": 674383938.0, + "step": 17673 + }, + { + "epoch": 2.2483144638086756, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.59036636352539, + "learning_rate": 1e-06, + "loss": 0.4125, + "mean_token_accuracy": 0.8690913319587708, + "num_tokens": 674421665.0, + "step": 17674 + }, + { + "epoch": 2.248441674087266, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.60995864868164, + "learning_rate": 1e-06, + "loss": 0.3972, + "mean_token_accuracy": 0.8742905855178833, + "num_tokens": 674466065.0, + "step": 17675 + }, + { + "epoch": 2.2485688843658567, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.335783004760742, + "learning_rate": 1e-06, + "loss": 0.4036, + "mean_token_accuracy": 0.8740121126174927, + "num_tokens": 674506868.0, + "step": 17676 + }, + { + "epoch": 2.248696094644447, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.598358154296875, + "learning_rate": 1e-06, + "loss": 0.4152, + "mean_token_accuracy": 0.8678151369094849, + "num_tokens": 674545756.0, + "step": 17677 + }, + { + "epoch": 2.2488233049230377, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.863344192504883, + "learning_rate": 1e-06, + "loss": 0.4246, + "mean_token_accuracy": 0.8673050403594971, + "num_tokens": 674590680.0, + "step": 17678 + }, + { + "epoch": 2.2489505152016283, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.272077560424805, + "learning_rate": 1e-06, + "loss": 0.3768, + "mean_token_accuracy": 0.876652717590332, + "num_tokens": 674631624.0, + "step": 17679 + }, + { + "epoch": 2.249077725480219, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.349651336669922, + "learning_rate": 1e-06, + "loss": 0.4206, + "mean_token_accuracy": 0.8672488331794739, + "num_tokens": 674667365.0, + "step": 17680 + }, + { + "epoch": 2.2492049357588093, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.380115509033203, + "learning_rate": 1e-06, + "loss": 0.4012, + "mean_token_accuracy": 0.8718138933181763, + "num_tokens": 674706587.0, + "step": 17681 + }, + { + "epoch": 2.2493321460374, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.405004501342773, + "learning_rate": 1e-06, + "loss": 0.4057, + "mean_token_accuracy": 0.8711497783660889, + "num_tokens": 674744480.0, + "step": 17682 + }, + { + "epoch": 2.2494593563159904, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.372129440307617, + "learning_rate": 1e-06, + "loss": 0.4025, + "mean_token_accuracy": 0.8712085485458374, + "num_tokens": 674783104.0, + "step": 17683 + }, + { + "epoch": 2.249586566594581, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.55515480041504, + "learning_rate": 1e-06, + "loss": 0.4263, + "mean_token_accuracy": 0.8636127710342407, + "num_tokens": 674823488.0, + "step": 17684 + }, + { + "epoch": 2.2497137768731714, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.317459106445312, + "learning_rate": 1e-06, + "loss": 0.3785, + "mean_token_accuracy": 0.8776487112045288, + "num_tokens": 674857322.0, + "step": 17685 + }, + { + "epoch": 2.249840987151762, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.571367263793945, + "learning_rate": 1e-06, + "loss": 0.4309, + "mean_token_accuracy": 0.8633511662483215, + "num_tokens": 674901566.0, + "step": 17686 + }, + { + "epoch": 2.2499681974303525, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.678699493408203, + "learning_rate": 1e-06, + "loss": 0.3819, + "mean_token_accuracy": 0.8776854276657104, + "num_tokens": 674942803.0, + "step": 17687 + }, + { + "epoch": 2.250095407708943, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.516990661621094, + "learning_rate": 1e-06, + "loss": 0.4306, + "mean_token_accuracy": 0.8636586666107178, + "num_tokens": 674983160.0, + "step": 17688 + }, + { + "epoch": 2.2502226179875335, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.383197784423828, + "learning_rate": 1e-06, + "loss": 0.402, + "mean_token_accuracy": 0.8729103803634644, + "num_tokens": 675021243.0, + "step": 17689 + }, + { + "epoch": 2.250349828266124, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.43150520324707, + "learning_rate": 1e-06, + "loss": 0.41, + "mean_token_accuracy": 0.8685476183891296, + "num_tokens": 675059435.0, + "step": 17690 + }, + { + "epoch": 2.2504770385447146, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.719863891601562, + "learning_rate": 1e-06, + "loss": 0.4038, + "mean_token_accuracy": 0.8755894899368286, + "num_tokens": 675091689.0, + "step": 17691 + }, + { + "epoch": 2.250604248823305, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.643827438354492, + "learning_rate": 1e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.8662036657333374, + "num_tokens": 675130368.0, + "step": 17692 + }, + { + "epoch": 2.250731459101895, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.440837860107422, + "learning_rate": 1e-06, + "loss": 0.3896, + "mean_token_accuracy": 0.8757321834564209, + "num_tokens": 675171745.0, + "step": 17693 + }, + { + "epoch": 2.250858669380486, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.407299041748047, + "learning_rate": 1e-06, + "loss": 0.4131, + "mean_token_accuracy": 0.8707907199859619, + "num_tokens": 675206052.0, + "step": 17694 + }, + { + "epoch": 2.2509858796590763, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.45305633544922, + "learning_rate": 1e-06, + "loss": 0.416, + "mean_token_accuracy": 0.8668361902236938, + "num_tokens": 675243231.0, + "step": 17695 + }, + { + "epoch": 2.251113089937667, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.402042388916016, + "learning_rate": 1e-06, + "loss": 0.4162, + "mean_token_accuracy": 0.8648858666419983, + "num_tokens": 675273582.0, + "step": 17696 + }, + { + "epoch": 2.2512403002162573, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.424884796142578, + "learning_rate": 1e-06, + "loss": 0.4245, + "mean_token_accuracy": 0.8690289258956909, + "num_tokens": 675314247.0, + "step": 17697 + }, + { + "epoch": 2.251367510494848, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.633275985717773, + "learning_rate": 1e-06, + "loss": 0.4, + "mean_token_accuracy": 0.8739453554153442, + "num_tokens": 675355675.0, + "step": 17698 + }, + { + "epoch": 2.2514947207734384, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.70699691772461, + "learning_rate": 1e-06, + "loss": 0.4079, + "mean_token_accuracy": 0.8699789047241211, + "num_tokens": 675394290.0, + "step": 17699 + }, + { + "epoch": 2.251621931052029, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.270204544067383, + "learning_rate": 1e-06, + "loss": 0.3879, + "mean_token_accuracy": 0.8755291104316711, + "num_tokens": 675434251.0, + "step": 17700 + }, + { + "epoch": 2.2517491413306194, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.39967918395996, + "learning_rate": 1e-06, + "loss": 0.3823, + "mean_token_accuracy": 0.8780760765075684, + "num_tokens": 675474731.0, + "step": 17701 + }, + { + "epoch": 2.25187635160921, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.676156997680664, + "learning_rate": 1e-06, + "loss": 0.4141, + "mean_token_accuracy": 0.8664898872375488, + "num_tokens": 675515180.0, + "step": 17702 + }, + { + "epoch": 2.2520035618878005, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.282115936279297, + "learning_rate": 1e-06, + "loss": 0.4369, + "mean_token_accuracy": 0.862310528755188, + "num_tokens": 675553429.0, + "step": 17703 + }, + { + "epoch": 2.252130772166391, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.788026809692383, + "learning_rate": 1e-06, + "loss": 0.3987, + "mean_token_accuracy": 0.8760251998901367, + "num_tokens": 675592919.0, + "step": 17704 + }, + { + "epoch": 2.2522579824449815, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.6645565032959, + "learning_rate": 1e-06, + "loss": 0.3779, + "mean_token_accuracy": 0.8795071244239807, + "num_tokens": 675627528.0, + "step": 17705 + }, + { + "epoch": 2.252385192723572, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.424348831176758, + "learning_rate": 1e-06, + "loss": 0.3792, + "mean_token_accuracy": 0.8802896738052368, + "num_tokens": 675663721.0, + "step": 17706 + }, + { + "epoch": 2.2525124030021626, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.527050018310547, + "learning_rate": 1e-06, + "loss": 0.4259, + "mean_token_accuracy": 0.865516185760498, + "num_tokens": 675705187.0, + "step": 17707 + }, + { + "epoch": 2.252639613280753, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.637351989746094, + "learning_rate": 1e-06, + "loss": 0.3782, + "mean_token_accuracy": 0.8812225461006165, + "num_tokens": 675752713.0, + "step": 17708 + }, + { + "epoch": 2.2527668235593437, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.468294143676758, + "learning_rate": 1e-06, + "loss": 0.3997, + "mean_token_accuracy": 0.8727112412452698, + "num_tokens": 675791009.0, + "step": 17709 + }, + { + "epoch": 2.252894033837934, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.52091407775879, + "learning_rate": 1e-06, + "loss": 0.3913, + "mean_token_accuracy": 0.8747594952583313, + "num_tokens": 675828890.0, + "step": 17710 + }, + { + "epoch": 2.2530212441165247, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.57740020751953, + "learning_rate": 1e-06, + "loss": 0.4017, + "mean_token_accuracy": 0.8724629878997803, + "num_tokens": 675862855.0, + "step": 17711 + }, + { + "epoch": 2.2531484543951152, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.401498794555664, + "learning_rate": 1e-06, + "loss": 0.4069, + "mean_token_accuracy": 0.8707097768783569, + "num_tokens": 675903032.0, + "step": 17712 + }, + { + "epoch": 2.2532756646737058, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.361711502075195, + "learning_rate": 1e-06, + "loss": 0.4209, + "mean_token_accuracy": 0.8675814867019653, + "num_tokens": 675943035.0, + "step": 17713 + }, + { + "epoch": 2.2534028749522963, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.591623306274414, + "learning_rate": 1e-06, + "loss": 0.3962, + "mean_token_accuracy": 0.8745342493057251, + "num_tokens": 675983073.0, + "step": 17714 + }, + { + "epoch": 2.253530085230887, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.445297241210938, + "learning_rate": 1e-06, + "loss": 0.382, + "mean_token_accuracy": 0.8770912885665894, + "num_tokens": 676022341.0, + "step": 17715 + }, + { + "epoch": 2.2536572955094774, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.490753173828125, + "learning_rate": 1e-06, + "loss": 0.4245, + "mean_token_accuracy": 0.8692103624343872, + "num_tokens": 676060907.0, + "step": 17716 + }, + { + "epoch": 2.253784505788068, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.308324813842773, + "learning_rate": 1e-06, + "loss": 0.407, + "mean_token_accuracy": 0.870417058467865, + "num_tokens": 676100693.0, + "step": 17717 + }, + { + "epoch": 2.253911716066658, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.50351333618164, + "learning_rate": 1e-06, + "loss": 0.3622, + "mean_token_accuracy": 0.8862776160240173, + "num_tokens": 676137526.0, + "step": 17718 + }, + { + "epoch": 2.254038926345249, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.44552993774414, + "learning_rate": 1e-06, + "loss": 0.3913, + "mean_token_accuracy": 0.875259280204773, + "num_tokens": 676177245.0, + "step": 17719 + }, + { + "epoch": 2.254166136623839, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.646034240722656, + "learning_rate": 1e-06, + "loss": 0.3819, + "mean_token_accuracy": 0.8789650797843933, + "num_tokens": 676214568.0, + "step": 17720 + }, + { + "epoch": 2.2542933469024296, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.399974822998047, + "learning_rate": 1e-06, + "loss": 0.3981, + "mean_token_accuracy": 0.8734415769577026, + "num_tokens": 676252552.0, + "step": 17721 + }, + { + "epoch": 2.25442055718102, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.463565826416016, + "learning_rate": 1e-06, + "loss": 0.4614, + "mean_token_accuracy": 0.8547452092170715, + "num_tokens": 676290480.0, + "step": 17722 + }, + { + "epoch": 2.2545477674596106, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.537330627441406, + "learning_rate": 1e-06, + "loss": 0.3624, + "mean_token_accuracy": 0.884834349155426, + "num_tokens": 676324950.0, + "step": 17723 + }, + { + "epoch": 2.254674977738201, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.455278396606445, + "learning_rate": 1e-06, + "loss": 0.426, + "mean_token_accuracy": 0.8648611307144165, + "num_tokens": 676363851.0, + "step": 17724 + }, + { + "epoch": 2.2548021880167917, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.331573486328125, + "learning_rate": 1e-06, + "loss": 0.3791, + "mean_token_accuracy": 0.881419837474823, + "num_tokens": 676399920.0, + "step": 17725 + }, + { + "epoch": 2.254929398295382, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.752338409423828, + "learning_rate": 1e-06, + "loss": 0.4114, + "mean_token_accuracy": 0.8695728778839111, + "num_tokens": 676443243.0, + "step": 17726 + }, + { + "epoch": 2.2550566085739727, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.672317504882812, + "learning_rate": 1e-06, + "loss": 0.4501, + "mean_token_accuracy": 0.8591925501823425, + "num_tokens": 676480272.0, + "step": 17727 + }, + { + "epoch": 2.2551838188525632, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.428054809570312, + "learning_rate": 1e-06, + "loss": 0.4133, + "mean_token_accuracy": 0.8684083223342896, + "num_tokens": 676516630.0, + "step": 17728 + }, + { + "epoch": 2.2553110291311538, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.451353073120117, + "learning_rate": 1e-06, + "loss": 0.3809, + "mean_token_accuracy": 0.8758111596107483, + "num_tokens": 676557842.0, + "step": 17729 + }, + { + "epoch": 2.2554382394097443, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.711118698120117, + "learning_rate": 1e-06, + "loss": 0.3713, + "mean_token_accuracy": 0.882387638092041, + "num_tokens": 676599413.0, + "step": 17730 + }, + { + "epoch": 2.255565449688335, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.512592315673828, + "learning_rate": 1e-06, + "loss": 0.4286, + "mean_token_accuracy": 0.8646574020385742, + "num_tokens": 676637791.0, + "step": 17731 + }, + { + "epoch": 2.2556926599669254, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.48614501953125, + "learning_rate": 1e-06, + "loss": 0.4053, + "mean_token_accuracy": 0.8677502870559692, + "num_tokens": 676677108.0, + "step": 17732 + }, + { + "epoch": 2.255819870245516, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.416698455810547, + "learning_rate": 1e-06, + "loss": 0.3977, + "mean_token_accuracy": 0.8741529583930969, + "num_tokens": 676720698.0, + "step": 17733 + }, + { + "epoch": 2.2559470805241064, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.453086853027344, + "learning_rate": 1e-06, + "loss": 0.437, + "mean_token_accuracy": 0.8610714673995972, + "num_tokens": 676763719.0, + "step": 17734 + }, + { + "epoch": 2.256074290802697, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.539745330810547, + "learning_rate": 1e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.8734877109527588, + "num_tokens": 676801083.0, + "step": 17735 + }, + { + "epoch": 2.2562015010812875, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.67262077331543, + "learning_rate": 1e-06, + "loss": 0.4202, + "mean_token_accuracy": 0.8646849393844604, + "num_tokens": 676840861.0, + "step": 17736 + }, + { + "epoch": 2.256328711359878, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.41667938232422, + "learning_rate": 1e-06, + "loss": 0.3663, + "mean_token_accuracy": 0.8848304748535156, + "num_tokens": 676876864.0, + "step": 17737 + }, + { + "epoch": 2.2564559216384685, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.54962158203125, + "learning_rate": 1e-06, + "loss": 0.3415, + "mean_token_accuracy": 0.8926467895507812, + "num_tokens": 676920983.0, + "step": 17738 + }, + { + "epoch": 2.256583131917059, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.393173217773438, + "learning_rate": 1e-06, + "loss": 0.3721, + "mean_token_accuracy": 0.8822992444038391, + "num_tokens": 676961294.0, + "step": 17739 + }, + { + "epoch": 2.2567103421956496, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.598302841186523, + "learning_rate": 1e-06, + "loss": 0.4237, + "mean_token_accuracy": 0.8652703166007996, + "num_tokens": 676996318.0, + "step": 17740 + }, + { + "epoch": 2.2568375524742397, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.671977996826172, + "learning_rate": 1e-06, + "loss": 0.3941, + "mean_token_accuracy": 0.8746728897094727, + "num_tokens": 677034083.0, + "step": 17741 + }, + { + "epoch": 2.2569647627528306, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.604694366455078, + "learning_rate": 1e-06, + "loss": 0.3989, + "mean_token_accuracy": 0.8732805848121643, + "num_tokens": 677071206.0, + "step": 17742 + }, + { + "epoch": 2.2570919730314207, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.7725772857666, + "learning_rate": 1e-06, + "loss": 0.3945, + "mean_token_accuracy": 0.8776907920837402, + "num_tokens": 677103573.0, + "step": 17743 + }, + { + "epoch": 2.2572191833100113, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.602203369140625, + "learning_rate": 1e-06, + "loss": 0.3725, + "mean_token_accuracy": 0.8816961050033569, + "num_tokens": 677141406.0, + "step": 17744 + }, + { + "epoch": 2.257346393588602, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.67648696899414, + "learning_rate": 1e-06, + "loss": 0.4357, + "mean_token_accuracy": 0.8623775243759155, + "num_tokens": 677185968.0, + "step": 17745 + }, + { + "epoch": 2.2574736038671923, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.536731719970703, + "learning_rate": 1e-06, + "loss": 0.4228, + "mean_token_accuracy": 0.8651630282402039, + "num_tokens": 677229004.0, + "step": 17746 + }, + { + "epoch": 2.257600814145783, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.58377456665039, + "learning_rate": 1e-06, + "loss": 0.3559, + "mean_token_accuracy": 0.8826846480369568, + "num_tokens": 677266317.0, + "step": 17747 + }, + { + "epoch": 2.2577280244243734, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.563108444213867, + "learning_rate": 1e-06, + "loss": 0.3683, + "mean_token_accuracy": 0.8820838928222656, + "num_tokens": 677302344.0, + "step": 17748 + }, + { + "epoch": 2.257855234702964, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.638044357299805, + "learning_rate": 1e-06, + "loss": 0.4186, + "mean_token_accuracy": 0.8632815480232239, + "num_tokens": 677342103.0, + "step": 17749 + }, + { + "epoch": 2.2579824449815544, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.581356048583984, + "learning_rate": 1e-06, + "loss": 0.4461, + "mean_token_accuracy": 0.8596621751785278, + "num_tokens": 677378602.0, + "step": 17750 + }, + { + "epoch": 2.258109655260145, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.371723175048828, + "learning_rate": 1e-06, + "loss": 0.4146, + "mean_token_accuracy": 0.8715709447860718, + "num_tokens": 677415005.0, + "step": 17751 + }, + { + "epoch": 2.2582368655387355, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.457992553710938, + "learning_rate": 1e-06, + "loss": 0.469, + "mean_token_accuracy": 0.8548411130905151, + "num_tokens": 677457908.0, + "step": 17752 + }, + { + "epoch": 2.258364075817326, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.551612854003906, + "learning_rate": 1e-06, + "loss": 0.4377, + "mean_token_accuracy": 0.8624910116195679, + "num_tokens": 677492759.0, + "step": 17753 + }, + { + "epoch": 2.2584912860959165, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.476699829101562, + "learning_rate": 1e-06, + "loss": 0.4166, + "mean_token_accuracy": 0.8703588843345642, + "num_tokens": 677535771.0, + "step": 17754 + }, + { + "epoch": 2.258618496374507, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.664146423339844, + "learning_rate": 1e-06, + "loss": 0.3664, + "mean_token_accuracy": 0.8855664730072021, + "num_tokens": 677575325.0, + "step": 17755 + }, + { + "epoch": 2.2587457066530976, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.278894424438477, + "learning_rate": 1e-06, + "loss": 0.4045, + "mean_token_accuracy": 0.872897744178772, + "num_tokens": 677614132.0, + "step": 17756 + }, + { + "epoch": 2.258872916931688, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.870304107666016, + "learning_rate": 1e-06, + "loss": 0.4213, + "mean_token_accuracy": 0.8673062324523926, + "num_tokens": 677645657.0, + "step": 17757 + }, + { + "epoch": 2.2590001272102787, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.371583938598633, + "learning_rate": 1e-06, + "loss": 0.3731, + "mean_token_accuracy": 0.8803855180740356, + "num_tokens": 677686238.0, + "step": 17758 + }, + { + "epoch": 2.259127337488869, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.549205780029297, + "learning_rate": 1e-06, + "loss": 0.3958, + "mean_token_accuracy": 0.8756480813026428, + "num_tokens": 677724080.0, + "step": 17759 + }, + { + "epoch": 2.2592545477674597, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.435901641845703, + "learning_rate": 1e-06, + "loss": 0.407, + "mean_token_accuracy": 0.8719840049743652, + "num_tokens": 677765768.0, + "step": 17760 + }, + { + "epoch": 2.2593817580460502, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.506385803222656, + "learning_rate": 1e-06, + "loss": 0.4069, + "mean_token_accuracy": 0.8728712201118469, + "num_tokens": 677803937.0, + "step": 17761 + }, + { + "epoch": 2.2595089683246408, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.516408920288086, + "learning_rate": 1e-06, + "loss": 0.358, + "mean_token_accuracy": 0.8830808997154236, + "num_tokens": 677839685.0, + "step": 17762 + }, + { + "epoch": 2.2596361786032313, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.605907440185547, + "learning_rate": 1e-06, + "loss": 0.4629, + "mean_token_accuracy": 0.858427882194519, + "num_tokens": 677880411.0, + "step": 17763 + }, + { + "epoch": 2.259763388881822, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.63397979736328, + "learning_rate": 1e-06, + "loss": 0.3943, + "mean_token_accuracy": 0.8807908296585083, + "num_tokens": 677921177.0, + "step": 17764 + }, + { + "epoch": 2.2598905991604123, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.657209396362305, + "learning_rate": 1e-06, + "loss": 0.3761, + "mean_token_accuracy": 0.8818938732147217, + "num_tokens": 677956393.0, + "step": 17765 + }, + { + "epoch": 2.2600178094390024, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.398624420166016, + "learning_rate": 1e-06, + "loss": 0.4014, + "mean_token_accuracy": 0.8701465129852295, + "num_tokens": 677988364.0, + "step": 17766 + }, + { + "epoch": 2.2601450197175934, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.46150779724121, + "learning_rate": 1e-06, + "loss": 0.385, + "mean_token_accuracy": 0.8800203800201416, + "num_tokens": 678028093.0, + "step": 17767 + }, + { + "epoch": 2.2602722299961835, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.613643646240234, + "learning_rate": 1e-06, + "loss": 0.396, + "mean_token_accuracy": 0.8742750883102417, + "num_tokens": 678068174.0, + "step": 17768 + }, + { + "epoch": 2.260399440274774, + "ewc_loss": 0.03662109375, + "ewc_loss_parallel": 3.6716461181640625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.402639389038086, + "learning_rate": 1e-06, + "loss": 0.387, + "mean_token_accuracy": 0.8746238946914673, + "num_tokens": 678105784.0, + "step": 17769 + }, + { + "epoch": 2.2605266505533645, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.41659164428711, + "learning_rate": 1e-06, + "loss": 0.3666, + "mean_token_accuracy": 0.8829064965248108, + "num_tokens": 678142873.0, + "step": 17770 + }, + { + "epoch": 2.260653860831955, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.48476219177246, + "learning_rate": 1e-06, + "loss": 0.4133, + "mean_token_accuracy": 0.8692554235458374, + "num_tokens": 678181527.0, + "step": 17771 + }, + { + "epoch": 2.2607810711105456, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.389148712158203, + "learning_rate": 1e-06, + "loss": 0.3875, + "mean_token_accuracy": 0.8761110305786133, + "num_tokens": 678221225.0, + "step": 17772 + }, + { + "epoch": 2.260908281389136, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.743671417236328, + "learning_rate": 1e-06, + "loss": 0.3699, + "mean_token_accuracy": 0.8812795877456665, + "num_tokens": 678254767.0, + "step": 17773 + }, + { + "epoch": 2.2610354916677267, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.520627975463867, + "learning_rate": 1e-06, + "loss": 0.4008, + "mean_token_accuracy": 0.8753761053085327, + "num_tokens": 678290830.0, + "step": 17774 + }, + { + "epoch": 2.261162701946317, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.543346405029297, + "learning_rate": 1e-06, + "loss": 0.3639, + "mean_token_accuracy": 0.8838858008384705, + "num_tokens": 678327102.0, + "step": 17775 + }, + { + "epoch": 2.2612899122249077, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.37449073791504, + "learning_rate": 1e-06, + "loss": 0.3622, + "mean_token_accuracy": 0.8857895135879517, + "num_tokens": 678364872.0, + "step": 17776 + }, + { + "epoch": 2.2614171225034982, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.4058895111084, + "learning_rate": 1e-06, + "loss": 0.4186, + "mean_token_accuracy": 0.8658003807067871, + "num_tokens": 678399233.0, + "step": 17777 + }, + { + "epoch": 2.2615443327820888, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.410140991210938, + "learning_rate": 1e-06, + "loss": 0.375, + "mean_token_accuracy": 0.8810632228851318, + "num_tokens": 678434577.0, + "step": 17778 + }, + { + "epoch": 2.2616715430606793, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.359140396118164, + "learning_rate": 1e-06, + "loss": 0.3766, + "mean_token_accuracy": 0.8788460493087769, + "num_tokens": 678466835.0, + "step": 17779 + }, + { + "epoch": 2.26179875333927, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.608112335205078, + "learning_rate": 1e-06, + "loss": 0.3518, + "mean_token_accuracy": 0.8878598213195801, + "num_tokens": 678506817.0, + "step": 17780 + }, + { + "epoch": 2.2619259636178604, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.309001922607422, + "learning_rate": 1e-06, + "loss": 0.3903, + "mean_token_accuracy": 0.8736484050750732, + "num_tokens": 678543734.0, + "step": 17781 + }, + { + "epoch": 2.262053173896451, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.553783416748047, + "learning_rate": 1e-06, + "loss": 0.4095, + "mean_token_accuracy": 0.8698505759239197, + "num_tokens": 678585066.0, + "step": 17782 + }, + { + "epoch": 2.2621803841750414, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.49753761291504, + "learning_rate": 1e-06, + "loss": 0.35, + "mean_token_accuracy": 0.8867441415786743, + "num_tokens": 678614951.0, + "step": 17783 + }, + { + "epoch": 2.262307594453632, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.539873123168945, + "learning_rate": 1e-06, + "loss": 0.3896, + "mean_token_accuracy": 0.8759858012199402, + "num_tokens": 678649671.0, + "step": 17784 + }, + { + "epoch": 2.2624348047322225, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.339181900024414, + "learning_rate": 1e-06, + "loss": 0.3489, + "mean_token_accuracy": 0.8897282481193542, + "num_tokens": 678693418.0, + "step": 17785 + }, + { + "epoch": 2.262562015010813, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.346168518066406, + "learning_rate": 1e-06, + "loss": 0.3674, + "mean_token_accuracy": 0.8832043409347534, + "num_tokens": 678730891.0, + "step": 17786 + }, + { + "epoch": 2.2626892252894035, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.258588790893555, + "learning_rate": 1e-06, + "loss": 0.3712, + "mean_token_accuracy": 0.8819730281829834, + "num_tokens": 678766369.0, + "step": 17787 + }, + { + "epoch": 2.262816435567994, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.344192504882812, + "learning_rate": 1e-06, + "loss": 0.4413, + "mean_token_accuracy": 0.8617980480194092, + "num_tokens": 678809118.0, + "step": 17788 + }, + { + "epoch": 2.2629436458465846, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.28041648864746, + "learning_rate": 1e-06, + "loss": 0.4072, + "mean_token_accuracy": 0.8719671368598938, + "num_tokens": 678850057.0, + "step": 17789 + }, + { + "epoch": 2.263070856125175, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.414268493652344, + "learning_rate": 1e-06, + "loss": 0.4036, + "mean_token_accuracy": 0.87158203125, + "num_tokens": 678890253.0, + "step": 17790 + }, + { + "epoch": 2.263198066403765, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.426929473876953, + "learning_rate": 1e-06, + "loss": 0.3565, + "mean_token_accuracy": 0.8879693746566772, + "num_tokens": 678922535.0, + "step": 17791 + }, + { + "epoch": 2.263325276682356, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.47308349609375, + "learning_rate": 1e-06, + "loss": 0.4052, + "mean_token_accuracy": 0.8716710209846497, + "num_tokens": 678956136.0, + "step": 17792 + }, + { + "epoch": 2.2634524869609463, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.430824279785156, + "learning_rate": 1e-06, + "loss": 0.3793, + "mean_token_accuracy": 0.8771827220916748, + "num_tokens": 678990864.0, + "step": 17793 + }, + { + "epoch": 2.263579697239537, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.324033737182617, + "learning_rate": 1e-06, + "loss": 0.3609, + "mean_token_accuracy": 0.8863368034362793, + "num_tokens": 679028345.0, + "step": 17794 + }, + { + "epoch": 2.2637069075181273, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.570941925048828, + "learning_rate": 1e-06, + "loss": 0.4471, + "mean_token_accuracy": 0.8602670431137085, + "num_tokens": 679073679.0, + "step": 17795 + }, + { + "epoch": 2.263834117796718, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.409774780273438, + "learning_rate": 1e-06, + "loss": 0.3945, + "mean_token_accuracy": 0.8713575601577759, + "num_tokens": 679109274.0, + "step": 17796 + }, + { + "epoch": 2.2639613280753084, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.332860946655273, + "learning_rate": 1e-06, + "loss": 0.3903, + "mean_token_accuracy": 0.8745400309562683, + "num_tokens": 679141644.0, + "step": 17797 + }, + { + "epoch": 2.264088538353899, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.658952713012695, + "learning_rate": 1e-06, + "loss": 0.4549, + "mean_token_accuracy": 0.8578201532363892, + "num_tokens": 679184351.0, + "step": 17798 + }, + { + "epoch": 2.2642157486324894, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.265914916992188, + "learning_rate": 1e-06, + "loss": 0.4051, + "mean_token_accuracy": 0.8698478937149048, + "num_tokens": 679222509.0, + "step": 17799 + }, + { + "epoch": 2.26434295891108, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.6057186126709, + "learning_rate": 1e-06, + "loss": 0.4481, + "mean_token_accuracy": 0.857990026473999, + "num_tokens": 679263362.0, + "step": 17800 + }, + { + "epoch": 2.2644701691896705, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.34835433959961, + "learning_rate": 1e-06, + "loss": 0.4471, + "mean_token_accuracy": 0.8615741729736328, + "num_tokens": 679307859.0, + "step": 17801 + }, + { + "epoch": 2.264597379468261, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.401344299316406, + "learning_rate": 1e-06, + "loss": 0.362, + "mean_token_accuracy": 0.8837296366691589, + "num_tokens": 679344801.0, + "step": 17802 + }, + { + "epoch": 2.2647245897468515, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.301204681396484, + "learning_rate": 1e-06, + "loss": 0.4192, + "mean_token_accuracy": 0.866499125957489, + "num_tokens": 679382795.0, + "step": 17803 + }, + { + "epoch": 2.264851800025442, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.464216232299805, + "learning_rate": 1e-06, + "loss": 0.4078, + "mean_token_accuracy": 0.8714531064033508, + "num_tokens": 679425282.0, + "step": 17804 + }, + { + "epoch": 2.2649790103040326, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.318166732788086, + "learning_rate": 1e-06, + "loss": 0.4389, + "mean_token_accuracy": 0.8610112071037292, + "num_tokens": 679466881.0, + "step": 17805 + }, + { + "epoch": 2.265106220582623, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.31248664855957, + "learning_rate": 1e-06, + "loss": 0.3586, + "mean_token_accuracy": 0.8849846720695496, + "num_tokens": 679509268.0, + "step": 17806 + }, + { + "epoch": 2.2652334308612136, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.50271987915039, + "learning_rate": 1e-06, + "loss": 0.414, + "mean_token_accuracy": 0.8705240488052368, + "num_tokens": 679545082.0, + "step": 17807 + }, + { + "epoch": 2.265360641139804, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.620534896850586, + "learning_rate": 1e-06, + "loss": 0.3976, + "mean_token_accuracy": 0.8754760026931763, + "num_tokens": 679581795.0, + "step": 17808 + }, + { + "epoch": 2.2654878514183947, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.597299575805664, + "learning_rate": 1e-06, + "loss": 0.4073, + "mean_token_accuracy": 0.8716545701026917, + "num_tokens": 679618978.0, + "step": 17809 + }, + { + "epoch": 2.2656150616969852, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.402442932128906, + "learning_rate": 1e-06, + "loss": 0.388, + "mean_token_accuracy": 0.8769170045852661, + "num_tokens": 679655002.0, + "step": 17810 + }, + { + "epoch": 2.2657422719755758, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.51428985595703, + "learning_rate": 1e-06, + "loss": 0.3472, + "mean_token_accuracy": 0.8876821994781494, + "num_tokens": 679691143.0, + "step": 17811 + }, + { + "epoch": 2.2658694822541663, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.583473205566406, + "learning_rate": 1e-06, + "loss": 0.3915, + "mean_token_accuracy": 0.8752039670944214, + "num_tokens": 679729168.0, + "step": 17812 + }, + { + "epoch": 2.265996692532757, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.301916122436523, + "learning_rate": 1e-06, + "loss": 0.4023, + "mean_token_accuracy": 0.871511697769165, + "num_tokens": 679773385.0, + "step": 17813 + }, + { + "epoch": 2.2661239028113473, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.578601837158203, + "learning_rate": 1e-06, + "loss": 0.3595, + "mean_token_accuracy": 0.8845309019088745, + "num_tokens": 679813884.0, + "step": 17814 + }, + { + "epoch": 2.266251113089938, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.43185043334961, + "learning_rate": 1e-06, + "loss": 0.4011, + "mean_token_accuracy": 0.8743249177932739, + "num_tokens": 679850834.0, + "step": 17815 + }, + { + "epoch": 2.266378323368528, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.54425048828125, + "learning_rate": 1e-06, + "loss": 0.3822, + "mean_token_accuracy": 0.8795294761657715, + "num_tokens": 679890210.0, + "step": 17816 + }, + { + "epoch": 2.266505533647119, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.5759220123291, + "learning_rate": 1e-06, + "loss": 0.3765, + "mean_token_accuracy": 0.8815511465072632, + "num_tokens": 679926252.0, + "step": 17817 + }, + { + "epoch": 2.266632743925709, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.380136489868164, + "learning_rate": 1e-06, + "loss": 0.4, + "mean_token_accuracy": 0.8705257177352905, + "num_tokens": 679959626.0, + "step": 17818 + }, + { + "epoch": 2.2667599542042995, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.397111892700195, + "learning_rate": 1e-06, + "loss": 0.4122, + "mean_token_accuracy": 0.8683323860168457, + "num_tokens": 679999973.0, + "step": 17819 + }, + { + "epoch": 2.26688716448289, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.406261444091797, + "learning_rate": 1e-06, + "loss": 0.4242, + "mean_token_accuracy": 0.8671779632568359, + "num_tokens": 680041081.0, + "step": 17820 + }, + { + "epoch": 2.2670143747614806, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.47998046875, + "learning_rate": 1e-06, + "loss": 0.3858, + "mean_token_accuracy": 0.8782241344451904, + "num_tokens": 680078953.0, + "step": 17821 + }, + { + "epoch": 2.267141585040071, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.663162231445312, + "learning_rate": 1e-06, + "loss": 0.3741, + "mean_token_accuracy": 0.8818821907043457, + "num_tokens": 680112309.0, + "step": 17822 + }, + { + "epoch": 2.2672687953186617, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.513500213623047, + "learning_rate": 1e-06, + "loss": 0.4059, + "mean_token_accuracy": 0.8683531880378723, + "num_tokens": 680148607.0, + "step": 17823 + }, + { + "epoch": 2.267396005597252, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.41619300842285, + "learning_rate": 1e-06, + "loss": 0.3488, + "mean_token_accuracy": 0.8898458480834961, + "num_tokens": 680189617.0, + "step": 17824 + }, + { + "epoch": 2.2675232158758427, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.413881301879883, + "learning_rate": 1e-06, + "loss": 0.3487, + "mean_token_accuracy": 0.8890404105186462, + "num_tokens": 680227940.0, + "step": 17825 + }, + { + "epoch": 2.2676504261544332, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.414533615112305, + "learning_rate": 1e-06, + "loss": 0.3628, + "mean_token_accuracy": 0.8865297436714172, + "num_tokens": 680263473.0, + "step": 17826 + }, + { + "epoch": 2.2677776364330238, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.54938507080078, + "learning_rate": 1e-06, + "loss": 0.4339, + "mean_token_accuracy": 0.8655532598495483, + "num_tokens": 680297895.0, + "step": 17827 + }, + { + "epoch": 2.2679048467116143, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.252960205078125, + "learning_rate": 1e-06, + "loss": 0.3937, + "mean_token_accuracy": 0.8759483695030212, + "num_tokens": 680337658.0, + "step": 17828 + }, + { + "epoch": 2.268032056990205, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.631916046142578, + "learning_rate": 1e-06, + "loss": 0.4193, + "mean_token_accuracy": 0.867924690246582, + "num_tokens": 680375912.0, + "step": 17829 + }, + { + "epoch": 2.2681592672687954, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.49312400817871, + "learning_rate": 1e-06, + "loss": 0.3677, + "mean_token_accuracy": 0.8850574493408203, + "num_tokens": 680415231.0, + "step": 17830 + }, + { + "epoch": 2.268286477547386, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.285118103027344, + "learning_rate": 1e-06, + "loss": 0.4129, + "mean_token_accuracy": 0.8696386814117432, + "num_tokens": 680460037.0, + "step": 17831 + }, + { + "epoch": 2.2684136878259764, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.495412826538086, + "learning_rate": 1e-06, + "loss": 0.4655, + "mean_token_accuracy": 0.8519946336746216, + "num_tokens": 680505521.0, + "step": 17832 + }, + { + "epoch": 2.268540898104567, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.3558406829834, + "learning_rate": 1e-06, + "loss": 0.3597, + "mean_token_accuracy": 0.8867380619049072, + "num_tokens": 680547682.0, + "step": 17833 + }, + { + "epoch": 2.2686681083831575, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.4487247467041, + "learning_rate": 1e-06, + "loss": 0.3797, + "mean_token_accuracy": 0.8798404335975647, + "num_tokens": 680587510.0, + "step": 17834 + }, + { + "epoch": 2.268795318661748, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.548419952392578, + "learning_rate": 1e-06, + "loss": 0.3853, + "mean_token_accuracy": 0.8774129152297974, + "num_tokens": 680624233.0, + "step": 17835 + }, + { + "epoch": 2.2689225289403385, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.518983840942383, + "learning_rate": 1e-06, + "loss": 0.4076, + "mean_token_accuracy": 0.8692876696586609, + "num_tokens": 680666091.0, + "step": 17836 + }, + { + "epoch": 2.269049739218929, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.418212890625, + "learning_rate": 1e-06, + "loss": 0.3815, + "mean_token_accuracy": 0.8800103068351746, + "num_tokens": 680703062.0, + "step": 17837 + }, + { + "epoch": 2.2691769494975196, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.581113815307617, + "learning_rate": 1e-06, + "loss": 0.3605, + "mean_token_accuracy": 0.8865446448326111, + "num_tokens": 680735966.0, + "step": 17838 + }, + { + "epoch": 2.2693041597761097, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.430572509765625, + "learning_rate": 1e-06, + "loss": 0.4028, + "mean_token_accuracy": 0.8741531372070312, + "num_tokens": 680780010.0, + "step": 17839 + }, + { + "epoch": 2.2694313700547006, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.540386199951172, + "learning_rate": 1e-06, + "loss": 0.3994, + "mean_token_accuracy": 0.8730838298797607, + "num_tokens": 680819516.0, + "step": 17840 + }, + { + "epoch": 2.2695585803332907, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.66518211364746, + "learning_rate": 1e-06, + "loss": 0.4012, + "mean_token_accuracy": 0.8738687634468079, + "num_tokens": 680861374.0, + "step": 17841 + }, + { + "epoch": 2.2696857906118812, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.540512084960938, + "learning_rate": 1e-06, + "loss": 0.4321, + "mean_token_accuracy": 0.8656977415084839, + "num_tokens": 680902447.0, + "step": 17842 + }, + { + "epoch": 2.2698130008904718, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.478736877441406, + "learning_rate": 1e-06, + "loss": 0.3432, + "mean_token_accuracy": 0.8901493549346924, + "num_tokens": 680942376.0, + "step": 17843 + }, + { + "epoch": 2.2699402111690623, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.43808937072754, + "learning_rate": 1e-06, + "loss": 0.3915, + "mean_token_accuracy": 0.8747919797897339, + "num_tokens": 680977060.0, + "step": 17844 + }, + { + "epoch": 2.270067421447653, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.55329132080078, + "learning_rate": 1e-06, + "loss": 0.4481, + "mean_token_accuracy": 0.8589250445365906, + "num_tokens": 681014504.0, + "step": 17845 + }, + { + "epoch": 2.2701946317262434, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.519105911254883, + "learning_rate": 1e-06, + "loss": 0.4479, + "mean_token_accuracy": 0.8584011793136597, + "num_tokens": 681053919.0, + "step": 17846 + }, + { + "epoch": 2.270321842004834, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.551515579223633, + "learning_rate": 1e-06, + "loss": 0.4035, + "mean_token_accuracy": 0.874243974685669, + "num_tokens": 681090883.0, + "step": 17847 + }, + { + "epoch": 2.2704490522834244, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.461641311645508, + "learning_rate": 1e-06, + "loss": 0.3561, + "mean_token_accuracy": 0.8879523277282715, + "num_tokens": 681128819.0, + "step": 17848 + }, + { + "epoch": 2.270576262562015, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.410030364990234, + "learning_rate": 1e-06, + "loss": 0.4263, + "mean_token_accuracy": 0.8637986183166504, + "num_tokens": 681171434.0, + "step": 17849 + }, + { + "epoch": 2.2707034728406055, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.390668869018555, + "learning_rate": 1e-06, + "loss": 0.3657, + "mean_token_accuracy": 0.8838704228401184, + "num_tokens": 681206297.0, + "step": 17850 + }, + { + "epoch": 2.270830683119196, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.703651428222656, + "learning_rate": 1e-06, + "loss": 0.4341, + "mean_token_accuracy": 0.8651553988456726, + "num_tokens": 681242521.0, + "step": 17851 + }, + { + "epoch": 2.2709578933977865, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.474393844604492, + "learning_rate": 1e-06, + "loss": 0.3654, + "mean_token_accuracy": 0.8835915327072144, + "num_tokens": 681282474.0, + "step": 17852 + }, + { + "epoch": 2.271085103676377, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.440061569213867, + "learning_rate": 1e-06, + "loss": 0.4072, + "mean_token_accuracy": 0.8699944019317627, + "num_tokens": 681317280.0, + "step": 17853 + }, + { + "epoch": 2.2712123139549676, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.523792266845703, + "learning_rate": 1e-06, + "loss": 0.3912, + "mean_token_accuracy": 0.8723076581954956, + "num_tokens": 681349155.0, + "step": 17854 + }, + { + "epoch": 2.271339524233558, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.563663482666016, + "learning_rate": 1e-06, + "loss": 0.3874, + "mean_token_accuracy": 0.8743165731430054, + "num_tokens": 681380863.0, + "step": 17855 + }, + { + "epoch": 2.2714667345121486, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.58566665649414, + "learning_rate": 1e-06, + "loss": 0.4017, + "mean_token_accuracy": 0.8788365721702576, + "num_tokens": 681421970.0, + "step": 17856 + }, + { + "epoch": 2.271593944790739, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.335838317871094, + "learning_rate": 1e-06, + "loss": 0.3472, + "mean_token_accuracy": 0.8884924650192261, + "num_tokens": 681453511.0, + "step": 17857 + }, + { + "epoch": 2.2717211550693297, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.477548599243164, + "learning_rate": 1e-06, + "loss": 0.4333, + "mean_token_accuracy": 0.8615125417709351, + "num_tokens": 681496293.0, + "step": 17858 + }, + { + "epoch": 2.2718483653479202, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.333398818969727, + "learning_rate": 1e-06, + "loss": 0.4204, + "mean_token_accuracy": 0.8667681217193604, + "num_tokens": 681537074.0, + "step": 17859 + }, + { + "epoch": 2.2719755756265108, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.35926055908203, + "learning_rate": 1e-06, + "loss": 0.4206, + "mean_token_accuracy": 0.8671648502349854, + "num_tokens": 681578911.0, + "step": 17860 + }, + { + "epoch": 2.2721027859051013, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.56816864013672, + "learning_rate": 1e-06, + "loss": 0.4064, + "mean_token_accuracy": 0.8700094223022461, + "num_tokens": 681622134.0, + "step": 17861 + }, + { + "epoch": 2.272229996183692, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.4127254486084, + "learning_rate": 1e-06, + "loss": 0.4066, + "mean_token_accuracy": 0.8751524090766907, + "num_tokens": 681655919.0, + "step": 17862 + }, + { + "epoch": 2.2723572064622823, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.86016845703125, + "learning_rate": 1e-06, + "loss": 0.4281, + "mean_token_accuracy": 0.8656129837036133, + "num_tokens": 681696539.0, + "step": 17863 + }, + { + "epoch": 2.2724844167408724, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.316469192504883, + "learning_rate": 1e-06, + "loss": 0.4116, + "mean_token_accuracy": 0.8717193603515625, + "num_tokens": 681734630.0, + "step": 17864 + }, + { + "epoch": 2.2726116270194634, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.596494674682617, + "learning_rate": 1e-06, + "loss": 0.4589, + "mean_token_accuracy": 0.8621731400489807, + "num_tokens": 681779669.0, + "step": 17865 + }, + { + "epoch": 2.2727388372980535, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.50064468383789, + "learning_rate": 1e-06, + "loss": 0.4086, + "mean_token_accuracy": 0.87189781665802, + "num_tokens": 681821920.0, + "step": 17866 + }, + { + "epoch": 2.272866047576644, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.494592666625977, + "learning_rate": 1e-06, + "loss": 0.4051, + "mean_token_accuracy": 0.8720166683197021, + "num_tokens": 681859239.0, + "step": 17867 + }, + { + "epoch": 2.2729932578552345, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.49413299560547, + "learning_rate": 1e-06, + "loss": 0.4419, + "mean_token_accuracy": 0.8613647222518921, + "num_tokens": 681896569.0, + "step": 17868 + }, + { + "epoch": 2.273120468133825, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.45304298400879, + "learning_rate": 1e-06, + "loss": 0.3514, + "mean_token_accuracy": 0.8883088827133179, + "num_tokens": 681932549.0, + "step": 17869 + }, + { + "epoch": 2.2732476784124156, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.439899444580078, + "learning_rate": 1e-06, + "loss": 0.4137, + "mean_token_accuracy": 0.8691844940185547, + "num_tokens": 681973981.0, + "step": 17870 + }, + { + "epoch": 2.273374888691006, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.57699203491211, + "learning_rate": 1e-06, + "loss": 0.4886, + "mean_token_accuracy": 0.8505429029464722, + "num_tokens": 682009512.0, + "step": 17871 + }, + { + "epoch": 2.2735020989695967, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.41721534729004, + "learning_rate": 1e-06, + "loss": 0.4113, + "mean_token_accuracy": 0.8710954189300537, + "num_tokens": 682048058.0, + "step": 17872 + }, + { + "epoch": 2.273629309248187, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.319753646850586, + "learning_rate": 1e-06, + "loss": 0.4349, + "mean_token_accuracy": 0.8665596842765808, + "num_tokens": 682084181.0, + "step": 17873 + }, + { + "epoch": 2.2737565195267777, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.49872398376465, + "learning_rate": 1e-06, + "loss": 0.3449, + "mean_token_accuracy": 0.8927263021469116, + "num_tokens": 682119613.0, + "step": 17874 + }, + { + "epoch": 2.2738837298053682, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.549169540405273, + "learning_rate": 1e-06, + "loss": 0.3839, + "mean_token_accuracy": 0.8768740892410278, + "num_tokens": 682159501.0, + "step": 17875 + }, + { + "epoch": 2.2740109400839588, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.505996704101562, + "learning_rate": 1e-06, + "loss": 0.3693, + "mean_token_accuracy": 0.881693422794342, + "num_tokens": 682197323.0, + "step": 17876 + }, + { + "epoch": 2.2741381503625493, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.48167610168457, + "learning_rate": 1e-06, + "loss": 0.4015, + "mean_token_accuracy": 0.8754183053970337, + "num_tokens": 682231150.0, + "step": 17877 + }, + { + "epoch": 2.27426536064114, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.75160789489746, + "learning_rate": 1e-06, + "loss": 0.3884, + "mean_token_accuracy": 0.8753270506858826, + "num_tokens": 682269351.0, + "step": 17878 + }, + { + "epoch": 2.2743925709197303, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.428930282592773, + "learning_rate": 1e-06, + "loss": 0.3664, + "mean_token_accuracy": 0.8850727081298828, + "num_tokens": 682303633.0, + "step": 17879 + }, + { + "epoch": 2.274519781198321, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.496915817260742, + "learning_rate": 1e-06, + "loss": 0.4022, + "mean_token_accuracy": 0.8721920251846313, + "num_tokens": 682340890.0, + "step": 17880 + }, + { + "epoch": 2.2746469914769114, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.601383209228516, + "learning_rate": 1e-06, + "loss": 0.3523, + "mean_token_accuracy": 0.8882595896720886, + "num_tokens": 682374253.0, + "step": 17881 + }, + { + "epoch": 2.274774201755502, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.502635955810547, + "learning_rate": 1e-06, + "loss": 0.4039, + "mean_token_accuracy": 0.8736943006515503, + "num_tokens": 682412675.0, + "step": 17882 + }, + { + "epoch": 2.2749014120340925, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.552715301513672, + "learning_rate": 1e-06, + "loss": 0.3617, + "mean_token_accuracy": 0.884443998336792, + "num_tokens": 682447864.0, + "step": 17883 + }, + { + "epoch": 2.275028622312683, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.776777267456055, + "learning_rate": 1e-06, + "loss": 0.4039, + "mean_token_accuracy": 0.8743983507156372, + "num_tokens": 682486445.0, + "step": 17884 + }, + { + "epoch": 2.2751558325912735, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.530044555664062, + "learning_rate": 1e-06, + "loss": 0.4032, + "mean_token_accuracy": 0.8730412721633911, + "num_tokens": 682518173.0, + "step": 17885 + }, + { + "epoch": 2.275283042869864, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.413969039916992, + "learning_rate": 1e-06, + "loss": 0.3994, + "mean_token_accuracy": 0.872530996799469, + "num_tokens": 682558627.0, + "step": 17886 + }, + { + "epoch": 2.2754102531484546, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.543270111083984, + "learning_rate": 1e-06, + "loss": 0.386, + "mean_token_accuracy": 0.8781483173370361, + "num_tokens": 682600821.0, + "step": 17887 + }, + { + "epoch": 2.275537463427045, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.861051559448242, + "learning_rate": 1e-06, + "loss": 0.4365, + "mean_token_accuracy": 0.8621990084648132, + "num_tokens": 682643947.0, + "step": 17888 + }, + { + "epoch": 2.275664673705635, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.526451110839844, + "learning_rate": 1e-06, + "loss": 0.419, + "mean_token_accuracy": 0.8686541318893433, + "num_tokens": 682683045.0, + "step": 17889 + }, + { + "epoch": 2.275791883984226, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.690290451049805, + "learning_rate": 1e-06, + "loss": 0.4062, + "mean_token_accuracy": 0.8747985363006592, + "num_tokens": 682721617.0, + "step": 17890 + }, + { + "epoch": 2.2759190942628162, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.60324478149414, + "learning_rate": 1e-06, + "loss": 0.4105, + "mean_token_accuracy": 0.8678136467933655, + "num_tokens": 682758339.0, + "step": 17891 + }, + { + "epoch": 2.2760463045414068, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.33574867248535, + "learning_rate": 1e-06, + "loss": 0.3825, + "mean_token_accuracy": 0.8789757490158081, + "num_tokens": 682800501.0, + "step": 17892 + }, + { + "epoch": 2.2761735148199973, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.712060928344727, + "learning_rate": 1e-06, + "loss": 0.3902, + "mean_token_accuracy": 0.8751969337463379, + "num_tokens": 682833231.0, + "step": 17893 + }, + { + "epoch": 2.276300725098588, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.50580596923828, + "learning_rate": 1e-06, + "loss": 0.3828, + "mean_token_accuracy": 0.8790336847305298, + "num_tokens": 682873945.0, + "step": 17894 + }, + { + "epoch": 2.2764279353771784, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.353628158569336, + "learning_rate": 1e-06, + "loss": 0.4066, + "mean_token_accuracy": 0.8718138337135315, + "num_tokens": 682918329.0, + "step": 17895 + }, + { + "epoch": 2.276555145655769, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.681396484375, + "learning_rate": 1e-06, + "loss": 0.3796, + "mean_token_accuracy": 0.8797034621238708, + "num_tokens": 682956277.0, + "step": 17896 + }, + { + "epoch": 2.2766823559343594, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.435039520263672, + "learning_rate": 1e-06, + "loss": 0.3845, + "mean_token_accuracy": 0.8774358630180359, + "num_tokens": 682990648.0, + "step": 17897 + }, + { + "epoch": 2.27680956621295, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.418485641479492, + "learning_rate": 1e-06, + "loss": 0.3554, + "mean_token_accuracy": 0.8887221813201904, + "num_tokens": 683029046.0, + "step": 17898 + }, + { + "epoch": 2.2769367764915405, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.77352523803711, + "learning_rate": 1e-06, + "loss": 0.4022, + "mean_token_accuracy": 0.8742362260818481, + "num_tokens": 683064541.0, + "step": 17899 + }, + { + "epoch": 2.277063986770131, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.493755340576172, + "learning_rate": 1e-06, + "loss": 0.3967, + "mean_token_accuracy": 0.8732618689537048, + "num_tokens": 683105015.0, + "step": 17900 + }, + { + "epoch": 2.2771911970487215, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.557392120361328, + "learning_rate": 1e-06, + "loss": 0.4127, + "mean_token_accuracy": 0.8673614263534546, + "num_tokens": 683142938.0, + "step": 17901 + }, + { + "epoch": 2.277318407327312, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.566743850708008, + "learning_rate": 1e-06, + "loss": 0.3614, + "mean_token_accuracy": 0.8874415159225464, + "num_tokens": 683181593.0, + "step": 17902 + }, + { + "epoch": 2.2774456176059026, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.335010528564453, + "learning_rate": 1e-06, + "loss": 0.4275, + "mean_token_accuracy": 0.8628902435302734, + "num_tokens": 683218641.0, + "step": 17903 + }, + { + "epoch": 2.277572827884493, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.464670181274414, + "learning_rate": 1e-06, + "loss": 0.4451, + "mean_token_accuracy": 0.857990562915802, + "num_tokens": 683261479.0, + "step": 17904 + }, + { + "epoch": 2.2777000381630836, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.509401321411133, + "learning_rate": 1e-06, + "loss": 0.4721, + "mean_token_accuracy": 0.8491010665893555, + "num_tokens": 683300602.0, + "step": 17905 + }, + { + "epoch": 2.277827248441674, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.379207611083984, + "learning_rate": 1e-06, + "loss": 0.3969, + "mean_token_accuracy": 0.872110903263092, + "num_tokens": 683337012.0, + "step": 17906 + }, + { + "epoch": 2.2779544587202647, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.509666442871094, + "learning_rate": 1e-06, + "loss": 0.3768, + "mean_token_accuracy": 0.8806478977203369, + "num_tokens": 683368949.0, + "step": 17907 + }, + { + "epoch": 2.2780816689988552, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.495080947875977, + "learning_rate": 1e-06, + "loss": 0.3998, + "mean_token_accuracy": 0.8713253140449524, + "num_tokens": 683408741.0, + "step": 17908 + }, + { + "epoch": 2.2782088792774458, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.436992645263672, + "learning_rate": 1e-06, + "loss": 0.4273, + "mean_token_accuracy": 0.863391637802124, + "num_tokens": 683447843.0, + "step": 17909 + }, + { + "epoch": 2.2783360895560363, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.385709762573242, + "learning_rate": 1e-06, + "loss": 0.4056, + "mean_token_accuracy": 0.8703298568725586, + "num_tokens": 683489695.0, + "step": 17910 + }, + { + "epoch": 2.278463299834627, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.45737648010254, + "learning_rate": 1e-06, + "loss": 0.4004, + "mean_token_accuracy": 0.8710716962814331, + "num_tokens": 683528381.0, + "step": 17911 + }, + { + "epoch": 2.2785905101132173, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.55560302734375, + "learning_rate": 1e-06, + "loss": 0.3919, + "mean_token_accuracy": 0.8748376965522766, + "num_tokens": 683567261.0, + "step": 17912 + }, + { + "epoch": 2.278717720391808, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.57866096496582, + "learning_rate": 1e-06, + "loss": 0.4515, + "mean_token_accuracy": 0.8594995737075806, + "num_tokens": 683601083.0, + "step": 17913 + }, + { + "epoch": 2.278844930670398, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.42456817626953, + "learning_rate": 1e-06, + "loss": 0.4249, + "mean_token_accuracy": 0.8670065402984619, + "num_tokens": 683643494.0, + "step": 17914 + }, + { + "epoch": 2.278972140948989, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.641786575317383, + "learning_rate": 1e-06, + "loss": 0.4082, + "mean_token_accuracy": 0.8697354793548584, + "num_tokens": 683684267.0, + "step": 17915 + }, + { + "epoch": 2.279099351227579, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.510114669799805, + "learning_rate": 1e-06, + "loss": 0.4335, + "mean_token_accuracy": 0.8621119260787964, + "num_tokens": 683723292.0, + "step": 17916 + }, + { + "epoch": 2.2792265615061695, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.537384033203125, + "learning_rate": 1e-06, + "loss": 0.3646, + "mean_token_accuracy": 0.8840984106063843, + "num_tokens": 683761378.0, + "step": 17917 + }, + { + "epoch": 2.27935377178476, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.696361541748047, + "learning_rate": 1e-06, + "loss": 0.4312, + "mean_token_accuracy": 0.8659589290618896, + "num_tokens": 683794838.0, + "step": 17918 + }, + { + "epoch": 2.2794809820633506, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.505638122558594, + "learning_rate": 1e-06, + "loss": 0.4621, + "mean_token_accuracy": 0.8548072576522827, + "num_tokens": 683832307.0, + "step": 17919 + }, + { + "epoch": 2.279608192341941, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.50337028503418, + "learning_rate": 1e-06, + "loss": 0.4128, + "mean_token_accuracy": 0.866849422454834, + "num_tokens": 683871848.0, + "step": 17920 + }, + { + "epoch": 2.2797354026205316, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.545059204101562, + "learning_rate": 1e-06, + "loss": 0.3948, + "mean_token_accuracy": 0.8761649131774902, + "num_tokens": 683915086.0, + "step": 17921 + }, + { + "epoch": 2.279862612899122, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.691877365112305, + "learning_rate": 1e-06, + "loss": 0.4115, + "mean_token_accuracy": 0.8664463758468628, + "num_tokens": 683945411.0, + "step": 17922 + }, + { + "epoch": 2.2799898231777127, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.45694923400879, + "learning_rate": 1e-06, + "loss": 0.3993, + "mean_token_accuracy": 0.8732788562774658, + "num_tokens": 683979664.0, + "step": 17923 + }, + { + "epoch": 2.2801170334563032, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.429880142211914, + "learning_rate": 1e-06, + "loss": 0.4342, + "mean_token_accuracy": 0.8633426427841187, + "num_tokens": 684025526.0, + "step": 17924 + }, + { + "epoch": 2.2802442437348938, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.72972297668457, + "learning_rate": 1e-06, + "loss": 0.422, + "mean_token_accuracy": 0.868197500705719, + "num_tokens": 684056992.0, + "step": 17925 + }, + { + "epoch": 2.2803714540134843, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.468894958496094, + "learning_rate": 1e-06, + "loss": 0.3862, + "mean_token_accuracy": 0.8773149847984314, + "num_tokens": 684094515.0, + "step": 17926 + }, + { + "epoch": 2.280498664292075, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.58902359008789, + "learning_rate": 1e-06, + "loss": 0.3768, + "mean_token_accuracy": 0.8799068927764893, + "num_tokens": 684132294.0, + "step": 17927 + }, + { + "epoch": 2.2806258745706653, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.559696197509766, + "learning_rate": 1e-06, + "loss": 0.377, + "mean_token_accuracy": 0.8767693638801575, + "num_tokens": 684164010.0, + "step": 17928 + }, + { + "epoch": 2.280753084849256, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.423843383789062, + "learning_rate": 1e-06, + "loss": 0.3759, + "mean_token_accuracy": 0.880176305770874, + "num_tokens": 684202077.0, + "step": 17929 + }, + { + "epoch": 2.2808802951278464, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.734529495239258, + "learning_rate": 1e-06, + "loss": 0.4685, + "mean_token_accuracy": 0.8463134765625, + "num_tokens": 684240799.0, + "step": 17930 + }, + { + "epoch": 2.281007505406437, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.42203140258789, + "learning_rate": 1e-06, + "loss": 0.4139, + "mean_token_accuracy": 0.870036244392395, + "num_tokens": 684284080.0, + "step": 17931 + }, + { + "epoch": 2.2811347156850275, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.666126251220703, + "learning_rate": 1e-06, + "loss": 0.4497, + "mean_token_accuracy": 0.8561426401138306, + "num_tokens": 684317915.0, + "step": 17932 + }, + { + "epoch": 2.281261925963618, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.4017333984375, + "learning_rate": 1e-06, + "loss": 0.2874, + "mean_token_accuracy": 0.9094933867454529, + "num_tokens": 684354665.0, + "step": 17933 + }, + { + "epoch": 2.2813891362422085, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.45044708251953, + "learning_rate": 1e-06, + "loss": 0.399, + "mean_token_accuracy": 0.8714916110038757, + "num_tokens": 684392866.0, + "step": 17934 + }, + { + "epoch": 2.281516346520799, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.51125717163086, + "learning_rate": 1e-06, + "loss": 0.3856, + "mean_token_accuracy": 0.8766545653343201, + "num_tokens": 684428422.0, + "step": 17935 + }, + { + "epoch": 2.2816435567993896, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.388643264770508, + "learning_rate": 1e-06, + "loss": 0.4427, + "mean_token_accuracy": 0.8597642183303833, + "num_tokens": 684467129.0, + "step": 17936 + }, + { + "epoch": 2.2817707670779797, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.618375778198242, + "learning_rate": 1e-06, + "loss": 0.432, + "mean_token_accuracy": 0.8663748502731323, + "num_tokens": 684500093.0, + "step": 17937 + }, + { + "epoch": 2.2818979773565706, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.697410583496094, + "learning_rate": 1e-06, + "loss": 0.4387, + "mean_token_accuracy": 0.863408088684082, + "num_tokens": 684541422.0, + "step": 17938 + }, + { + "epoch": 2.2820251876351607, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.260751724243164, + "learning_rate": 1e-06, + "loss": 0.4046, + "mean_token_accuracy": 0.869892418384552, + "num_tokens": 684579164.0, + "step": 17939 + }, + { + "epoch": 2.2821523979137512, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.55943489074707, + "learning_rate": 1e-06, + "loss": 0.4471, + "mean_token_accuracy": 0.8576213121414185, + "num_tokens": 684618617.0, + "step": 17940 + }, + { + "epoch": 2.2822796081923418, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.680442810058594, + "learning_rate": 1e-06, + "loss": 0.4496, + "mean_token_accuracy": 0.859504222869873, + "num_tokens": 684655525.0, + "step": 17941 + }, + { + "epoch": 2.2824068184709323, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.354976654052734, + "learning_rate": 1e-06, + "loss": 0.4064, + "mean_token_accuracy": 0.8716592788696289, + "num_tokens": 684694525.0, + "step": 17942 + }, + { + "epoch": 2.282534028749523, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.652116775512695, + "learning_rate": 1e-06, + "loss": 0.3968, + "mean_token_accuracy": 0.8728318214416504, + "num_tokens": 684733247.0, + "step": 17943 + }, + { + "epoch": 2.2826612390281134, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.471174240112305, + "learning_rate": 1e-06, + "loss": 0.3584, + "mean_token_accuracy": 0.8861452341079712, + "num_tokens": 684763642.0, + "step": 17944 + }, + { + "epoch": 2.282788449306704, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.545927047729492, + "learning_rate": 1e-06, + "loss": 0.3997, + "mean_token_accuracy": 0.8787902593612671, + "num_tokens": 684806164.0, + "step": 17945 + }, + { + "epoch": 2.2829156595852944, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.495718002319336, + "learning_rate": 1e-06, + "loss": 0.4141, + "mean_token_accuracy": 0.8711949586868286, + "num_tokens": 684844193.0, + "step": 17946 + }, + { + "epoch": 2.283042869863885, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.54733657836914, + "learning_rate": 1e-06, + "loss": 0.3828, + "mean_token_accuracy": 0.8779306411743164, + "num_tokens": 684882103.0, + "step": 17947 + }, + { + "epoch": 2.2831700801424755, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.501773834228516, + "learning_rate": 1e-06, + "loss": 0.3517, + "mean_token_accuracy": 0.8875499963760376, + "num_tokens": 684917667.0, + "step": 17948 + }, + { + "epoch": 2.283297290421066, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.342477798461914, + "learning_rate": 1e-06, + "loss": 0.3819, + "mean_token_accuracy": 0.8772921562194824, + "num_tokens": 684954007.0, + "step": 17949 + }, + { + "epoch": 2.2834245006996565, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.51718521118164, + "learning_rate": 1e-06, + "loss": 0.407, + "mean_token_accuracy": 0.8677154779434204, + "num_tokens": 684994693.0, + "step": 17950 + }, + { + "epoch": 2.283551710978247, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.391820907592773, + "learning_rate": 1e-06, + "loss": 0.4439, + "mean_token_accuracy": 0.8603639602661133, + "num_tokens": 685032713.0, + "step": 17951 + }, + { + "epoch": 2.2836789212568376, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.563568115234375, + "learning_rate": 1e-06, + "loss": 0.4252, + "mean_token_accuracy": 0.8677811026573181, + "num_tokens": 685066597.0, + "step": 17952 + }, + { + "epoch": 2.283806131535428, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.41843032836914, + "learning_rate": 1e-06, + "loss": 0.4137, + "mean_token_accuracy": 0.8672729730606079, + "num_tokens": 685106243.0, + "step": 17953 + }, + { + "epoch": 2.2839333418140186, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.641271591186523, + "learning_rate": 1e-06, + "loss": 0.4406, + "mean_token_accuracy": 0.8636322021484375, + "num_tokens": 685145177.0, + "step": 17954 + }, + { + "epoch": 2.284060552092609, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.499889373779297, + "learning_rate": 1e-06, + "loss": 0.3833, + "mean_token_accuracy": 0.8793302178382874, + "num_tokens": 685189130.0, + "step": 17955 + }, + { + "epoch": 2.2841877623711997, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.41754722595215, + "learning_rate": 1e-06, + "loss": 0.3867, + "mean_token_accuracy": 0.8778022527694702, + "num_tokens": 685227536.0, + "step": 17956 + }, + { + "epoch": 2.28431497264979, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.756380081176758, + "learning_rate": 1e-06, + "loss": 0.3909, + "mean_token_accuracy": 0.877977192401886, + "num_tokens": 685267213.0, + "step": 17957 + }, + { + "epoch": 2.2844421829283807, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.465328216552734, + "learning_rate": 1e-06, + "loss": 0.411, + "mean_token_accuracy": 0.8702815175056458, + "num_tokens": 685301115.0, + "step": 17958 + }, + { + "epoch": 2.2845693932069713, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.571090698242188, + "learning_rate": 1e-06, + "loss": 0.3847, + "mean_token_accuracy": 0.87791907787323, + "num_tokens": 685338497.0, + "step": 17959 + }, + { + "epoch": 2.284696603485562, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.490812301635742, + "learning_rate": 1e-06, + "loss": 0.4365, + "mean_token_accuracy": 0.8618977069854736, + "num_tokens": 685374983.0, + "step": 17960 + }, + { + "epoch": 2.2848238137641523, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.556474685668945, + "learning_rate": 1e-06, + "loss": 0.3781, + "mean_token_accuracy": 0.8787351250648499, + "num_tokens": 685408859.0, + "step": 17961 + }, + { + "epoch": 2.2849510240427424, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.427364349365234, + "learning_rate": 1e-06, + "loss": 0.412, + "mean_token_accuracy": 0.868421196937561, + "num_tokens": 685445103.0, + "step": 17962 + }, + { + "epoch": 2.2850782343213334, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.455856323242188, + "learning_rate": 1e-06, + "loss": 0.4166, + "mean_token_accuracy": 0.8680579662322998, + "num_tokens": 685484657.0, + "step": 17963 + }, + { + "epoch": 2.2852054445999235, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.653955459594727, + "learning_rate": 1e-06, + "loss": 0.378, + "mean_token_accuracy": 0.8806225061416626, + "num_tokens": 685524034.0, + "step": 17964 + }, + { + "epoch": 2.285332654878514, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.594337463378906, + "learning_rate": 1e-06, + "loss": 0.388, + "mean_token_accuracy": 0.8779919743537903, + "num_tokens": 685560368.0, + "step": 17965 + }, + { + "epoch": 2.2854598651571045, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.573881149291992, + "learning_rate": 1e-06, + "loss": 0.4084, + "mean_token_accuracy": 0.8687278032302856, + "num_tokens": 685598192.0, + "step": 17966 + }, + { + "epoch": 2.285587075435695, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.468231201171875, + "learning_rate": 1e-06, + "loss": 0.3963, + "mean_token_accuracy": 0.8757349252700806, + "num_tokens": 685636300.0, + "step": 17967 + }, + { + "epoch": 2.2857142857142856, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.58234214782715, + "learning_rate": 1e-06, + "loss": 0.4057, + "mean_token_accuracy": 0.8705823421478271, + "num_tokens": 685671457.0, + "step": 17968 + }, + { + "epoch": 2.285841495992876, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.281930923461914, + "learning_rate": 1e-06, + "loss": 0.4331, + "mean_token_accuracy": 0.8626459836959839, + "num_tokens": 685709687.0, + "step": 17969 + }, + { + "epoch": 2.2859687062714666, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.55607795715332, + "learning_rate": 1e-06, + "loss": 0.4743, + "mean_token_accuracy": 0.8508445024490356, + "num_tokens": 685747854.0, + "step": 17970 + }, + { + "epoch": 2.286095916550057, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.64900779724121, + "learning_rate": 1e-06, + "loss": 0.3466, + "mean_token_accuracy": 0.8921546339988708, + "num_tokens": 685785215.0, + "step": 17971 + }, + { + "epoch": 2.2862231268286477, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.655271530151367, + "learning_rate": 1e-06, + "loss": 0.3909, + "mean_token_accuracy": 0.877358078956604, + "num_tokens": 685818871.0, + "step": 17972 + }, + { + "epoch": 2.2863503371072382, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.655607223510742, + "learning_rate": 1e-06, + "loss": 0.4119, + "mean_token_accuracy": 0.8649840950965881, + "num_tokens": 685859089.0, + "step": 17973 + }, + { + "epoch": 2.2864775473858288, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.4863338470459, + "learning_rate": 1e-06, + "loss": 0.4333, + "mean_token_accuracy": 0.8635739684104919, + "num_tokens": 685900346.0, + "step": 17974 + }, + { + "epoch": 2.2866047576644193, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.540517807006836, + "learning_rate": 1e-06, + "loss": 0.4007, + "mean_token_accuracy": 0.8717496991157532, + "num_tokens": 685938186.0, + "step": 17975 + }, + { + "epoch": 2.28673196794301, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.516971588134766, + "learning_rate": 1e-06, + "loss": 0.3856, + "mean_token_accuracy": 0.8785342574119568, + "num_tokens": 685974911.0, + "step": 17976 + }, + { + "epoch": 2.2868591782216003, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.851852416992188, + "learning_rate": 1e-06, + "loss": 0.4004, + "mean_token_accuracy": 0.8748445510864258, + "num_tokens": 686010430.0, + "step": 17977 + }, + { + "epoch": 2.286986388500191, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.339466094970703, + "learning_rate": 1e-06, + "loss": 0.351, + "mean_token_accuracy": 0.8862513899803162, + "num_tokens": 686042727.0, + "step": 17978 + }, + { + "epoch": 2.2871135987787814, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.536876678466797, + "learning_rate": 1e-06, + "loss": 0.3833, + "mean_token_accuracy": 0.8743144273757935, + "num_tokens": 686078819.0, + "step": 17979 + }, + { + "epoch": 2.287240809057372, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.682363510131836, + "learning_rate": 1e-06, + "loss": 0.398, + "mean_token_accuracy": 0.8746933341026306, + "num_tokens": 686117760.0, + "step": 17980 + }, + { + "epoch": 2.2873680193359625, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.475910186767578, + "learning_rate": 1e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.8629336357116699, + "num_tokens": 686152992.0, + "step": 17981 + }, + { + "epoch": 2.287495229614553, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.620433807373047, + "learning_rate": 1e-06, + "loss": 0.4148, + "mean_token_accuracy": 0.8671606779098511, + "num_tokens": 686195353.0, + "step": 17982 + }, + { + "epoch": 2.2876224398931435, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.468034744262695, + "learning_rate": 1e-06, + "loss": 0.4155, + "mean_token_accuracy": 0.8686268925666809, + "num_tokens": 686232990.0, + "step": 17983 + }, + { + "epoch": 2.287749650171734, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.8529109954834, + "learning_rate": 1e-06, + "loss": 0.3861, + "mean_token_accuracy": 0.8774834871292114, + "num_tokens": 686265933.0, + "step": 17984 + }, + { + "epoch": 2.2878768604503246, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.634998321533203, + "learning_rate": 1e-06, + "loss": 0.3803, + "mean_token_accuracy": 0.8782954216003418, + "num_tokens": 686307111.0, + "step": 17985 + }, + { + "epoch": 2.288004070728915, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.680484771728516, + "learning_rate": 1e-06, + "loss": 0.3984, + "mean_token_accuracy": 0.8744955658912659, + "num_tokens": 686343363.0, + "step": 17986 + }, + { + "epoch": 2.288131281007505, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.79876708984375, + "learning_rate": 1e-06, + "loss": 0.4086, + "mean_token_accuracy": 0.8715836405754089, + "num_tokens": 686380888.0, + "step": 17987 + }, + { + "epoch": 2.288258491286096, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.86620330810547, + "learning_rate": 1e-06, + "loss": 0.4324, + "mean_token_accuracy": 0.8636178374290466, + "num_tokens": 686417350.0, + "step": 17988 + }, + { + "epoch": 2.2883857015646862, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.645153045654297, + "learning_rate": 1e-06, + "loss": 0.3737, + "mean_token_accuracy": 0.880612313747406, + "num_tokens": 686458658.0, + "step": 17989 + }, + { + "epoch": 2.2885129118432768, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.621158599853516, + "learning_rate": 1e-06, + "loss": 0.3903, + "mean_token_accuracy": 0.8761593103408813, + "num_tokens": 686493945.0, + "step": 17990 + }, + { + "epoch": 2.2886401221218673, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.472305297851562, + "learning_rate": 1e-06, + "loss": 0.3946, + "mean_token_accuracy": 0.8725684285163879, + "num_tokens": 686527608.0, + "step": 17991 + }, + { + "epoch": 2.288767332400458, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.53693389892578, + "learning_rate": 1e-06, + "loss": 0.3736, + "mean_token_accuracy": 0.8816359043121338, + "num_tokens": 686562272.0, + "step": 17992 + }, + { + "epoch": 2.2888945426790483, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.575729370117188, + "learning_rate": 1e-06, + "loss": 0.3714, + "mean_token_accuracy": 0.8844467401504517, + "num_tokens": 686599937.0, + "step": 17993 + }, + { + "epoch": 2.289021752957639, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.767377853393555, + "learning_rate": 1e-06, + "loss": 0.3992, + "mean_token_accuracy": 0.87375408411026, + "num_tokens": 686632103.0, + "step": 17994 + }, + { + "epoch": 2.2891489632362294, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.68187713623047, + "learning_rate": 1e-06, + "loss": 0.3801, + "mean_token_accuracy": 0.8778514862060547, + "num_tokens": 686662802.0, + "step": 17995 + }, + { + "epoch": 2.28927617351482, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.392040252685547, + "learning_rate": 1e-06, + "loss": 0.3405, + "mean_token_accuracy": 0.8939085602760315, + "num_tokens": 686699386.0, + "step": 17996 + }, + { + "epoch": 2.2894033837934105, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.55162811279297, + "learning_rate": 1e-06, + "loss": 0.3907, + "mean_token_accuracy": 0.876605749130249, + "num_tokens": 686738369.0, + "step": 17997 + }, + { + "epoch": 2.289530594072001, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.783288955688477, + "learning_rate": 1e-06, + "loss": 0.4234, + "mean_token_accuracy": 0.8661096692085266, + "num_tokens": 686774543.0, + "step": 17998 + }, + { + "epoch": 2.2896578043505915, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.456716537475586, + "learning_rate": 1e-06, + "loss": 0.4176, + "mean_token_accuracy": 0.8651888370513916, + "num_tokens": 686807048.0, + "step": 17999 + }, + { + "epoch": 2.289785014629182, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.4114933013916, + "learning_rate": 1e-06, + "loss": 0.4082, + "mean_token_accuracy": 0.8741738796234131, + "num_tokens": 686844678.0, + "step": 18000 + }, + { + "epoch": 2.2899122249077726, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.636423110961914, + "learning_rate": 1e-06, + "loss": 0.3998, + "mean_token_accuracy": 0.8725057244300842, + "num_tokens": 686877721.0, + "step": 18001 + }, + { + "epoch": 2.290039435186363, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.68019676208496, + "learning_rate": 1e-06, + "loss": 0.4214, + "mean_token_accuracy": 0.8665173649787903, + "num_tokens": 686918329.0, + "step": 18002 + }, + { + "epoch": 2.2901666454649536, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.78764533996582, + "learning_rate": 1e-06, + "loss": 0.4136, + "mean_token_accuracy": 0.8681837320327759, + "num_tokens": 686960050.0, + "step": 18003 + }, + { + "epoch": 2.290293855743544, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.554960250854492, + "learning_rate": 1e-06, + "loss": 0.3619, + "mean_token_accuracy": 0.8831651210784912, + "num_tokens": 686997849.0, + "step": 18004 + }, + { + "epoch": 2.2904210660221347, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.41002082824707, + "learning_rate": 1e-06, + "loss": 0.3855, + "mean_token_accuracy": 0.8762257695198059, + "num_tokens": 687032363.0, + "step": 18005 + }, + { + "epoch": 2.290548276300725, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.681177139282227, + "learning_rate": 1e-06, + "loss": 0.4462, + "mean_token_accuracy": 0.8577431440353394, + "num_tokens": 687069606.0, + "step": 18006 + }, + { + "epoch": 2.2906754865793157, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.508420944213867, + "learning_rate": 1e-06, + "loss": 0.4348, + "mean_token_accuracy": 0.8642537593841553, + "num_tokens": 687112857.0, + "step": 18007 + }, + { + "epoch": 2.2908026968579063, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.038040161132812, + "learning_rate": 1e-06, + "loss": 0.4142, + "mean_token_accuracy": 0.8707399964332581, + "num_tokens": 687152245.0, + "step": 18008 + }, + { + "epoch": 2.290929907136497, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.64828109741211, + "learning_rate": 1e-06, + "loss": 0.4046, + "mean_token_accuracy": 0.8723709583282471, + "num_tokens": 687193988.0, + "step": 18009 + }, + { + "epoch": 2.2910571174150873, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.606416702270508, + "learning_rate": 1e-06, + "loss": 0.4186, + "mean_token_accuracy": 0.8680678009986877, + "num_tokens": 687232375.0, + "step": 18010 + }, + { + "epoch": 2.291184327693678, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.599653244018555, + "learning_rate": 1e-06, + "loss": 0.3843, + "mean_token_accuracy": 0.8769469261169434, + "num_tokens": 687274566.0, + "step": 18011 + }, + { + "epoch": 2.291311537972268, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.417287826538086, + "learning_rate": 1e-06, + "loss": 0.4087, + "mean_token_accuracy": 0.8722767233848572, + "num_tokens": 687313231.0, + "step": 18012 + }, + { + "epoch": 2.291438748250859, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.85538673400879, + "learning_rate": 1e-06, + "loss": 0.3842, + "mean_token_accuracy": 0.8770544528961182, + "num_tokens": 687354502.0, + "step": 18013 + }, + { + "epoch": 2.291565958529449, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.587228775024414, + "learning_rate": 1e-06, + "loss": 0.4176, + "mean_token_accuracy": 0.8698301315307617, + "num_tokens": 687399160.0, + "step": 18014 + }, + { + "epoch": 2.2916931688080395, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.54612922668457, + "learning_rate": 1e-06, + "loss": 0.423, + "mean_token_accuracy": 0.863135576248169, + "num_tokens": 687438775.0, + "step": 18015 + }, + { + "epoch": 2.29182037908663, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.768386840820312, + "learning_rate": 1e-06, + "loss": 0.3869, + "mean_token_accuracy": 0.8766393065452576, + "num_tokens": 687471435.0, + "step": 18016 + }, + { + "epoch": 2.2919475893652206, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.53364372253418, + "learning_rate": 1e-06, + "loss": 0.3888, + "mean_token_accuracy": 0.8735209107398987, + "num_tokens": 687508657.0, + "step": 18017 + }, + { + "epoch": 2.292074799643811, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.475597381591797, + "learning_rate": 1e-06, + "loss": 0.3681, + "mean_token_accuracy": 0.8827939033508301, + "num_tokens": 687545024.0, + "step": 18018 + }, + { + "epoch": 2.2922020099224016, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.021337509155273, + "learning_rate": 1e-06, + "loss": 0.3936, + "mean_token_accuracy": 0.8769510388374329, + "num_tokens": 687579432.0, + "step": 18019 + }, + { + "epoch": 2.292329220200992, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.347482681274414, + "learning_rate": 1e-06, + "loss": 0.4339, + "mean_token_accuracy": 0.8628021478652954, + "num_tokens": 687618378.0, + "step": 18020 + }, + { + "epoch": 2.2924564304795827, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.70368194580078, + "learning_rate": 1e-06, + "loss": 0.3979, + "mean_token_accuracy": 0.871048092842102, + "num_tokens": 687657119.0, + "step": 18021 + }, + { + "epoch": 2.2925836407581732, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.562387466430664, + "learning_rate": 1e-06, + "loss": 0.4279, + "mean_token_accuracy": 0.8621201515197754, + "num_tokens": 687691924.0, + "step": 18022 + }, + { + "epoch": 2.2927108510367638, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.540918350219727, + "learning_rate": 1e-06, + "loss": 0.4717, + "mean_token_accuracy": 0.8499282598495483, + "num_tokens": 687724221.0, + "step": 18023 + }, + { + "epoch": 2.2928380613153543, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.854185104370117, + "learning_rate": 1e-06, + "loss": 0.3952, + "mean_token_accuracy": 0.8746184706687927, + "num_tokens": 687765049.0, + "step": 18024 + }, + { + "epoch": 2.292965271593945, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.78714942932129, + "learning_rate": 1e-06, + "loss": 0.4408, + "mean_token_accuracy": 0.863088846206665, + "num_tokens": 687810222.0, + "step": 18025 + }, + { + "epoch": 2.2930924818725353, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.417871475219727, + "learning_rate": 1e-06, + "loss": 0.4105, + "mean_token_accuracy": 0.868043065071106, + "num_tokens": 687851961.0, + "step": 18026 + }, + { + "epoch": 2.293219692151126, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.60208511352539, + "learning_rate": 1e-06, + "loss": 0.4323, + "mean_token_accuracy": 0.860649824142456, + "num_tokens": 687890448.0, + "step": 18027 + }, + { + "epoch": 2.2933469024297164, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.58865737915039, + "learning_rate": 1e-06, + "loss": 0.3705, + "mean_token_accuracy": 0.8821518421173096, + "num_tokens": 687929509.0, + "step": 18028 + }, + { + "epoch": 2.293474112708307, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.533782958984375, + "learning_rate": 1e-06, + "loss": 0.4311, + "mean_token_accuracy": 0.8630579113960266, + "num_tokens": 687966814.0, + "step": 18029 + }, + { + "epoch": 2.2936013229868975, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.59058380126953, + "learning_rate": 1e-06, + "loss": 0.3783, + "mean_token_accuracy": 0.8795757293701172, + "num_tokens": 688006398.0, + "step": 18030 + }, + { + "epoch": 2.293728533265488, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.626731872558594, + "learning_rate": 1e-06, + "loss": 0.4121, + "mean_token_accuracy": 0.8708925843238831, + "num_tokens": 688044604.0, + "step": 18031 + }, + { + "epoch": 2.2938557435440785, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.44542121887207, + "learning_rate": 1e-06, + "loss": 0.412, + "mean_token_accuracy": 0.86720210313797, + "num_tokens": 688082969.0, + "step": 18032 + }, + { + "epoch": 2.293982953822669, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.585575103759766, + "learning_rate": 1e-06, + "loss": 0.3835, + "mean_token_accuracy": 0.8784838914871216, + "num_tokens": 688121442.0, + "step": 18033 + }, + { + "epoch": 2.2941101641012596, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.563398361206055, + "learning_rate": 1e-06, + "loss": 0.4081, + "mean_token_accuracy": 0.8728472590446472, + "num_tokens": 688161096.0, + "step": 18034 + }, + { + "epoch": 2.2942373743798496, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.689655303955078, + "learning_rate": 1e-06, + "loss": 0.4066, + "mean_token_accuracy": 0.8729021549224854, + "num_tokens": 688200782.0, + "step": 18035 + }, + { + "epoch": 2.2943645846584406, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.53421401977539, + "learning_rate": 1e-06, + "loss": 0.3817, + "mean_token_accuracy": 0.8789080381393433, + "num_tokens": 688241333.0, + "step": 18036 + }, + { + "epoch": 2.2944917949370307, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.612003326416016, + "learning_rate": 1e-06, + "loss": 0.4186, + "mean_token_accuracy": 0.870165228843689, + "num_tokens": 688278297.0, + "step": 18037 + }, + { + "epoch": 2.2946190052156212, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.739667892456055, + "learning_rate": 1e-06, + "loss": 0.4807, + "mean_token_accuracy": 0.8490803241729736, + "num_tokens": 688318835.0, + "step": 18038 + }, + { + "epoch": 2.2947462154942118, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.487346649169922, + "learning_rate": 1e-06, + "loss": 0.4008, + "mean_token_accuracy": 0.8756157755851746, + "num_tokens": 688354814.0, + "step": 18039 + }, + { + "epoch": 2.2948734257728023, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.69756507873535, + "learning_rate": 1e-06, + "loss": 0.3951, + "mean_token_accuracy": 0.8749182224273682, + "num_tokens": 688390008.0, + "step": 18040 + }, + { + "epoch": 2.295000636051393, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.458965301513672, + "learning_rate": 1e-06, + "loss": 0.3935, + "mean_token_accuracy": 0.8738842606544495, + "num_tokens": 688430938.0, + "step": 18041 + }, + { + "epoch": 2.2951278463299833, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.514997482299805, + "learning_rate": 1e-06, + "loss": 0.5202, + "mean_token_accuracy": 0.8419762253761292, + "num_tokens": 688466730.0, + "step": 18042 + }, + { + "epoch": 2.295255056608574, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.746898651123047, + "learning_rate": 1e-06, + "loss": 0.4009, + "mean_token_accuracy": 0.8710940480232239, + "num_tokens": 688506666.0, + "step": 18043 + }, + { + "epoch": 2.2953822668871644, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.478843688964844, + "learning_rate": 1e-06, + "loss": 0.4592, + "mean_token_accuracy": 0.8582895994186401, + "num_tokens": 688547726.0, + "step": 18044 + }, + { + "epoch": 2.295509477165755, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.737401962280273, + "learning_rate": 1e-06, + "loss": 0.4166, + "mean_token_accuracy": 0.8688496351242065, + "num_tokens": 688587487.0, + "step": 18045 + }, + { + "epoch": 2.2956366874443455, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.501358032226562, + "learning_rate": 1e-06, + "loss": 0.4268, + "mean_token_accuracy": 0.8653348684310913, + "num_tokens": 688632454.0, + "step": 18046 + }, + { + "epoch": 2.295763897722936, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.50663948059082, + "learning_rate": 1e-06, + "loss": 0.3825, + "mean_token_accuracy": 0.8793396949768066, + "num_tokens": 688668489.0, + "step": 18047 + }, + { + "epoch": 2.2958911080015265, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.03482437133789, + "learning_rate": 1e-06, + "loss": 0.3829, + "mean_token_accuracy": 0.8748405575752258, + "num_tokens": 688707899.0, + "step": 18048 + }, + { + "epoch": 2.296018318280117, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.791210174560547, + "learning_rate": 1e-06, + "loss": 0.3836, + "mean_token_accuracy": 0.8775986433029175, + "num_tokens": 688747171.0, + "step": 18049 + }, + { + "epoch": 2.2961455285587076, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.623151779174805, + "learning_rate": 1e-06, + "loss": 0.4127, + "mean_token_accuracy": 0.8704235553741455, + "num_tokens": 688789553.0, + "step": 18050 + }, + { + "epoch": 2.296272738837298, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.012723922729492, + "learning_rate": 1e-06, + "loss": 0.3963, + "mean_token_accuracy": 0.8765988349914551, + "num_tokens": 688827632.0, + "step": 18051 + }, + { + "epoch": 2.2963999491158886, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.63873291015625, + "learning_rate": 1e-06, + "loss": 0.3766, + "mean_token_accuracy": 0.8806922435760498, + "num_tokens": 688861505.0, + "step": 18052 + }, + { + "epoch": 2.296527159394479, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.60198402404785, + "learning_rate": 1e-06, + "loss": 0.4188, + "mean_token_accuracy": 0.8660900592803955, + "num_tokens": 688898270.0, + "step": 18053 + }, + { + "epoch": 2.2966543696730697, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.980724334716797, + "learning_rate": 1e-06, + "loss": 0.4066, + "mean_token_accuracy": 0.8733833432197571, + "num_tokens": 688937420.0, + "step": 18054 + }, + { + "epoch": 2.29678157995166, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.763019561767578, + "learning_rate": 1e-06, + "loss": 0.3984, + "mean_token_accuracy": 0.8784685134887695, + "num_tokens": 688972426.0, + "step": 18055 + }, + { + "epoch": 2.2969087902302507, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.686553955078125, + "learning_rate": 1e-06, + "loss": 0.4044, + "mean_token_accuracy": 0.8725883960723877, + "num_tokens": 689005801.0, + "step": 18056 + }, + { + "epoch": 2.2970360005088413, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.633573532104492, + "learning_rate": 1e-06, + "loss": 0.4129, + "mean_token_accuracy": 0.8679265975952148, + "num_tokens": 689039999.0, + "step": 18057 + }, + { + "epoch": 2.297163210787432, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.57138442993164, + "learning_rate": 1e-06, + "loss": 0.4588, + "mean_token_accuracy": 0.8570557832717896, + "num_tokens": 689079404.0, + "step": 18058 + }, + { + "epoch": 2.2972904210660223, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.57823371887207, + "learning_rate": 1e-06, + "loss": 0.4733, + "mean_token_accuracy": 0.8513559103012085, + "num_tokens": 689116197.0, + "step": 18059 + }, + { + "epoch": 2.2974176313446124, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.586240768432617, + "learning_rate": 1e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.873031497001648, + "num_tokens": 689153446.0, + "step": 18060 + }, + { + "epoch": 2.2975448416232034, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.735652923583984, + "learning_rate": 1e-06, + "loss": 0.4532, + "mean_token_accuracy": 0.8557729721069336, + "num_tokens": 689185310.0, + "step": 18061 + }, + { + "epoch": 2.2976720519017935, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.74260711669922, + "learning_rate": 1e-06, + "loss": 0.3756, + "mean_token_accuracy": 0.8828506469726562, + "num_tokens": 689224150.0, + "step": 18062 + }, + { + "epoch": 2.297799262180384, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.71927833557129, + "learning_rate": 1e-06, + "loss": 0.4415, + "mean_token_accuracy": 0.8621684312820435, + "num_tokens": 689261206.0, + "step": 18063 + }, + { + "epoch": 2.2979264724589745, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.632585525512695, + "learning_rate": 1e-06, + "loss": 0.3872, + "mean_token_accuracy": 0.8781248927116394, + "num_tokens": 689299129.0, + "step": 18064 + }, + { + "epoch": 2.298053682737565, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.782602310180664, + "learning_rate": 1e-06, + "loss": 0.3546, + "mean_token_accuracy": 0.8881176710128784, + "num_tokens": 689330554.0, + "step": 18065 + }, + { + "epoch": 2.2981808930161556, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.795387268066406, + "learning_rate": 1e-06, + "loss": 0.3886, + "mean_token_accuracy": 0.8785049915313721, + "num_tokens": 689364084.0, + "step": 18066 + }, + { + "epoch": 2.298308103294746, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.719127655029297, + "learning_rate": 1e-06, + "loss": 0.46, + "mean_token_accuracy": 0.8607118129730225, + "num_tokens": 689405254.0, + "step": 18067 + }, + { + "epoch": 2.2984353135733366, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.543535232543945, + "learning_rate": 1e-06, + "loss": 0.4405, + "mean_token_accuracy": 0.8629363775253296, + "num_tokens": 689441109.0, + "step": 18068 + }, + { + "epoch": 2.298562523851927, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.703277587890625, + "learning_rate": 1e-06, + "loss": 0.4082, + "mean_token_accuracy": 0.8726192712783813, + "num_tokens": 689474937.0, + "step": 18069 + }, + { + "epoch": 2.2986897341305177, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.50205421447754, + "learning_rate": 1e-06, + "loss": 0.42, + "mean_token_accuracy": 0.8675235509872437, + "num_tokens": 689515971.0, + "step": 18070 + }, + { + "epoch": 2.298816944409108, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.70868492126465, + "learning_rate": 1e-06, + "loss": 0.47, + "mean_token_accuracy": 0.8506137132644653, + "num_tokens": 689553620.0, + "step": 18071 + }, + { + "epoch": 2.2989441546876987, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.576595306396484, + "learning_rate": 1e-06, + "loss": 0.4063, + "mean_token_accuracy": 0.8759061098098755, + "num_tokens": 689592157.0, + "step": 18072 + }, + { + "epoch": 2.2990713649662893, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.788646697998047, + "learning_rate": 1e-06, + "loss": 0.3553, + "mean_token_accuracy": 0.885515570640564, + "num_tokens": 689626889.0, + "step": 18073 + }, + { + "epoch": 2.29919857524488, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.6073055267334, + "learning_rate": 1e-06, + "loss": 0.373, + "mean_token_accuracy": 0.8840049505233765, + "num_tokens": 689666161.0, + "step": 18074 + }, + { + "epoch": 2.2993257855234703, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.578887939453125, + "learning_rate": 1e-06, + "loss": 0.4307, + "mean_token_accuracy": 0.8646042346954346, + "num_tokens": 689713713.0, + "step": 18075 + }, + { + "epoch": 2.299452995802061, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.387971878051758, + "learning_rate": 1e-06, + "loss": 0.4369, + "mean_token_accuracy": 0.8638508319854736, + "num_tokens": 689755146.0, + "step": 18076 + }, + { + "epoch": 2.2995802060806514, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.70880126953125, + "learning_rate": 1e-06, + "loss": 0.3757, + "mean_token_accuracy": 0.8806498646736145, + "num_tokens": 689795925.0, + "step": 18077 + }, + { + "epoch": 2.299707416359242, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.665630340576172, + "learning_rate": 1e-06, + "loss": 0.3744, + "mean_token_accuracy": 0.8836473226547241, + "num_tokens": 689830673.0, + "step": 18078 + }, + { + "epoch": 2.2998346266378324, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.63300895690918, + "learning_rate": 1e-06, + "loss": 0.4032, + "mean_token_accuracy": 0.8716095685958862, + "num_tokens": 689863649.0, + "step": 18079 + }, + { + "epoch": 2.299961836916423, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.67424964904785, + "learning_rate": 1e-06, + "loss": 0.415, + "mean_token_accuracy": 0.8712735176086426, + "num_tokens": 689901946.0, + "step": 18080 + }, + { + "epoch": 2.3000890471950135, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.497655868530273, + "learning_rate": 1e-06, + "loss": 0.4082, + "mean_token_accuracy": 0.8661956191062927, + "num_tokens": 689939968.0, + "step": 18081 + }, + { + "epoch": 2.300216257473604, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.777999877929688, + "learning_rate": 1e-06, + "loss": 0.3814, + "mean_token_accuracy": 0.8807013630867004, + "num_tokens": 689979792.0, + "step": 18082 + }, + { + "epoch": 2.3003434677521946, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.7183780670166, + "learning_rate": 1e-06, + "loss": 0.3609, + "mean_token_accuracy": 0.8851165771484375, + "num_tokens": 690013974.0, + "step": 18083 + }, + { + "epoch": 2.300470678030785, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.361251831054688, + "learning_rate": 1e-06, + "loss": 0.3796, + "mean_token_accuracy": 0.8799176216125488, + "num_tokens": 690056540.0, + "step": 18084 + }, + { + "epoch": 2.300597888309375, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.81717300415039, + "learning_rate": 1e-06, + "loss": 0.4259, + "mean_token_accuracy": 0.8658211827278137, + "num_tokens": 690101032.0, + "step": 18085 + }, + { + "epoch": 2.300725098587966, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.729930877685547, + "learning_rate": 1e-06, + "loss": 0.365, + "mean_token_accuracy": 0.8843445777893066, + "num_tokens": 690135417.0, + "step": 18086 + }, + { + "epoch": 2.3008523088665562, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.427413940429688, + "learning_rate": 1e-06, + "loss": 0.4017, + "mean_token_accuracy": 0.8734973073005676, + "num_tokens": 690178552.0, + "step": 18087 + }, + { + "epoch": 2.3009795191451468, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.60533905029297, + "learning_rate": 1e-06, + "loss": 0.4195, + "mean_token_accuracy": 0.8688415288925171, + "num_tokens": 690221928.0, + "step": 18088 + }, + { + "epoch": 2.3011067294237373, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.705230712890625, + "learning_rate": 1e-06, + "loss": 0.3872, + "mean_token_accuracy": 0.8785744905471802, + "num_tokens": 690257040.0, + "step": 18089 + }, + { + "epoch": 2.301233939702328, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.583011627197266, + "learning_rate": 1e-06, + "loss": 0.4013, + "mean_token_accuracy": 0.8713963031768799, + "num_tokens": 690302720.0, + "step": 18090 + }, + { + "epoch": 2.3013611499809183, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.720882415771484, + "learning_rate": 1e-06, + "loss": 0.414, + "mean_token_accuracy": 0.8669916391372681, + "num_tokens": 690337042.0, + "step": 18091 + }, + { + "epoch": 2.301488360259509, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.71487045288086, + "learning_rate": 1e-06, + "loss": 0.3903, + "mean_token_accuracy": 0.8778612613677979, + "num_tokens": 690380247.0, + "step": 18092 + }, + { + "epoch": 2.3016155705380994, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.407188415527344, + "learning_rate": 1e-06, + "loss": 0.4015, + "mean_token_accuracy": 0.8760296106338501, + "num_tokens": 690425361.0, + "step": 18093 + }, + { + "epoch": 2.30174278081669, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.850873947143555, + "learning_rate": 1e-06, + "loss": 0.414, + "mean_token_accuracy": 0.8669053316116333, + "num_tokens": 690465137.0, + "step": 18094 + }, + { + "epoch": 2.3018699910952805, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.6192684173584, + "learning_rate": 1e-06, + "loss": 0.4117, + "mean_token_accuracy": 0.8706796169281006, + "num_tokens": 690501437.0, + "step": 18095 + }, + { + "epoch": 2.301997201373871, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.56278419494629, + "learning_rate": 1e-06, + "loss": 0.4296, + "mean_token_accuracy": 0.8676168918609619, + "num_tokens": 690544134.0, + "step": 18096 + }, + { + "epoch": 2.3021244116524615, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.686368942260742, + "learning_rate": 1e-06, + "loss": 0.3699, + "mean_token_accuracy": 0.8816225528717041, + "num_tokens": 690581062.0, + "step": 18097 + }, + { + "epoch": 2.302251621931052, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.498382568359375, + "learning_rate": 1e-06, + "loss": 0.3849, + "mean_token_accuracy": 0.8770315647125244, + "num_tokens": 690615145.0, + "step": 18098 + }, + { + "epoch": 2.3023788322096426, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.485206604003906, + "learning_rate": 1e-06, + "loss": 0.3802, + "mean_token_accuracy": 0.8783990144729614, + "num_tokens": 690655483.0, + "step": 18099 + }, + { + "epoch": 2.302506042488233, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.507888793945312, + "learning_rate": 1e-06, + "loss": 0.4095, + "mean_token_accuracy": 0.8698417544364929, + "num_tokens": 690694133.0, + "step": 18100 + }, + { + "epoch": 2.3026332527668236, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.514995574951172, + "learning_rate": 1e-06, + "loss": 0.4605, + "mean_token_accuracy": 0.8510609269142151, + "num_tokens": 690729939.0, + "step": 18101 + }, + { + "epoch": 2.302760463045414, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.6043643951416, + "learning_rate": 1e-06, + "loss": 0.3871, + "mean_token_accuracy": 0.8786745071411133, + "num_tokens": 690770139.0, + "step": 18102 + }, + { + "epoch": 2.3028876733240047, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.423233032226562, + "learning_rate": 1e-06, + "loss": 0.4001, + "mean_token_accuracy": 0.8740385174751282, + "num_tokens": 690811379.0, + "step": 18103 + }, + { + "epoch": 2.303014883602595, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.459522247314453, + "learning_rate": 1e-06, + "loss": 0.4052, + "mean_token_accuracy": 0.8683403134346008, + "num_tokens": 690844194.0, + "step": 18104 + }, + { + "epoch": 2.3031420938811857, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.407806396484375, + "learning_rate": 1e-06, + "loss": 0.4157, + "mean_token_accuracy": 0.865699052810669, + "num_tokens": 690882129.0, + "step": 18105 + }, + { + "epoch": 2.3032693041597763, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.58238410949707, + "learning_rate": 1e-06, + "loss": 0.3621, + "mean_token_accuracy": 0.8864151239395142, + "num_tokens": 690915568.0, + "step": 18106 + }, + { + "epoch": 2.303396514438367, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.590858459472656, + "learning_rate": 1e-06, + "loss": 0.4096, + "mean_token_accuracy": 0.870478093624115, + "num_tokens": 690953412.0, + "step": 18107 + }, + { + "epoch": 2.3035237247169573, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.510013580322266, + "learning_rate": 1e-06, + "loss": 0.4091, + "mean_token_accuracy": 0.8695821166038513, + "num_tokens": 690990047.0, + "step": 18108 + }, + { + "epoch": 2.303650934995548, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.570907592773438, + "learning_rate": 1e-06, + "loss": 0.39, + "mean_token_accuracy": 0.8770666122436523, + "num_tokens": 691031010.0, + "step": 18109 + }, + { + "epoch": 2.303778145274138, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.676162719726562, + "learning_rate": 1e-06, + "loss": 0.4012, + "mean_token_accuracy": 0.8745584487915039, + "num_tokens": 691065830.0, + "step": 18110 + }, + { + "epoch": 2.303905355552729, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.749591827392578, + "learning_rate": 1e-06, + "loss": 0.3807, + "mean_token_accuracy": 0.8840911388397217, + "num_tokens": 691101418.0, + "step": 18111 + }, + { + "epoch": 2.304032565831319, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.527738571166992, + "learning_rate": 1e-06, + "loss": 0.4396, + "mean_token_accuracy": 0.8582639098167419, + "num_tokens": 691139447.0, + "step": 18112 + }, + { + "epoch": 2.3041597761099095, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.703899383544922, + "learning_rate": 1e-06, + "loss": 0.4496, + "mean_token_accuracy": 0.8527989387512207, + "num_tokens": 691168809.0, + "step": 18113 + }, + { + "epoch": 2.3042869863885, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.7667179107666, + "learning_rate": 1e-06, + "loss": 0.3846, + "mean_token_accuracy": 0.878497302532196, + "num_tokens": 691200420.0, + "step": 18114 + }, + { + "epoch": 2.3044141966670906, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.698596954345703, + "learning_rate": 1e-06, + "loss": 0.4231, + "mean_token_accuracy": 0.8671289682388306, + "num_tokens": 691238587.0, + "step": 18115 + }, + { + "epoch": 2.304541406945681, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.752182006835938, + "learning_rate": 1e-06, + "loss": 0.405, + "mean_token_accuracy": 0.8704978823661804, + "num_tokens": 691277137.0, + "step": 18116 + }, + { + "epoch": 2.3046686172242716, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.693592071533203, + "learning_rate": 1e-06, + "loss": 0.4042, + "mean_token_accuracy": 0.8714123964309692, + "num_tokens": 691319564.0, + "step": 18117 + }, + { + "epoch": 2.304795827502862, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.578285217285156, + "learning_rate": 1e-06, + "loss": 0.4136, + "mean_token_accuracy": 0.8713489174842834, + "num_tokens": 691359652.0, + "step": 18118 + }, + { + "epoch": 2.3049230377814527, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.83773422241211, + "learning_rate": 1e-06, + "loss": 0.4031, + "mean_token_accuracy": 0.8726001977920532, + "num_tokens": 691399705.0, + "step": 18119 + }, + { + "epoch": 2.305050248060043, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.633968353271484, + "learning_rate": 1e-06, + "loss": 0.4218, + "mean_token_accuracy": 0.8657393455505371, + "num_tokens": 691439447.0, + "step": 18120 + }, + { + "epoch": 2.3051774583386337, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.597612380981445, + "learning_rate": 1e-06, + "loss": 0.4496, + "mean_token_accuracy": 0.8606216311454773, + "num_tokens": 691477453.0, + "step": 18121 + }, + { + "epoch": 2.3053046686172243, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.754680633544922, + "learning_rate": 1e-06, + "loss": 0.4013, + "mean_token_accuracy": 0.8719887137413025, + "num_tokens": 691517766.0, + "step": 18122 + }, + { + "epoch": 2.305431878895815, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.5770263671875, + "learning_rate": 1e-06, + "loss": 0.414, + "mean_token_accuracy": 0.872133731842041, + "num_tokens": 691553956.0, + "step": 18123 + }, + { + "epoch": 2.3055590891744053, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.550037384033203, + "learning_rate": 1e-06, + "loss": 0.4138, + "mean_token_accuracy": 0.8700605630874634, + "num_tokens": 691592522.0, + "step": 18124 + }, + { + "epoch": 2.305686299452996, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.85133171081543, + "learning_rate": 1e-06, + "loss": 0.4012, + "mean_token_accuracy": 0.8713694214820862, + "num_tokens": 691629202.0, + "step": 18125 + }, + { + "epoch": 2.3058135097315864, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.56270980834961, + "learning_rate": 1e-06, + "loss": 0.3953, + "mean_token_accuracy": 0.8746706247329712, + "num_tokens": 691665324.0, + "step": 18126 + }, + { + "epoch": 2.305940720010177, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.697559356689453, + "learning_rate": 1e-06, + "loss": 0.3712, + "mean_token_accuracy": 0.8820596933364868, + "num_tokens": 691704223.0, + "step": 18127 + }, + { + "epoch": 2.3060679302887674, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.650026321411133, + "learning_rate": 1e-06, + "loss": 0.4103, + "mean_token_accuracy": 0.868892252445221, + "num_tokens": 691745061.0, + "step": 18128 + }, + { + "epoch": 2.306195140567358, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.69691276550293, + "learning_rate": 1e-06, + "loss": 0.3754, + "mean_token_accuracy": 0.8841782212257385, + "num_tokens": 691783630.0, + "step": 18129 + }, + { + "epoch": 2.3063223508459485, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.5975284576416, + "learning_rate": 1e-06, + "loss": 0.4124, + "mean_token_accuracy": 0.8710181713104248, + "num_tokens": 691818283.0, + "step": 18130 + }, + { + "epoch": 2.306449561124539, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.64179229736328, + "learning_rate": 1e-06, + "loss": 0.4311, + "mean_token_accuracy": 0.8634392619132996, + "num_tokens": 691854585.0, + "step": 18131 + }, + { + "epoch": 2.3065767714031296, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.67915916442871, + "learning_rate": 1e-06, + "loss": 0.4541, + "mean_token_accuracy": 0.8549339175224304, + "num_tokens": 691891784.0, + "step": 18132 + }, + { + "epoch": 2.3067039816817196, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.417970657348633, + "learning_rate": 1e-06, + "loss": 0.3978, + "mean_token_accuracy": 0.8736770153045654, + "num_tokens": 691927389.0, + "step": 18133 + }, + { + "epoch": 2.3068311919603106, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.772979736328125, + "learning_rate": 1e-06, + "loss": 0.4197, + "mean_token_accuracy": 0.8696110248565674, + "num_tokens": 691962928.0, + "step": 18134 + }, + { + "epoch": 2.3069584022389007, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.614225387573242, + "learning_rate": 1e-06, + "loss": 0.4074, + "mean_token_accuracy": 0.8716326355934143, + "num_tokens": 692007064.0, + "step": 18135 + }, + { + "epoch": 2.3070856125174912, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.470218658447266, + "learning_rate": 1e-06, + "loss": 0.3847, + "mean_token_accuracy": 0.875734806060791, + "num_tokens": 692040668.0, + "step": 18136 + }, + { + "epoch": 2.3072128227960818, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.86093521118164, + "learning_rate": 1e-06, + "loss": 0.4033, + "mean_token_accuracy": 0.873566746711731, + "num_tokens": 692077710.0, + "step": 18137 + }, + { + "epoch": 2.3073400330746723, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.523202896118164, + "learning_rate": 1e-06, + "loss": 0.3861, + "mean_token_accuracy": 0.8791850805282593, + "num_tokens": 692112316.0, + "step": 18138 + }, + { + "epoch": 2.307467243353263, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.676973342895508, + "learning_rate": 1e-06, + "loss": 0.3844, + "mean_token_accuracy": 0.8768056035041809, + "num_tokens": 692153201.0, + "step": 18139 + }, + { + "epoch": 2.3075944536318533, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.965822219848633, + "learning_rate": 1e-06, + "loss": 0.4242, + "mean_token_accuracy": 0.8658660650253296, + "num_tokens": 692192942.0, + "step": 18140 + }, + { + "epoch": 2.307721663910444, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.998567581176758, + "learning_rate": 1e-06, + "loss": 0.3619, + "mean_token_accuracy": 0.8858448266983032, + "num_tokens": 692235515.0, + "step": 18141 + }, + { + "epoch": 2.3078488741890344, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.59524917602539, + "learning_rate": 1e-06, + "loss": 0.4354, + "mean_token_accuracy": 0.8631441593170166, + "num_tokens": 692271338.0, + "step": 18142 + }, + { + "epoch": 2.307976084467625, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.677087783813477, + "learning_rate": 1e-06, + "loss": 0.4656, + "mean_token_accuracy": 0.8536552786827087, + "num_tokens": 692311469.0, + "step": 18143 + }, + { + "epoch": 2.3081032947462155, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.885929107666016, + "learning_rate": 1e-06, + "loss": 0.4633, + "mean_token_accuracy": 0.8549594879150391, + "num_tokens": 692352052.0, + "step": 18144 + }, + { + "epoch": 2.308230505024806, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.775609970092773, + "learning_rate": 1e-06, + "loss": 0.4533, + "mean_token_accuracy": 0.8562576770782471, + "num_tokens": 692382978.0, + "step": 18145 + }, + { + "epoch": 2.3083577153033965, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.56200408935547, + "learning_rate": 1e-06, + "loss": 0.4029, + "mean_token_accuracy": 0.8746705055236816, + "num_tokens": 692415651.0, + "step": 18146 + }, + { + "epoch": 2.308484925581987, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.70821762084961, + "learning_rate": 1e-06, + "loss": 0.4612, + "mean_token_accuracy": 0.8539547920227051, + "num_tokens": 692455937.0, + "step": 18147 + }, + { + "epoch": 2.3086121358605776, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.842872619628906, + "learning_rate": 1e-06, + "loss": 0.4095, + "mean_token_accuracy": 0.8665382862091064, + "num_tokens": 692496277.0, + "step": 18148 + }, + { + "epoch": 2.308739346139168, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.631591796875, + "learning_rate": 1e-06, + "loss": 0.3976, + "mean_token_accuracy": 0.8725472092628479, + "num_tokens": 692530485.0, + "step": 18149 + }, + { + "epoch": 2.3088665564177586, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.712587356567383, + "learning_rate": 1e-06, + "loss": 0.4192, + "mean_token_accuracy": 0.8684993386268616, + "num_tokens": 692564794.0, + "step": 18150 + }, + { + "epoch": 2.308993766696349, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.74929428100586, + "learning_rate": 1e-06, + "loss": 0.4378, + "mean_token_accuracy": 0.8617197275161743, + "num_tokens": 692610359.0, + "step": 18151 + }, + { + "epoch": 2.3091209769749397, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.552621841430664, + "learning_rate": 1e-06, + "loss": 0.3795, + "mean_token_accuracy": 0.8788108825683594, + "num_tokens": 692641560.0, + "step": 18152 + }, + { + "epoch": 2.30924818725353, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.54673194885254, + "learning_rate": 1e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.866997241973877, + "num_tokens": 692680114.0, + "step": 18153 + }, + { + "epoch": 2.3093753975321207, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.854076385498047, + "learning_rate": 1e-06, + "loss": 0.4541, + "mean_token_accuracy": 0.8590289950370789, + "num_tokens": 692714199.0, + "step": 18154 + }, + { + "epoch": 2.3095026078107113, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.603878021240234, + "learning_rate": 1e-06, + "loss": 0.3931, + "mean_token_accuracy": 0.8738564848899841, + "num_tokens": 692752331.0, + "step": 18155 + }, + { + "epoch": 2.309629818089302, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.498470306396484, + "learning_rate": 1e-06, + "loss": 0.3991, + "mean_token_accuracy": 0.8779149055480957, + "num_tokens": 692789167.0, + "step": 18156 + }, + { + "epoch": 2.3097570283678923, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.054758071899414, + "learning_rate": 1e-06, + "loss": 0.4041, + "mean_token_accuracy": 0.871917724609375, + "num_tokens": 692829234.0, + "step": 18157 + }, + { + "epoch": 2.3098842386464824, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.933517456054688, + "learning_rate": 1e-06, + "loss": 0.4695, + "mean_token_accuracy": 0.8511716723442078, + "num_tokens": 692863749.0, + "step": 18158 + }, + { + "epoch": 2.3100114489250734, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.447429656982422, + "learning_rate": 1e-06, + "loss": 0.4251, + "mean_token_accuracy": 0.867174506187439, + "num_tokens": 692900972.0, + "step": 18159 + }, + { + "epoch": 2.3101386592036635, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.740015029907227, + "learning_rate": 1e-06, + "loss": 0.4278, + "mean_token_accuracy": 0.8657266497612, + "num_tokens": 692938522.0, + "step": 18160 + }, + { + "epoch": 2.310265869482254, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.638843536376953, + "learning_rate": 1e-06, + "loss": 0.3763, + "mean_token_accuracy": 0.8809366226196289, + "num_tokens": 692973653.0, + "step": 18161 + }, + { + "epoch": 2.3103930797608445, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.590669631958008, + "learning_rate": 1e-06, + "loss": 0.4415, + "mean_token_accuracy": 0.8643602728843689, + "num_tokens": 693016325.0, + "step": 18162 + }, + { + "epoch": 2.310520290039435, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.53121566772461, + "learning_rate": 1e-06, + "loss": 0.382, + "mean_token_accuracy": 0.8820058703422546, + "num_tokens": 693052022.0, + "step": 18163 + }, + { + "epoch": 2.3106475003180256, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.56371307373047, + "learning_rate": 1e-06, + "loss": 0.398, + "mean_token_accuracy": 0.8762855529785156, + "num_tokens": 693093936.0, + "step": 18164 + }, + { + "epoch": 2.310774710596616, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.02419090270996, + "learning_rate": 1e-06, + "loss": 0.3585, + "mean_token_accuracy": 0.8849599361419678, + "num_tokens": 693130160.0, + "step": 18165 + }, + { + "epoch": 2.3109019208752066, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.462661743164062, + "learning_rate": 1e-06, + "loss": 0.3953, + "mean_token_accuracy": 0.8744927644729614, + "num_tokens": 693172405.0, + "step": 18166 + }, + { + "epoch": 2.311029131153797, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.559967041015625, + "learning_rate": 1e-06, + "loss": 0.4038, + "mean_token_accuracy": 0.8743830323219299, + "num_tokens": 693206499.0, + "step": 18167 + }, + { + "epoch": 2.3111563414323877, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.726966857910156, + "learning_rate": 1e-06, + "loss": 0.4077, + "mean_token_accuracy": 0.8699873685836792, + "num_tokens": 693244287.0, + "step": 18168 + }, + { + "epoch": 2.311283551710978, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.544776916503906, + "learning_rate": 1e-06, + "loss": 0.4442, + "mean_token_accuracy": 0.862520158290863, + "num_tokens": 693286164.0, + "step": 18169 + }, + { + "epoch": 2.3114107619895687, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.66293716430664, + "learning_rate": 1e-06, + "loss": 0.3918, + "mean_token_accuracy": 0.8780632019042969, + "num_tokens": 693330961.0, + "step": 18170 + }, + { + "epoch": 2.3115379722681593, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.66669464111328, + "learning_rate": 1e-06, + "loss": 0.3976, + "mean_token_accuracy": 0.8764669895172119, + "num_tokens": 693375169.0, + "step": 18171 + }, + { + "epoch": 2.31166518254675, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.027843475341797, + "learning_rate": 1e-06, + "loss": 0.4462, + "mean_token_accuracy": 0.8666085600852966, + "num_tokens": 693414127.0, + "step": 18172 + }, + { + "epoch": 2.3117923928253403, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.568096160888672, + "learning_rate": 1e-06, + "loss": 0.4125, + "mean_token_accuracy": 0.8680495023727417, + "num_tokens": 693450446.0, + "step": 18173 + }, + { + "epoch": 2.311919603103931, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.667158126831055, + "learning_rate": 1e-06, + "loss": 0.4457, + "mean_token_accuracy": 0.8628133535385132, + "num_tokens": 693490973.0, + "step": 18174 + }, + { + "epoch": 2.3120468133825214, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.508359909057617, + "learning_rate": 1e-06, + "loss": 0.3959, + "mean_token_accuracy": 0.8732970356941223, + "num_tokens": 693526815.0, + "step": 18175 + }, + { + "epoch": 2.312174023661112, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.836328506469727, + "learning_rate": 1e-06, + "loss": 0.4439, + "mean_token_accuracy": 0.8648161888122559, + "num_tokens": 693566949.0, + "step": 18176 + }, + { + "epoch": 2.3123012339397024, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.52239990234375, + "learning_rate": 1e-06, + "loss": 0.431, + "mean_token_accuracy": 0.8656183481216431, + "num_tokens": 693601116.0, + "step": 18177 + }, + { + "epoch": 2.312428444218293, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.7273006439209, + "learning_rate": 1e-06, + "loss": 0.3284, + "mean_token_accuracy": 0.9008774161338806, + "num_tokens": 693640588.0, + "step": 18178 + }, + { + "epoch": 2.3125556544968835, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.878780364990234, + "learning_rate": 1e-06, + "loss": 0.3985, + "mean_token_accuracy": 0.8729230165481567, + "num_tokens": 693683352.0, + "step": 18179 + }, + { + "epoch": 2.312682864775474, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.652830123901367, + "learning_rate": 1e-06, + "loss": 0.381, + "mean_token_accuracy": 0.8776228427886963, + "num_tokens": 693722579.0, + "step": 18180 + }, + { + "epoch": 2.3128100750540646, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.698917388916016, + "learning_rate": 1e-06, + "loss": 0.3979, + "mean_token_accuracy": 0.873087465763092, + "num_tokens": 693766743.0, + "step": 18181 + }, + { + "epoch": 2.312937285332655, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.614789962768555, + "learning_rate": 1e-06, + "loss": 0.401, + "mean_token_accuracy": 0.8745536208152771, + "num_tokens": 693809699.0, + "step": 18182 + }, + { + "epoch": 2.313064495611245, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.630979537963867, + "learning_rate": 1e-06, + "loss": 0.3876, + "mean_token_accuracy": 0.8780934810638428, + "num_tokens": 693845508.0, + "step": 18183 + }, + { + "epoch": 2.313191705889836, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.700828552246094, + "learning_rate": 1e-06, + "loss": 0.4025, + "mean_token_accuracy": 0.8709896206855774, + "num_tokens": 693883763.0, + "step": 18184 + }, + { + "epoch": 2.313318916168426, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.48543930053711, + "learning_rate": 1e-06, + "loss": 0.3879, + "mean_token_accuracy": 0.8775984048843384, + "num_tokens": 693917093.0, + "step": 18185 + }, + { + "epoch": 2.3134461264470167, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.029003143310547, + "learning_rate": 1e-06, + "loss": 0.3971, + "mean_token_accuracy": 0.872998833656311, + "num_tokens": 693949219.0, + "step": 18186 + }, + { + "epoch": 2.3135733367256073, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.044843673706055, + "learning_rate": 1e-06, + "loss": 0.4254, + "mean_token_accuracy": 0.865347146987915, + "num_tokens": 693992755.0, + "step": 18187 + }, + { + "epoch": 2.313700547004198, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.542343139648438, + "learning_rate": 1e-06, + "loss": 0.461, + "mean_token_accuracy": 0.8569996356964111, + "num_tokens": 694030386.0, + "step": 18188 + }, + { + "epoch": 2.3138277572827883, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.685134887695312, + "learning_rate": 1e-06, + "loss": 0.3687, + "mean_token_accuracy": 0.8797869682312012, + "num_tokens": 694065019.0, + "step": 18189 + }, + { + "epoch": 2.313954967561379, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.00299835205078, + "learning_rate": 1e-06, + "loss": 0.4187, + "mean_token_accuracy": 0.8692896366119385, + "num_tokens": 694106645.0, + "step": 18190 + }, + { + "epoch": 2.3140821778399694, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.886837005615234, + "learning_rate": 1e-06, + "loss": 0.3923, + "mean_token_accuracy": 0.875943124294281, + "num_tokens": 694149146.0, + "step": 18191 + }, + { + "epoch": 2.31420938811856, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.530534744262695, + "learning_rate": 1e-06, + "loss": 0.4644, + "mean_token_accuracy": 0.8560391068458557, + "num_tokens": 694193567.0, + "step": 18192 + }, + { + "epoch": 2.3143365983971504, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.983888626098633, + "learning_rate": 1e-06, + "loss": 0.3687, + "mean_token_accuracy": 0.8826209306716919, + "num_tokens": 694232160.0, + "step": 18193 + }, + { + "epoch": 2.314463808675741, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.013212203979492, + "learning_rate": 1e-06, + "loss": 0.4043, + "mean_token_accuracy": 0.8720723986625671, + "num_tokens": 694275153.0, + "step": 18194 + }, + { + "epoch": 2.3145910189543315, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.808603286743164, + "learning_rate": 1e-06, + "loss": 0.3901, + "mean_token_accuracy": 0.8778805732727051, + "num_tokens": 694313722.0, + "step": 18195 + }, + { + "epoch": 2.314718229232922, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.56581687927246, + "learning_rate": 1e-06, + "loss": 0.4018, + "mean_token_accuracy": 0.8748319149017334, + "num_tokens": 694350544.0, + "step": 18196 + }, + { + "epoch": 2.3148454395115126, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.816869735717773, + "learning_rate": 1e-06, + "loss": 0.3958, + "mean_token_accuracy": 0.8743489384651184, + "num_tokens": 694386552.0, + "step": 18197 + }, + { + "epoch": 2.314972649790103, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.481687545776367, + "learning_rate": 1e-06, + "loss": 0.4171, + "mean_token_accuracy": 0.8678606748580933, + "num_tokens": 694434368.0, + "step": 18198 + }, + { + "epoch": 2.3150998600686936, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.563173294067383, + "learning_rate": 1e-06, + "loss": 0.406, + "mean_token_accuracy": 0.8718836903572083, + "num_tokens": 694479856.0, + "step": 18199 + }, + { + "epoch": 2.315227070347284, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.607582092285156, + "learning_rate": 1e-06, + "loss": 0.406, + "mean_token_accuracy": 0.8724321126937866, + "num_tokens": 694521107.0, + "step": 18200 + }, + { + "epoch": 2.3153542806258747, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.521900177001953, + "learning_rate": 1e-06, + "loss": 0.4098, + "mean_token_accuracy": 0.8730443120002747, + "num_tokens": 694560886.0, + "step": 18201 + }, + { + "epoch": 2.315481490904465, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.780534744262695, + "learning_rate": 1e-06, + "loss": 0.397, + "mean_token_accuracy": 0.8766864538192749, + "num_tokens": 694598404.0, + "step": 18202 + }, + { + "epoch": 2.3156087011830557, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.54763412475586, + "learning_rate": 1e-06, + "loss": 0.4215, + "mean_token_accuracy": 0.8666470050811768, + "num_tokens": 694637162.0, + "step": 18203 + }, + { + "epoch": 2.3157359114616463, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.765668869018555, + "learning_rate": 1e-06, + "loss": 0.3975, + "mean_token_accuracy": 0.8752169609069824, + "num_tokens": 694677435.0, + "step": 18204 + }, + { + "epoch": 2.315863121740237, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.560171127319336, + "learning_rate": 1e-06, + "loss": 0.3768, + "mean_token_accuracy": 0.881134569644928, + "num_tokens": 694713980.0, + "step": 18205 + }, + { + "epoch": 2.3159903320188273, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.774677276611328, + "learning_rate": 1e-06, + "loss": 0.4186, + "mean_token_accuracy": 0.867135763168335, + "num_tokens": 694753884.0, + "step": 18206 + }, + { + "epoch": 2.316117542297418, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.797637939453125, + "learning_rate": 1e-06, + "loss": 0.3655, + "mean_token_accuracy": 0.8845144510269165, + "num_tokens": 694785739.0, + "step": 18207 + }, + { + "epoch": 2.316244752576008, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.59402847290039, + "learning_rate": 1e-06, + "loss": 0.4192, + "mean_token_accuracy": 0.8708265423774719, + "num_tokens": 694827629.0, + "step": 18208 + }, + { + "epoch": 2.316371962854599, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.915916442871094, + "learning_rate": 1e-06, + "loss": 0.4164, + "mean_token_accuracy": 0.8677049875259399, + "num_tokens": 694868241.0, + "step": 18209 + }, + { + "epoch": 2.316499173133189, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.914541244506836, + "learning_rate": 1e-06, + "loss": 0.4213, + "mean_token_accuracy": 0.8629071712493896, + "num_tokens": 694905298.0, + "step": 18210 + }, + { + "epoch": 2.3166263834117795, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.404191970825195, + "learning_rate": 1e-06, + "loss": 0.4276, + "mean_token_accuracy": 0.8639016151428223, + "num_tokens": 694938054.0, + "step": 18211 + }, + { + "epoch": 2.31675359369037, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.874914169311523, + "learning_rate": 1e-06, + "loss": 0.4249, + "mean_token_accuracy": 0.867622435092926, + "num_tokens": 694978761.0, + "step": 18212 + }, + { + "epoch": 2.3168808039689606, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.982166290283203, + "learning_rate": 1e-06, + "loss": 0.4158, + "mean_token_accuracy": 0.8683511018753052, + "num_tokens": 695015073.0, + "step": 18213 + }, + { + "epoch": 2.317008014247551, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.70148468017578, + "learning_rate": 1e-06, + "loss": 0.4207, + "mean_token_accuracy": 0.8665706515312195, + "num_tokens": 695052288.0, + "step": 18214 + }, + { + "epoch": 2.3171352245261416, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.607528686523438, + "learning_rate": 1e-06, + "loss": 0.4503, + "mean_token_accuracy": 0.8605716228485107, + "num_tokens": 695088976.0, + "step": 18215 + }, + { + "epoch": 2.317262434804732, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.651611328125, + "learning_rate": 1e-06, + "loss": 0.3944, + "mean_token_accuracy": 0.875293493270874, + "num_tokens": 695128482.0, + "step": 18216 + }, + { + "epoch": 2.3173896450833227, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.856929779052734, + "learning_rate": 1e-06, + "loss": 0.4161, + "mean_token_accuracy": 0.868385910987854, + "num_tokens": 695164963.0, + "step": 18217 + }, + { + "epoch": 2.317516855361913, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.58664321899414, + "learning_rate": 1e-06, + "loss": 0.3453, + "mean_token_accuracy": 0.8932374715805054, + "num_tokens": 695201515.0, + "step": 18218 + }, + { + "epoch": 2.3176440656405037, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.69256019592285, + "learning_rate": 1e-06, + "loss": 0.3892, + "mean_token_accuracy": 0.8782490491867065, + "num_tokens": 695242057.0, + "step": 18219 + }, + { + "epoch": 2.3177712759190943, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.710660934448242, + "learning_rate": 1e-06, + "loss": 0.402, + "mean_token_accuracy": 0.8736541271209717, + "num_tokens": 695277128.0, + "step": 18220 + }, + { + "epoch": 2.317898486197685, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.160388946533203, + "learning_rate": 1e-06, + "loss": 0.3723, + "mean_token_accuracy": 0.8842861652374268, + "num_tokens": 695311984.0, + "step": 18221 + }, + { + "epoch": 2.3180256964762753, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.773609161376953, + "learning_rate": 1e-06, + "loss": 0.4375, + "mean_token_accuracy": 0.8617497682571411, + "num_tokens": 695348457.0, + "step": 18222 + }, + { + "epoch": 2.318152906754866, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.590736389160156, + "learning_rate": 1e-06, + "loss": 0.3918, + "mean_token_accuracy": 0.8746801614761353, + "num_tokens": 695386518.0, + "step": 18223 + }, + { + "epoch": 2.3182801170334564, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.959993362426758, + "learning_rate": 1e-06, + "loss": 0.4165, + "mean_token_accuracy": 0.868868350982666, + "num_tokens": 695425365.0, + "step": 18224 + }, + { + "epoch": 2.318407327312047, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.66070556640625, + "learning_rate": 1e-06, + "loss": 0.4026, + "mean_token_accuracy": 0.8743064403533936, + "num_tokens": 695459559.0, + "step": 18225 + }, + { + "epoch": 2.3185345375906374, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.638031005859375, + "learning_rate": 1e-06, + "loss": 0.4174, + "mean_token_accuracy": 0.8674458265304565, + "num_tokens": 695491386.0, + "step": 18226 + }, + { + "epoch": 2.318661747869228, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.411457061767578, + "learning_rate": 1e-06, + "loss": 0.4601, + "mean_token_accuracy": 0.8550319075584412, + "num_tokens": 695534660.0, + "step": 18227 + }, + { + "epoch": 2.3187889581478185, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.812349319458008, + "learning_rate": 1e-06, + "loss": 0.4399, + "mean_token_accuracy": 0.8635859489440918, + "num_tokens": 695578127.0, + "step": 18228 + }, + { + "epoch": 2.318916168426409, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.102134704589844, + "learning_rate": 1e-06, + "loss": 0.4435, + "mean_token_accuracy": 0.8595374822616577, + "num_tokens": 695614181.0, + "step": 18229 + }, + { + "epoch": 2.3190433787049995, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.5994815826416, + "learning_rate": 1e-06, + "loss": 0.413, + "mean_token_accuracy": 0.8713076710700989, + "num_tokens": 695652753.0, + "step": 18230 + }, + { + "epoch": 2.3191705889835896, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.5880184173584, + "learning_rate": 1e-06, + "loss": 0.387, + "mean_token_accuracy": 0.8788117170333862, + "num_tokens": 695692449.0, + "step": 18231 + }, + { + "epoch": 2.3192977992621806, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.9088134765625, + "learning_rate": 1e-06, + "loss": 0.4092, + "mean_token_accuracy": 0.8711346983909607, + "num_tokens": 695730812.0, + "step": 18232 + }, + { + "epoch": 2.3194250095407707, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.559358596801758, + "learning_rate": 1e-06, + "loss": 0.3585, + "mean_token_accuracy": 0.8869333267211914, + "num_tokens": 695767577.0, + "step": 18233 + }, + { + "epoch": 2.319552219819361, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.848125457763672, + "learning_rate": 1e-06, + "loss": 0.4128, + "mean_token_accuracy": 0.8730844855308533, + "num_tokens": 695812372.0, + "step": 18234 + }, + { + "epoch": 2.3196794300979517, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.665449142456055, + "learning_rate": 1e-06, + "loss": 0.3524, + "mean_token_accuracy": 0.8877092599868774, + "num_tokens": 695847470.0, + "step": 18235 + }, + { + "epoch": 2.3198066403765423, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.52613639831543, + "learning_rate": 1e-06, + "loss": 0.3614, + "mean_token_accuracy": 0.8871265649795532, + "num_tokens": 695888082.0, + "step": 18236 + }, + { + "epoch": 2.319933850655133, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.955841064453125, + "learning_rate": 1e-06, + "loss": 0.4021, + "mean_token_accuracy": 0.8790812492370605, + "num_tokens": 695928904.0, + "step": 18237 + }, + { + "epoch": 2.3200610609337233, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.91533851623535, + "learning_rate": 1e-06, + "loss": 0.4365, + "mean_token_accuracy": 0.8618022799491882, + "num_tokens": 695967549.0, + "step": 18238 + }, + { + "epoch": 2.320188271212314, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.579586029052734, + "learning_rate": 1e-06, + "loss": 0.4287, + "mean_token_accuracy": 0.8664991855621338, + "num_tokens": 696005611.0, + "step": 18239 + }, + { + "epoch": 2.3203154814909044, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.73119354248047, + "learning_rate": 1e-06, + "loss": 0.3894, + "mean_token_accuracy": 0.8746750950813293, + "num_tokens": 696034864.0, + "step": 18240 + }, + { + "epoch": 2.320442691769495, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.610185623168945, + "learning_rate": 1e-06, + "loss": 0.4069, + "mean_token_accuracy": 0.8693599700927734, + "num_tokens": 696072137.0, + "step": 18241 + }, + { + "epoch": 2.3205699020480854, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.744823455810547, + "learning_rate": 1e-06, + "loss": 0.4054, + "mean_token_accuracy": 0.8740944266319275, + "num_tokens": 696115744.0, + "step": 18242 + }, + { + "epoch": 2.320697112326676, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.642698287963867, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.8674643635749817, + "num_tokens": 696151293.0, + "step": 18243 + }, + { + "epoch": 2.3208243226052665, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.771865844726562, + "learning_rate": 1e-06, + "loss": 0.3507, + "mean_token_accuracy": 0.8893982768058777, + "num_tokens": 696190742.0, + "step": 18244 + }, + { + "epoch": 2.320951532883857, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.964427947998047, + "learning_rate": 1e-06, + "loss": 0.4421, + "mean_token_accuracy": 0.8608490228652954, + "num_tokens": 696232807.0, + "step": 18245 + }, + { + "epoch": 2.3210787431624476, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.568878173828125, + "learning_rate": 1e-06, + "loss": 0.4002, + "mean_token_accuracy": 0.8685243129730225, + "num_tokens": 696269416.0, + "step": 18246 + }, + { + "epoch": 2.321205953441038, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.656505584716797, + "learning_rate": 1e-06, + "loss": 0.3904, + "mean_token_accuracy": 0.8726854920387268, + "num_tokens": 696305144.0, + "step": 18247 + }, + { + "epoch": 2.3213331637196286, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.93535804748535, + "learning_rate": 1e-06, + "loss": 0.442, + "mean_token_accuracy": 0.8638754487037659, + "num_tokens": 696344854.0, + "step": 18248 + }, + { + "epoch": 2.321460373998219, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.536285400390625, + "learning_rate": 1e-06, + "loss": 0.3698, + "mean_token_accuracy": 0.8901758193969727, + "num_tokens": 696380154.0, + "step": 18249 + }, + { + "epoch": 2.3215875842768097, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.61404037475586, + "learning_rate": 1e-06, + "loss": 0.4475, + "mean_token_accuracy": 0.8594778776168823, + "num_tokens": 696416458.0, + "step": 18250 + }, + { + "epoch": 2.3217147945554, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.91546630859375, + "learning_rate": 1e-06, + "loss": 0.4065, + "mean_token_accuracy": 0.8747934699058533, + "num_tokens": 696453232.0, + "step": 18251 + }, + { + "epoch": 2.3218420048339907, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.53411102294922, + "learning_rate": 1e-06, + "loss": 0.4561, + "mean_token_accuracy": 0.8512954115867615, + "num_tokens": 696487930.0, + "step": 18252 + }, + { + "epoch": 2.3219692151125813, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.689329147338867, + "learning_rate": 1e-06, + "loss": 0.4408, + "mean_token_accuracy": 0.8622426986694336, + "num_tokens": 696523824.0, + "step": 18253 + }, + { + "epoch": 2.322096425391172, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.766035079956055, + "learning_rate": 1e-06, + "loss": 0.3863, + "mean_token_accuracy": 0.8819814324378967, + "num_tokens": 696567007.0, + "step": 18254 + }, + { + "epoch": 2.3222236356697623, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.703636169433594, + "learning_rate": 1e-06, + "loss": 0.3613, + "mean_token_accuracy": 0.8880113363265991, + "num_tokens": 696610055.0, + "step": 18255 + }, + { + "epoch": 2.3223508459483524, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.57188606262207, + "learning_rate": 1e-06, + "loss": 0.4371, + "mean_token_accuracy": 0.8576736450195312, + "num_tokens": 696644058.0, + "step": 18256 + }, + { + "epoch": 2.3224780562269434, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.79400634765625, + "learning_rate": 1e-06, + "loss": 0.3806, + "mean_token_accuracy": 0.8802233934402466, + "num_tokens": 696686381.0, + "step": 18257 + }, + { + "epoch": 2.3226052665055335, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.589599609375, + "learning_rate": 1e-06, + "loss": 0.4243, + "mean_token_accuracy": 0.8665133714675903, + "num_tokens": 696727567.0, + "step": 18258 + }, + { + "epoch": 2.322732476784124, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.9985294342041, + "learning_rate": 1e-06, + "loss": 0.4441, + "mean_token_accuracy": 0.8614417910575867, + "num_tokens": 696767266.0, + "step": 18259 + }, + { + "epoch": 2.3228596870627145, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.65964698791504, + "learning_rate": 1e-06, + "loss": 0.3585, + "mean_token_accuracy": 0.884874165058136, + "num_tokens": 696806059.0, + "step": 18260 + }, + { + "epoch": 2.322986897341305, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.869661331176758, + "learning_rate": 1e-06, + "loss": 0.4157, + "mean_token_accuracy": 0.8705824613571167, + "num_tokens": 696844232.0, + "step": 18261 + }, + { + "epoch": 2.3231141076198956, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.659095764160156, + "learning_rate": 1e-06, + "loss": 0.4101, + "mean_token_accuracy": 0.8742031455039978, + "num_tokens": 696886223.0, + "step": 18262 + }, + { + "epoch": 2.323241317898486, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.67371368408203, + "learning_rate": 1e-06, + "loss": 0.4294, + "mean_token_accuracy": 0.8573823571205139, + "num_tokens": 696922625.0, + "step": 18263 + }, + { + "epoch": 2.3233685281770766, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.843456268310547, + "learning_rate": 1e-06, + "loss": 0.4372, + "mean_token_accuracy": 0.8609321117401123, + "num_tokens": 696965160.0, + "step": 18264 + }, + { + "epoch": 2.323495738455667, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.8076114654541, + "learning_rate": 1e-06, + "loss": 0.3789, + "mean_token_accuracy": 0.8779274225234985, + "num_tokens": 697000854.0, + "step": 18265 + }, + { + "epoch": 2.3236229487342577, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.688854217529297, + "learning_rate": 1e-06, + "loss": 0.3592, + "mean_token_accuracy": 0.8858903646469116, + "num_tokens": 697041264.0, + "step": 18266 + }, + { + "epoch": 2.323750159012848, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.664165496826172, + "learning_rate": 1e-06, + "loss": 0.4124, + "mean_token_accuracy": 0.8696693181991577, + "num_tokens": 697080079.0, + "step": 18267 + }, + { + "epoch": 2.3238773692914387, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.51146697998047, + "learning_rate": 1e-06, + "loss": 0.4187, + "mean_token_accuracy": 0.8676352500915527, + "num_tokens": 697123755.0, + "step": 18268 + }, + { + "epoch": 2.3240045795700293, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.897254943847656, + "learning_rate": 1e-06, + "loss": 0.4992, + "mean_token_accuracy": 0.8438386917114258, + "num_tokens": 697162012.0, + "step": 18269 + }, + { + "epoch": 2.32413178984862, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.652332305908203, + "learning_rate": 1e-06, + "loss": 0.3782, + "mean_token_accuracy": 0.8820856809616089, + "num_tokens": 697200188.0, + "step": 18270 + }, + { + "epoch": 2.3242590001272103, + "ewc_loss": 0.036865234375, + "ewc_loss_parallel": 3.695487976074219e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.815555572509766, + "learning_rate": 1e-06, + "loss": 0.3947, + "mean_token_accuracy": 0.8746105432510376, + "num_tokens": 697240536.0, + "step": 18271 + }, + { + "epoch": 2.324386210405801, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.693002700805664, + "learning_rate": 1e-06, + "loss": 0.3775, + "mean_token_accuracy": 0.8831682205200195, + "num_tokens": 697279459.0, + "step": 18272 + }, + { + "epoch": 2.3245134206843914, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.84512710571289, + "learning_rate": 1e-06, + "loss": 0.3458, + "mean_token_accuracy": 0.888707160949707, + "num_tokens": 697312594.0, + "step": 18273 + }, + { + "epoch": 2.324640630962982, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.549978256225586, + "learning_rate": 1e-06, + "loss": 0.4459, + "mean_token_accuracy": 0.8588141202926636, + "num_tokens": 697357834.0, + "step": 18274 + }, + { + "epoch": 2.3247678412415724, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.88446807861328, + "learning_rate": 1e-06, + "loss": 0.4118, + "mean_token_accuracy": 0.8706482648849487, + "num_tokens": 697401522.0, + "step": 18275 + }, + { + "epoch": 2.324895051520163, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.616472244262695, + "learning_rate": 1e-06, + "loss": 0.4244, + "mean_token_accuracy": 0.8692464828491211, + "num_tokens": 697440415.0, + "step": 18276 + }, + { + "epoch": 2.3250222617987535, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.791147232055664, + "learning_rate": 1e-06, + "loss": 0.452, + "mean_token_accuracy": 0.8561896085739136, + "num_tokens": 697477609.0, + "step": 18277 + }, + { + "epoch": 2.325149472077344, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.707324981689453, + "learning_rate": 1e-06, + "loss": 0.4174, + "mean_token_accuracy": 0.8681535124778748, + "num_tokens": 697515566.0, + "step": 18278 + }, + { + "epoch": 2.3252766823559345, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.477371215820312, + "learning_rate": 1e-06, + "loss": 0.3902, + "mean_token_accuracy": 0.8762591481208801, + "num_tokens": 697549537.0, + "step": 18279 + }, + { + "epoch": 2.325403892634525, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.72824478149414, + "learning_rate": 1e-06, + "loss": 0.3978, + "mean_token_accuracy": 0.8758336305618286, + "num_tokens": 697582618.0, + "step": 18280 + }, + { + "epoch": 2.325531102913115, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.875234603881836, + "learning_rate": 1e-06, + "loss": 0.3669, + "mean_token_accuracy": 0.88551926612854, + "num_tokens": 697622234.0, + "step": 18281 + }, + { + "epoch": 2.325658313191706, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.489307403564453, + "learning_rate": 1e-06, + "loss": 0.4306, + "mean_token_accuracy": 0.8642105460166931, + "num_tokens": 697657504.0, + "step": 18282 + }, + { + "epoch": 2.325785523470296, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.666275024414062, + "learning_rate": 1e-06, + "loss": 0.4413, + "mean_token_accuracy": 0.8588048815727234, + "num_tokens": 697699496.0, + "step": 18283 + }, + { + "epoch": 2.3259127337488867, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.83568000793457, + "learning_rate": 1e-06, + "loss": 0.3649, + "mean_token_accuracy": 0.8844757676124573, + "num_tokens": 697739969.0, + "step": 18284 + }, + { + "epoch": 2.3260399440274773, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.571422576904297, + "learning_rate": 1e-06, + "loss": 0.4416, + "mean_token_accuracy": 0.8617597222328186, + "num_tokens": 697774346.0, + "step": 18285 + }, + { + "epoch": 2.326167154306068, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.65311050415039, + "learning_rate": 1e-06, + "loss": 0.4505, + "mean_token_accuracy": 0.8577703237533569, + "num_tokens": 697813013.0, + "step": 18286 + }, + { + "epoch": 2.3262943645846583, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.77558135986328, + "learning_rate": 1e-06, + "loss": 0.4398, + "mean_token_accuracy": 0.8648720979690552, + "num_tokens": 697854163.0, + "step": 18287 + }, + { + "epoch": 2.326421574863249, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.527666091918945, + "learning_rate": 1e-06, + "loss": 0.3841, + "mean_token_accuracy": 0.879313588142395, + "num_tokens": 697898049.0, + "step": 18288 + }, + { + "epoch": 2.3265487851418394, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.726337432861328, + "learning_rate": 1e-06, + "loss": 0.3991, + "mean_token_accuracy": 0.8751405477523804, + "num_tokens": 697936086.0, + "step": 18289 + }, + { + "epoch": 2.32667599542043, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.61933708190918, + "learning_rate": 1e-06, + "loss": 0.5004, + "mean_token_accuracy": 0.8475246429443359, + "num_tokens": 697981529.0, + "step": 18290 + }, + { + "epoch": 2.3268032056990204, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.59459114074707, + "learning_rate": 1e-06, + "loss": 0.3544, + "mean_token_accuracy": 0.8887729644775391, + "num_tokens": 698016047.0, + "step": 18291 + }, + { + "epoch": 2.326930415977611, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.742774963378906, + "learning_rate": 1e-06, + "loss": 0.3728, + "mean_token_accuracy": 0.8835753202438354, + "num_tokens": 698052408.0, + "step": 18292 + }, + { + "epoch": 2.3270576262562015, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.784757614135742, + "learning_rate": 1e-06, + "loss": 0.4231, + "mean_token_accuracy": 0.8654956221580505, + "num_tokens": 698089751.0, + "step": 18293 + }, + { + "epoch": 2.327184836534792, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.718732833862305, + "learning_rate": 1e-06, + "loss": 0.4161, + "mean_token_accuracy": 0.8703895807266235, + "num_tokens": 698121274.0, + "step": 18294 + }, + { + "epoch": 2.3273120468133826, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.68523597717285, + "learning_rate": 1e-06, + "loss": 0.3769, + "mean_token_accuracy": 0.8807669281959534, + "num_tokens": 698155880.0, + "step": 18295 + }, + { + "epoch": 2.327439257091973, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.725889205932617, + "learning_rate": 1e-06, + "loss": 0.3957, + "mean_token_accuracy": 0.873949408531189, + "num_tokens": 698191448.0, + "step": 18296 + }, + { + "epoch": 2.3275664673705636, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.77006721496582, + "learning_rate": 1e-06, + "loss": 0.4326, + "mean_token_accuracy": 0.8666017651557922, + "num_tokens": 698228437.0, + "step": 18297 + }, + { + "epoch": 2.327693677649154, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.63874626159668, + "learning_rate": 1e-06, + "loss": 0.4201, + "mean_token_accuracy": 0.8641831874847412, + "num_tokens": 698268714.0, + "step": 18298 + }, + { + "epoch": 2.3278208879277447, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.8587589263916, + "learning_rate": 1e-06, + "loss": 0.3986, + "mean_token_accuracy": 0.8724524974822998, + "num_tokens": 698305575.0, + "step": 18299 + }, + { + "epoch": 2.327948098206335, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.62332534790039, + "learning_rate": 1e-06, + "loss": 0.4237, + "mean_token_accuracy": 0.8673925995826721, + "num_tokens": 698347701.0, + "step": 18300 + }, + { + "epoch": 2.3280753084849257, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.908004760742188, + "learning_rate": 1e-06, + "loss": 0.3687, + "mean_token_accuracy": 0.8868727087974548, + "num_tokens": 698385906.0, + "step": 18301 + }, + { + "epoch": 2.3282025187635162, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.453149795532227, + "learning_rate": 1e-06, + "loss": 0.4129, + "mean_token_accuracy": 0.8705102205276489, + "num_tokens": 698415511.0, + "step": 18302 + }, + { + "epoch": 2.3283297290421068, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.958683013916016, + "learning_rate": 1e-06, + "loss": 0.4205, + "mean_token_accuracy": 0.8673262000083923, + "num_tokens": 698460528.0, + "step": 18303 + }, + { + "epoch": 2.3284569393206973, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.79232406616211, + "learning_rate": 1e-06, + "loss": 0.4507, + "mean_token_accuracy": 0.8574190139770508, + "num_tokens": 698500800.0, + "step": 18304 + }, + { + "epoch": 2.328584149599288, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.718421936035156, + "learning_rate": 1e-06, + "loss": 0.407, + "mean_token_accuracy": 0.8700367212295532, + "num_tokens": 698539373.0, + "step": 18305 + }, + { + "epoch": 2.328711359877878, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.93781852722168, + "learning_rate": 1e-06, + "loss": 0.3508, + "mean_token_accuracy": 0.8939909934997559, + "num_tokens": 698577067.0, + "step": 18306 + }, + { + "epoch": 2.328838570156469, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.98202133178711, + "learning_rate": 1e-06, + "loss": 0.4047, + "mean_token_accuracy": 0.8712544441223145, + "num_tokens": 698613515.0, + "step": 18307 + }, + { + "epoch": 2.328965780435059, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.858644485473633, + "learning_rate": 1e-06, + "loss": 0.4057, + "mean_token_accuracy": 0.8731513023376465, + "num_tokens": 698647278.0, + "step": 18308 + }, + { + "epoch": 2.3290929907136495, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.839336395263672, + "learning_rate": 1e-06, + "loss": 0.3811, + "mean_token_accuracy": 0.878097414970398, + "num_tokens": 698688738.0, + "step": 18309 + }, + { + "epoch": 2.32922020099224, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.733312606811523, + "learning_rate": 1e-06, + "loss": 0.394, + "mean_token_accuracy": 0.875545859336853, + "num_tokens": 698725003.0, + "step": 18310 + }, + { + "epoch": 2.3293474112708306, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.997581481933594, + "learning_rate": 1e-06, + "loss": 0.4544, + "mean_token_accuracy": 0.8540163636207581, + "num_tokens": 698768205.0, + "step": 18311 + }, + { + "epoch": 2.329474621549421, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.543506622314453, + "learning_rate": 1e-06, + "loss": 0.4676, + "mean_token_accuracy": 0.8524389266967773, + "num_tokens": 698805352.0, + "step": 18312 + }, + { + "epoch": 2.3296018318280116, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.955883026123047, + "learning_rate": 1e-06, + "loss": 0.367, + "mean_token_accuracy": 0.8808544278144836, + "num_tokens": 698836610.0, + "step": 18313 + }, + { + "epoch": 2.329729042106602, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.940004348754883, + "learning_rate": 1e-06, + "loss": 0.3806, + "mean_token_accuracy": 0.8797513842582703, + "num_tokens": 698877514.0, + "step": 18314 + }, + { + "epoch": 2.3298562523851927, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.699951171875, + "learning_rate": 1e-06, + "loss": 0.4005, + "mean_token_accuracy": 0.8748618960380554, + "num_tokens": 698916699.0, + "step": 18315 + }, + { + "epoch": 2.329983462663783, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.803621292114258, + "learning_rate": 1e-06, + "loss": 0.4175, + "mean_token_accuracy": 0.8717597723007202, + "num_tokens": 698953568.0, + "step": 18316 + }, + { + "epoch": 2.3301106729423737, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.799463272094727, + "learning_rate": 1e-06, + "loss": 0.4189, + "mean_token_accuracy": 0.8671938180923462, + "num_tokens": 698990562.0, + "step": 18317 + }, + { + "epoch": 2.3302378832209643, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.63405990600586, + "learning_rate": 1e-06, + "loss": 0.398, + "mean_token_accuracy": 0.873866081237793, + "num_tokens": 699024095.0, + "step": 18318 + }, + { + "epoch": 2.330365093499555, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.930665969848633, + "learning_rate": 1e-06, + "loss": 0.4631, + "mean_token_accuracy": 0.8531444072723389, + "num_tokens": 699062327.0, + "step": 18319 + }, + { + "epoch": 2.3304923037781453, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.650259017944336, + "learning_rate": 1e-06, + "loss": 0.3824, + "mean_token_accuracy": 0.8766289949417114, + "num_tokens": 699101591.0, + "step": 18320 + }, + { + "epoch": 2.330619514056736, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.828269958496094, + "learning_rate": 1e-06, + "loss": 0.4258, + "mean_token_accuracy": 0.863634467124939, + "num_tokens": 699134836.0, + "step": 18321 + }, + { + "epoch": 2.3307467243353264, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.020076751708984, + "learning_rate": 1e-06, + "loss": 0.4381, + "mean_token_accuracy": 0.8629518747329712, + "num_tokens": 699171218.0, + "step": 18322 + }, + { + "epoch": 2.330873934613917, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.79265785217285, + "learning_rate": 1e-06, + "loss": 0.3655, + "mean_token_accuracy": 0.8890966176986694, + "num_tokens": 699200891.0, + "step": 18323 + }, + { + "epoch": 2.3310011448925074, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.883338928222656, + "learning_rate": 1e-06, + "loss": 0.3628, + "mean_token_accuracy": 0.8873934745788574, + "num_tokens": 699238080.0, + "step": 18324 + }, + { + "epoch": 2.331128355171098, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.906156539916992, + "learning_rate": 1e-06, + "loss": 0.45, + "mean_token_accuracy": 0.8601053953170776, + "num_tokens": 699276051.0, + "step": 18325 + }, + { + "epoch": 2.3312555654496885, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.60346221923828, + "learning_rate": 1e-06, + "loss": 0.3773, + "mean_token_accuracy": 0.8786369562149048, + "num_tokens": 699315721.0, + "step": 18326 + }, + { + "epoch": 2.331382775728279, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.523815155029297, + "learning_rate": 1e-06, + "loss": 0.361, + "mean_token_accuracy": 0.8832113146781921, + "num_tokens": 699350342.0, + "step": 18327 + }, + { + "epoch": 2.3315099860068695, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.142038345336914, + "learning_rate": 1e-06, + "loss": 0.3892, + "mean_token_accuracy": 0.877890408039093, + "num_tokens": 699390301.0, + "step": 18328 + }, + { + "epoch": 2.3316371962854596, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.693103790283203, + "learning_rate": 1e-06, + "loss": 0.4444, + "mean_token_accuracy": 0.8567264080047607, + "num_tokens": 699427135.0, + "step": 18329 + }, + { + "epoch": 2.3317644065640506, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.6715145111084, + "learning_rate": 1e-06, + "loss": 0.3829, + "mean_token_accuracy": 0.8775851130485535, + "num_tokens": 699466461.0, + "step": 18330 + }, + { + "epoch": 2.3318916168426407, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.778474807739258, + "learning_rate": 1e-06, + "loss": 0.4232, + "mean_token_accuracy": 0.8650617599487305, + "num_tokens": 699502446.0, + "step": 18331 + }, + { + "epoch": 2.332018827121231, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.574777603149414, + "learning_rate": 1e-06, + "loss": 0.4096, + "mean_token_accuracy": 0.8714218139648438, + "num_tokens": 699546357.0, + "step": 18332 + }, + { + "epoch": 2.3321460373998217, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.77106475830078, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.8717707395553589, + "num_tokens": 699581879.0, + "step": 18333 + }, + { + "epoch": 2.3322732476784123, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.69415283203125, + "learning_rate": 1e-06, + "loss": 0.3581, + "mean_token_accuracy": 0.8866300582885742, + "num_tokens": 699616287.0, + "step": 18334 + }, + { + "epoch": 2.332400457957003, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.568410873413086, + "learning_rate": 1e-06, + "loss": 0.3981, + "mean_token_accuracy": 0.8712213635444641, + "num_tokens": 699655409.0, + "step": 18335 + }, + { + "epoch": 2.3325276682355933, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.771087646484375, + "learning_rate": 1e-06, + "loss": 0.4795, + "mean_token_accuracy": 0.8459950685501099, + "num_tokens": 699701420.0, + "step": 18336 + }, + { + "epoch": 2.332654878514184, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.694896697998047, + "learning_rate": 1e-06, + "loss": 0.3997, + "mean_token_accuracy": 0.8736385107040405, + "num_tokens": 699736151.0, + "step": 18337 + }, + { + "epoch": 2.3327820887927744, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.73965072631836, + "learning_rate": 1e-06, + "loss": 0.418, + "mean_token_accuracy": 0.8648760914802551, + "num_tokens": 699770111.0, + "step": 18338 + }, + { + "epoch": 2.332909299071365, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.02825164794922, + "learning_rate": 1e-06, + "loss": 0.4029, + "mean_token_accuracy": 0.8682235479354858, + "num_tokens": 699807442.0, + "step": 18339 + }, + { + "epoch": 2.3330365093499554, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.615615844726562, + "learning_rate": 1e-06, + "loss": 0.4058, + "mean_token_accuracy": 0.8690989017486572, + "num_tokens": 699849364.0, + "step": 18340 + }, + { + "epoch": 2.333163719628546, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.900793075561523, + "learning_rate": 1e-06, + "loss": 0.3588, + "mean_token_accuracy": 0.8858949542045593, + "num_tokens": 699886234.0, + "step": 18341 + }, + { + "epoch": 2.3332909299071365, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.769906997680664, + "learning_rate": 1e-06, + "loss": 0.4479, + "mean_token_accuracy": 0.8555280566215515, + "num_tokens": 699925660.0, + "step": 18342 + }, + { + "epoch": 2.333418140185727, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.665956497192383, + "learning_rate": 1e-06, + "loss": 0.4899, + "mean_token_accuracy": 0.8444352149963379, + "num_tokens": 699967894.0, + "step": 18343 + }, + { + "epoch": 2.3335453504643175, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.70067024230957, + "learning_rate": 1e-06, + "loss": 0.4223, + "mean_token_accuracy": 0.8664531707763672, + "num_tokens": 699999211.0, + "step": 18344 + }, + { + "epoch": 2.333672560742908, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.72146224975586, + "learning_rate": 1e-06, + "loss": 0.3856, + "mean_token_accuracy": 0.8794727325439453, + "num_tokens": 700037639.0, + "step": 18345 + }, + { + "epoch": 2.3337997710214986, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.810810089111328, + "learning_rate": 1e-06, + "loss": 0.4265, + "mean_token_accuracy": 0.8650014400482178, + "num_tokens": 700075677.0, + "step": 18346 + }, + { + "epoch": 2.333926981300089, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.800159454345703, + "learning_rate": 1e-06, + "loss": 0.3864, + "mean_token_accuracy": 0.8781231641769409, + "num_tokens": 700115697.0, + "step": 18347 + }, + { + "epoch": 2.3340541915786797, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.712482452392578, + "learning_rate": 1e-06, + "loss": 0.452, + "mean_token_accuracy": 0.8544509410858154, + "num_tokens": 700152553.0, + "step": 18348 + }, + { + "epoch": 2.33418140185727, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.775390625, + "learning_rate": 1e-06, + "loss": 0.3809, + "mean_token_accuracy": 0.8783339262008667, + "num_tokens": 700190958.0, + "step": 18349 + }, + { + "epoch": 2.3343086121358607, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.560745239257812, + "learning_rate": 1e-06, + "loss": 0.4211, + "mean_token_accuracy": 0.8691299557685852, + "num_tokens": 700224244.0, + "step": 18350 + }, + { + "epoch": 2.3344358224144512, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.809051513671875, + "learning_rate": 1e-06, + "loss": 0.3263, + "mean_token_accuracy": 0.8975287675857544, + "num_tokens": 700259736.0, + "step": 18351 + }, + { + "epoch": 2.3345630326930418, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.721078872680664, + "learning_rate": 1e-06, + "loss": 0.3911, + "mean_token_accuracy": 0.8723158836364746, + "num_tokens": 700293624.0, + "step": 18352 + }, + { + "epoch": 2.3346902429716323, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.734771728515625, + "learning_rate": 1e-06, + "loss": 0.3834, + "mean_token_accuracy": 0.8753713369369507, + "num_tokens": 700328052.0, + "step": 18353 + }, + { + "epoch": 2.3348174532502224, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.554378509521484, + "learning_rate": 1e-06, + "loss": 0.42, + "mean_token_accuracy": 0.8647611141204834, + "num_tokens": 700364652.0, + "step": 18354 + }, + { + "epoch": 2.3349446635288134, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.768823623657227, + "learning_rate": 1e-06, + "loss": 0.3612, + "mean_token_accuracy": 0.883766233921051, + "num_tokens": 700403698.0, + "step": 18355 + }, + { + "epoch": 2.3350718738074034, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.650371551513672, + "learning_rate": 1e-06, + "loss": 0.3653, + "mean_token_accuracy": 0.8854851722717285, + "num_tokens": 700439391.0, + "step": 18356 + }, + { + "epoch": 2.335199084085994, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.586585998535156, + "learning_rate": 1e-06, + "loss": 0.3968, + "mean_token_accuracy": 0.8748593330383301, + "num_tokens": 700475095.0, + "step": 18357 + }, + { + "epoch": 2.3353262943645845, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.711246490478516, + "learning_rate": 1e-06, + "loss": 0.3937, + "mean_token_accuracy": 0.8763383030891418, + "num_tokens": 700514167.0, + "step": 18358 + }, + { + "epoch": 2.335453504643175, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.6431827545166, + "learning_rate": 1e-06, + "loss": 0.3767, + "mean_token_accuracy": 0.8814693093299866, + "num_tokens": 700548333.0, + "step": 18359 + }, + { + "epoch": 2.3355807149217656, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.492631912231445, + "learning_rate": 1e-06, + "loss": 0.3871, + "mean_token_accuracy": 0.8769068717956543, + "num_tokens": 700594261.0, + "step": 18360 + }, + { + "epoch": 2.335707925200356, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.777099609375, + "learning_rate": 1e-06, + "loss": 0.3884, + "mean_token_accuracy": 0.877547562122345, + "num_tokens": 700632513.0, + "step": 18361 + }, + { + "epoch": 2.3358351354789466, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.674787521362305, + "learning_rate": 1e-06, + "loss": 0.389, + "mean_token_accuracy": 0.8781659603118896, + "num_tokens": 700672430.0, + "step": 18362 + }, + { + "epoch": 2.335962345757537, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.57178497314453, + "learning_rate": 1e-06, + "loss": 0.3829, + "mean_token_accuracy": 0.8777934312820435, + "num_tokens": 700714321.0, + "step": 18363 + }, + { + "epoch": 2.3360895560361277, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.762420654296875, + "learning_rate": 1e-06, + "loss": 0.4041, + "mean_token_accuracy": 0.8721590042114258, + "num_tokens": 700751714.0, + "step": 18364 + }, + { + "epoch": 2.336216766314718, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.717727661132812, + "learning_rate": 1e-06, + "loss": 0.4149, + "mean_token_accuracy": 0.869274377822876, + "num_tokens": 700794087.0, + "step": 18365 + }, + { + "epoch": 2.3363439765933087, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.59520721435547, + "learning_rate": 1e-06, + "loss": 0.4168, + "mean_token_accuracy": 0.8676600456237793, + "num_tokens": 700832031.0, + "step": 18366 + }, + { + "epoch": 2.3364711868718993, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.66368293762207, + "learning_rate": 1e-06, + "loss": 0.3979, + "mean_token_accuracy": 0.8749434947967529, + "num_tokens": 700867175.0, + "step": 18367 + }, + { + "epoch": 2.33659839715049, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.65354347229004, + "learning_rate": 1e-06, + "loss": 0.3848, + "mean_token_accuracy": 0.8788673877716064, + "num_tokens": 700906762.0, + "step": 18368 + }, + { + "epoch": 2.3367256074290803, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.78984260559082, + "learning_rate": 1e-06, + "loss": 0.3999, + "mean_token_accuracy": 0.8745419979095459, + "num_tokens": 700944316.0, + "step": 18369 + }, + { + "epoch": 2.336852817707671, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.607206344604492, + "learning_rate": 1e-06, + "loss": 0.415, + "mean_token_accuracy": 0.8684648275375366, + "num_tokens": 700979369.0, + "step": 18370 + }, + { + "epoch": 2.3369800279862614, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.57225227355957, + "learning_rate": 1e-06, + "loss": 0.4139, + "mean_token_accuracy": 0.8681787252426147, + "num_tokens": 701016213.0, + "step": 18371 + }, + { + "epoch": 2.337107238264852, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.625450134277344, + "learning_rate": 1e-06, + "loss": 0.4095, + "mean_token_accuracy": 0.8702652454376221, + "num_tokens": 701055270.0, + "step": 18372 + }, + { + "epoch": 2.3372344485434424, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.82008934020996, + "learning_rate": 1e-06, + "loss": 0.3976, + "mean_token_accuracy": 0.8703539371490479, + "num_tokens": 701091058.0, + "step": 18373 + }, + { + "epoch": 2.337361658822033, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.699493408203125, + "learning_rate": 1e-06, + "loss": 0.3911, + "mean_token_accuracy": 0.8808360695838928, + "num_tokens": 701129446.0, + "step": 18374 + }, + { + "epoch": 2.3374888691006235, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.633487701416016, + "learning_rate": 1e-06, + "loss": 0.3892, + "mean_token_accuracy": 0.8759338855743408, + "num_tokens": 701168726.0, + "step": 18375 + }, + { + "epoch": 2.337616079379214, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.865642547607422, + "learning_rate": 1e-06, + "loss": 0.4164, + "mean_token_accuracy": 0.8655776977539062, + "num_tokens": 701204041.0, + "step": 18376 + }, + { + "epoch": 2.3377432896578045, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.723312377929688, + "learning_rate": 1e-06, + "loss": 0.4715, + "mean_token_accuracy": 0.8504459261894226, + "num_tokens": 701244660.0, + "step": 18377 + }, + { + "epoch": 2.337870499936395, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.66486930847168, + "learning_rate": 1e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.864230215549469, + "num_tokens": 701284830.0, + "step": 18378 + }, + { + "epoch": 2.337997710214985, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.844467163085938, + "learning_rate": 1e-06, + "loss": 0.3745, + "mean_token_accuracy": 0.8791360855102539, + "num_tokens": 701319720.0, + "step": 18379 + }, + { + "epoch": 2.338124920493576, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.78723907470703, + "learning_rate": 1e-06, + "loss": 0.4727, + "mean_token_accuracy": 0.8518495559692383, + "num_tokens": 701361770.0, + "step": 18380 + }, + { + "epoch": 2.338252130772166, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.81209373474121, + "learning_rate": 1e-06, + "loss": 0.4097, + "mean_token_accuracy": 0.8686574101448059, + "num_tokens": 701399777.0, + "step": 18381 + }, + { + "epoch": 2.3383793410507567, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.710329055786133, + "learning_rate": 1e-06, + "loss": 0.4018, + "mean_token_accuracy": 0.8704594373703003, + "num_tokens": 701432680.0, + "step": 18382 + }, + { + "epoch": 2.3385065513293473, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.63138198852539, + "learning_rate": 1e-06, + "loss": 0.383, + "mean_token_accuracy": 0.8819694519042969, + "num_tokens": 701474101.0, + "step": 18383 + }, + { + "epoch": 2.338633761607938, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.707319259643555, + "learning_rate": 1e-06, + "loss": 0.3712, + "mean_token_accuracy": 0.8849630355834961, + "num_tokens": 701517568.0, + "step": 18384 + }, + { + "epoch": 2.3387609718865283, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.86667823791504, + "learning_rate": 1e-06, + "loss": 0.43, + "mean_token_accuracy": 0.8649560213088989, + "num_tokens": 701558466.0, + "step": 18385 + }, + { + "epoch": 2.338888182165119, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.691341400146484, + "learning_rate": 1e-06, + "loss": 0.4311, + "mean_token_accuracy": 0.8672221899032593, + "num_tokens": 701591104.0, + "step": 18386 + }, + { + "epoch": 2.3390153924437094, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.89674949645996, + "learning_rate": 1e-06, + "loss": 0.4277, + "mean_token_accuracy": 0.8643366098403931, + "num_tokens": 701633242.0, + "step": 18387 + }, + { + "epoch": 2.3391426027223, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.81214714050293, + "learning_rate": 1e-06, + "loss": 0.4084, + "mean_token_accuracy": 0.8714274168014526, + "num_tokens": 701668864.0, + "step": 18388 + }, + { + "epoch": 2.3392698130008904, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.746490478515625, + "learning_rate": 1e-06, + "loss": 0.3947, + "mean_token_accuracy": 0.8759173154830933, + "num_tokens": 701703648.0, + "step": 18389 + }, + { + "epoch": 2.339397023279481, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.541969299316406, + "learning_rate": 1e-06, + "loss": 0.4317, + "mean_token_accuracy": 0.8681168556213379, + "num_tokens": 701744417.0, + "step": 18390 + }, + { + "epoch": 2.3395242335580715, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.783430099487305, + "learning_rate": 1e-06, + "loss": 0.4917, + "mean_token_accuracy": 0.8405448794364929, + "num_tokens": 701783541.0, + "step": 18391 + }, + { + "epoch": 2.339651443836662, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.664045333862305, + "learning_rate": 1e-06, + "loss": 0.4245, + "mean_token_accuracy": 0.8679817318916321, + "num_tokens": 701825054.0, + "step": 18392 + }, + { + "epoch": 2.3397786541152525, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.821073532104492, + "learning_rate": 1e-06, + "loss": 0.383, + "mean_token_accuracy": 0.8778517842292786, + "num_tokens": 701866173.0, + "step": 18393 + }, + { + "epoch": 2.339905864393843, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.085063934326172, + "learning_rate": 1e-06, + "loss": 0.4163, + "mean_token_accuracy": 0.8681219816207886, + "num_tokens": 701904812.0, + "step": 18394 + }, + { + "epoch": 2.3400330746724336, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.587495803833008, + "learning_rate": 1e-06, + "loss": 0.4344, + "mean_token_accuracy": 0.8622184991836548, + "num_tokens": 701942590.0, + "step": 18395 + }, + { + "epoch": 2.340160284951024, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.688091278076172, + "learning_rate": 1e-06, + "loss": 0.3915, + "mean_token_accuracy": 0.874906063079834, + "num_tokens": 701980773.0, + "step": 18396 + }, + { + "epoch": 2.3402874952296147, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.748294830322266, + "learning_rate": 1e-06, + "loss": 0.3708, + "mean_token_accuracy": 0.8851145505905151, + "num_tokens": 702014508.0, + "step": 18397 + }, + { + "epoch": 2.340414705508205, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.90156364440918, + "learning_rate": 1e-06, + "loss": 0.372, + "mean_token_accuracy": 0.8840574026107788, + "num_tokens": 702055754.0, + "step": 18398 + }, + { + "epoch": 2.3405419157867957, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.553125381469727, + "learning_rate": 1e-06, + "loss": 0.4061, + "mean_token_accuracy": 0.8709914088249207, + "num_tokens": 702100022.0, + "step": 18399 + }, + { + "epoch": 2.3406691260653862, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.736251831054688, + "learning_rate": 1e-06, + "loss": 0.4387, + "mean_token_accuracy": 0.860575795173645, + "num_tokens": 702133217.0, + "step": 18400 + }, + { + "epoch": 2.3407963363439768, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.747241973876953, + "learning_rate": 1e-06, + "loss": 0.3693, + "mean_token_accuracy": 0.8845292329788208, + "num_tokens": 702176998.0, + "step": 18401 + }, + { + "epoch": 2.3409235466225673, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.70172119140625, + "learning_rate": 1e-06, + "loss": 0.3772, + "mean_token_accuracy": 0.8777594566345215, + "num_tokens": 702216219.0, + "step": 18402 + }, + { + "epoch": 2.341050756901158, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.693775177001953, + "learning_rate": 1e-06, + "loss": 0.4486, + "mean_token_accuracy": 0.8679352402687073, + "num_tokens": 702246502.0, + "step": 18403 + }, + { + "epoch": 2.341177967179748, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.739492416381836, + "learning_rate": 1e-06, + "loss": 0.3755, + "mean_token_accuracy": 0.8788316249847412, + "num_tokens": 702277990.0, + "step": 18404 + }, + { + "epoch": 2.341305177458339, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.76076316833496, + "learning_rate": 1e-06, + "loss": 0.4012, + "mean_token_accuracy": 0.8749067783355713, + "num_tokens": 702312742.0, + "step": 18405 + }, + { + "epoch": 2.341432387736929, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.608816146850586, + "learning_rate": 1e-06, + "loss": 0.3593, + "mean_token_accuracy": 0.8872069120407104, + "num_tokens": 702348575.0, + "step": 18406 + }, + { + "epoch": 2.3415595980155195, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.591026306152344, + "learning_rate": 1e-06, + "loss": 0.436, + "mean_token_accuracy": 0.8605177402496338, + "num_tokens": 702385956.0, + "step": 18407 + }, + { + "epoch": 2.34168680829411, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.535276412963867, + "learning_rate": 1e-06, + "loss": 0.3835, + "mean_token_accuracy": 0.8769232630729675, + "num_tokens": 702423973.0, + "step": 18408 + }, + { + "epoch": 2.3418140185727006, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.585792541503906, + "learning_rate": 1e-06, + "loss": 0.413, + "mean_token_accuracy": 0.8685801029205322, + "num_tokens": 702455508.0, + "step": 18409 + }, + { + "epoch": 2.341941228851291, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.765413284301758, + "learning_rate": 1e-06, + "loss": 0.3816, + "mean_token_accuracy": 0.8821249008178711, + "num_tokens": 702493365.0, + "step": 18410 + }, + { + "epoch": 2.3420684391298816, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.973464965820312, + "learning_rate": 1e-06, + "loss": 0.3948, + "mean_token_accuracy": 0.873924732208252, + "num_tokens": 702528036.0, + "step": 18411 + }, + { + "epoch": 2.342195649408472, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.552425384521484, + "learning_rate": 1e-06, + "loss": 0.4253, + "mean_token_accuracy": 0.8665032982826233, + "num_tokens": 702571238.0, + "step": 18412 + }, + { + "epoch": 2.3423228596870627, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.67885971069336, + "learning_rate": 1e-06, + "loss": 0.4207, + "mean_token_accuracy": 0.8677029013633728, + "num_tokens": 702611247.0, + "step": 18413 + }, + { + "epoch": 2.342450069965653, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.64090919494629, + "learning_rate": 1e-06, + "loss": 0.3982, + "mean_token_accuracy": 0.873581051826477, + "num_tokens": 702649661.0, + "step": 18414 + }, + { + "epoch": 2.3425772802442437, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.532814025878906, + "learning_rate": 1e-06, + "loss": 0.4182, + "mean_token_accuracy": 0.8655561804771423, + "num_tokens": 702693573.0, + "step": 18415 + }, + { + "epoch": 2.3427044905228342, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.699068069458008, + "learning_rate": 1e-06, + "loss": 0.3675, + "mean_token_accuracy": 0.8838127851486206, + "num_tokens": 702735351.0, + "step": 18416 + }, + { + "epoch": 2.3428317008014248, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.724224090576172, + "learning_rate": 1e-06, + "loss": 0.3802, + "mean_token_accuracy": 0.8772879838943481, + "num_tokens": 702773406.0, + "step": 18417 + }, + { + "epoch": 2.3429589110800153, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.648372650146484, + "learning_rate": 1e-06, + "loss": 0.3832, + "mean_token_accuracy": 0.8781790137290955, + "num_tokens": 702809939.0, + "step": 18418 + }, + { + "epoch": 2.343086121358606, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.62360954284668, + "learning_rate": 1e-06, + "loss": 0.4117, + "mean_token_accuracy": 0.8694279789924622, + "num_tokens": 702851120.0, + "step": 18419 + }, + { + "epoch": 2.3432133316371964, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.682811737060547, + "learning_rate": 1e-06, + "loss": 0.3771, + "mean_token_accuracy": 0.8809608221054077, + "num_tokens": 702889492.0, + "step": 18420 + }, + { + "epoch": 2.343340541915787, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.769123077392578, + "learning_rate": 1e-06, + "loss": 0.4267, + "mean_token_accuracy": 0.8663537502288818, + "num_tokens": 702930486.0, + "step": 18421 + }, + { + "epoch": 2.3434677521943774, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.58514404296875, + "learning_rate": 1e-06, + "loss": 0.4267, + "mean_token_accuracy": 0.8632016181945801, + "num_tokens": 702969866.0, + "step": 18422 + }, + { + "epoch": 2.343594962472968, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.803556442260742, + "learning_rate": 1e-06, + "loss": 0.4298, + "mean_token_accuracy": 0.863724410533905, + "num_tokens": 703013703.0, + "step": 18423 + }, + { + "epoch": 2.3437221727515585, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.618213653564453, + "learning_rate": 1e-06, + "loss": 0.4378, + "mean_token_accuracy": 0.8605194687843323, + "num_tokens": 703054849.0, + "step": 18424 + }, + { + "epoch": 2.343849383030149, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.76946258544922, + "learning_rate": 1e-06, + "loss": 0.421, + "mean_token_accuracy": 0.8674294352531433, + "num_tokens": 703088250.0, + "step": 18425 + }, + { + "epoch": 2.3439765933087395, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.826448440551758, + "learning_rate": 1e-06, + "loss": 0.4132, + "mean_token_accuracy": 0.8674743175506592, + "num_tokens": 703124072.0, + "step": 18426 + }, + { + "epoch": 2.3441038035873296, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.650808334350586, + "learning_rate": 1e-06, + "loss": 0.4589, + "mean_token_accuracy": 0.8552465438842773, + "num_tokens": 703158869.0, + "step": 18427 + }, + { + "epoch": 2.3442310138659206, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.852203369140625, + "learning_rate": 1e-06, + "loss": 0.4052, + "mean_token_accuracy": 0.8742462992668152, + "num_tokens": 703193342.0, + "step": 18428 + }, + { + "epoch": 2.3443582241445107, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.632320404052734, + "learning_rate": 1e-06, + "loss": 0.3638, + "mean_token_accuracy": 0.8845894932746887, + "num_tokens": 703229899.0, + "step": 18429 + }, + { + "epoch": 2.344485434423101, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.75667953491211, + "learning_rate": 1e-06, + "loss": 0.3963, + "mean_token_accuracy": 0.8707241415977478, + "num_tokens": 703266735.0, + "step": 18430 + }, + { + "epoch": 2.3446126447016917, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.566810607910156, + "learning_rate": 1e-06, + "loss": 0.4116, + "mean_token_accuracy": 0.8679580092430115, + "num_tokens": 703307515.0, + "step": 18431 + }, + { + "epoch": 2.3447398549802823, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.88379669189453, + "learning_rate": 1e-06, + "loss": 0.3576, + "mean_token_accuracy": 0.882443904876709, + "num_tokens": 703341237.0, + "step": 18432 + }, + { + "epoch": 2.344867065258873, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.894926071166992, + "learning_rate": 1e-06, + "loss": 0.3733, + "mean_token_accuracy": 0.8822927474975586, + "num_tokens": 703378900.0, + "step": 18433 + }, + { + "epoch": 2.3449942755374633, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.649232864379883, + "learning_rate": 1e-06, + "loss": 0.3773, + "mean_token_accuracy": 0.8802323937416077, + "num_tokens": 703415260.0, + "step": 18434 + }, + { + "epoch": 2.345121485816054, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.714832305908203, + "learning_rate": 1e-06, + "loss": 0.3974, + "mean_token_accuracy": 0.8771889209747314, + "num_tokens": 703452399.0, + "step": 18435 + }, + { + "epoch": 2.3452486960946444, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.72516632080078, + "learning_rate": 1e-06, + "loss": 0.4, + "mean_token_accuracy": 0.8737189173698425, + "num_tokens": 703492700.0, + "step": 18436 + }, + { + "epoch": 2.345375906373235, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.771944046020508, + "learning_rate": 1e-06, + "loss": 0.3709, + "mean_token_accuracy": 0.8842031955718994, + "num_tokens": 703530805.0, + "step": 18437 + }, + { + "epoch": 2.3455031166518254, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.73303985595703, + "learning_rate": 1e-06, + "loss": 0.3838, + "mean_token_accuracy": 0.8813745975494385, + "num_tokens": 703571894.0, + "step": 18438 + }, + { + "epoch": 2.345630326930416, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.78718376159668, + "learning_rate": 1e-06, + "loss": 0.3827, + "mean_token_accuracy": 0.8775589466094971, + "num_tokens": 703612583.0, + "step": 18439 + }, + { + "epoch": 2.3457575372090065, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.575637817382812, + "learning_rate": 1e-06, + "loss": 0.423, + "mean_token_accuracy": 0.8664437532424927, + "num_tokens": 703649100.0, + "step": 18440 + }, + { + "epoch": 2.345884747487597, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.020414352416992, + "learning_rate": 1e-06, + "loss": 0.3609, + "mean_token_accuracy": 0.8876991271972656, + "num_tokens": 703680425.0, + "step": 18441 + }, + { + "epoch": 2.3460119577661875, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.87134552001953, + "learning_rate": 1e-06, + "loss": 0.3801, + "mean_token_accuracy": 0.8789984583854675, + "num_tokens": 703721819.0, + "step": 18442 + }, + { + "epoch": 2.346139168044778, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.538196563720703, + "learning_rate": 1e-06, + "loss": 0.4064, + "mean_token_accuracy": 0.8688963055610657, + "num_tokens": 703755432.0, + "step": 18443 + }, + { + "epoch": 2.3462663783233686, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.8619384765625, + "learning_rate": 1e-06, + "loss": 0.4007, + "mean_token_accuracy": 0.8712915778160095, + "num_tokens": 703795903.0, + "step": 18444 + }, + { + "epoch": 2.346393588601959, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.94917106628418, + "learning_rate": 1e-06, + "loss": 0.3683, + "mean_token_accuracy": 0.8828926682472229, + "num_tokens": 703836074.0, + "step": 18445 + }, + { + "epoch": 2.3465207988805497, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.58257293701172, + "learning_rate": 1e-06, + "loss": 0.3855, + "mean_token_accuracy": 0.8769832849502563, + "num_tokens": 703870609.0, + "step": 18446 + }, + { + "epoch": 2.34664800915914, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.78574562072754, + "learning_rate": 1e-06, + "loss": 0.3928, + "mean_token_accuracy": 0.877087414264679, + "num_tokens": 703909034.0, + "step": 18447 + }, + { + "epoch": 2.3467752194377307, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.798171997070312, + "learning_rate": 1e-06, + "loss": 0.4197, + "mean_token_accuracy": 0.8652268648147583, + "num_tokens": 703946976.0, + "step": 18448 + }, + { + "epoch": 2.3469024297163212, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.678434371948242, + "learning_rate": 1e-06, + "loss": 0.4015, + "mean_token_accuracy": 0.8749759793281555, + "num_tokens": 703986379.0, + "step": 18449 + }, + { + "epoch": 2.3470296399949118, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.735496520996094, + "learning_rate": 1e-06, + "loss": 0.3456, + "mean_token_accuracy": 0.8912906646728516, + "num_tokens": 704019461.0, + "step": 18450 + }, + { + "epoch": 2.3471568502735023, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.766130447387695, + "learning_rate": 1e-06, + "loss": 0.4419, + "mean_token_accuracy": 0.8584039807319641, + "num_tokens": 704053411.0, + "step": 18451 + }, + { + "epoch": 2.3472840605520924, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.71707534790039, + "learning_rate": 1e-06, + "loss": 0.3928, + "mean_token_accuracy": 0.877521276473999, + "num_tokens": 704092633.0, + "step": 18452 + }, + { + "epoch": 2.3474112708306833, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.835302352905273, + "learning_rate": 1e-06, + "loss": 0.3728, + "mean_token_accuracy": 0.8764247298240662, + "num_tokens": 704126955.0, + "step": 18453 + }, + { + "epoch": 2.3475384811092734, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.902729034423828, + "learning_rate": 1e-06, + "loss": 0.3544, + "mean_token_accuracy": 0.8889384269714355, + "num_tokens": 704162222.0, + "step": 18454 + }, + { + "epoch": 2.347665691387864, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.758689880371094, + "learning_rate": 1e-06, + "loss": 0.4065, + "mean_token_accuracy": 0.8722739815711975, + "num_tokens": 704198949.0, + "step": 18455 + }, + { + "epoch": 2.3477929016664545, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.925519943237305, + "learning_rate": 1e-06, + "loss": 0.4221, + "mean_token_accuracy": 0.8678290843963623, + "num_tokens": 704235327.0, + "step": 18456 + }, + { + "epoch": 2.347920111945045, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.635683059692383, + "learning_rate": 1e-06, + "loss": 0.3817, + "mean_token_accuracy": 0.8786514401435852, + "num_tokens": 704272783.0, + "step": 18457 + }, + { + "epoch": 2.3480473222236355, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.775875091552734, + "learning_rate": 1e-06, + "loss": 0.4635, + "mean_token_accuracy": 0.8559664487838745, + "num_tokens": 704314396.0, + "step": 18458 + }, + { + "epoch": 2.348174532502226, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.80006980895996, + "learning_rate": 1e-06, + "loss": 0.3525, + "mean_token_accuracy": 0.8889902234077454, + "num_tokens": 704346903.0, + "step": 18459 + }, + { + "epoch": 2.3483017427808166, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.839797973632812, + "learning_rate": 1e-06, + "loss": 0.3794, + "mean_token_accuracy": 0.8803837299346924, + "num_tokens": 704380456.0, + "step": 18460 + }, + { + "epoch": 2.348428953059407, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.691530227661133, + "learning_rate": 1e-06, + "loss": 0.3635, + "mean_token_accuracy": 0.8845202922821045, + "num_tokens": 704416403.0, + "step": 18461 + }, + { + "epoch": 2.3485561633379977, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.835256576538086, + "learning_rate": 1e-06, + "loss": 0.3933, + "mean_token_accuracy": 0.8743069767951965, + "num_tokens": 704460431.0, + "step": 18462 + }, + { + "epoch": 2.348683373616588, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.875295639038086, + "learning_rate": 1e-06, + "loss": 0.3717, + "mean_token_accuracy": 0.8812774419784546, + "num_tokens": 704497872.0, + "step": 18463 + }, + { + "epoch": 2.3488105838951787, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.841503143310547, + "learning_rate": 1e-06, + "loss": 0.3978, + "mean_token_accuracy": 0.8744608163833618, + "num_tokens": 704538180.0, + "step": 18464 + }, + { + "epoch": 2.3489377941737692, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.760278701782227, + "learning_rate": 1e-06, + "loss": 0.3651, + "mean_token_accuracy": 0.8845828771591187, + "num_tokens": 704578342.0, + "step": 18465 + }, + { + "epoch": 2.3490650044523598, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.844131469726562, + "learning_rate": 1e-06, + "loss": 0.3609, + "mean_token_accuracy": 0.8847870230674744, + "num_tokens": 704611792.0, + "step": 18466 + }, + { + "epoch": 2.3491922147309503, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.826278686523438, + "learning_rate": 1e-06, + "loss": 0.39, + "mean_token_accuracy": 0.8758933544158936, + "num_tokens": 704649898.0, + "step": 18467 + }, + { + "epoch": 2.349319425009541, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.708860397338867, + "learning_rate": 1e-06, + "loss": 0.3764, + "mean_token_accuracy": 0.8812780380249023, + "num_tokens": 704690632.0, + "step": 18468 + }, + { + "epoch": 2.3494466352881314, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.80295753479004, + "learning_rate": 1e-06, + "loss": 0.4116, + "mean_token_accuracy": 0.8631101846694946, + "num_tokens": 704722955.0, + "step": 18469 + }, + { + "epoch": 2.349573845566722, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.881893157958984, + "learning_rate": 1e-06, + "loss": 0.4042, + "mean_token_accuracy": 0.8723217844963074, + "num_tokens": 704762678.0, + "step": 18470 + }, + { + "epoch": 2.3497010558453124, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.718847274780273, + "learning_rate": 1e-06, + "loss": 0.3263, + "mean_token_accuracy": 0.8976703882217407, + "num_tokens": 704805440.0, + "step": 18471 + }, + { + "epoch": 2.349828266123903, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.722227096557617, + "learning_rate": 1e-06, + "loss": 0.4657, + "mean_token_accuracy": 0.8549636602401733, + "num_tokens": 704848153.0, + "step": 18472 + }, + { + "epoch": 2.3499554764024935, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.79650115966797, + "learning_rate": 1e-06, + "loss": 0.4045, + "mean_token_accuracy": 0.8710461854934692, + "num_tokens": 704886739.0, + "step": 18473 + }, + { + "epoch": 2.350082686681084, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.704130172729492, + "learning_rate": 1e-06, + "loss": 0.3963, + "mean_token_accuracy": 0.8718526363372803, + "num_tokens": 704926145.0, + "step": 18474 + }, + { + "epoch": 2.3502098969596745, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.60983657836914, + "learning_rate": 1e-06, + "loss": 0.4424, + "mean_token_accuracy": 0.857835054397583, + "num_tokens": 704964135.0, + "step": 18475 + }, + { + "epoch": 2.350337107238265, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.952632904052734, + "learning_rate": 1e-06, + "loss": 0.4528, + "mean_token_accuracy": 0.8613772392272949, + "num_tokens": 705002171.0, + "step": 18476 + }, + { + "epoch": 2.350464317516855, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.87327003479004, + "learning_rate": 1e-06, + "loss": 0.3385, + "mean_token_accuracy": 0.8925173282623291, + "num_tokens": 705040660.0, + "step": 18477 + }, + { + "epoch": 2.350591527795446, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.728151321411133, + "learning_rate": 1e-06, + "loss": 0.3812, + "mean_token_accuracy": 0.8786122798919678, + "num_tokens": 705073655.0, + "step": 18478 + }, + { + "epoch": 2.350718738074036, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.830081939697266, + "learning_rate": 1e-06, + "loss": 0.4148, + "mean_token_accuracy": 0.8706129193305969, + "num_tokens": 705115497.0, + "step": 18479 + }, + { + "epoch": 2.3508459483526267, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.029817581176758, + "learning_rate": 1e-06, + "loss": 0.35, + "mean_token_accuracy": 0.8901709318161011, + "num_tokens": 705152642.0, + "step": 18480 + }, + { + "epoch": 2.3509731586312173, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.92003631591797, + "learning_rate": 1e-06, + "loss": 0.4577, + "mean_token_accuracy": 0.8588275909423828, + "num_tokens": 705187282.0, + "step": 18481 + }, + { + "epoch": 2.351100368909808, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.003145217895508, + "learning_rate": 1e-06, + "loss": 0.3917, + "mean_token_accuracy": 0.8767432570457458, + "num_tokens": 705225635.0, + "step": 18482 + }, + { + "epoch": 2.3512275791883983, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.723295211791992, + "learning_rate": 1e-06, + "loss": 0.3616, + "mean_token_accuracy": 0.8832042217254639, + "num_tokens": 705260376.0, + "step": 18483 + }, + { + "epoch": 2.351354789466989, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.767061233520508, + "learning_rate": 1e-06, + "loss": 0.4022, + "mean_token_accuracy": 0.8718020915985107, + "num_tokens": 705298799.0, + "step": 18484 + }, + { + "epoch": 2.3514819997455794, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.84429931640625, + "learning_rate": 1e-06, + "loss": 0.4419, + "mean_token_accuracy": 0.8555468320846558, + "num_tokens": 705332570.0, + "step": 18485 + }, + { + "epoch": 2.35160921002417, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.607940673828125, + "learning_rate": 1e-06, + "loss": 0.4125, + "mean_token_accuracy": 0.8753083944320679, + "num_tokens": 705371214.0, + "step": 18486 + }, + { + "epoch": 2.3517364203027604, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.7747802734375, + "learning_rate": 1e-06, + "loss": 0.4023, + "mean_token_accuracy": 0.8705612421035767, + "num_tokens": 705405051.0, + "step": 18487 + }, + { + "epoch": 2.351863630581351, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.74656105041504, + "learning_rate": 1e-06, + "loss": 0.3981, + "mean_token_accuracy": 0.8723508715629578, + "num_tokens": 705442460.0, + "step": 18488 + }, + { + "epoch": 2.3519908408599415, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.746522903442383, + "learning_rate": 1e-06, + "loss": 0.4082, + "mean_token_accuracy": 0.8727363348007202, + "num_tokens": 705482929.0, + "step": 18489 + }, + { + "epoch": 2.352118051138532, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.761404037475586, + "learning_rate": 1e-06, + "loss": 0.4106, + "mean_token_accuracy": 0.8684746026992798, + "num_tokens": 705525313.0, + "step": 18490 + }, + { + "epoch": 2.3522452614171225, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.71564292907715, + "learning_rate": 1e-06, + "loss": 0.4125, + "mean_token_accuracy": 0.8679008483886719, + "num_tokens": 705563730.0, + "step": 18491 + }, + { + "epoch": 2.352372471695713, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.618383407592773, + "learning_rate": 1e-06, + "loss": 0.3947, + "mean_token_accuracy": 0.8774271011352539, + "num_tokens": 705602459.0, + "step": 18492 + }, + { + "epoch": 2.3524996819743036, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.813467025756836, + "learning_rate": 1e-06, + "loss": 0.401, + "mean_token_accuracy": 0.8699181079864502, + "num_tokens": 705636223.0, + "step": 18493 + }, + { + "epoch": 2.352626892252894, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.6153621673584, + "learning_rate": 1e-06, + "loss": 0.4131, + "mean_token_accuracy": 0.8719192743301392, + "num_tokens": 705676178.0, + "step": 18494 + }, + { + "epoch": 2.3527541025314846, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.72303009033203, + "learning_rate": 1e-06, + "loss": 0.3853, + "mean_token_accuracy": 0.87926185131073, + "num_tokens": 705716687.0, + "step": 18495 + }, + { + "epoch": 2.352881312810075, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.8232479095459, + "learning_rate": 1e-06, + "loss": 0.3792, + "mean_token_accuracy": 0.8810771107673645, + "num_tokens": 705759163.0, + "step": 18496 + }, + { + "epoch": 2.3530085230886657, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.707107543945312, + "learning_rate": 1e-06, + "loss": 0.3973, + "mean_token_accuracy": 0.8750807046890259, + "num_tokens": 705795982.0, + "step": 18497 + }, + { + "epoch": 2.3531357333672562, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.646976470947266, + "learning_rate": 1e-06, + "loss": 0.39, + "mean_token_accuracy": 0.8769577741622925, + "num_tokens": 705835601.0, + "step": 18498 + }, + { + "epoch": 2.3532629436458468, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.70121955871582, + "learning_rate": 1e-06, + "loss": 0.416, + "mean_token_accuracy": 0.8712527751922607, + "num_tokens": 705876858.0, + "step": 18499 + }, + { + "epoch": 2.353390153924437, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.87080192565918, + "learning_rate": 1e-06, + "loss": 0.4296, + "mean_token_accuracy": 0.8651875257492065, + "num_tokens": 705921944.0, + "step": 18500 + }, + { + "epoch": 2.353517364203028, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.559782028198242, + "learning_rate": 1e-06, + "loss": 0.3947, + "mean_token_accuracy": 0.8733801245689392, + "num_tokens": 705967999.0, + "step": 18501 + }, + { + "epoch": 2.353644574481618, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.798891067504883, + "learning_rate": 1e-06, + "loss": 0.4245, + "mean_token_accuracy": 0.8652044534683228, + "num_tokens": 706003370.0, + "step": 18502 + }, + { + "epoch": 2.353771784760209, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.70085906982422, + "learning_rate": 1e-06, + "loss": 0.443, + "mean_token_accuracy": 0.8629488945007324, + "num_tokens": 706036551.0, + "step": 18503 + }, + { + "epoch": 2.353898995038799, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.72978973388672, + "learning_rate": 1e-06, + "loss": 0.3482, + "mean_token_accuracy": 0.8902450799942017, + "num_tokens": 706075965.0, + "step": 18504 + }, + { + "epoch": 2.3540262053173895, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.805097579956055, + "learning_rate": 1e-06, + "loss": 0.4374, + "mean_token_accuracy": 0.8608081936836243, + "num_tokens": 706117943.0, + "step": 18505 + }, + { + "epoch": 2.35415341559598, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.69455909729004, + "learning_rate": 1e-06, + "loss": 0.3989, + "mean_token_accuracy": 0.8766132593154907, + "num_tokens": 706157183.0, + "step": 18506 + }, + { + "epoch": 2.3542806258745705, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.69770622253418, + "learning_rate": 1e-06, + "loss": 0.3765, + "mean_token_accuracy": 0.8817839622497559, + "num_tokens": 706196878.0, + "step": 18507 + }, + { + "epoch": 2.354407836153161, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.778865814208984, + "learning_rate": 1e-06, + "loss": 0.4466, + "mean_token_accuracy": 0.8599184155464172, + "num_tokens": 706231102.0, + "step": 18508 + }, + { + "epoch": 2.3545350464317516, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.629945755004883, + "learning_rate": 1e-06, + "loss": 0.3863, + "mean_token_accuracy": 0.8802300691604614, + "num_tokens": 706273372.0, + "step": 18509 + }, + { + "epoch": 2.354662256710342, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.802858352661133, + "learning_rate": 1e-06, + "loss": 0.4513, + "mean_token_accuracy": 0.8666657209396362, + "num_tokens": 706308106.0, + "step": 18510 + }, + { + "epoch": 2.3547894669889327, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.830768585205078, + "learning_rate": 1e-06, + "loss": 0.3726, + "mean_token_accuracy": 0.8804248571395874, + "num_tokens": 706344267.0, + "step": 18511 + }, + { + "epoch": 2.354916677267523, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.60551643371582, + "learning_rate": 1e-06, + "loss": 0.3548, + "mean_token_accuracy": 0.891265332698822, + "num_tokens": 706385019.0, + "step": 18512 + }, + { + "epoch": 2.3550438875461137, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.854087829589844, + "learning_rate": 1e-06, + "loss": 0.4013, + "mean_token_accuracy": 0.8722331523895264, + "num_tokens": 706424913.0, + "step": 18513 + }, + { + "epoch": 2.3551710978247042, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.66200065612793, + "learning_rate": 1e-06, + "loss": 0.4015, + "mean_token_accuracy": 0.8732348084449768, + "num_tokens": 706466064.0, + "step": 18514 + }, + { + "epoch": 2.3552983081032948, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.735254287719727, + "learning_rate": 1e-06, + "loss": 0.3788, + "mean_token_accuracy": 0.8798128962516785, + "num_tokens": 706504572.0, + "step": 18515 + }, + { + "epoch": 2.3554255183818853, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.566299438476562, + "learning_rate": 1e-06, + "loss": 0.4227, + "mean_token_accuracy": 0.8670201897621155, + "num_tokens": 706543901.0, + "step": 18516 + }, + { + "epoch": 2.355552728660476, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.591564178466797, + "learning_rate": 1e-06, + "loss": 0.4208, + "mean_token_accuracy": 0.8659928441047668, + "num_tokens": 706579095.0, + "step": 18517 + }, + { + "epoch": 2.3556799389390664, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.763872146606445, + "learning_rate": 1e-06, + "loss": 0.4169, + "mean_token_accuracy": 0.8650956153869629, + "num_tokens": 706614078.0, + "step": 18518 + }, + { + "epoch": 2.355807149217657, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.60426139831543, + "learning_rate": 1e-06, + "loss": 0.3848, + "mean_token_accuracy": 0.8805853128433228, + "num_tokens": 706652596.0, + "step": 18519 + }, + { + "epoch": 2.3559343594962474, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.518177032470703, + "learning_rate": 1e-06, + "loss": 0.3324, + "mean_token_accuracy": 0.8965843915939331, + "num_tokens": 706691297.0, + "step": 18520 + }, + { + "epoch": 2.356061569774838, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.75602149963379, + "learning_rate": 1e-06, + "loss": 0.4455, + "mean_token_accuracy": 0.8617928624153137, + "num_tokens": 706727344.0, + "step": 18521 + }, + { + "epoch": 2.3561887800534285, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.87225341796875, + "learning_rate": 1e-06, + "loss": 0.3931, + "mean_token_accuracy": 0.876734733581543, + "num_tokens": 706770411.0, + "step": 18522 + }, + { + "epoch": 2.356315990332019, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.616741180419922, + "learning_rate": 1e-06, + "loss": 0.3991, + "mean_token_accuracy": 0.8754967451095581, + "num_tokens": 706816215.0, + "step": 18523 + }, + { + "epoch": 2.3564432006106095, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.71566390991211, + "learning_rate": 1e-06, + "loss": 0.4196, + "mean_token_accuracy": 0.8680932521820068, + "num_tokens": 706857400.0, + "step": 18524 + }, + { + "epoch": 2.3565704108891996, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.725622177124023, + "learning_rate": 1e-06, + "loss": 0.4247, + "mean_token_accuracy": 0.8648853898048401, + "num_tokens": 706892715.0, + "step": 18525 + }, + { + "epoch": 2.3566976211677906, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.802059173583984, + "learning_rate": 1e-06, + "loss": 0.4024, + "mean_token_accuracy": 0.8737939596176147, + "num_tokens": 706934373.0, + "step": 18526 + }, + { + "epoch": 2.3568248314463807, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.715740203857422, + "learning_rate": 1e-06, + "loss": 0.4061, + "mean_token_accuracy": 0.8740738034248352, + "num_tokens": 706976885.0, + "step": 18527 + }, + { + "epoch": 2.356952041724971, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.867862701416016, + "learning_rate": 1e-06, + "loss": 0.377, + "mean_token_accuracy": 0.8792302012443542, + "num_tokens": 707016957.0, + "step": 18528 + }, + { + "epoch": 2.3570792520035617, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.64438247680664, + "learning_rate": 1e-06, + "loss": 0.3706, + "mean_token_accuracy": 0.8828625082969666, + "num_tokens": 707053687.0, + "step": 18529 + }, + { + "epoch": 2.3572064622821522, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.712535858154297, + "learning_rate": 1e-06, + "loss": 0.4172, + "mean_token_accuracy": 0.8672664165496826, + "num_tokens": 707087327.0, + "step": 18530 + }, + { + "epoch": 2.3573336725607428, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.73296546936035, + "learning_rate": 1e-06, + "loss": 0.3655, + "mean_token_accuracy": 0.8854110240936279, + "num_tokens": 707124479.0, + "step": 18531 + }, + { + "epoch": 2.3574608828393333, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.75091552734375, + "learning_rate": 1e-06, + "loss": 0.3904, + "mean_token_accuracy": 0.8784372210502625, + "num_tokens": 707162452.0, + "step": 18532 + }, + { + "epoch": 2.357588093117924, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.782514572143555, + "learning_rate": 1e-06, + "loss": 0.3673, + "mean_token_accuracy": 0.8817130327224731, + "num_tokens": 707196765.0, + "step": 18533 + }, + { + "epoch": 2.3577153033965144, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.805999755859375, + "learning_rate": 1e-06, + "loss": 0.384, + "mean_token_accuracy": 0.8798476457595825, + "num_tokens": 707230151.0, + "step": 18534 + }, + { + "epoch": 2.357842513675105, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.62590217590332, + "learning_rate": 1e-06, + "loss": 0.4557, + "mean_token_accuracy": 0.8554189205169678, + "num_tokens": 707264373.0, + "step": 18535 + }, + { + "epoch": 2.3579697239536954, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.714344024658203, + "learning_rate": 1e-06, + "loss": 0.415, + "mean_token_accuracy": 0.867794394493103, + "num_tokens": 707304844.0, + "step": 18536 + }, + { + "epoch": 2.358096934232286, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.60741424560547, + "learning_rate": 1e-06, + "loss": 0.3918, + "mean_token_accuracy": 0.8752081394195557, + "num_tokens": 707347081.0, + "step": 18537 + }, + { + "epoch": 2.3582241445108765, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.749181747436523, + "learning_rate": 1e-06, + "loss": 0.4199, + "mean_token_accuracy": 0.8665910959243774, + "num_tokens": 707388243.0, + "step": 18538 + }, + { + "epoch": 2.358351354789467, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.968486785888672, + "learning_rate": 1e-06, + "loss": 0.438, + "mean_token_accuracy": 0.8604594469070435, + "num_tokens": 707427109.0, + "step": 18539 + }, + { + "epoch": 2.3584785650680575, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.634592056274414, + "learning_rate": 1e-06, + "loss": 0.4068, + "mean_token_accuracy": 0.8746883869171143, + "num_tokens": 707462502.0, + "step": 18540 + }, + { + "epoch": 2.358605775346648, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.686599731445312, + "learning_rate": 1e-06, + "loss": 0.4098, + "mean_token_accuracy": 0.8680867552757263, + "num_tokens": 707502861.0, + "step": 18541 + }, + { + "epoch": 2.3587329856252386, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.87815284729004, + "learning_rate": 1e-06, + "loss": 0.4645, + "mean_token_accuracy": 0.8522120118141174, + "num_tokens": 707535868.0, + "step": 18542 + }, + { + "epoch": 2.358860195903829, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.684545516967773, + "learning_rate": 1e-06, + "loss": 0.3959, + "mean_token_accuracy": 0.876484215259552, + "num_tokens": 707566128.0, + "step": 18543 + }, + { + "epoch": 2.3589874061824196, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.74981689453125, + "learning_rate": 1e-06, + "loss": 0.3755, + "mean_token_accuracy": 0.879675030708313, + "num_tokens": 707605113.0, + "step": 18544 + }, + { + "epoch": 2.35911461646101, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.81723976135254, + "learning_rate": 1e-06, + "loss": 0.4025, + "mean_token_accuracy": 0.8713401556015015, + "num_tokens": 707647968.0, + "step": 18545 + }, + { + "epoch": 2.3592418267396007, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.714569091796875, + "learning_rate": 1e-06, + "loss": 0.3708, + "mean_token_accuracy": 0.8800731897354126, + "num_tokens": 707682183.0, + "step": 18546 + }, + { + "epoch": 2.3593690370181912, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.574384689331055, + "learning_rate": 1e-06, + "loss": 0.38, + "mean_token_accuracy": 0.8819628953933716, + "num_tokens": 707720938.0, + "step": 18547 + }, + { + "epoch": 2.3594962472967818, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.709787368774414, + "learning_rate": 1e-06, + "loss": 0.4442, + "mean_token_accuracy": 0.8614460229873657, + "num_tokens": 707762624.0, + "step": 18548 + }, + { + "epoch": 2.3596234575753723, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.86629295349121, + "learning_rate": 1e-06, + "loss": 0.4216, + "mean_token_accuracy": 0.8659923076629639, + "num_tokens": 707798406.0, + "step": 18549 + }, + { + "epoch": 2.3597506678539624, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.902803421020508, + "learning_rate": 1e-06, + "loss": 0.4055, + "mean_token_accuracy": 0.8757134079933167, + "num_tokens": 707835935.0, + "step": 18550 + }, + { + "epoch": 2.3598778781325533, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.66169548034668, + "learning_rate": 1e-06, + "loss": 0.3637, + "mean_token_accuracy": 0.8846495151519775, + "num_tokens": 707872971.0, + "step": 18551 + }, + { + "epoch": 2.3600050884111434, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.691057205200195, + "learning_rate": 1e-06, + "loss": 0.3714, + "mean_token_accuracy": 0.8843659162521362, + "num_tokens": 707909798.0, + "step": 18552 + }, + { + "epoch": 2.360132298689734, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.76675796508789, + "learning_rate": 1e-06, + "loss": 0.3841, + "mean_token_accuracy": 0.8761462569236755, + "num_tokens": 707951130.0, + "step": 18553 + }, + { + "epoch": 2.3602595089683245, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.769515991210938, + "learning_rate": 1e-06, + "loss": 0.4459, + "mean_token_accuracy": 0.8612860441207886, + "num_tokens": 707989502.0, + "step": 18554 + }, + { + "epoch": 2.360386719246915, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.695100784301758, + "learning_rate": 1e-06, + "loss": 0.4097, + "mean_token_accuracy": 0.8718694448471069, + "num_tokens": 708023521.0, + "step": 18555 + }, + { + "epoch": 2.3605139295255055, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.79700469970703, + "learning_rate": 1e-06, + "loss": 0.3702, + "mean_token_accuracy": 0.8791981339454651, + "num_tokens": 708057594.0, + "step": 18556 + }, + { + "epoch": 2.360641139804096, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.674053192138672, + "learning_rate": 1e-06, + "loss": 0.4059, + "mean_token_accuracy": 0.8710930347442627, + "num_tokens": 708101393.0, + "step": 18557 + }, + { + "epoch": 2.3607683500826866, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.725902557373047, + "learning_rate": 1e-06, + "loss": 0.3695, + "mean_token_accuracy": 0.8841961026191711, + "num_tokens": 708137141.0, + "step": 18558 + }, + { + "epoch": 2.360895560361277, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.72478485107422, + "learning_rate": 1e-06, + "loss": 0.3901, + "mean_token_accuracy": 0.8780711889266968, + "num_tokens": 708173497.0, + "step": 18559 + }, + { + "epoch": 2.3610227706398677, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.7529239654541, + "learning_rate": 1e-06, + "loss": 0.4075, + "mean_token_accuracy": 0.8731251955032349, + "num_tokens": 708209412.0, + "step": 18560 + }, + { + "epoch": 2.361149980918458, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.768762588500977, + "learning_rate": 1e-06, + "loss": 0.3711, + "mean_token_accuracy": 0.8816914558410645, + "num_tokens": 708239675.0, + "step": 18561 + }, + { + "epoch": 2.3612771911970487, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.853347778320312, + "learning_rate": 1e-06, + "loss": 0.4221, + "mean_token_accuracy": 0.8675867915153503, + "num_tokens": 708274918.0, + "step": 18562 + }, + { + "epoch": 2.3614044014756392, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.71450424194336, + "learning_rate": 1e-06, + "loss": 0.4454, + "mean_token_accuracy": 0.8580988049507141, + "num_tokens": 708313645.0, + "step": 18563 + }, + { + "epoch": 2.3615316117542298, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.760929107666016, + "learning_rate": 1e-06, + "loss": 0.4029, + "mean_token_accuracy": 0.8744977712631226, + "num_tokens": 708357985.0, + "step": 18564 + }, + { + "epoch": 2.3616588220328203, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.879562377929688, + "learning_rate": 1e-06, + "loss": 0.4796, + "mean_token_accuracy": 0.8531543016433716, + "num_tokens": 708393606.0, + "step": 18565 + }, + { + "epoch": 2.361786032311411, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.84988784790039, + "learning_rate": 1e-06, + "loss": 0.3838, + "mean_token_accuracy": 0.8784277439117432, + "num_tokens": 708428941.0, + "step": 18566 + }, + { + "epoch": 2.3619132425900013, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.79227638244629, + "learning_rate": 1e-06, + "loss": 0.386, + "mean_token_accuracy": 0.8778695464134216, + "num_tokens": 708467847.0, + "step": 18567 + }, + { + "epoch": 2.362040452868592, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.99062728881836, + "learning_rate": 1e-06, + "loss": 0.3445, + "mean_token_accuracy": 0.8897127509117126, + "num_tokens": 708503212.0, + "step": 18568 + }, + { + "epoch": 2.3621676631471824, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.936595916748047, + "learning_rate": 1e-06, + "loss": 0.3993, + "mean_token_accuracy": 0.8742948770523071, + "num_tokens": 708542897.0, + "step": 18569 + }, + { + "epoch": 2.362294873425773, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.766517639160156, + "learning_rate": 1e-06, + "loss": 0.4385, + "mean_token_accuracy": 0.8616437911987305, + "num_tokens": 708574725.0, + "step": 18570 + }, + { + "epoch": 2.3624220837043635, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.778106689453125, + "learning_rate": 1e-06, + "loss": 0.363, + "mean_token_accuracy": 0.8832823038101196, + "num_tokens": 708613557.0, + "step": 18571 + }, + { + "epoch": 2.362549293982954, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.93222999572754, + "learning_rate": 1e-06, + "loss": 0.425, + "mean_token_accuracy": 0.865564227104187, + "num_tokens": 708651872.0, + "step": 18572 + }, + { + "epoch": 2.3626765042615445, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.818523406982422, + "learning_rate": 1e-06, + "loss": 0.4394, + "mean_token_accuracy": 0.861135721206665, + "num_tokens": 708688654.0, + "step": 18573 + }, + { + "epoch": 2.362803714540135, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.67392349243164, + "learning_rate": 1e-06, + "loss": 0.3814, + "mean_token_accuracy": 0.8802500367164612, + "num_tokens": 708723285.0, + "step": 18574 + }, + { + "epoch": 2.362930924818725, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.97826385498047, + "learning_rate": 1e-06, + "loss": 0.3678, + "mean_token_accuracy": 0.8825758099555969, + "num_tokens": 708760617.0, + "step": 18575 + }, + { + "epoch": 2.363058135097316, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.740713119506836, + "learning_rate": 1e-06, + "loss": 0.38, + "mean_token_accuracy": 0.8770055770874023, + "num_tokens": 708790792.0, + "step": 18576 + }, + { + "epoch": 2.363185345375906, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.75071144104004, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8635372519493103, + "num_tokens": 708829935.0, + "step": 18577 + }, + { + "epoch": 2.3633125556544967, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.922494888305664, + "learning_rate": 1e-06, + "loss": 0.4104, + "mean_token_accuracy": 0.8676959276199341, + "num_tokens": 708870578.0, + "step": 18578 + }, + { + "epoch": 2.3634397659330872, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.71619987487793, + "learning_rate": 1e-06, + "loss": 0.4169, + "mean_token_accuracy": 0.8711603879928589, + "num_tokens": 708907044.0, + "step": 18579 + }, + { + "epoch": 2.3635669762116778, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.81202507019043, + "learning_rate": 1e-06, + "loss": 0.445, + "mean_token_accuracy": 0.8563560247421265, + "num_tokens": 708947443.0, + "step": 18580 + }, + { + "epoch": 2.3636941864902683, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.776151657104492, + "learning_rate": 1e-06, + "loss": 0.394, + "mean_token_accuracy": 0.8752930760383606, + "num_tokens": 708983174.0, + "step": 18581 + }, + { + "epoch": 2.363821396768859, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.76348304748535, + "learning_rate": 1e-06, + "loss": 0.3902, + "mean_token_accuracy": 0.8761007189750671, + "num_tokens": 709021950.0, + "step": 18582 + }, + { + "epoch": 2.3639486070474494, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.704212188720703, + "learning_rate": 1e-06, + "loss": 0.3727, + "mean_token_accuracy": 0.8797659873962402, + "num_tokens": 709057236.0, + "step": 18583 + }, + { + "epoch": 2.36407581732604, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.060609817504883, + "learning_rate": 1e-06, + "loss": 0.3948, + "mean_token_accuracy": 0.8742621541023254, + "num_tokens": 709093376.0, + "step": 18584 + }, + { + "epoch": 2.3642030276046304, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.84593963623047, + "learning_rate": 1e-06, + "loss": 0.4105, + "mean_token_accuracy": 0.8684931993484497, + "num_tokens": 709129502.0, + "step": 18585 + }, + { + "epoch": 2.364330237883221, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.841102600097656, + "learning_rate": 1e-06, + "loss": 0.4023, + "mean_token_accuracy": 0.8731399774551392, + "num_tokens": 709162538.0, + "step": 18586 + }, + { + "epoch": 2.3644574481618115, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.928447723388672, + "learning_rate": 1e-06, + "loss": 0.432, + "mean_token_accuracy": 0.8627485036849976, + "num_tokens": 709201690.0, + "step": 18587 + }, + { + "epoch": 2.364584658440402, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.953479766845703, + "learning_rate": 1e-06, + "loss": 0.419, + "mean_token_accuracy": 0.8658817410469055, + "num_tokens": 709240593.0, + "step": 18588 + }, + { + "epoch": 2.3647118687189925, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.89150619506836, + "learning_rate": 1e-06, + "loss": 0.3652, + "mean_token_accuracy": 0.8841565847396851, + "num_tokens": 709278192.0, + "step": 18589 + }, + { + "epoch": 2.364839078997583, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.098079681396484, + "learning_rate": 1e-06, + "loss": 0.3386, + "mean_token_accuracy": 0.8910326957702637, + "num_tokens": 709311797.0, + "step": 18590 + }, + { + "epoch": 2.3649662892761736, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.027067184448242, + "learning_rate": 1e-06, + "loss": 0.4099, + "mean_token_accuracy": 0.8714520931243896, + "num_tokens": 709350379.0, + "step": 18591 + }, + { + "epoch": 2.365093499554764, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.950891494750977, + "learning_rate": 1e-06, + "loss": 0.3253, + "mean_token_accuracy": 0.8986140489578247, + "num_tokens": 709389030.0, + "step": 18592 + }, + { + "epoch": 2.3652207098333546, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.916404724121094, + "learning_rate": 1e-06, + "loss": 0.4367, + "mean_token_accuracy": 0.8657363653182983, + "num_tokens": 709433692.0, + "step": 18593 + }, + { + "epoch": 2.365347920111945, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.808509826660156, + "learning_rate": 1e-06, + "loss": 0.393, + "mean_token_accuracy": 0.8726410865783691, + "num_tokens": 709470773.0, + "step": 18594 + }, + { + "epoch": 2.3654751303905357, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.04097557067871, + "learning_rate": 1e-06, + "loss": 0.4183, + "mean_token_accuracy": 0.869739830493927, + "num_tokens": 709506764.0, + "step": 18595 + }, + { + "epoch": 2.3656023406691262, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.947702407836914, + "learning_rate": 1e-06, + "loss": 0.4193, + "mean_token_accuracy": 0.8722034096717834, + "num_tokens": 709546931.0, + "step": 18596 + }, + { + "epoch": 2.3657295509477168, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.864105224609375, + "learning_rate": 1e-06, + "loss": 0.3886, + "mean_token_accuracy": 0.8747113943099976, + "num_tokens": 709582842.0, + "step": 18597 + }, + { + "epoch": 2.365856761226307, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.874650955200195, + "learning_rate": 1e-06, + "loss": 0.4127, + "mean_token_accuracy": 0.8729948997497559, + "num_tokens": 709621574.0, + "step": 18598 + }, + { + "epoch": 2.365983971504898, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.907367706298828, + "learning_rate": 1e-06, + "loss": 0.3461, + "mean_token_accuracy": 0.8908661007881165, + "num_tokens": 709662437.0, + "step": 18599 + }, + { + "epoch": 2.366111181783488, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.843202590942383, + "learning_rate": 1e-06, + "loss": 0.3569, + "mean_token_accuracy": 0.8867658376693726, + "num_tokens": 709701318.0, + "step": 18600 + }, + { + "epoch": 2.3662383920620784, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.879560470581055, + "learning_rate": 1e-06, + "loss": 0.4016, + "mean_token_accuracy": 0.8726928234100342, + "num_tokens": 709745156.0, + "step": 18601 + }, + { + "epoch": 2.366365602340669, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.813188552856445, + "learning_rate": 1e-06, + "loss": 0.4136, + "mean_token_accuracy": 0.8681501150131226, + "num_tokens": 709785849.0, + "step": 18602 + }, + { + "epoch": 2.3664928126192595, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.96227264404297, + "learning_rate": 1e-06, + "loss": 0.3876, + "mean_token_accuracy": 0.8770018815994263, + "num_tokens": 709830038.0, + "step": 18603 + }, + { + "epoch": 2.36662002289785, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.790332794189453, + "learning_rate": 1e-06, + "loss": 0.412, + "mean_token_accuracy": 0.870916485786438, + "num_tokens": 709871174.0, + "step": 18604 + }, + { + "epoch": 2.3667472331764405, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.9815731048584, + "learning_rate": 1e-06, + "loss": 0.3351, + "mean_token_accuracy": 0.8941805958747864, + "num_tokens": 709905921.0, + "step": 18605 + }, + { + "epoch": 2.366874443455031, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.930936813354492, + "learning_rate": 1e-06, + "loss": 0.357, + "mean_token_accuracy": 0.8895511031150818, + "num_tokens": 709939856.0, + "step": 18606 + }, + { + "epoch": 2.3670016537336216, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.90248680114746, + "learning_rate": 1e-06, + "loss": 0.3585, + "mean_token_accuracy": 0.8862699866294861, + "num_tokens": 709978805.0, + "step": 18607 + }, + { + "epoch": 2.367128864012212, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.78146743774414, + "learning_rate": 1e-06, + "loss": 0.3614, + "mean_token_accuracy": 0.887121856212616, + "num_tokens": 710015015.0, + "step": 18608 + }, + { + "epoch": 2.3672560742908026, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.982009887695312, + "learning_rate": 1e-06, + "loss": 0.4252, + "mean_token_accuracy": 0.865059494972229, + "num_tokens": 710054887.0, + "step": 18609 + }, + { + "epoch": 2.367383284569393, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.586345672607422, + "learning_rate": 1e-06, + "loss": 0.3955, + "mean_token_accuracy": 0.8733752965927124, + "num_tokens": 710097012.0, + "step": 18610 + }, + { + "epoch": 2.3675104948479837, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.924715042114258, + "learning_rate": 1e-06, + "loss": 0.4472, + "mean_token_accuracy": 0.857812225818634, + "num_tokens": 710130912.0, + "step": 18611 + }, + { + "epoch": 2.3676377051265742, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.826580047607422, + "learning_rate": 1e-06, + "loss": 0.4079, + "mean_token_accuracy": 0.869604229927063, + "num_tokens": 710168880.0, + "step": 18612 + }, + { + "epoch": 2.3677649154051648, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.95313262939453, + "learning_rate": 1e-06, + "loss": 0.3802, + "mean_token_accuracy": 0.8832628726959229, + "num_tokens": 710205619.0, + "step": 18613 + }, + { + "epoch": 2.3678921256837553, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.81261444091797, + "learning_rate": 1e-06, + "loss": 0.3883, + "mean_token_accuracy": 0.8793365955352783, + "num_tokens": 710239475.0, + "step": 18614 + }, + { + "epoch": 2.368019335962346, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.960359573364258, + "learning_rate": 1e-06, + "loss": 0.3856, + "mean_token_accuracy": 0.8755184412002563, + "num_tokens": 710276533.0, + "step": 18615 + }, + { + "epoch": 2.3681465462409363, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.61054229736328, + "learning_rate": 1e-06, + "loss": 0.3521, + "mean_token_accuracy": 0.8895031809806824, + "num_tokens": 710316731.0, + "step": 18616 + }, + { + "epoch": 2.368273756519527, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.048179626464844, + "learning_rate": 1e-06, + "loss": 0.4063, + "mean_token_accuracy": 0.8698738813400269, + "num_tokens": 710352310.0, + "step": 18617 + }, + { + "epoch": 2.3684009667981174, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.673860549926758, + "learning_rate": 1e-06, + "loss": 0.4106, + "mean_token_accuracy": 0.870082437992096, + "num_tokens": 710400203.0, + "step": 18618 + }, + { + "epoch": 2.368528177076708, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.847124099731445, + "learning_rate": 1e-06, + "loss": 0.3762, + "mean_token_accuracy": 0.8807187676429749, + "num_tokens": 710433054.0, + "step": 18619 + }, + { + "epoch": 2.3686553873552985, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.831857681274414, + "learning_rate": 1e-06, + "loss": 0.3795, + "mean_token_accuracy": 0.8781746625900269, + "num_tokens": 710466879.0, + "step": 18620 + }, + { + "epoch": 2.368782597633889, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.769290924072266, + "learning_rate": 1e-06, + "loss": 0.4276, + "mean_token_accuracy": 0.8672078847885132, + "num_tokens": 710502808.0, + "step": 18621 + }, + { + "epoch": 2.3689098079124795, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.70400619506836, + "learning_rate": 1e-06, + "loss": 0.4265, + "mean_token_accuracy": 0.8630599975585938, + "num_tokens": 710546190.0, + "step": 18622 + }, + { + "epoch": 2.3690370181910696, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.7365665435791, + "learning_rate": 1e-06, + "loss": 0.3932, + "mean_token_accuracy": 0.875182032585144, + "num_tokens": 710585556.0, + "step": 18623 + }, + { + "epoch": 2.3691642284696606, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.77645492553711, + "learning_rate": 1e-06, + "loss": 0.3897, + "mean_token_accuracy": 0.8767275810241699, + "num_tokens": 710621366.0, + "step": 18624 + }, + { + "epoch": 2.3692914387482507, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.76192283630371, + "learning_rate": 1e-06, + "loss": 0.3835, + "mean_token_accuracy": 0.8783715963363647, + "num_tokens": 710660217.0, + "step": 18625 + }, + { + "epoch": 2.369418649026841, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.54673194885254, + "learning_rate": 1e-06, + "loss": 0.4055, + "mean_token_accuracy": 0.8700598478317261, + "num_tokens": 710695798.0, + "step": 18626 + }, + { + "epoch": 2.3695458593054317, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.799333572387695, + "learning_rate": 1e-06, + "loss": 0.4612, + "mean_token_accuracy": 0.8566157221794128, + "num_tokens": 710732739.0, + "step": 18627 + }, + { + "epoch": 2.3696730695840222, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.663196563720703, + "learning_rate": 1e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.8666617274284363, + "num_tokens": 710769651.0, + "step": 18628 + }, + { + "epoch": 2.3698002798626128, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.78059959411621, + "learning_rate": 1e-06, + "loss": 0.4016, + "mean_token_accuracy": 0.8712736368179321, + "num_tokens": 710809487.0, + "step": 18629 + }, + { + "epoch": 2.3699274901412033, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.799346923828125, + "learning_rate": 1e-06, + "loss": 0.3899, + "mean_token_accuracy": 0.8764320611953735, + "num_tokens": 710849152.0, + "step": 18630 + }, + { + "epoch": 2.370054700419794, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.723600387573242, + "learning_rate": 1e-06, + "loss": 0.4405, + "mean_token_accuracy": 0.8602913022041321, + "num_tokens": 710890933.0, + "step": 18631 + }, + { + "epoch": 2.3701819106983844, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.824951171875, + "learning_rate": 1e-06, + "loss": 0.3943, + "mean_token_accuracy": 0.8776346445083618, + "num_tokens": 710928262.0, + "step": 18632 + }, + { + "epoch": 2.370309120976975, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.793758392333984, + "learning_rate": 1e-06, + "loss": 0.3868, + "mean_token_accuracy": 0.8790094256401062, + "num_tokens": 710962969.0, + "step": 18633 + }, + { + "epoch": 2.3704363312555654, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.822860717773438, + "learning_rate": 1e-06, + "loss": 0.4028, + "mean_token_accuracy": 0.868861198425293, + "num_tokens": 710998717.0, + "step": 18634 + }, + { + "epoch": 2.370563541534156, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.625024795532227, + "learning_rate": 1e-06, + "loss": 0.4493, + "mean_token_accuracy": 0.8590359687805176, + "num_tokens": 711032819.0, + "step": 18635 + }, + { + "epoch": 2.3706907518127465, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.761930465698242, + "learning_rate": 1e-06, + "loss": 0.4269, + "mean_token_accuracy": 0.8634491562843323, + "num_tokens": 711069395.0, + "step": 18636 + }, + { + "epoch": 2.370817962091337, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.8016357421875, + "learning_rate": 1e-06, + "loss": 0.389, + "mean_token_accuracy": 0.875647783279419, + "num_tokens": 711105754.0, + "step": 18637 + }, + { + "epoch": 2.3709451723699275, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.73749351501465, + "learning_rate": 1e-06, + "loss": 0.4165, + "mean_token_accuracy": 0.8685581684112549, + "num_tokens": 711145291.0, + "step": 18638 + }, + { + "epoch": 2.371072382648518, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.72464370727539, + "learning_rate": 1e-06, + "loss": 0.3998, + "mean_token_accuracy": 0.8747621774673462, + "num_tokens": 711183431.0, + "step": 18639 + }, + { + "epoch": 2.3711995929271086, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.702720642089844, + "learning_rate": 1e-06, + "loss": 0.4437, + "mean_token_accuracy": 0.8597686290740967, + "num_tokens": 711229754.0, + "step": 18640 + }, + { + "epoch": 2.371326803205699, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.87732696533203, + "learning_rate": 1e-06, + "loss": 0.3983, + "mean_token_accuracy": 0.8749433755874634, + "num_tokens": 711269071.0, + "step": 18641 + }, + { + "epoch": 2.3714540134842896, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.660808563232422, + "learning_rate": 1e-06, + "loss": 0.414, + "mean_token_accuracy": 0.8700536489486694, + "num_tokens": 711312036.0, + "step": 18642 + }, + { + "epoch": 2.37158122376288, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.851247787475586, + "learning_rate": 1e-06, + "loss": 0.3662, + "mean_token_accuracy": 0.881222665309906, + "num_tokens": 711347945.0, + "step": 18643 + }, + { + "epoch": 2.3717084340414707, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.66558265686035, + "learning_rate": 1e-06, + "loss": 0.4712, + "mean_token_accuracy": 0.8529448509216309, + "num_tokens": 711388433.0, + "step": 18644 + }, + { + "epoch": 2.371835644320061, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.693281173706055, + "learning_rate": 1e-06, + "loss": 0.3919, + "mean_token_accuracy": 0.8773187398910522, + "num_tokens": 711425821.0, + "step": 18645 + }, + { + "epoch": 2.3719628545986517, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.76629638671875, + "learning_rate": 1e-06, + "loss": 0.4251, + "mean_token_accuracy": 0.867425799369812, + "num_tokens": 711462725.0, + "step": 18646 + }, + { + "epoch": 2.3720900648772423, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.814332962036133, + "learning_rate": 1e-06, + "loss": 0.3616, + "mean_token_accuracy": 0.8864785432815552, + "num_tokens": 711499987.0, + "step": 18647 + }, + { + "epoch": 2.3722172751558324, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.700603485107422, + "learning_rate": 1e-06, + "loss": 0.4376, + "mean_token_accuracy": 0.8588600158691406, + "num_tokens": 711543658.0, + "step": 18648 + }, + { + "epoch": 2.3723444854344233, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.752132415771484, + "learning_rate": 1e-06, + "loss": 0.3602, + "mean_token_accuracy": 0.8853499293327332, + "num_tokens": 711583875.0, + "step": 18649 + }, + { + "epoch": 2.3724716957130134, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.715362548828125, + "learning_rate": 1e-06, + "loss": 0.4489, + "mean_token_accuracy": 0.8567323684692383, + "num_tokens": 711625543.0, + "step": 18650 + }, + { + "epoch": 2.372598905991604, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.69832992553711, + "learning_rate": 1e-06, + "loss": 0.3845, + "mean_token_accuracy": 0.8762418031692505, + "num_tokens": 711660528.0, + "step": 18651 + }, + { + "epoch": 2.3727261162701945, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.7255859375, + "learning_rate": 1e-06, + "loss": 0.3549, + "mean_token_accuracy": 0.8897648453712463, + "num_tokens": 711695646.0, + "step": 18652 + }, + { + "epoch": 2.372853326548785, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.74081039428711, + "learning_rate": 1e-06, + "loss": 0.4494, + "mean_token_accuracy": 0.8583840131759644, + "num_tokens": 711734691.0, + "step": 18653 + }, + { + "epoch": 2.3729805368273755, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.809778213500977, + "learning_rate": 1e-06, + "loss": 0.398, + "mean_token_accuracy": 0.8752615451812744, + "num_tokens": 711771518.0, + "step": 18654 + }, + { + "epoch": 2.373107747105966, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.727764129638672, + "learning_rate": 1e-06, + "loss": 0.4394, + "mean_token_accuracy": 0.8640087246894836, + "num_tokens": 711808968.0, + "step": 18655 + }, + { + "epoch": 2.3732349573845566, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.592727661132812, + "learning_rate": 1e-06, + "loss": 0.3836, + "mean_token_accuracy": 0.8813889026641846, + "num_tokens": 711842944.0, + "step": 18656 + }, + { + "epoch": 2.373362167663147, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.7825927734375, + "learning_rate": 1e-06, + "loss": 0.4094, + "mean_token_accuracy": 0.8702715635299683, + "num_tokens": 711882740.0, + "step": 18657 + }, + { + "epoch": 2.3734893779417376, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.72686004638672, + "learning_rate": 1e-06, + "loss": 0.4222, + "mean_token_accuracy": 0.8661699295043945, + "num_tokens": 711925362.0, + "step": 18658 + }, + { + "epoch": 2.373616588220328, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.569503784179688, + "learning_rate": 1e-06, + "loss": 0.3871, + "mean_token_accuracy": 0.8775470852851868, + "num_tokens": 711971285.0, + "step": 18659 + }, + { + "epoch": 2.3737437984989187, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.76150131225586, + "learning_rate": 1e-06, + "loss": 0.4153, + "mean_token_accuracy": 0.868894100189209, + "num_tokens": 712013306.0, + "step": 18660 + }, + { + "epoch": 2.3738710087775092, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.587947845458984, + "learning_rate": 1e-06, + "loss": 0.3913, + "mean_token_accuracy": 0.8745872974395752, + "num_tokens": 712050837.0, + "step": 18661 + }, + { + "epoch": 2.3739982190560998, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.3786563873291, + "learning_rate": 1e-06, + "loss": 0.3544, + "mean_token_accuracy": 0.8898168802261353, + "num_tokens": 712085140.0, + "step": 18662 + }, + { + "epoch": 2.3741254293346903, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.706151962280273, + "learning_rate": 1e-06, + "loss": 0.3604, + "mean_token_accuracy": 0.8866109251976013, + "num_tokens": 712122495.0, + "step": 18663 + }, + { + "epoch": 2.374252639613281, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.812496185302734, + "learning_rate": 1e-06, + "loss": 0.4105, + "mean_token_accuracy": 0.8678672909736633, + "num_tokens": 712161312.0, + "step": 18664 + }, + { + "epoch": 2.3743798498918713, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.78220558166504, + "learning_rate": 1e-06, + "loss": 0.4012, + "mean_token_accuracy": 0.8726334571838379, + "num_tokens": 712197275.0, + "step": 18665 + }, + { + "epoch": 2.374507060170462, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.643783569335938, + "learning_rate": 1e-06, + "loss": 0.4385, + "mean_token_accuracy": 0.861117959022522, + "num_tokens": 712231309.0, + "step": 18666 + }, + { + "epoch": 2.3746342704490524, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.761802673339844, + "learning_rate": 1e-06, + "loss": 0.4265, + "mean_token_accuracy": 0.8689255714416504, + "num_tokens": 712264864.0, + "step": 18667 + }, + { + "epoch": 2.374761480727643, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.60364532470703, + "learning_rate": 1e-06, + "loss": 0.4009, + "mean_token_accuracy": 0.8731825351715088, + "num_tokens": 712303564.0, + "step": 18668 + }, + { + "epoch": 2.3748886910062335, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.234819412231445, + "learning_rate": 1e-06, + "loss": 0.3747, + "mean_token_accuracy": 0.8841150999069214, + "num_tokens": 712341872.0, + "step": 18669 + }, + { + "epoch": 2.375015901284824, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.890378952026367, + "learning_rate": 1e-06, + "loss": 0.4322, + "mean_token_accuracy": 0.8615650534629822, + "num_tokens": 712379835.0, + "step": 18670 + }, + { + "epoch": 2.3751431115634145, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.899120330810547, + "learning_rate": 1e-06, + "loss": 0.4104, + "mean_token_accuracy": 0.8694069385528564, + "num_tokens": 712417462.0, + "step": 18671 + }, + { + "epoch": 2.375270321842005, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.780492782592773, + "learning_rate": 1e-06, + "loss": 0.4915, + "mean_token_accuracy": 0.8476837873458862, + "num_tokens": 712461581.0, + "step": 18672 + }, + { + "epoch": 2.375397532120595, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.785812377929688, + "learning_rate": 1e-06, + "loss": 0.3986, + "mean_token_accuracy": 0.8726059198379517, + "num_tokens": 712499615.0, + "step": 18673 + }, + { + "epoch": 2.375524742399186, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.83025360107422, + "learning_rate": 1e-06, + "loss": 0.3694, + "mean_token_accuracy": 0.8831319808959961, + "num_tokens": 712533264.0, + "step": 18674 + }, + { + "epoch": 2.375651952677776, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.924409866333008, + "learning_rate": 1e-06, + "loss": 0.4175, + "mean_token_accuracy": 0.8649473190307617, + "num_tokens": 712571496.0, + "step": 18675 + }, + { + "epoch": 2.3757791629563667, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.807619094848633, + "learning_rate": 1e-06, + "loss": 0.3771, + "mean_token_accuracy": 0.880431056022644, + "num_tokens": 712606510.0, + "step": 18676 + }, + { + "epoch": 2.3759063732349572, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.75426483154297, + "learning_rate": 1e-06, + "loss": 0.3771, + "mean_token_accuracy": 0.88227379322052, + "num_tokens": 712646218.0, + "step": 18677 + }, + { + "epoch": 2.3760335835135478, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.962390899658203, + "learning_rate": 1e-06, + "loss": 0.5015, + "mean_token_accuracy": 0.8427655696868896, + "num_tokens": 712681905.0, + "step": 18678 + }, + { + "epoch": 2.3761607937921383, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.683856964111328, + "learning_rate": 1e-06, + "loss": 0.3512, + "mean_token_accuracy": 0.8862211108207703, + "num_tokens": 712719916.0, + "step": 18679 + }, + { + "epoch": 2.376288004070729, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.73372459411621, + "learning_rate": 1e-06, + "loss": 0.3884, + "mean_token_accuracy": 0.8732059597969055, + "num_tokens": 712760849.0, + "step": 18680 + }, + { + "epoch": 2.3764152143493193, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.95361328125, + "learning_rate": 1e-06, + "loss": 0.3641, + "mean_token_accuracy": 0.8846310377120972, + "num_tokens": 712801199.0, + "step": 18681 + }, + { + "epoch": 2.37654242462791, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.749530792236328, + "learning_rate": 1e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.8680988550186157, + "num_tokens": 712843471.0, + "step": 18682 + }, + { + "epoch": 2.3766696349065004, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.6812801361084, + "learning_rate": 1e-06, + "loss": 0.3903, + "mean_token_accuracy": 0.8744253516197205, + "num_tokens": 712880649.0, + "step": 18683 + }, + { + "epoch": 2.376796845185091, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.77352523803711, + "learning_rate": 1e-06, + "loss": 0.467, + "mean_token_accuracy": 0.8508014678955078, + "num_tokens": 712920574.0, + "step": 18684 + }, + { + "epoch": 2.3769240554636815, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.699935913085938, + "learning_rate": 1e-06, + "loss": 0.378, + "mean_token_accuracy": 0.8817782402038574, + "num_tokens": 712962544.0, + "step": 18685 + }, + { + "epoch": 2.377051265742272, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.798341751098633, + "learning_rate": 1e-06, + "loss": 0.3845, + "mean_token_accuracy": 0.8777661323547363, + "num_tokens": 712999138.0, + "step": 18686 + }, + { + "epoch": 2.3771784760208625, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.987958908081055, + "learning_rate": 1e-06, + "loss": 0.3936, + "mean_token_accuracy": 0.8750096559524536, + "num_tokens": 713041799.0, + "step": 18687 + }, + { + "epoch": 2.377305686299453, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.554306030273438, + "learning_rate": 1e-06, + "loss": 0.4276, + "mean_token_accuracy": 0.8658158183097839, + "num_tokens": 713088248.0, + "step": 18688 + }, + { + "epoch": 2.3774328965780436, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.87482452392578, + "learning_rate": 1e-06, + "loss": 0.3718, + "mean_token_accuracy": 0.8827986121177673, + "num_tokens": 713127239.0, + "step": 18689 + }, + { + "epoch": 2.377560106856634, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.604833602905273, + "learning_rate": 1e-06, + "loss": 0.3937, + "mean_token_accuracy": 0.876579761505127, + "num_tokens": 713168389.0, + "step": 18690 + }, + { + "epoch": 2.3776873171352246, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.023487091064453, + "learning_rate": 1e-06, + "loss": 0.4251, + "mean_token_accuracy": 0.8660055994987488, + "num_tokens": 713208137.0, + "step": 18691 + }, + { + "epoch": 2.377814527413815, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.829723358154297, + "learning_rate": 1e-06, + "loss": 0.4266, + "mean_token_accuracy": 0.8637679815292358, + "num_tokens": 713249344.0, + "step": 18692 + }, + { + "epoch": 2.3779417376924057, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.662975311279297, + "learning_rate": 1e-06, + "loss": 0.4105, + "mean_token_accuracy": 0.8700599670410156, + "num_tokens": 713291376.0, + "step": 18693 + }, + { + "epoch": 2.378068947970996, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.93623924255371, + "learning_rate": 1e-06, + "loss": 0.4086, + "mean_token_accuracy": 0.8712760806083679, + "num_tokens": 713329925.0, + "step": 18694 + }, + { + "epoch": 2.3781961582495867, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.869903564453125, + "learning_rate": 1e-06, + "loss": 0.3881, + "mean_token_accuracy": 0.8744029998779297, + "num_tokens": 713372509.0, + "step": 18695 + }, + { + "epoch": 2.378323368528177, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.901611328125, + "learning_rate": 1e-06, + "loss": 0.4137, + "mean_token_accuracy": 0.8671133518218994, + "num_tokens": 713410055.0, + "step": 18696 + }, + { + "epoch": 2.378450578806768, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.865333557128906, + "learning_rate": 1e-06, + "loss": 0.4083, + "mean_token_accuracy": 0.8718630075454712, + "num_tokens": 713453958.0, + "step": 18697 + }, + { + "epoch": 2.378577789085358, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.812408447265625, + "learning_rate": 1e-06, + "loss": 0.3982, + "mean_token_accuracy": 0.8739726543426514, + "num_tokens": 713485683.0, + "step": 18698 + }, + { + "epoch": 2.3787049993639484, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.822307586669922, + "learning_rate": 1e-06, + "loss": 0.3927, + "mean_token_accuracy": 0.8744192719459534, + "num_tokens": 713528565.0, + "step": 18699 + }, + { + "epoch": 2.378832209642539, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.038665771484375, + "learning_rate": 1e-06, + "loss": 0.4258, + "mean_token_accuracy": 0.8699647188186646, + "num_tokens": 713560932.0, + "step": 18700 + }, + { + "epoch": 2.3789594199211295, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.791393280029297, + "learning_rate": 1e-06, + "loss": 0.4095, + "mean_token_accuracy": 0.8698492050170898, + "num_tokens": 713592891.0, + "step": 18701 + }, + { + "epoch": 2.37908663019972, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.040252685546875, + "learning_rate": 1e-06, + "loss": 0.4096, + "mean_token_accuracy": 0.8710966110229492, + "num_tokens": 713631285.0, + "step": 18702 + }, + { + "epoch": 2.3792138404783105, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.72031021118164, + "learning_rate": 1e-06, + "loss": 0.4688, + "mean_token_accuracy": 0.8551822304725647, + "num_tokens": 713675143.0, + "step": 18703 + }, + { + "epoch": 2.379341050756901, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.002338409423828, + "learning_rate": 1e-06, + "loss": 0.395, + "mean_token_accuracy": 0.8773832321166992, + "num_tokens": 713717556.0, + "step": 18704 + }, + { + "epoch": 2.3794682610354916, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.974130630493164, + "learning_rate": 1e-06, + "loss": 0.4203, + "mean_token_accuracy": 0.8653460741043091, + "num_tokens": 713763515.0, + "step": 18705 + }, + { + "epoch": 2.379595471314082, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.882160186767578, + "learning_rate": 1e-06, + "loss": 0.3764, + "mean_token_accuracy": 0.8777567744255066, + "num_tokens": 713801064.0, + "step": 18706 + }, + { + "epoch": 2.3797226815926726, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.85227394104004, + "learning_rate": 1e-06, + "loss": 0.407, + "mean_token_accuracy": 0.8705196380615234, + "num_tokens": 713841662.0, + "step": 18707 + }, + { + "epoch": 2.379849891871263, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.745393753051758, + "learning_rate": 1e-06, + "loss": 0.3908, + "mean_token_accuracy": 0.8754801154136658, + "num_tokens": 713883646.0, + "step": 18708 + }, + { + "epoch": 2.3799771021498537, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.016138076782227, + "learning_rate": 1e-06, + "loss": 0.3472, + "mean_token_accuracy": 0.8934165835380554, + "num_tokens": 713925365.0, + "step": 18709 + }, + { + "epoch": 2.3801043124284442, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.613332748413086, + "learning_rate": 1e-06, + "loss": 0.4129, + "mean_token_accuracy": 0.8693269491195679, + "num_tokens": 713961142.0, + "step": 18710 + }, + { + "epoch": 2.3802315227070348, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.287723541259766, + "learning_rate": 1e-06, + "loss": 0.3793, + "mean_token_accuracy": 0.8809636235237122, + "num_tokens": 714001652.0, + "step": 18711 + }, + { + "epoch": 2.3803587329856253, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.844099044799805, + "learning_rate": 1e-06, + "loss": 0.3866, + "mean_token_accuracy": 0.8747158050537109, + "num_tokens": 714043023.0, + "step": 18712 + }, + { + "epoch": 2.380485943264216, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.03263282775879, + "learning_rate": 1e-06, + "loss": 0.408, + "mean_token_accuracy": 0.8736768364906311, + "num_tokens": 714080389.0, + "step": 18713 + }, + { + "epoch": 2.3806131535428063, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.001249313354492, + "learning_rate": 1e-06, + "loss": 0.3827, + "mean_token_accuracy": 0.876701831817627, + "num_tokens": 714117695.0, + "step": 18714 + }, + { + "epoch": 2.380740363821397, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.857982635498047, + "learning_rate": 1e-06, + "loss": 0.3801, + "mean_token_accuracy": 0.8804792165756226, + "num_tokens": 714154204.0, + "step": 18715 + }, + { + "epoch": 2.3808675740999874, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.991226196289062, + "learning_rate": 1e-06, + "loss": 0.4568, + "mean_token_accuracy": 0.8564154505729675, + "num_tokens": 714194857.0, + "step": 18716 + }, + { + "epoch": 2.380994784378578, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.980754852294922, + "learning_rate": 1e-06, + "loss": 0.3657, + "mean_token_accuracy": 0.8836227655410767, + "num_tokens": 714228034.0, + "step": 18717 + }, + { + "epoch": 2.3811219946571685, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.836894989013672, + "learning_rate": 1e-06, + "loss": 0.3844, + "mean_token_accuracy": 0.8788474798202515, + "num_tokens": 714269688.0, + "step": 18718 + }, + { + "epoch": 2.381249204935759, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.856441497802734, + "learning_rate": 1e-06, + "loss": 0.427, + "mean_token_accuracy": 0.8677953481674194, + "num_tokens": 714309039.0, + "step": 18719 + }, + { + "epoch": 2.3813764152143495, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.89043617248535, + "learning_rate": 1e-06, + "loss": 0.3799, + "mean_token_accuracy": 0.8789366483688354, + "num_tokens": 714347453.0, + "step": 18720 + }, + { + "epoch": 2.3815036254929396, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.56218910217285, + "learning_rate": 1e-06, + "loss": 0.4095, + "mean_token_accuracy": 0.8713153600692749, + "num_tokens": 714385850.0, + "step": 18721 + }, + { + "epoch": 2.3816308357715306, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.78964614868164, + "learning_rate": 1e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.8676673173904419, + "num_tokens": 714426735.0, + "step": 18722 + }, + { + "epoch": 2.3817580460501206, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.792373657226562, + "learning_rate": 1e-06, + "loss": 0.3757, + "mean_token_accuracy": 0.8848394155502319, + "num_tokens": 714465279.0, + "step": 18723 + }, + { + "epoch": 2.381885256328711, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.915388107299805, + "learning_rate": 1e-06, + "loss": 0.4337, + "mean_token_accuracy": 0.8626044392585754, + "num_tokens": 714505703.0, + "step": 18724 + }, + { + "epoch": 2.3820124666073017, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.805538177490234, + "learning_rate": 1e-06, + "loss": 0.4207, + "mean_token_accuracy": 0.8673328757286072, + "num_tokens": 714543642.0, + "step": 18725 + }, + { + "epoch": 2.3821396768858922, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.922204971313477, + "learning_rate": 1e-06, + "loss": 0.3524, + "mean_token_accuracy": 0.889875054359436, + "num_tokens": 714583533.0, + "step": 18726 + }, + { + "epoch": 2.3822668871644828, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.64301872253418, + "learning_rate": 1e-06, + "loss": 0.3856, + "mean_token_accuracy": 0.8777505159378052, + "num_tokens": 714625577.0, + "step": 18727 + }, + { + "epoch": 2.3823940974430733, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.816089630126953, + "learning_rate": 1e-06, + "loss": 0.4046, + "mean_token_accuracy": 0.8697269558906555, + "num_tokens": 714664987.0, + "step": 18728 + }, + { + "epoch": 2.382521307721664, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.83332061767578, + "learning_rate": 1e-06, + "loss": 0.414, + "mean_token_accuracy": 0.8671140670776367, + "num_tokens": 714705881.0, + "step": 18729 + }, + { + "epoch": 2.3826485180002543, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.802749633789062, + "learning_rate": 1e-06, + "loss": 0.4396, + "mean_token_accuracy": 0.8636429309844971, + "num_tokens": 714746012.0, + "step": 18730 + }, + { + "epoch": 2.382775728278845, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.85331153869629, + "learning_rate": 1e-06, + "loss": 0.3982, + "mean_token_accuracy": 0.8764216899871826, + "num_tokens": 714790001.0, + "step": 18731 + }, + { + "epoch": 2.3829029385574354, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.868083953857422, + "learning_rate": 1e-06, + "loss": 0.3783, + "mean_token_accuracy": 0.8805739283561707, + "num_tokens": 714824515.0, + "step": 18732 + }, + { + "epoch": 2.383030148836026, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.783483505249023, + "learning_rate": 1e-06, + "loss": 0.4166, + "mean_token_accuracy": 0.8661676645278931, + "num_tokens": 714860841.0, + "step": 18733 + }, + { + "epoch": 2.3831573591146165, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.918636322021484, + "learning_rate": 1e-06, + "loss": 0.3861, + "mean_token_accuracy": 0.8753580451011658, + "num_tokens": 714896373.0, + "step": 18734 + }, + { + "epoch": 2.383284569393207, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.822853088378906, + "learning_rate": 1e-06, + "loss": 0.3889, + "mean_token_accuracy": 0.8762446641921997, + "num_tokens": 714926684.0, + "step": 18735 + }, + { + "epoch": 2.3834117796717975, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.844987869262695, + "learning_rate": 1e-06, + "loss": 0.3619, + "mean_token_accuracy": 0.8878983855247498, + "num_tokens": 714961057.0, + "step": 18736 + }, + { + "epoch": 2.383538989950388, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.893535614013672, + "learning_rate": 1e-06, + "loss": 0.39, + "mean_token_accuracy": 0.8788315653800964, + "num_tokens": 715001215.0, + "step": 18737 + }, + { + "epoch": 2.3836662002289786, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.558683395385742, + "learning_rate": 1e-06, + "loss": 0.4632, + "mean_token_accuracy": 0.854839026927948, + "num_tokens": 715038252.0, + "step": 18738 + }, + { + "epoch": 2.383793410507569, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.793289184570312, + "learning_rate": 1e-06, + "loss": 0.4004, + "mean_token_accuracy": 0.8722707033157349, + "num_tokens": 715078320.0, + "step": 18739 + }, + { + "epoch": 2.3839206207861596, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.808544158935547, + "learning_rate": 1e-06, + "loss": 0.4301, + "mean_token_accuracy": 0.868034839630127, + "num_tokens": 715115628.0, + "step": 18740 + }, + { + "epoch": 2.38404783106475, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.89586067199707, + "learning_rate": 1e-06, + "loss": 0.3723, + "mean_token_accuracy": 0.8830485343933105, + "num_tokens": 715153294.0, + "step": 18741 + }, + { + "epoch": 2.3841750413433407, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.79863929748535, + "learning_rate": 1e-06, + "loss": 0.4064, + "mean_token_accuracy": 0.8739360570907593, + "num_tokens": 715193736.0, + "step": 18742 + }, + { + "epoch": 2.384302251621931, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.79244613647461, + "learning_rate": 1e-06, + "loss": 0.3921, + "mean_token_accuracy": 0.8737668395042419, + "num_tokens": 715234539.0, + "step": 18743 + }, + { + "epoch": 2.3844294619005217, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.988605499267578, + "learning_rate": 1e-06, + "loss": 0.3552, + "mean_token_accuracy": 0.8855993151664734, + "num_tokens": 715271567.0, + "step": 18744 + }, + { + "epoch": 2.3845566721791123, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.904708862304688, + "learning_rate": 1e-06, + "loss": 0.3639, + "mean_token_accuracy": 0.8844786286354065, + "num_tokens": 715314850.0, + "step": 18745 + }, + { + "epoch": 2.3846838824577024, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.920211791992188, + "learning_rate": 1e-06, + "loss": 0.3904, + "mean_token_accuracy": 0.8794558048248291, + "num_tokens": 715353353.0, + "step": 18746 + }, + { + "epoch": 2.3848110927362933, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.019100189208984, + "learning_rate": 1e-06, + "loss": 0.3674, + "mean_token_accuracy": 0.8840230703353882, + "num_tokens": 715387843.0, + "step": 18747 + }, + { + "epoch": 2.3849383030148834, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.961721420288086, + "learning_rate": 1e-06, + "loss": 0.4306, + "mean_token_accuracy": 0.8609048128128052, + "num_tokens": 715428037.0, + "step": 18748 + }, + { + "epoch": 2.385065513293474, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.9666748046875, + "learning_rate": 1e-06, + "loss": 0.3583, + "mean_token_accuracy": 0.886979877948761, + "num_tokens": 715464403.0, + "step": 18749 + }, + { + "epoch": 2.3851927235720645, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.927141189575195, + "learning_rate": 1e-06, + "loss": 0.3643, + "mean_token_accuracy": 0.8823287487030029, + "num_tokens": 715501820.0, + "step": 18750 + }, + { + "epoch": 2.385319933850655, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.86933708190918, + "learning_rate": 1e-06, + "loss": 0.3694, + "mean_token_accuracy": 0.8833861351013184, + "num_tokens": 715543287.0, + "step": 18751 + }, + { + "epoch": 2.3854471441292455, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.860994338989258, + "learning_rate": 1e-06, + "loss": 0.3654, + "mean_token_accuracy": 0.8822543621063232, + "num_tokens": 715578189.0, + "step": 18752 + }, + { + "epoch": 2.385574354407836, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.73131561279297, + "learning_rate": 1e-06, + "loss": 0.4224, + "mean_token_accuracy": 0.864075779914856, + "num_tokens": 715616447.0, + "step": 18753 + }, + { + "epoch": 2.3857015646864266, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.885894775390625, + "learning_rate": 1e-06, + "loss": 0.4067, + "mean_token_accuracy": 0.8778299689292908, + "num_tokens": 715651692.0, + "step": 18754 + }, + { + "epoch": 2.385828774965017, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.72635841369629, + "learning_rate": 1e-06, + "loss": 0.423, + "mean_token_accuracy": 0.8662797212600708, + "num_tokens": 715687733.0, + "step": 18755 + }, + { + "epoch": 2.3859559852436076, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.739582061767578, + "learning_rate": 1e-06, + "loss": 0.4586, + "mean_token_accuracy": 0.8518396615982056, + "num_tokens": 715727481.0, + "step": 18756 + }, + { + "epoch": 2.386083195522198, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.84833526611328, + "learning_rate": 1e-06, + "loss": 0.4458, + "mean_token_accuracy": 0.856058657169342, + "num_tokens": 715767549.0, + "step": 18757 + }, + { + "epoch": 2.3862104058007887, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.65203285217285, + "learning_rate": 1e-06, + "loss": 0.4251, + "mean_token_accuracy": 0.8667386770248413, + "num_tokens": 715807251.0, + "step": 18758 + }, + { + "epoch": 2.386337616079379, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.730363845825195, + "learning_rate": 1e-06, + "loss": 0.4571, + "mean_token_accuracy": 0.8574988842010498, + "num_tokens": 715846893.0, + "step": 18759 + }, + { + "epoch": 2.3864648263579697, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.86402130126953, + "learning_rate": 1e-06, + "loss": 0.4483, + "mean_token_accuracy": 0.8599568605422974, + "num_tokens": 715883611.0, + "step": 18760 + }, + { + "epoch": 2.3865920366365603, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.86221694946289, + "learning_rate": 1e-06, + "loss": 0.3942, + "mean_token_accuracy": 0.874113917350769, + "num_tokens": 715915598.0, + "step": 18761 + }, + { + "epoch": 2.386719246915151, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.813411712646484, + "learning_rate": 1e-06, + "loss": 0.3795, + "mean_token_accuracy": 0.8802192807197571, + "num_tokens": 715956647.0, + "step": 18762 + }, + { + "epoch": 2.3868464571937413, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.84490203857422, + "learning_rate": 1e-06, + "loss": 0.3753, + "mean_token_accuracy": 0.8806601762771606, + "num_tokens": 715996627.0, + "step": 18763 + }, + { + "epoch": 2.386973667472332, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.753122329711914, + "learning_rate": 1e-06, + "loss": 0.3338, + "mean_token_accuracy": 0.8937156200408936, + "num_tokens": 716027823.0, + "step": 18764 + }, + { + "epoch": 2.3871008777509224, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.71147346496582, + "learning_rate": 1e-06, + "loss": 0.4503, + "mean_token_accuracy": 0.8593952059745789, + "num_tokens": 716067430.0, + "step": 18765 + }, + { + "epoch": 2.387228088029513, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.713117599487305, + "learning_rate": 1e-06, + "loss": 0.4195, + "mean_token_accuracy": 0.8660745620727539, + "num_tokens": 716109436.0, + "step": 18766 + }, + { + "epoch": 2.3873552983081034, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.773954391479492, + "learning_rate": 1e-06, + "loss": 0.4197, + "mean_token_accuracy": 0.8687975406646729, + "num_tokens": 716148157.0, + "step": 18767 + }, + { + "epoch": 2.387482508586694, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.578378677368164, + "learning_rate": 1e-06, + "loss": 0.3932, + "mean_token_accuracy": 0.8752126097679138, + "num_tokens": 716184662.0, + "step": 18768 + }, + { + "epoch": 2.3876097188652845, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.77735137939453, + "learning_rate": 1e-06, + "loss": 0.4242, + "mean_token_accuracy": 0.8667125701904297, + "num_tokens": 716227382.0, + "step": 18769 + }, + { + "epoch": 2.387736929143875, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.65839385986328, + "learning_rate": 1e-06, + "loss": 0.3893, + "mean_token_accuracy": 0.8764517307281494, + "num_tokens": 716262236.0, + "step": 18770 + }, + { + "epoch": 2.387864139422465, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.760883331298828, + "learning_rate": 1e-06, + "loss": 0.3704, + "mean_token_accuracy": 0.8807599544525146, + "num_tokens": 716295541.0, + "step": 18771 + }, + { + "epoch": 2.387991349701056, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.916061401367188, + "learning_rate": 1e-06, + "loss": 0.3905, + "mean_token_accuracy": 0.8759143352508545, + "num_tokens": 716331796.0, + "step": 18772 + }, + { + "epoch": 2.388118559979646, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.750686645507812, + "learning_rate": 1e-06, + "loss": 0.4406, + "mean_token_accuracy": 0.864059567451477, + "num_tokens": 716369539.0, + "step": 18773 + }, + { + "epoch": 2.3882457702582367, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.734657287597656, + "learning_rate": 1e-06, + "loss": 0.3549, + "mean_token_accuracy": 0.8903515338897705, + "num_tokens": 716409806.0, + "step": 18774 + }, + { + "epoch": 2.3883729805368272, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.928377151489258, + "learning_rate": 1e-06, + "loss": 0.4223, + "mean_token_accuracy": 0.8660054206848145, + "num_tokens": 716448191.0, + "step": 18775 + }, + { + "epoch": 2.3885001908154178, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.824148178100586, + "learning_rate": 1e-06, + "loss": 0.3993, + "mean_token_accuracy": 0.8753914833068848, + "num_tokens": 716486522.0, + "step": 18776 + }, + { + "epoch": 2.3886274010940083, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.702312469482422, + "learning_rate": 1e-06, + "loss": 0.3931, + "mean_token_accuracy": 0.874654233455658, + "num_tokens": 716519383.0, + "step": 18777 + }, + { + "epoch": 2.388754611372599, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.777400970458984, + "learning_rate": 1e-06, + "loss": 0.3963, + "mean_token_accuracy": 0.8751709461212158, + "num_tokens": 716556136.0, + "step": 18778 + }, + { + "epoch": 2.3888818216511893, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.90645980834961, + "learning_rate": 1e-06, + "loss": 0.4178, + "mean_token_accuracy": 0.8659654855728149, + "num_tokens": 716592346.0, + "step": 18779 + }, + { + "epoch": 2.38900903192978, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.578168869018555, + "learning_rate": 1e-06, + "loss": 0.4183, + "mean_token_accuracy": 0.8666559457778931, + "num_tokens": 716633996.0, + "step": 18780 + }, + { + "epoch": 2.3891362422083704, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.810068130493164, + "learning_rate": 1e-06, + "loss": 0.4146, + "mean_token_accuracy": 0.8692216873168945, + "num_tokens": 716669638.0, + "step": 18781 + }, + { + "epoch": 2.389263452486961, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.75339698791504, + "learning_rate": 1e-06, + "loss": 0.4215, + "mean_token_accuracy": 0.8649516105651855, + "num_tokens": 716711194.0, + "step": 18782 + }, + { + "epoch": 2.3893906627655515, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.886138916015625, + "learning_rate": 1e-06, + "loss": 0.3737, + "mean_token_accuracy": 0.8803160786628723, + "num_tokens": 716753402.0, + "step": 18783 + }, + { + "epoch": 2.389517873044142, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.86566162109375, + "learning_rate": 1e-06, + "loss": 0.3959, + "mean_token_accuracy": 0.8750708103179932, + "num_tokens": 716794638.0, + "step": 18784 + }, + { + "epoch": 2.3896450833227325, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.048023223876953, + "learning_rate": 1e-06, + "loss": 0.4794, + "mean_token_accuracy": 0.8454900979995728, + "num_tokens": 716834003.0, + "step": 18785 + }, + { + "epoch": 2.389772293601323, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.836095809936523, + "learning_rate": 1e-06, + "loss": 0.385, + "mean_token_accuracy": 0.8791676163673401, + "num_tokens": 716867349.0, + "step": 18786 + }, + { + "epoch": 2.3898995038799136, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.938770294189453, + "learning_rate": 1e-06, + "loss": 0.4192, + "mean_token_accuracy": 0.8717540502548218, + "num_tokens": 716904271.0, + "step": 18787 + }, + { + "epoch": 2.390026714158504, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.658754348754883, + "learning_rate": 1e-06, + "loss": 0.3946, + "mean_token_accuracy": 0.8758924007415771, + "num_tokens": 716936165.0, + "step": 18788 + }, + { + "epoch": 2.3901539244370946, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.692033767700195, + "learning_rate": 1e-06, + "loss": 0.4047, + "mean_token_accuracy": 0.871942400932312, + "num_tokens": 716974011.0, + "step": 18789 + }, + { + "epoch": 2.390281134715685, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.88224983215332, + "learning_rate": 1e-06, + "loss": 0.4086, + "mean_token_accuracy": 0.8736107349395752, + "num_tokens": 717012438.0, + "step": 18790 + }, + { + "epoch": 2.3904083449942757, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.910194396972656, + "learning_rate": 1e-06, + "loss": 0.4444, + "mean_token_accuracy": 0.8636432886123657, + "num_tokens": 717051755.0, + "step": 18791 + }, + { + "epoch": 2.390535555272866, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.927465438842773, + "learning_rate": 1e-06, + "loss": 0.406, + "mean_token_accuracy": 0.8734801411628723, + "num_tokens": 717090740.0, + "step": 18792 + }, + { + "epoch": 2.3906627655514567, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.87675666809082, + "learning_rate": 1e-06, + "loss": 0.4658, + "mean_token_accuracy": 0.8484774827957153, + "num_tokens": 717132033.0, + "step": 18793 + }, + { + "epoch": 2.390789975830047, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.034061431884766, + "learning_rate": 1e-06, + "loss": 0.3908, + "mean_token_accuracy": 0.8795825242996216, + "num_tokens": 717168255.0, + "step": 18794 + }, + { + "epoch": 2.390917186108638, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.708213806152344, + "learning_rate": 1e-06, + "loss": 0.4153, + "mean_token_accuracy": 0.8708810806274414, + "num_tokens": 717206863.0, + "step": 18795 + }, + { + "epoch": 2.391044396387228, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.982667922973633, + "learning_rate": 1e-06, + "loss": 0.4643, + "mean_token_accuracy": 0.8524303436279297, + "num_tokens": 717245374.0, + "step": 18796 + }, + { + "epoch": 2.3911716066658184, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.862096786499023, + "learning_rate": 1e-06, + "loss": 0.3486, + "mean_token_accuracy": 0.8915241360664368, + "num_tokens": 717279821.0, + "step": 18797 + }, + { + "epoch": 2.391298816944409, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.70598602294922, + "learning_rate": 1e-06, + "loss": 0.4277, + "mean_token_accuracy": 0.8655529618263245, + "num_tokens": 717318896.0, + "step": 18798 + }, + { + "epoch": 2.3914260272229995, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.9609375, + "learning_rate": 1e-06, + "loss": 0.4418, + "mean_token_accuracy": 0.8631435632705688, + "num_tokens": 717361493.0, + "step": 18799 + }, + { + "epoch": 2.39155323750159, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.878259658813477, + "learning_rate": 1e-06, + "loss": 0.4351, + "mean_token_accuracy": 0.8642269372940063, + "num_tokens": 717402801.0, + "step": 18800 + }, + { + "epoch": 2.3916804477801805, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.932239532470703, + "learning_rate": 1e-06, + "loss": 0.3513, + "mean_token_accuracy": 0.8906318545341492, + "num_tokens": 717441275.0, + "step": 18801 + }, + { + "epoch": 2.391807658058771, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.69000816345215, + "learning_rate": 1e-06, + "loss": 0.4287, + "mean_token_accuracy": 0.8609393835067749, + "num_tokens": 717477423.0, + "step": 18802 + }, + { + "epoch": 2.3919348683373616, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.864484786987305, + "learning_rate": 1e-06, + "loss": 0.4192, + "mean_token_accuracy": 0.8656104803085327, + "num_tokens": 717512978.0, + "step": 18803 + }, + { + "epoch": 2.392062078615952, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.92789649963379, + "learning_rate": 1e-06, + "loss": 0.3456, + "mean_token_accuracy": 0.8914059400558472, + "num_tokens": 717549710.0, + "step": 18804 + }, + { + "epoch": 2.3921892888945426, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.032447814941406, + "learning_rate": 1e-06, + "loss": 0.418, + "mean_token_accuracy": 0.8689886331558228, + "num_tokens": 717586588.0, + "step": 18805 + }, + { + "epoch": 2.392316499173133, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.869857788085938, + "learning_rate": 1e-06, + "loss": 0.37, + "mean_token_accuracy": 0.885223925113678, + "num_tokens": 717621507.0, + "step": 18806 + }, + { + "epoch": 2.3924437094517237, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.74696922302246, + "learning_rate": 1e-06, + "loss": 0.4054, + "mean_token_accuracy": 0.8748397827148438, + "num_tokens": 717655915.0, + "step": 18807 + }, + { + "epoch": 2.392570919730314, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.739980697631836, + "learning_rate": 1e-06, + "loss": 0.4071, + "mean_token_accuracy": 0.870226263999939, + "num_tokens": 717696357.0, + "step": 18808 + }, + { + "epoch": 2.3926981300089047, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.791452407836914, + "learning_rate": 1e-06, + "loss": 0.3657, + "mean_token_accuracy": 0.8870397806167603, + "num_tokens": 717735397.0, + "step": 18809 + }, + { + "epoch": 2.3928253402874953, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.880382537841797, + "learning_rate": 1e-06, + "loss": 0.3896, + "mean_token_accuracy": 0.8773051500320435, + "num_tokens": 717770081.0, + "step": 18810 + }, + { + "epoch": 2.392952550566086, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.66604995727539, + "learning_rate": 1e-06, + "loss": 0.3468, + "mean_token_accuracy": 0.8888868689537048, + "num_tokens": 717805262.0, + "step": 18811 + }, + { + "epoch": 2.3930797608446763, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.717010498046875, + "learning_rate": 1e-06, + "loss": 0.403, + "mean_token_accuracy": 0.8726797103881836, + "num_tokens": 717841877.0, + "step": 18812 + }, + { + "epoch": 2.393206971123267, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.66694450378418, + "learning_rate": 1e-06, + "loss": 0.4428, + "mean_token_accuracy": 0.857369065284729, + "num_tokens": 717881435.0, + "step": 18813 + }, + { + "epoch": 2.3933341814018574, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.808706283569336, + "learning_rate": 1e-06, + "loss": 0.3464, + "mean_token_accuracy": 0.8920391798019409, + "num_tokens": 717922240.0, + "step": 18814 + }, + { + "epoch": 2.393461391680448, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.87386131286621, + "learning_rate": 1e-06, + "loss": 0.3996, + "mean_token_accuracy": 0.8704327344894409, + "num_tokens": 717958438.0, + "step": 18815 + }, + { + "epoch": 2.3935886019590384, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.738746643066406, + "learning_rate": 1e-06, + "loss": 0.4311, + "mean_token_accuracy": 0.8647496104240417, + "num_tokens": 717994451.0, + "step": 18816 + }, + { + "epoch": 2.393715812237629, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.864748001098633, + "learning_rate": 1e-06, + "loss": 0.4014, + "mean_token_accuracy": 0.8740029335021973, + "num_tokens": 718034318.0, + "step": 18817 + }, + { + "epoch": 2.3938430225162195, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.807647705078125, + "learning_rate": 1e-06, + "loss": 0.3972, + "mean_token_accuracy": 0.8717492818832397, + "num_tokens": 718074994.0, + "step": 18818 + }, + { + "epoch": 2.3939702327948096, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.65865135192871, + "learning_rate": 1e-06, + "loss": 0.4064, + "mean_token_accuracy": 0.8702964782714844, + "num_tokens": 718118849.0, + "step": 18819 + }, + { + "epoch": 2.3940974430734006, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.937137603759766, + "learning_rate": 1e-06, + "loss": 0.3866, + "mean_token_accuracy": 0.879380464553833, + "num_tokens": 718152874.0, + "step": 18820 + }, + { + "epoch": 2.3942246533519906, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.75479507446289, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.8656494617462158, + "num_tokens": 718194206.0, + "step": 18821 + }, + { + "epoch": 2.394351863630581, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.94647979736328, + "learning_rate": 1e-06, + "loss": 0.3893, + "mean_token_accuracy": 0.8759621381759644, + "num_tokens": 718227408.0, + "step": 18822 + }, + { + "epoch": 2.3944790739091717, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.67723274230957, + "learning_rate": 1e-06, + "loss": 0.4091, + "mean_token_accuracy": 0.8671895265579224, + "num_tokens": 718266962.0, + "step": 18823 + }, + { + "epoch": 2.3946062841877622, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.82530403137207, + "learning_rate": 1e-06, + "loss": 0.3841, + "mean_token_accuracy": 0.8777600526809692, + "num_tokens": 718307873.0, + "step": 18824 + }, + { + "epoch": 2.3947334944663528, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.964773178100586, + "learning_rate": 1e-06, + "loss": 0.3852, + "mean_token_accuracy": 0.879342794418335, + "num_tokens": 718350296.0, + "step": 18825 + }, + { + "epoch": 2.3948607047449433, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.83478355407715, + "learning_rate": 1e-06, + "loss": 0.3973, + "mean_token_accuracy": 0.8757147789001465, + "num_tokens": 718390734.0, + "step": 18826 + }, + { + "epoch": 2.394987915023534, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.84006118774414, + "learning_rate": 1e-06, + "loss": 0.3543, + "mean_token_accuracy": 0.8888220191001892, + "num_tokens": 718435630.0, + "step": 18827 + }, + { + "epoch": 2.3951151253021243, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.92153549194336, + "learning_rate": 1e-06, + "loss": 0.3719, + "mean_token_accuracy": 0.8798462152481079, + "num_tokens": 718474791.0, + "step": 18828 + }, + { + "epoch": 2.395242335580715, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.75320053100586, + "learning_rate": 1e-06, + "loss": 0.4583, + "mean_token_accuracy": 0.8558990359306335, + "num_tokens": 718514957.0, + "step": 18829 + }, + { + "epoch": 2.3953695458593054, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.00308609008789, + "learning_rate": 1e-06, + "loss": 0.3708, + "mean_token_accuracy": 0.8845677375793457, + "num_tokens": 718551761.0, + "step": 18830 + }, + { + "epoch": 2.395496756137896, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.72409439086914, + "learning_rate": 1e-06, + "loss": 0.3965, + "mean_token_accuracy": 0.8720192909240723, + "num_tokens": 718586445.0, + "step": 18831 + }, + { + "epoch": 2.3956239664164865, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.968963623046875, + "learning_rate": 1e-06, + "loss": 0.4128, + "mean_token_accuracy": 0.868188202381134, + "num_tokens": 718631105.0, + "step": 18832 + }, + { + "epoch": 2.395751176695077, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.675857543945312, + "learning_rate": 1e-06, + "loss": 0.4171, + "mean_token_accuracy": 0.8700892329216003, + "num_tokens": 718672278.0, + "step": 18833 + }, + { + "epoch": 2.3958783869736675, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.99155044555664, + "learning_rate": 1e-06, + "loss": 0.3799, + "mean_token_accuracy": 0.8785319924354553, + "num_tokens": 718713191.0, + "step": 18834 + }, + { + "epoch": 2.396005597252258, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.818984985351562, + "learning_rate": 1e-06, + "loss": 0.4172, + "mean_token_accuracy": 0.8676788210868835, + "num_tokens": 718745012.0, + "step": 18835 + }, + { + "epoch": 2.3961328075308486, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.8339786529541, + "learning_rate": 1e-06, + "loss": 0.3944, + "mean_token_accuracy": 0.8736201524734497, + "num_tokens": 718780149.0, + "step": 18836 + }, + { + "epoch": 2.396260017809439, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.00244903564453, + "learning_rate": 1e-06, + "loss": 0.4375, + "mean_token_accuracy": 0.8693184852600098, + "num_tokens": 718812831.0, + "step": 18837 + }, + { + "epoch": 2.3963872280880296, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.879037857055664, + "learning_rate": 1e-06, + "loss": 0.3836, + "mean_token_accuracy": 0.8833708167076111, + "num_tokens": 718847650.0, + "step": 18838 + }, + { + "epoch": 2.39651443836662, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.62412452697754, + "learning_rate": 1e-06, + "loss": 0.3936, + "mean_token_accuracy": 0.8736178278923035, + "num_tokens": 718883417.0, + "step": 18839 + }, + { + "epoch": 2.3966416486452107, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.884510040283203, + "learning_rate": 1e-06, + "loss": 0.3201, + "mean_token_accuracy": 0.8981777429580688, + "num_tokens": 718915501.0, + "step": 18840 + }, + { + "epoch": 2.396768858923801, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.81683921813965, + "learning_rate": 1e-06, + "loss": 0.3679, + "mean_token_accuracy": 0.887161135673523, + "num_tokens": 718954485.0, + "step": 18841 + }, + { + "epoch": 2.3968960692023917, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.891862869262695, + "learning_rate": 1e-06, + "loss": 0.4411, + "mean_token_accuracy": 0.8584843277931213, + "num_tokens": 718995815.0, + "step": 18842 + }, + { + "epoch": 2.3970232794809823, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.784517288208008, + "learning_rate": 1e-06, + "loss": 0.3388, + "mean_token_accuracy": 0.8880403637886047, + "num_tokens": 719032820.0, + "step": 18843 + }, + { + "epoch": 2.3971504897595723, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.769079208374023, + "learning_rate": 1e-06, + "loss": 0.4004, + "mean_token_accuracy": 0.8717606067657471, + "num_tokens": 719066966.0, + "step": 18844 + }, + { + "epoch": 2.3972777000381633, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.833627700805664, + "learning_rate": 1e-06, + "loss": 0.4049, + "mean_token_accuracy": 0.8725621700286865, + "num_tokens": 719108768.0, + "step": 18845 + }, + { + "epoch": 2.3974049103167534, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.73846435546875, + "learning_rate": 1e-06, + "loss": 0.4033, + "mean_token_accuracy": 0.8707817196846008, + "num_tokens": 719151451.0, + "step": 18846 + }, + { + "epoch": 2.397532120595344, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.939966201782227, + "learning_rate": 1e-06, + "loss": 0.4286, + "mean_token_accuracy": 0.8664774298667908, + "num_tokens": 719192103.0, + "step": 18847 + }, + { + "epoch": 2.3976593308739345, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.79262351989746, + "learning_rate": 1e-06, + "loss": 0.4215, + "mean_token_accuracy": 0.869624137878418, + "num_tokens": 719226514.0, + "step": 18848 + }, + { + "epoch": 2.397786541152525, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.840471267700195, + "learning_rate": 1e-06, + "loss": 0.3267, + "mean_token_accuracy": 0.8961424827575684, + "num_tokens": 719262944.0, + "step": 18849 + }, + { + "epoch": 2.3979137514311155, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.054365158081055, + "learning_rate": 1e-06, + "loss": 0.3868, + "mean_token_accuracy": 0.879695475101471, + "num_tokens": 719306707.0, + "step": 18850 + }, + { + "epoch": 2.398040961709706, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.809316635131836, + "learning_rate": 1e-06, + "loss": 0.3844, + "mean_token_accuracy": 0.8789674043655396, + "num_tokens": 719347523.0, + "step": 18851 + }, + { + "epoch": 2.3981681719882966, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.842262268066406, + "learning_rate": 1e-06, + "loss": 0.333, + "mean_token_accuracy": 0.8922215700149536, + "num_tokens": 719379474.0, + "step": 18852 + }, + { + "epoch": 2.398295382266887, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.93283462524414, + "learning_rate": 1e-06, + "loss": 0.4555, + "mean_token_accuracy": 0.8590465784072876, + "num_tokens": 719421624.0, + "step": 18853 + }, + { + "epoch": 2.3984225925454776, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.795507431030273, + "learning_rate": 1e-06, + "loss": 0.412, + "mean_token_accuracy": 0.8696862459182739, + "num_tokens": 719455570.0, + "step": 18854 + }, + { + "epoch": 2.398549802824068, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.659910202026367, + "learning_rate": 1e-06, + "loss": 0.3561, + "mean_token_accuracy": 0.8896773457527161, + "num_tokens": 719500300.0, + "step": 18855 + }, + { + "epoch": 2.3986770131026587, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.93482780456543, + "learning_rate": 1e-06, + "loss": 0.4403, + "mean_token_accuracy": 0.8587598204612732, + "num_tokens": 719538119.0, + "step": 18856 + }, + { + "epoch": 2.398804223381249, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.797727584838867, + "learning_rate": 1e-06, + "loss": 0.4289, + "mean_token_accuracy": 0.8636094331741333, + "num_tokens": 719574012.0, + "step": 18857 + }, + { + "epoch": 2.3989314336598397, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.917163848876953, + "learning_rate": 1e-06, + "loss": 0.3665, + "mean_token_accuracy": 0.8861083984375, + "num_tokens": 719609548.0, + "step": 18858 + }, + { + "epoch": 2.3990586439384303, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.98775863647461, + "learning_rate": 1e-06, + "loss": 0.4189, + "mean_token_accuracy": 0.8674892783164978, + "num_tokens": 719647280.0, + "step": 18859 + }, + { + "epoch": 2.399185854217021, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.82349395751953, + "learning_rate": 1e-06, + "loss": 0.3513, + "mean_token_accuracy": 0.8885288238525391, + "num_tokens": 719686987.0, + "step": 18860 + }, + { + "epoch": 2.3993130644956113, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.878442764282227, + "learning_rate": 1e-06, + "loss": 0.3517, + "mean_token_accuracy": 0.8921151161193848, + "num_tokens": 719727974.0, + "step": 18861 + }, + { + "epoch": 2.399440274774202, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.904163360595703, + "learning_rate": 1e-06, + "loss": 0.3959, + "mean_token_accuracy": 0.873871922492981, + "num_tokens": 719764308.0, + "step": 18862 + }, + { + "epoch": 2.3995674850527924, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.883588790893555, + "learning_rate": 1e-06, + "loss": 0.4217, + "mean_token_accuracy": 0.8629463911056519, + "num_tokens": 719801872.0, + "step": 18863 + }, + { + "epoch": 2.399694695331383, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.703449249267578, + "learning_rate": 1e-06, + "loss": 0.399, + "mean_token_accuracy": 0.8758782744407654, + "num_tokens": 719842504.0, + "step": 18864 + }, + { + "epoch": 2.3998219056099734, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.89036750793457, + "learning_rate": 1e-06, + "loss": 0.4131, + "mean_token_accuracy": 0.8683911561965942, + "num_tokens": 719878885.0, + "step": 18865 + }, + { + "epoch": 2.399949115888564, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.71578025817871, + "learning_rate": 1e-06, + "loss": 0.4515, + "mean_token_accuracy": 0.8572995662689209, + "num_tokens": 719922891.0, + "step": 18866 + }, + { + "epoch": 2.4000763261671545, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.934619903564453, + "learning_rate": 1e-06, + "loss": 0.3995, + "mean_token_accuracy": 0.8709762692451477, + "num_tokens": 719960113.0, + "step": 18867 + }, + { + "epoch": 2.400203536445745, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.828279495239258, + "learning_rate": 1e-06, + "loss": 0.4007, + "mean_token_accuracy": 0.8724589347839355, + "num_tokens": 720006079.0, + "step": 18868 + }, + { + "epoch": 2.400330746724335, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.933900833129883, + "learning_rate": 1e-06, + "loss": 0.4298, + "mean_token_accuracy": 0.8606635332107544, + "num_tokens": 720042219.0, + "step": 18869 + }, + { + "epoch": 2.400457957002926, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.846738815307617, + "learning_rate": 1e-06, + "loss": 0.4004, + "mean_token_accuracy": 0.8751324415206909, + "num_tokens": 720082574.0, + "step": 18870 + }, + { + "epoch": 2.400585167281516, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.970352172851562, + "learning_rate": 1e-06, + "loss": 0.3971, + "mean_token_accuracy": 0.8727095127105713, + "num_tokens": 720115583.0, + "step": 18871 + }, + { + "epoch": 2.4007123775601067, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.85548973083496, + "learning_rate": 1e-06, + "loss": 0.4243, + "mean_token_accuracy": 0.8643069863319397, + "num_tokens": 720151350.0, + "step": 18872 + }, + { + "epoch": 2.400839587838697, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.764245986938477, + "learning_rate": 1e-06, + "loss": 0.42, + "mean_token_accuracy": 0.8658806085586548, + "num_tokens": 720187072.0, + "step": 18873 + }, + { + "epoch": 2.4009667981172877, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.858671188354492, + "learning_rate": 1e-06, + "loss": 0.3898, + "mean_token_accuracy": 0.8770344257354736, + "num_tokens": 720222249.0, + "step": 18874 + }, + { + "epoch": 2.4010940083958783, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.91547203063965, + "learning_rate": 1e-06, + "loss": 0.4012, + "mean_token_accuracy": 0.8718063235282898, + "num_tokens": 720264682.0, + "step": 18875 + }, + { + "epoch": 2.401221218674469, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.835115432739258, + "learning_rate": 1e-06, + "loss": 0.4205, + "mean_token_accuracy": 0.870076596736908, + "num_tokens": 720309935.0, + "step": 18876 + }, + { + "epoch": 2.4013484289530593, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.916728973388672, + "learning_rate": 1e-06, + "loss": 0.4158, + "mean_token_accuracy": 0.8684717416763306, + "num_tokens": 720343992.0, + "step": 18877 + }, + { + "epoch": 2.40147563923165, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.608455657958984, + "learning_rate": 1e-06, + "loss": 0.4001, + "mean_token_accuracy": 0.872578501701355, + "num_tokens": 720379707.0, + "step": 18878 + }, + { + "epoch": 2.4016028495102404, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.81736183166504, + "learning_rate": 1e-06, + "loss": 0.4115, + "mean_token_accuracy": 0.8703061938285828, + "num_tokens": 720419796.0, + "step": 18879 + }, + { + "epoch": 2.401730059788831, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.84465980529785, + "learning_rate": 1e-06, + "loss": 0.3948, + "mean_token_accuracy": 0.8752169013023376, + "num_tokens": 720460919.0, + "step": 18880 + }, + { + "epoch": 2.4018572700674214, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.881868362426758, + "learning_rate": 1e-06, + "loss": 0.4047, + "mean_token_accuracy": 0.8696637153625488, + "num_tokens": 720498403.0, + "step": 18881 + }, + { + "epoch": 2.401984480346012, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.8861083984375, + "learning_rate": 1e-06, + "loss": 0.3999, + "mean_token_accuracy": 0.8733288049697876, + "num_tokens": 720534931.0, + "step": 18882 + }, + { + "epoch": 2.4021116906246025, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.999370574951172, + "learning_rate": 1e-06, + "loss": 0.3918, + "mean_token_accuracy": 0.8767635822296143, + "num_tokens": 720574062.0, + "step": 18883 + }, + { + "epoch": 2.402238900903193, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.912504196166992, + "learning_rate": 1e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.8645538091659546, + "num_tokens": 720611552.0, + "step": 18884 + }, + { + "epoch": 2.4023661111817836, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.833986282348633, + "learning_rate": 1e-06, + "loss": 0.3952, + "mean_token_accuracy": 0.872933030128479, + "num_tokens": 720647886.0, + "step": 18885 + }, + { + "epoch": 2.402493321460374, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.003768920898438, + "learning_rate": 1e-06, + "loss": 0.3759, + "mean_token_accuracy": 0.8780550956726074, + "num_tokens": 720688034.0, + "step": 18886 + }, + { + "epoch": 2.4026205317389646, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.764678955078125, + "learning_rate": 1e-06, + "loss": 0.3879, + "mean_token_accuracy": 0.8802580237388611, + "num_tokens": 720727805.0, + "step": 18887 + }, + { + "epoch": 2.402747742017555, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.924602508544922, + "learning_rate": 1e-06, + "loss": 0.3782, + "mean_token_accuracy": 0.8752335906028748, + "num_tokens": 720759717.0, + "step": 18888 + }, + { + "epoch": 2.4028749522961457, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.873443603515625, + "learning_rate": 1e-06, + "loss": 0.4166, + "mean_token_accuracy": 0.8677821755409241, + "num_tokens": 720790435.0, + "step": 18889 + }, + { + "epoch": 2.403002162574736, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.97793197631836, + "learning_rate": 1e-06, + "loss": 0.4196, + "mean_token_accuracy": 0.869147777557373, + "num_tokens": 720831715.0, + "step": 18890 + }, + { + "epoch": 2.4031293728533267, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.75159454345703, + "learning_rate": 1e-06, + "loss": 0.4072, + "mean_token_accuracy": 0.871758759021759, + "num_tokens": 720869598.0, + "step": 18891 + }, + { + "epoch": 2.403256583131917, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.816282272338867, + "learning_rate": 1e-06, + "loss": 0.418, + "mean_token_accuracy": 0.8689216375350952, + "num_tokens": 720911122.0, + "step": 18892 + }, + { + "epoch": 2.403383793410508, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.028091430664062, + "learning_rate": 1e-06, + "loss": 0.3888, + "mean_token_accuracy": 0.8775309324264526, + "num_tokens": 720952265.0, + "step": 18893 + }, + { + "epoch": 2.403511003689098, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.718196868896484, + "learning_rate": 1e-06, + "loss": 0.3974, + "mean_token_accuracy": 0.8772769570350647, + "num_tokens": 720991191.0, + "step": 18894 + }, + { + "epoch": 2.4036382139676884, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.997709274291992, + "learning_rate": 1e-06, + "loss": 0.3794, + "mean_token_accuracy": 0.8831125497817993, + "num_tokens": 721032524.0, + "step": 18895 + }, + { + "epoch": 2.403765424246279, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.616352081298828, + "learning_rate": 1e-06, + "loss": 0.4084, + "mean_token_accuracy": 0.8729382753372192, + "num_tokens": 721072699.0, + "step": 18896 + }, + { + "epoch": 2.4038926345248695, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.90287971496582, + "learning_rate": 1e-06, + "loss": 0.3573, + "mean_token_accuracy": 0.8852413892745972, + "num_tokens": 721109467.0, + "step": 18897 + }, + { + "epoch": 2.40401984480346, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.94647789001465, + "learning_rate": 1e-06, + "loss": 0.4863, + "mean_token_accuracy": 0.8493633270263672, + "num_tokens": 721142108.0, + "step": 18898 + }, + { + "epoch": 2.4041470550820505, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.651872634887695, + "learning_rate": 1e-06, + "loss": 0.4288, + "mean_token_accuracy": 0.8658764362335205, + "num_tokens": 721179875.0, + "step": 18899 + }, + { + "epoch": 2.404274265360641, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.0444278717041, + "learning_rate": 1e-06, + "loss": 0.4127, + "mean_token_accuracy": 0.8701156973838806, + "num_tokens": 721217141.0, + "step": 18900 + }, + { + "epoch": 2.4044014756392316, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.983854293823242, + "learning_rate": 1e-06, + "loss": 0.4184, + "mean_token_accuracy": 0.867493748664856, + "num_tokens": 721256357.0, + "step": 18901 + }, + { + "epoch": 2.404528685917822, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.708833694458008, + "learning_rate": 1e-06, + "loss": 0.4023, + "mean_token_accuracy": 0.8731365203857422, + "num_tokens": 721292610.0, + "step": 18902 + }, + { + "epoch": 2.4046558961964126, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.89803695678711, + "learning_rate": 1e-06, + "loss": 0.3826, + "mean_token_accuracy": 0.8785233497619629, + "num_tokens": 721329048.0, + "step": 18903 + }, + { + "epoch": 2.404783106475003, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.7890567779541, + "learning_rate": 1e-06, + "loss": 0.3497, + "mean_token_accuracy": 0.8876707553863525, + "num_tokens": 721364249.0, + "step": 18904 + }, + { + "epoch": 2.4049103167535937, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.785688400268555, + "learning_rate": 1e-06, + "loss": 0.3769, + "mean_token_accuracy": 0.8803873062133789, + "num_tokens": 721391154.0, + "step": 18905 + }, + { + "epoch": 2.405037527032184, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.82225227355957, + "learning_rate": 1e-06, + "loss": 0.3731, + "mean_token_accuracy": 0.8831921815872192, + "num_tokens": 721428443.0, + "step": 18906 + }, + { + "epoch": 2.4051647373107747, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.607439041137695, + "learning_rate": 1e-06, + "loss": 0.4403, + "mean_token_accuracy": 0.8632292151451111, + "num_tokens": 721474545.0, + "step": 18907 + }, + { + "epoch": 2.4052919475893653, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.91448211669922, + "learning_rate": 1e-06, + "loss": 0.408, + "mean_token_accuracy": 0.8740304708480835, + "num_tokens": 721513153.0, + "step": 18908 + }, + { + "epoch": 2.405419157867956, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.618785858154297, + "learning_rate": 1e-06, + "loss": 0.4118, + "mean_token_accuracy": 0.8691027760505676, + "num_tokens": 721556287.0, + "step": 18909 + }, + { + "epoch": 2.4055463681465463, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.978960037231445, + "learning_rate": 1e-06, + "loss": 0.375, + "mean_token_accuracy": 0.8790256977081299, + "num_tokens": 721590219.0, + "step": 18910 + }, + { + "epoch": 2.405673578425137, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.81426239013672, + "learning_rate": 1e-06, + "loss": 0.3873, + "mean_token_accuracy": 0.8731880187988281, + "num_tokens": 721627308.0, + "step": 18911 + }, + { + "epoch": 2.4058007887037274, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.551952362060547, + "learning_rate": 1e-06, + "loss": 0.4153, + "mean_token_accuracy": 0.8684291243553162, + "num_tokens": 721664218.0, + "step": 18912 + }, + { + "epoch": 2.405927998982318, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.70438575744629, + "learning_rate": 1e-06, + "loss": 0.3744, + "mean_token_accuracy": 0.8836542367935181, + "num_tokens": 721703511.0, + "step": 18913 + }, + { + "epoch": 2.4060552092609084, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.738672256469727, + "learning_rate": 1e-06, + "loss": 0.3645, + "mean_token_accuracy": 0.8853351473808289, + "num_tokens": 721741512.0, + "step": 18914 + }, + { + "epoch": 2.406182419539499, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.600200653076172, + "learning_rate": 1e-06, + "loss": 0.388, + "mean_token_accuracy": 0.8773493766784668, + "num_tokens": 721778443.0, + "step": 18915 + }, + { + "epoch": 2.4063096298180895, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.76702308654785, + "learning_rate": 1e-06, + "loss": 0.4185, + "mean_token_accuracy": 0.8703069686889648, + "num_tokens": 721817226.0, + "step": 18916 + }, + { + "epoch": 2.4064368400966796, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.715070724487305, + "learning_rate": 1e-06, + "loss": 0.4524, + "mean_token_accuracy": 0.8596439957618713, + "num_tokens": 721859419.0, + "step": 18917 + }, + { + "epoch": 2.4065640503752705, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.86385726928711, + "learning_rate": 1e-06, + "loss": 0.3557, + "mean_token_accuracy": 0.8862258791923523, + "num_tokens": 721895183.0, + "step": 18918 + }, + { + "epoch": 2.4066912606538606, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.628122329711914, + "learning_rate": 1e-06, + "loss": 0.4387, + "mean_token_accuracy": 0.8593114614486694, + "num_tokens": 721933566.0, + "step": 18919 + }, + { + "epoch": 2.406818470932451, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.652067184448242, + "learning_rate": 1e-06, + "loss": 0.404, + "mean_token_accuracy": 0.8765985369682312, + "num_tokens": 721968827.0, + "step": 18920 + }, + { + "epoch": 2.4069456812110417, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.05694007873535, + "learning_rate": 1e-06, + "loss": 0.3946, + "mean_token_accuracy": 0.8752789497375488, + "num_tokens": 722004989.0, + "step": 18921 + }, + { + "epoch": 2.407072891489632, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.635940551757812, + "learning_rate": 1e-06, + "loss": 0.3918, + "mean_token_accuracy": 0.8758093118667603, + "num_tokens": 722043681.0, + "step": 18922 + }, + { + "epoch": 2.4072001017682227, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.884584426879883, + "learning_rate": 1e-06, + "loss": 0.4124, + "mean_token_accuracy": 0.8721921443939209, + "num_tokens": 722086541.0, + "step": 18923 + }, + { + "epoch": 2.4073273120468133, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.752267837524414, + "learning_rate": 1e-06, + "loss": 0.3927, + "mean_token_accuracy": 0.8736920952796936, + "num_tokens": 722125519.0, + "step": 18924 + }, + { + "epoch": 2.407454522325404, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.80237579345703, + "learning_rate": 1e-06, + "loss": 0.4151, + "mean_token_accuracy": 0.8691968321800232, + "num_tokens": 722157665.0, + "step": 18925 + }, + { + "epoch": 2.4075817326039943, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.824337005615234, + "learning_rate": 1e-06, + "loss": 0.3589, + "mean_token_accuracy": 0.8860023021697998, + "num_tokens": 722191819.0, + "step": 18926 + }, + { + "epoch": 2.407708942882585, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.807592391967773, + "learning_rate": 1e-06, + "loss": 0.4349, + "mean_token_accuracy": 0.8627793192863464, + "num_tokens": 722229755.0, + "step": 18927 + }, + { + "epoch": 2.4078361531611754, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.845279693603516, + "learning_rate": 1e-06, + "loss": 0.4391, + "mean_token_accuracy": 0.8635813593864441, + "num_tokens": 722267440.0, + "step": 18928 + }, + { + "epoch": 2.407963363439766, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.78192138671875, + "learning_rate": 1e-06, + "loss": 0.4139, + "mean_token_accuracy": 0.8689703941345215, + "num_tokens": 722310102.0, + "step": 18929 + }, + { + "epoch": 2.4080905737183564, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.807727813720703, + "learning_rate": 1e-06, + "loss": 0.4196, + "mean_token_accuracy": 0.8669630289077759, + "num_tokens": 722346713.0, + "step": 18930 + }, + { + "epoch": 2.408217783996947, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.770503997802734, + "learning_rate": 1e-06, + "loss": 0.4095, + "mean_token_accuracy": 0.8676967620849609, + "num_tokens": 722387803.0, + "step": 18931 + }, + { + "epoch": 2.4083449942755375, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.944887161254883, + "learning_rate": 1e-06, + "loss": 0.4426, + "mean_token_accuracy": 0.8637505769729614, + "num_tokens": 722428943.0, + "step": 18932 + }, + { + "epoch": 2.408472204554128, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.96634292602539, + "learning_rate": 1e-06, + "loss": 0.4221, + "mean_token_accuracy": 0.8660404682159424, + "num_tokens": 722467860.0, + "step": 18933 + }, + { + "epoch": 2.4085994148327186, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.76591682434082, + "learning_rate": 1e-06, + "loss": 0.4131, + "mean_token_accuracy": 0.8695124387741089, + "num_tokens": 722503067.0, + "step": 18934 + }, + { + "epoch": 2.408726625111309, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.829669952392578, + "learning_rate": 1e-06, + "loss": 0.3881, + "mean_token_accuracy": 0.8758552670478821, + "num_tokens": 722537842.0, + "step": 18935 + }, + { + "epoch": 2.4088538353898996, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.846708297729492, + "learning_rate": 1e-06, + "loss": 0.3928, + "mean_token_accuracy": 0.8789286017417908, + "num_tokens": 722574036.0, + "step": 18936 + }, + { + "epoch": 2.40898104566849, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.961956024169922, + "learning_rate": 1e-06, + "loss": 0.3965, + "mean_token_accuracy": 0.8741053342819214, + "num_tokens": 722606087.0, + "step": 18937 + }, + { + "epoch": 2.4091082559470807, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.823776245117188, + "learning_rate": 1e-06, + "loss": 0.4072, + "mean_token_accuracy": 0.8715428113937378, + "num_tokens": 722648745.0, + "step": 18938 + }, + { + "epoch": 2.409235466225671, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.727561950683594, + "learning_rate": 1e-06, + "loss": 0.4257, + "mean_token_accuracy": 0.8643050789833069, + "num_tokens": 722688589.0, + "step": 18939 + }, + { + "epoch": 2.4093626765042617, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.82486915588379, + "learning_rate": 1e-06, + "loss": 0.3526, + "mean_token_accuracy": 0.8898625373840332, + "num_tokens": 722723941.0, + "step": 18940 + }, + { + "epoch": 2.4094898867828523, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.891366958618164, + "learning_rate": 1e-06, + "loss": 0.4541, + "mean_token_accuracy": 0.8557801246643066, + "num_tokens": 722768242.0, + "step": 18941 + }, + { + "epoch": 2.4096170970614423, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.788421630859375, + "learning_rate": 1e-06, + "loss": 0.4071, + "mean_token_accuracy": 0.8733395338058472, + "num_tokens": 722800970.0, + "step": 18942 + }, + { + "epoch": 2.4097443073400333, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.92675018310547, + "learning_rate": 1e-06, + "loss": 0.4367, + "mean_token_accuracy": 0.8641093969345093, + "num_tokens": 722839168.0, + "step": 18943 + }, + { + "epoch": 2.4098715176186234, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.942895889282227, + "learning_rate": 1e-06, + "loss": 0.3678, + "mean_token_accuracy": 0.8845778703689575, + "num_tokens": 722883451.0, + "step": 18944 + }, + { + "epoch": 2.409998727897214, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.84671974182129, + "learning_rate": 1e-06, + "loss": 0.3967, + "mean_token_accuracy": 0.8734447360038757, + "num_tokens": 722928581.0, + "step": 18945 + }, + { + "epoch": 2.4101259381758044, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.85214614868164, + "learning_rate": 1e-06, + "loss": 0.361, + "mean_token_accuracy": 0.8835962414741516, + "num_tokens": 722967141.0, + "step": 18946 + }, + { + "epoch": 2.410253148454395, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.862836837768555, + "learning_rate": 1e-06, + "loss": 0.4297, + "mean_token_accuracy": 0.8693986535072327, + "num_tokens": 723006263.0, + "step": 18947 + }, + { + "epoch": 2.4103803587329855, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.769895553588867, + "learning_rate": 1e-06, + "loss": 0.3927, + "mean_token_accuracy": 0.8758376836776733, + "num_tokens": 723050843.0, + "step": 18948 + }, + { + "epoch": 2.410507569011576, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.93022918701172, + "learning_rate": 1e-06, + "loss": 0.404, + "mean_token_accuracy": 0.8731644153594971, + "num_tokens": 723092124.0, + "step": 18949 + }, + { + "epoch": 2.4106347792901666, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.78333282470703, + "learning_rate": 1e-06, + "loss": 0.3689, + "mean_token_accuracy": 0.8841289281845093, + "num_tokens": 723122478.0, + "step": 18950 + }, + { + "epoch": 2.410761989568757, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.157955169677734, + "learning_rate": 1e-06, + "loss": 0.4166, + "mean_token_accuracy": 0.8668333888053894, + "num_tokens": 723163054.0, + "step": 18951 + }, + { + "epoch": 2.4108891998473476, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.83295440673828, + "learning_rate": 1e-06, + "loss": 0.3962, + "mean_token_accuracy": 0.8730460405349731, + "num_tokens": 723201518.0, + "step": 18952 + }, + { + "epoch": 2.411016410125938, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.960506439208984, + "learning_rate": 1e-06, + "loss": 0.3868, + "mean_token_accuracy": 0.8732886910438538, + "num_tokens": 723235685.0, + "step": 18953 + }, + { + "epoch": 2.4111436204045287, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.989145278930664, + "learning_rate": 1e-06, + "loss": 0.4086, + "mean_token_accuracy": 0.8739326000213623, + "num_tokens": 723266510.0, + "step": 18954 + }, + { + "epoch": 2.411270830683119, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.793750762939453, + "learning_rate": 1e-06, + "loss": 0.3969, + "mean_token_accuracy": 0.8760290145874023, + "num_tokens": 723302800.0, + "step": 18955 + }, + { + "epoch": 2.4113980409617097, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.846220016479492, + "learning_rate": 1e-06, + "loss": 0.3976, + "mean_token_accuracy": 0.875927746295929, + "num_tokens": 723340669.0, + "step": 18956 + }, + { + "epoch": 2.4115252512403003, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.79003143310547, + "learning_rate": 1e-06, + "loss": 0.3847, + "mean_token_accuracy": 0.8756909370422363, + "num_tokens": 723377556.0, + "step": 18957 + }, + { + "epoch": 2.411652461518891, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.790283203125, + "learning_rate": 1e-06, + "loss": 0.3809, + "mean_token_accuracy": 0.8768277168273926, + "num_tokens": 723410942.0, + "step": 18958 + }, + { + "epoch": 2.4117796717974813, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.745271682739258, + "learning_rate": 1e-06, + "loss": 0.361, + "mean_token_accuracy": 0.8861238360404968, + "num_tokens": 723451131.0, + "step": 18959 + }, + { + "epoch": 2.411906882076072, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.917003631591797, + "learning_rate": 1e-06, + "loss": 0.3978, + "mean_token_accuracy": 0.8775314092636108, + "num_tokens": 723485876.0, + "step": 18960 + }, + { + "epoch": 2.4120340923546624, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.927249908447266, + "learning_rate": 1e-06, + "loss": 0.3623, + "mean_token_accuracy": 0.8808465003967285, + "num_tokens": 723521252.0, + "step": 18961 + }, + { + "epoch": 2.412161302633253, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.028825759887695, + "learning_rate": 1e-06, + "loss": 0.4421, + "mean_token_accuracy": 0.8574835062026978, + "num_tokens": 723561094.0, + "step": 18962 + }, + { + "epoch": 2.4122885129118434, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.71180534362793, + "learning_rate": 1e-06, + "loss": 0.4064, + "mean_token_accuracy": 0.8719711303710938, + "num_tokens": 723597281.0, + "step": 18963 + }, + { + "epoch": 2.412415723190434, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.241378784179688, + "learning_rate": 1e-06, + "loss": 0.3287, + "mean_token_accuracy": 0.8967351913452148, + "num_tokens": 723637690.0, + "step": 18964 + }, + { + "epoch": 2.4125429334690245, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.870641708374023, + "learning_rate": 1e-06, + "loss": 0.4053, + "mean_token_accuracy": 0.8740030527114868, + "num_tokens": 723676250.0, + "step": 18965 + }, + { + "epoch": 2.412670143747615, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.888078689575195, + "learning_rate": 1e-06, + "loss": 0.3715, + "mean_token_accuracy": 0.8822084665298462, + "num_tokens": 723713157.0, + "step": 18966 + }, + { + "epoch": 2.412797354026205, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.04977035522461, + "learning_rate": 1e-06, + "loss": 0.3876, + "mean_token_accuracy": 0.8792059421539307, + "num_tokens": 723745877.0, + "step": 18967 + }, + { + "epoch": 2.412924564304796, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.997901916503906, + "learning_rate": 1e-06, + "loss": 0.3903, + "mean_token_accuracy": 0.8755450248718262, + "num_tokens": 723784285.0, + "step": 18968 + }, + { + "epoch": 2.413051774583386, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.947961807250977, + "learning_rate": 1e-06, + "loss": 0.4318, + "mean_token_accuracy": 0.862309455871582, + "num_tokens": 723817085.0, + "step": 18969 + }, + { + "epoch": 2.4131789848619767, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.018169403076172, + "learning_rate": 1e-06, + "loss": 0.4267, + "mean_token_accuracy": 0.8685480356216431, + "num_tokens": 723858466.0, + "step": 18970 + }, + { + "epoch": 2.413306195140567, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.161243438720703, + "learning_rate": 1e-06, + "loss": 0.4003, + "mean_token_accuracy": 0.8734822869300842, + "num_tokens": 723896627.0, + "step": 18971 + }, + { + "epoch": 2.4134334054191577, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.953466415405273, + "learning_rate": 1e-06, + "loss": 0.4097, + "mean_token_accuracy": 0.8681526184082031, + "num_tokens": 723930727.0, + "step": 18972 + }, + { + "epoch": 2.4135606156977483, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.23949432373047, + "learning_rate": 1e-06, + "loss": 0.4383, + "mean_token_accuracy": 0.8630441427230835, + "num_tokens": 723965996.0, + "step": 18973 + }, + { + "epoch": 2.413687825976339, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.226011276245117, + "learning_rate": 1e-06, + "loss": 0.4134, + "mean_token_accuracy": 0.8687870502471924, + "num_tokens": 724002441.0, + "step": 18974 + }, + { + "epoch": 2.4138150362549293, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.98404312133789, + "learning_rate": 1e-06, + "loss": 0.393, + "mean_token_accuracy": 0.8779239654541016, + "num_tokens": 724039238.0, + "step": 18975 + }, + { + "epoch": 2.41394224653352, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.029083251953125, + "learning_rate": 1e-06, + "loss": 0.3948, + "mean_token_accuracy": 0.8787104487419128, + "num_tokens": 724077430.0, + "step": 18976 + }, + { + "epoch": 2.4140694568121104, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.22122573852539, + "learning_rate": 1e-06, + "loss": 0.4117, + "mean_token_accuracy": 0.8694727420806885, + "num_tokens": 724113175.0, + "step": 18977 + }, + { + "epoch": 2.414196667090701, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.21822738647461, + "learning_rate": 1e-06, + "loss": 0.4285, + "mean_token_accuracy": 0.8681418895721436, + "num_tokens": 724156409.0, + "step": 18978 + }, + { + "epoch": 2.4143238773692914, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.86485481262207, + "learning_rate": 1e-06, + "loss": 0.3968, + "mean_token_accuracy": 0.8740408420562744, + "num_tokens": 724199879.0, + "step": 18979 + }, + { + "epoch": 2.414451087647882, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.299259185791016, + "learning_rate": 1e-06, + "loss": 0.3549, + "mean_token_accuracy": 0.8855978846549988, + "num_tokens": 724238065.0, + "step": 18980 + }, + { + "epoch": 2.4145782979264725, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.877044677734375, + "learning_rate": 1e-06, + "loss": 0.4251, + "mean_token_accuracy": 0.8666446805000305, + "num_tokens": 724272336.0, + "step": 18981 + }, + { + "epoch": 2.414705508205063, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.02193260192871, + "learning_rate": 1e-06, + "loss": 0.4299, + "mean_token_accuracy": 0.8646991848945618, + "num_tokens": 724305200.0, + "step": 18982 + }, + { + "epoch": 2.4148327184836536, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.950796127319336, + "learning_rate": 1e-06, + "loss": 0.39, + "mean_token_accuracy": 0.8781399726867676, + "num_tokens": 724341295.0, + "step": 18983 + }, + { + "epoch": 2.414959928762244, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.29927635192871, + "learning_rate": 1e-06, + "loss": 0.3947, + "mean_token_accuracy": 0.8759119510650635, + "num_tokens": 724384336.0, + "step": 18984 + }, + { + "epoch": 2.4150871390408346, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.017366409301758, + "learning_rate": 1e-06, + "loss": 0.409, + "mean_token_accuracy": 0.8696385622024536, + "num_tokens": 724417279.0, + "step": 18985 + }, + { + "epoch": 2.415214349319425, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.989240646362305, + "learning_rate": 1e-06, + "loss": 0.3972, + "mean_token_accuracy": 0.8726664781570435, + "num_tokens": 724456509.0, + "step": 18986 + }, + { + "epoch": 2.4153415595980157, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.0217342376709, + "learning_rate": 1e-06, + "loss": 0.3657, + "mean_token_accuracy": 0.8838715553283691, + "num_tokens": 724498812.0, + "step": 18987 + }, + { + "epoch": 2.415468769876606, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.907075881958008, + "learning_rate": 1e-06, + "loss": 0.4382, + "mean_token_accuracy": 0.8612828850746155, + "num_tokens": 724538174.0, + "step": 18988 + }, + { + "epoch": 2.4155959801551967, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.17218017578125, + "learning_rate": 1e-06, + "loss": 0.3901, + "mean_token_accuracy": 0.8788250684738159, + "num_tokens": 724577171.0, + "step": 18989 + }, + { + "epoch": 2.415723190433787, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.264514923095703, + "learning_rate": 1e-06, + "loss": 0.3355, + "mean_token_accuracy": 0.8947699069976807, + "num_tokens": 724610626.0, + "step": 18990 + }, + { + "epoch": 2.4158504007123778, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.797216415405273, + "learning_rate": 1e-06, + "loss": 0.3634, + "mean_token_accuracy": 0.8858857154846191, + "num_tokens": 724647613.0, + "step": 18991 + }, + { + "epoch": 2.415977610990968, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.933055877685547, + "learning_rate": 1e-06, + "loss": 0.4024, + "mean_token_accuracy": 0.8755417466163635, + "num_tokens": 724684898.0, + "step": 18992 + }, + { + "epoch": 2.4161048212695584, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.87811279296875, + "learning_rate": 1e-06, + "loss": 0.4046, + "mean_token_accuracy": 0.8692461252212524, + "num_tokens": 724725566.0, + "step": 18993 + }, + { + "epoch": 2.416232031548149, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.890718460083008, + "learning_rate": 1e-06, + "loss": 0.4254, + "mean_token_accuracy": 0.8642148971557617, + "num_tokens": 724763701.0, + "step": 18994 + }, + { + "epoch": 2.4163592418267394, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.912208557128906, + "learning_rate": 1e-06, + "loss": 0.3951, + "mean_token_accuracy": 0.8726581931114197, + "num_tokens": 724798108.0, + "step": 18995 + }, + { + "epoch": 2.41648645210533, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.313770294189453, + "learning_rate": 1e-06, + "loss": 0.4563, + "mean_token_accuracy": 0.8532618284225464, + "num_tokens": 724842430.0, + "step": 18996 + }, + { + "epoch": 2.4166136623839205, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.79900360107422, + "learning_rate": 1e-06, + "loss": 0.4114, + "mean_token_accuracy": 0.8690223693847656, + "num_tokens": 724884585.0, + "step": 18997 + }, + { + "epoch": 2.416740872662511, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.89297103881836, + "learning_rate": 1e-06, + "loss": 0.3652, + "mean_token_accuracy": 0.8843708038330078, + "num_tokens": 724919338.0, + "step": 18998 + }, + { + "epoch": 2.4168680829411016, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.88153648376465, + "learning_rate": 1e-06, + "loss": 0.3389, + "mean_token_accuracy": 0.8950645923614502, + "num_tokens": 724956007.0, + "step": 18999 + }, + { + "epoch": 2.416995293219692, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.1513671875, + "learning_rate": 1e-06, + "loss": 0.3509, + "mean_token_accuracy": 0.8891236186027527, + "num_tokens": 724996994.0, + "step": 19000 + }, + { + "epoch": 2.4171225034982826, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.961244583129883, + "learning_rate": 1e-06, + "loss": 0.426, + "mean_token_accuracy": 0.8678522109985352, + "num_tokens": 725035607.0, + "step": 19001 + }, + { + "epoch": 2.417249713776873, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.764142990112305, + "learning_rate": 1e-06, + "loss": 0.4045, + "mean_token_accuracy": 0.876367449760437, + "num_tokens": 725075370.0, + "step": 19002 + }, + { + "epoch": 2.4173769240554637, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.289813995361328, + "learning_rate": 1e-06, + "loss": 0.409, + "mean_token_accuracy": 0.8720517158508301, + "num_tokens": 725119536.0, + "step": 19003 + }, + { + "epoch": 2.417504134334054, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.764307022094727, + "learning_rate": 1e-06, + "loss": 0.4078, + "mean_token_accuracy": 0.8732806444168091, + "num_tokens": 725158048.0, + "step": 19004 + }, + { + "epoch": 2.4176313446126447, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.013965606689453, + "learning_rate": 1e-06, + "loss": 0.3738, + "mean_token_accuracy": 0.8827999234199524, + "num_tokens": 725196343.0, + "step": 19005 + }, + { + "epoch": 2.4177585548912353, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.910005569458008, + "learning_rate": 1e-06, + "loss": 0.3935, + "mean_token_accuracy": 0.873948872089386, + "num_tokens": 725234613.0, + "step": 19006 + }, + { + "epoch": 2.417885765169826, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.965938568115234, + "learning_rate": 1e-06, + "loss": 0.4026, + "mean_token_accuracy": 0.8729913234710693, + "num_tokens": 725272129.0, + "step": 19007 + }, + { + "epoch": 2.4180129754484163, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.01342010498047, + "learning_rate": 1e-06, + "loss": 0.4197, + "mean_token_accuracy": 0.865452766418457, + "num_tokens": 725305740.0, + "step": 19008 + }, + { + "epoch": 2.418140185727007, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.96392250061035, + "learning_rate": 1e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.8696238398551941, + "num_tokens": 725340579.0, + "step": 19009 + }, + { + "epoch": 2.4182673960055974, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.107341766357422, + "learning_rate": 1e-06, + "loss": 0.3915, + "mean_token_accuracy": 0.8765356540679932, + "num_tokens": 725380492.0, + "step": 19010 + }, + { + "epoch": 2.418394606284188, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.831369400024414, + "learning_rate": 1e-06, + "loss": 0.397, + "mean_token_accuracy": 0.8759595155715942, + "num_tokens": 725418043.0, + "step": 19011 + }, + { + "epoch": 2.4185218165627784, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.07241439819336, + "learning_rate": 1e-06, + "loss": 0.4188, + "mean_token_accuracy": 0.8654769659042358, + "num_tokens": 725456260.0, + "step": 19012 + }, + { + "epoch": 2.418649026841369, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.005889892578125, + "learning_rate": 1e-06, + "loss": 0.4101, + "mean_token_accuracy": 0.8710654973983765, + "num_tokens": 725496605.0, + "step": 19013 + }, + { + "epoch": 2.4187762371199595, + "ewc_loss": 0.037109375, + "ewc_loss_parallel": 3.719329833984375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.777503967285156, + "learning_rate": 1e-06, + "loss": 0.4005, + "mean_token_accuracy": 0.8755944967269897, + "num_tokens": 725532395.0, + "step": 19014 + }, + { + "epoch": 2.4189034473985496, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.126609802246094, + "learning_rate": 1e-06, + "loss": 0.4164, + "mean_token_accuracy": 0.8688040375709534, + "num_tokens": 725575335.0, + "step": 19015 + }, + { + "epoch": 2.4190306576771405, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.020198822021484, + "learning_rate": 1e-06, + "loss": 0.3847, + "mean_token_accuracy": 0.8797214031219482, + "num_tokens": 725612345.0, + "step": 19016 + }, + { + "epoch": 2.4191578679557306, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.862211227416992, + "learning_rate": 1e-06, + "loss": 0.3862, + "mean_token_accuracy": 0.8773033022880554, + "num_tokens": 725645566.0, + "step": 19017 + }, + { + "epoch": 2.419285078234321, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.0140323638916, + "learning_rate": 1e-06, + "loss": 0.4103, + "mean_token_accuracy": 0.8686133027076721, + "num_tokens": 725684218.0, + "step": 19018 + }, + { + "epoch": 2.4194122885129117, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.914852142333984, + "learning_rate": 1e-06, + "loss": 0.3909, + "mean_token_accuracy": 0.8769848942756653, + "num_tokens": 725718478.0, + "step": 19019 + }, + { + "epoch": 2.419539498791502, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.986486434936523, + "learning_rate": 1e-06, + "loss": 0.431, + "mean_token_accuracy": 0.8663274049758911, + "num_tokens": 725756208.0, + "step": 19020 + }, + { + "epoch": 2.4196667090700927, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.052913665771484, + "learning_rate": 1e-06, + "loss": 0.4212, + "mean_token_accuracy": 0.86798495054245, + "num_tokens": 725791884.0, + "step": 19021 + }, + { + "epoch": 2.4197939193486833, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.952863693237305, + "learning_rate": 1e-06, + "loss": 0.3803, + "mean_token_accuracy": 0.8776929378509521, + "num_tokens": 725829138.0, + "step": 19022 + }, + { + "epoch": 2.419921129627274, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.912521362304688, + "learning_rate": 1e-06, + "loss": 0.3983, + "mean_token_accuracy": 0.8740156888961792, + "num_tokens": 725868301.0, + "step": 19023 + }, + { + "epoch": 2.4200483399058643, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.05817413330078, + "learning_rate": 1e-06, + "loss": 0.4422, + "mean_token_accuracy": 0.8631062507629395, + "num_tokens": 725906186.0, + "step": 19024 + }, + { + "epoch": 2.420175550184455, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.979524612426758, + "learning_rate": 1e-06, + "loss": 0.4385, + "mean_token_accuracy": 0.8614094257354736, + "num_tokens": 725947006.0, + "step": 19025 + }, + { + "epoch": 2.4203027604630454, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.875669479370117, + "learning_rate": 1e-06, + "loss": 0.3847, + "mean_token_accuracy": 0.8778098821640015, + "num_tokens": 725987363.0, + "step": 19026 + }, + { + "epoch": 2.420429970741636, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.07428741455078, + "learning_rate": 1e-06, + "loss": 0.4288, + "mean_token_accuracy": 0.8648800849914551, + "num_tokens": 726026176.0, + "step": 19027 + }, + { + "epoch": 2.4205571810202264, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.871091842651367, + "learning_rate": 1e-06, + "loss": 0.4065, + "mean_token_accuracy": 0.872260332107544, + "num_tokens": 726068124.0, + "step": 19028 + }, + { + "epoch": 2.420684391298817, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.99358558654785, + "learning_rate": 1e-06, + "loss": 0.4209, + "mean_token_accuracy": 0.8655228614807129, + "num_tokens": 726110270.0, + "step": 19029 + }, + { + "epoch": 2.4208116015774075, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.864673614501953, + "learning_rate": 1e-06, + "loss": 0.349, + "mean_token_accuracy": 0.88884037733078, + "num_tokens": 726143908.0, + "step": 19030 + }, + { + "epoch": 2.420938811855998, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.044923782348633, + "learning_rate": 1e-06, + "loss": 0.407, + "mean_token_accuracy": 0.8710168600082397, + "num_tokens": 726179082.0, + "step": 19031 + }, + { + "epoch": 2.4210660221345885, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.003498077392578, + "learning_rate": 1e-06, + "loss": 0.4169, + "mean_token_accuracy": 0.8672176599502563, + "num_tokens": 726216387.0, + "step": 19032 + }, + { + "epoch": 2.421193232413179, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.89601707458496, + "learning_rate": 1e-06, + "loss": 0.3778, + "mean_token_accuracy": 0.8818284869194031, + "num_tokens": 726258831.0, + "step": 19033 + }, + { + "epoch": 2.4213204426917696, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.763805389404297, + "learning_rate": 1e-06, + "loss": 0.3899, + "mean_token_accuracy": 0.8754479885101318, + "num_tokens": 726301929.0, + "step": 19034 + }, + { + "epoch": 2.42144765297036, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.850685119628906, + "learning_rate": 1e-06, + "loss": 0.3859, + "mean_token_accuracy": 0.8771501779556274, + "num_tokens": 726339938.0, + "step": 19035 + }, + { + "epoch": 2.4215748632489507, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.875232696533203, + "learning_rate": 1e-06, + "loss": 0.3958, + "mean_token_accuracy": 0.875670850276947, + "num_tokens": 726377197.0, + "step": 19036 + }, + { + "epoch": 2.421702073527541, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.86188507080078, + "learning_rate": 1e-06, + "loss": 0.3917, + "mean_token_accuracy": 0.8768045902252197, + "num_tokens": 726417591.0, + "step": 19037 + }, + { + "epoch": 2.4218292838061317, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.85342025756836, + "learning_rate": 1e-06, + "loss": 0.4289, + "mean_token_accuracy": 0.8637491464614868, + "num_tokens": 726456094.0, + "step": 19038 + }, + { + "epoch": 2.4219564940847222, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.997032165527344, + "learning_rate": 1e-06, + "loss": 0.3739, + "mean_token_accuracy": 0.8816615343093872, + "num_tokens": 726491158.0, + "step": 19039 + }, + { + "epoch": 2.4220837043633123, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.98863410949707, + "learning_rate": 1e-06, + "loss": 0.4353, + "mean_token_accuracy": 0.8599823713302612, + "num_tokens": 726531334.0, + "step": 19040 + }, + { + "epoch": 2.4222109146419033, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.012746810913086, + "learning_rate": 1e-06, + "loss": 0.4381, + "mean_token_accuracy": 0.8640833497047424, + "num_tokens": 726571999.0, + "step": 19041 + }, + { + "epoch": 2.4223381249204934, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.0156307220459, + "learning_rate": 1e-06, + "loss": 0.4307, + "mean_token_accuracy": 0.8650727868080139, + "num_tokens": 726604945.0, + "step": 19042 + }, + { + "epoch": 2.422465335199084, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.925825119018555, + "learning_rate": 1e-06, + "loss": 0.4344, + "mean_token_accuracy": 0.8648481369018555, + "num_tokens": 726645023.0, + "step": 19043 + }, + { + "epoch": 2.4225925454776744, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.889583587646484, + "learning_rate": 1e-06, + "loss": 0.4072, + "mean_token_accuracy": 0.8727818727493286, + "num_tokens": 726684297.0, + "step": 19044 + }, + { + "epoch": 2.422719755756265, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.046205520629883, + "learning_rate": 1e-06, + "loss": 0.4287, + "mean_token_accuracy": 0.8626120090484619, + "num_tokens": 726726950.0, + "step": 19045 + }, + { + "epoch": 2.4228469660348555, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.84421730041504, + "learning_rate": 1e-06, + "loss": 0.4679, + "mean_token_accuracy": 0.8550686836242676, + "num_tokens": 726761944.0, + "step": 19046 + }, + { + "epoch": 2.422974176313446, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.786571502685547, + "learning_rate": 1e-06, + "loss": 0.3658, + "mean_token_accuracy": 0.8835440874099731, + "num_tokens": 726799041.0, + "step": 19047 + }, + { + "epoch": 2.4231013865920366, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.131376266479492, + "learning_rate": 1e-06, + "loss": 0.4135, + "mean_token_accuracy": 0.8673543930053711, + "num_tokens": 726838160.0, + "step": 19048 + }, + { + "epoch": 2.423228596870627, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.890026092529297, + "learning_rate": 1e-06, + "loss": 0.3709, + "mean_token_accuracy": 0.882837176322937, + "num_tokens": 726873150.0, + "step": 19049 + }, + { + "epoch": 2.4233558071492176, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.80198097229004, + "learning_rate": 1e-06, + "loss": 0.3941, + "mean_token_accuracy": 0.8776862621307373, + "num_tokens": 726905253.0, + "step": 19050 + }, + { + "epoch": 2.423483017427808, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.970739364624023, + "learning_rate": 1e-06, + "loss": 0.3666, + "mean_token_accuracy": 0.8831632733345032, + "num_tokens": 726942842.0, + "step": 19051 + }, + { + "epoch": 2.4236102277063987, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.912485122680664, + "learning_rate": 1e-06, + "loss": 0.3835, + "mean_token_accuracy": 0.8776233196258545, + "num_tokens": 726987974.0, + "step": 19052 + }, + { + "epoch": 2.423737437984989, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.81007957458496, + "learning_rate": 1e-06, + "loss": 0.3475, + "mean_token_accuracy": 0.8915355205535889, + "num_tokens": 727021614.0, + "step": 19053 + }, + { + "epoch": 2.4238646482635797, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.163509368896484, + "learning_rate": 1e-06, + "loss": 0.4064, + "mean_token_accuracy": 0.8709344267845154, + "num_tokens": 727061773.0, + "step": 19054 + }, + { + "epoch": 2.4239918585421703, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.85387420654297, + "learning_rate": 1e-06, + "loss": 0.4174, + "mean_token_accuracy": 0.8680244088172913, + "num_tokens": 727095877.0, + "step": 19055 + }, + { + "epoch": 2.424119068820761, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.086544036865234, + "learning_rate": 1e-06, + "loss": 0.3824, + "mean_token_accuracy": 0.8761568665504456, + "num_tokens": 727128676.0, + "step": 19056 + }, + { + "epoch": 2.4242462790993513, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.12908172607422, + "learning_rate": 1e-06, + "loss": 0.3626, + "mean_token_accuracy": 0.8818985819816589, + "num_tokens": 727164528.0, + "step": 19057 + }, + { + "epoch": 2.424373489377942, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.098377227783203, + "learning_rate": 1e-06, + "loss": 0.3686, + "mean_token_accuracy": 0.8797837495803833, + "num_tokens": 727202403.0, + "step": 19058 + }, + { + "epoch": 2.4245006996565324, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.101991653442383, + "learning_rate": 1e-06, + "loss": 0.375, + "mean_token_accuracy": 0.8760411739349365, + "num_tokens": 727242231.0, + "step": 19059 + }, + { + "epoch": 2.424627909935123, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.933757781982422, + "learning_rate": 1e-06, + "loss": 0.4415, + "mean_token_accuracy": 0.8605696558952332, + "num_tokens": 727276371.0, + "step": 19060 + }, + { + "epoch": 2.4247551202137134, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.05585289001465, + "learning_rate": 1e-06, + "loss": 0.3967, + "mean_token_accuracy": 0.8767191171646118, + "num_tokens": 727320101.0, + "step": 19061 + }, + { + "epoch": 2.424882330492304, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.873687744140625, + "learning_rate": 1e-06, + "loss": 0.4203, + "mean_token_accuracy": 0.8652328252792358, + "num_tokens": 727363687.0, + "step": 19062 + }, + { + "epoch": 2.4250095407708945, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.881547927856445, + "learning_rate": 1e-06, + "loss": 0.3835, + "mean_token_accuracy": 0.8787916898727417, + "num_tokens": 727399772.0, + "step": 19063 + }, + { + "epoch": 2.425136751049485, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.117923736572266, + "learning_rate": 1e-06, + "loss": 0.3792, + "mean_token_accuracy": 0.8815189599990845, + "num_tokens": 727442944.0, + "step": 19064 + }, + { + "epoch": 2.425263961328075, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.857969284057617, + "learning_rate": 1e-06, + "loss": 0.3793, + "mean_token_accuracy": 0.8816545009613037, + "num_tokens": 727483283.0, + "step": 19065 + }, + { + "epoch": 2.425391171606666, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.932832717895508, + "learning_rate": 1e-06, + "loss": 0.416, + "mean_token_accuracy": 0.8723080158233643, + "num_tokens": 727526316.0, + "step": 19066 + }, + { + "epoch": 2.425518381885256, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.809295654296875, + "learning_rate": 1e-06, + "loss": 0.3827, + "mean_token_accuracy": 0.8802217245101929, + "num_tokens": 727563218.0, + "step": 19067 + }, + { + "epoch": 2.4256455921638467, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.872968673706055, + "learning_rate": 1e-06, + "loss": 0.367, + "mean_token_accuracy": 0.8864462375640869, + "num_tokens": 727594877.0, + "step": 19068 + }, + { + "epoch": 2.425772802442437, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.0748291015625, + "learning_rate": 1e-06, + "loss": 0.3581, + "mean_token_accuracy": 0.8867676854133606, + "num_tokens": 727627561.0, + "step": 19069 + }, + { + "epoch": 2.4259000127210277, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.842226028442383, + "learning_rate": 1e-06, + "loss": 0.423, + "mean_token_accuracy": 0.8647305965423584, + "num_tokens": 727664384.0, + "step": 19070 + }, + { + "epoch": 2.4260272229996183, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.822315216064453, + "learning_rate": 1e-06, + "loss": 0.4122, + "mean_token_accuracy": 0.8703954219818115, + "num_tokens": 727700866.0, + "step": 19071 + }, + { + "epoch": 2.426154433278209, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.10354232788086, + "learning_rate": 1e-06, + "loss": 0.3534, + "mean_token_accuracy": 0.8905595541000366, + "num_tokens": 727741539.0, + "step": 19072 + }, + { + "epoch": 2.4262816435567993, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.762977600097656, + "learning_rate": 1e-06, + "loss": 0.3893, + "mean_token_accuracy": 0.8737501502037048, + "num_tokens": 727777828.0, + "step": 19073 + }, + { + "epoch": 2.42640885383539, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.896469116210938, + "learning_rate": 1e-06, + "loss": 0.369, + "mean_token_accuracy": 0.8816269636154175, + "num_tokens": 727818979.0, + "step": 19074 + }, + { + "epoch": 2.4265360641139804, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.892915725708008, + "learning_rate": 1e-06, + "loss": 0.3936, + "mean_token_accuracy": 0.8749204874038696, + "num_tokens": 727856440.0, + "step": 19075 + }, + { + "epoch": 2.426663274392571, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.09468650817871, + "learning_rate": 1e-06, + "loss": 0.3722, + "mean_token_accuracy": 0.8816077709197998, + "num_tokens": 727895036.0, + "step": 19076 + }, + { + "epoch": 2.4267904846711614, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.817285537719727, + "learning_rate": 1e-06, + "loss": 0.429, + "mean_token_accuracy": 0.863189697265625, + "num_tokens": 727932292.0, + "step": 19077 + }, + { + "epoch": 2.426917694949752, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.88465118408203, + "learning_rate": 1e-06, + "loss": 0.4044, + "mean_token_accuracy": 0.8739452958106995, + "num_tokens": 727967005.0, + "step": 19078 + }, + { + "epoch": 2.4270449052283425, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.965328216552734, + "learning_rate": 1e-06, + "loss": 0.4262, + "mean_token_accuracy": 0.8650768995285034, + "num_tokens": 728004262.0, + "step": 19079 + }, + { + "epoch": 2.427172115506933, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.89632797241211, + "learning_rate": 1e-06, + "loss": 0.3745, + "mean_token_accuracy": 0.8833324909210205, + "num_tokens": 728041451.0, + "step": 19080 + }, + { + "epoch": 2.4272993257855235, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.859521865844727, + "learning_rate": 1e-06, + "loss": 0.4095, + "mean_token_accuracy": 0.8726822733879089, + "num_tokens": 728079217.0, + "step": 19081 + }, + { + "epoch": 2.427426536064114, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.96147918701172, + "learning_rate": 1e-06, + "loss": 0.3398, + "mean_token_accuracy": 0.8890639543533325, + "num_tokens": 728116221.0, + "step": 19082 + }, + { + "epoch": 2.4275537463427046, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.83285140991211, + "learning_rate": 1e-06, + "loss": 0.4281, + "mean_token_accuracy": 0.8661767840385437, + "num_tokens": 728154861.0, + "step": 19083 + }, + { + "epoch": 2.427680956621295, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.082639694213867, + "learning_rate": 1e-06, + "loss": 0.4425, + "mean_token_accuracy": 0.8631055355072021, + "num_tokens": 728188276.0, + "step": 19084 + }, + { + "epoch": 2.4278081668998857, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.01081085205078, + "learning_rate": 1e-06, + "loss": 0.3687, + "mean_token_accuracy": 0.8845642805099487, + "num_tokens": 728229313.0, + "step": 19085 + }, + { + "epoch": 2.427935377178476, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.830142974853516, + "learning_rate": 1e-06, + "loss": 0.4354, + "mean_token_accuracy": 0.8628802299499512, + "num_tokens": 728268706.0, + "step": 19086 + }, + { + "epoch": 2.4280625874570667, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.028066635131836, + "learning_rate": 1e-06, + "loss": 0.354, + "mean_token_accuracy": 0.8867378234863281, + "num_tokens": 728297490.0, + "step": 19087 + }, + { + "epoch": 2.428189797735657, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.81571388244629, + "learning_rate": 1e-06, + "loss": 0.4159, + "mean_token_accuracy": 0.8665190935134888, + "num_tokens": 728336041.0, + "step": 19088 + }, + { + "epoch": 2.4283170080142478, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.919736862182617, + "learning_rate": 1e-06, + "loss": 0.3796, + "mean_token_accuracy": 0.8812469840049744, + "num_tokens": 728372701.0, + "step": 19089 + }, + { + "epoch": 2.428444218292838, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.97095489501953, + "learning_rate": 1e-06, + "loss": 0.381, + "mean_token_accuracy": 0.877389669418335, + "num_tokens": 728408138.0, + "step": 19090 + }, + { + "epoch": 2.4285714285714284, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.899450302124023, + "learning_rate": 1e-06, + "loss": 0.3797, + "mean_token_accuracy": 0.8804859519004822, + "num_tokens": 728452649.0, + "step": 19091 + }, + { + "epoch": 2.428698638850019, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.833223342895508, + "learning_rate": 1e-06, + "loss": 0.4424, + "mean_token_accuracy": 0.8642306923866272, + "num_tokens": 728490551.0, + "step": 19092 + }, + { + "epoch": 2.4288258491286094, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.98501205444336, + "learning_rate": 1e-06, + "loss": 0.372, + "mean_token_accuracy": 0.8803601264953613, + "num_tokens": 728529558.0, + "step": 19093 + }, + { + "epoch": 2.4289530594072, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.813936233520508, + "learning_rate": 1e-06, + "loss": 0.4739, + "mean_token_accuracy": 0.8508840799331665, + "num_tokens": 728573495.0, + "step": 19094 + }, + { + "epoch": 2.4290802696857905, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.912912368774414, + "learning_rate": 1e-06, + "loss": 0.4063, + "mean_token_accuracy": 0.874452531337738, + "num_tokens": 728609166.0, + "step": 19095 + }, + { + "epoch": 2.429207479964381, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.085163116455078, + "learning_rate": 1e-06, + "loss": 0.375, + "mean_token_accuracy": 0.8806577920913696, + "num_tokens": 728653070.0, + "step": 19096 + }, + { + "epoch": 2.4293346902429716, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.803573608398438, + "learning_rate": 1e-06, + "loss": 0.4141, + "mean_token_accuracy": 0.8773030042648315, + "num_tokens": 728689142.0, + "step": 19097 + }, + { + "epoch": 2.429461900521562, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.99320411682129, + "learning_rate": 1e-06, + "loss": 0.3896, + "mean_token_accuracy": 0.8785485625267029, + "num_tokens": 728725615.0, + "step": 19098 + }, + { + "epoch": 2.4295891108001526, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.94780731201172, + "learning_rate": 1e-06, + "loss": 0.4334, + "mean_token_accuracy": 0.8643479347229004, + "num_tokens": 728762366.0, + "step": 19099 + }, + { + "epoch": 2.429716321078743, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.10418128967285, + "learning_rate": 1e-06, + "loss": 0.4394, + "mean_token_accuracy": 0.8617253303527832, + "num_tokens": 728801951.0, + "step": 19100 + }, + { + "epoch": 2.4298435313573337, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.822568893432617, + "learning_rate": 1e-06, + "loss": 0.3705, + "mean_token_accuracy": 0.8825750946998596, + "num_tokens": 728836403.0, + "step": 19101 + }, + { + "epoch": 2.429970741635924, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.90982437133789, + "learning_rate": 1e-06, + "loss": 0.3834, + "mean_token_accuracy": 0.8809411525726318, + "num_tokens": 728878485.0, + "step": 19102 + }, + { + "epoch": 2.4300979519145147, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.85773468017578, + "learning_rate": 1e-06, + "loss": 0.3612, + "mean_token_accuracy": 0.8835113644599915, + "num_tokens": 728915935.0, + "step": 19103 + }, + { + "epoch": 2.4302251621931052, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.893014907836914, + "learning_rate": 1e-06, + "loss": 0.4566, + "mean_token_accuracy": 0.8569936752319336, + "num_tokens": 728956847.0, + "step": 19104 + }, + { + "epoch": 2.4303523724716958, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.809856414794922, + "learning_rate": 1e-06, + "loss": 0.3871, + "mean_token_accuracy": 0.8790799379348755, + "num_tokens": 728994865.0, + "step": 19105 + }, + { + "epoch": 2.4304795827502863, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.920623779296875, + "learning_rate": 1e-06, + "loss": 0.4142, + "mean_token_accuracy": 0.8672114610671997, + "num_tokens": 729033030.0, + "step": 19106 + }, + { + "epoch": 2.430606793028877, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.892072677612305, + "learning_rate": 1e-06, + "loss": 0.3765, + "mean_token_accuracy": 0.883853018283844, + "num_tokens": 729070430.0, + "step": 19107 + }, + { + "epoch": 2.4307340033074674, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.864871978759766, + "learning_rate": 1e-06, + "loss": 0.3744, + "mean_token_accuracy": 0.8835322260856628, + "num_tokens": 729102127.0, + "step": 19108 + }, + { + "epoch": 2.430861213586058, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.917814254760742, + "learning_rate": 1e-06, + "loss": 0.388, + "mean_token_accuracy": 0.8773760795593262, + "num_tokens": 729138128.0, + "step": 19109 + }, + { + "epoch": 2.4309884238646484, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.063159942626953, + "learning_rate": 1e-06, + "loss": 0.4087, + "mean_token_accuracy": 0.8712024092674255, + "num_tokens": 729179756.0, + "step": 19110 + }, + { + "epoch": 2.431115634143239, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.95745849609375, + "learning_rate": 1e-06, + "loss": 0.4406, + "mean_token_accuracy": 0.8620905876159668, + "num_tokens": 729221356.0, + "step": 19111 + }, + { + "epoch": 2.4312428444218295, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.926525115966797, + "learning_rate": 1e-06, + "loss": 0.3868, + "mean_token_accuracy": 0.8787611126899719, + "num_tokens": 729260456.0, + "step": 19112 + }, + { + "epoch": 2.4313700547004196, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.942903518676758, + "learning_rate": 1e-06, + "loss": 0.4333, + "mean_token_accuracy": 0.8618995547294617, + "num_tokens": 729299654.0, + "step": 19113 + }, + { + "epoch": 2.4314972649790105, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.92654037475586, + "learning_rate": 1e-06, + "loss": 0.3999, + "mean_token_accuracy": 0.8768751621246338, + "num_tokens": 729341296.0, + "step": 19114 + }, + { + "epoch": 2.4316244752576006, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.78742790222168, + "learning_rate": 1e-06, + "loss": 0.391, + "mean_token_accuracy": 0.8780906200408936, + "num_tokens": 729379632.0, + "step": 19115 + }, + { + "epoch": 2.431751685536191, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.085546493530273, + "learning_rate": 1e-06, + "loss": 0.3946, + "mean_token_accuracy": 0.8752892017364502, + "num_tokens": 729423202.0, + "step": 19116 + }, + { + "epoch": 2.4318788958147817, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.8497371673584, + "learning_rate": 1e-06, + "loss": 0.3364, + "mean_token_accuracy": 0.8928855061531067, + "num_tokens": 729457108.0, + "step": 19117 + }, + { + "epoch": 2.432006106093372, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.8817081451416, + "learning_rate": 1e-06, + "loss": 0.4133, + "mean_token_accuracy": 0.87022864818573, + "num_tokens": 729494065.0, + "step": 19118 + }, + { + "epoch": 2.4321333163719627, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.090105056762695, + "learning_rate": 1e-06, + "loss": 0.3784, + "mean_token_accuracy": 0.8774908781051636, + "num_tokens": 729536621.0, + "step": 19119 + }, + { + "epoch": 2.4322605266505533, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.7447566986084, + "learning_rate": 1e-06, + "loss": 0.3938, + "mean_token_accuracy": 0.8743951916694641, + "num_tokens": 729574423.0, + "step": 19120 + }, + { + "epoch": 2.432387736929144, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.87954330444336, + "learning_rate": 1e-06, + "loss": 0.4157, + "mean_token_accuracy": 0.8685791492462158, + "num_tokens": 729614673.0, + "step": 19121 + }, + { + "epoch": 2.4325149472077343, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.888856887817383, + "learning_rate": 1e-06, + "loss": 0.3957, + "mean_token_accuracy": 0.8756389021873474, + "num_tokens": 729653049.0, + "step": 19122 + }, + { + "epoch": 2.432642157486325, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.825057983398438, + "learning_rate": 1e-06, + "loss": 0.3969, + "mean_token_accuracy": 0.8751694560050964, + "num_tokens": 729689667.0, + "step": 19123 + }, + { + "epoch": 2.4327693677649154, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.843782424926758, + "learning_rate": 1e-06, + "loss": 0.3978, + "mean_token_accuracy": 0.8777728080749512, + "num_tokens": 729734747.0, + "step": 19124 + }, + { + "epoch": 2.432896578043506, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.660367965698242, + "learning_rate": 1e-06, + "loss": 0.4137, + "mean_token_accuracy": 0.8695123195648193, + "num_tokens": 729770135.0, + "step": 19125 + }, + { + "epoch": 2.4330237883220964, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.93686866760254, + "learning_rate": 1e-06, + "loss": 0.3991, + "mean_token_accuracy": 0.8732151985168457, + "num_tokens": 729809203.0, + "step": 19126 + }, + { + "epoch": 2.433150998600687, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.906959533691406, + "learning_rate": 1e-06, + "loss": 0.3796, + "mean_token_accuracy": 0.8802415132522583, + "num_tokens": 729850164.0, + "step": 19127 + }, + { + "epoch": 2.4332782088792775, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.839075088500977, + "learning_rate": 1e-06, + "loss": 0.4281, + "mean_token_accuracy": 0.8631206750869751, + "num_tokens": 729884337.0, + "step": 19128 + }, + { + "epoch": 2.433405419157868, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.923870086669922, + "learning_rate": 1e-06, + "loss": 0.3616, + "mean_token_accuracy": 0.8866950273513794, + "num_tokens": 729924098.0, + "step": 19129 + }, + { + "epoch": 2.4335326294364585, + "ewc_loss": 0.037353515625, + "ewc_loss_parallel": 3.743171691894531e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.715682983398438, + "learning_rate": 1e-06, + "loss": 0.3514, + "mean_token_accuracy": 0.8919756412506104, + "num_tokens": 729960671.0, + "step": 19130 + }, + { + "epoch": 2.433659839715049, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.95008087158203, + "learning_rate": 1e-06, + "loss": 0.3923, + "mean_token_accuracy": 0.8747959733009338, + "num_tokens": 730000780.0, + "step": 19131 + }, + { + "epoch": 2.4337870499936396, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.08910369873047, + "learning_rate": 1e-06, + "loss": 0.408, + "mean_token_accuracy": 0.871932327747345, + "num_tokens": 730045198.0, + "step": 19132 + }, + { + "epoch": 2.43391426027223, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.943429946899414, + "learning_rate": 1e-06, + "loss": 0.4133, + "mean_token_accuracy": 0.8674768209457397, + "num_tokens": 730085010.0, + "step": 19133 + }, + { + "epoch": 2.4340414705508207, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.811147689819336, + "learning_rate": 1e-06, + "loss": 0.413, + "mean_token_accuracy": 0.8685506582260132, + "num_tokens": 730121512.0, + "step": 19134 + }, + { + "epoch": 2.434168680829411, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.85147476196289, + "learning_rate": 1e-06, + "loss": 0.391, + "mean_token_accuracy": 0.8730707168579102, + "num_tokens": 730164859.0, + "step": 19135 + }, + { + "epoch": 2.4342958911080017, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.09395980834961, + "learning_rate": 1e-06, + "loss": 0.4026, + "mean_token_accuracy": 0.8747794032096863, + "num_tokens": 730202273.0, + "step": 19136 + }, + { + "epoch": 2.4344231013865922, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.913854598999023, + "learning_rate": 1e-06, + "loss": 0.4243, + "mean_token_accuracy": 0.8670318126678467, + "num_tokens": 730248208.0, + "step": 19137 + }, + { + "epoch": 2.4345503116651823, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.89423370361328, + "learning_rate": 1e-06, + "loss": 0.3718, + "mean_token_accuracy": 0.8834002017974854, + "num_tokens": 730291084.0, + "step": 19138 + }, + { + "epoch": 2.4346775219437733, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.026296615600586, + "learning_rate": 1e-06, + "loss": 0.3829, + "mean_token_accuracy": 0.8782172799110413, + "num_tokens": 730334504.0, + "step": 19139 + }, + { + "epoch": 2.4348047322223634, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.912029266357422, + "learning_rate": 1e-06, + "loss": 0.3872, + "mean_token_accuracy": 0.8768717646598816, + "num_tokens": 730373356.0, + "step": 19140 + }, + { + "epoch": 2.434931942500954, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.064056396484375, + "learning_rate": 1e-06, + "loss": 0.4438, + "mean_token_accuracy": 0.861602783203125, + "num_tokens": 730416229.0, + "step": 19141 + }, + { + "epoch": 2.4350591527795444, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.902679443359375, + "learning_rate": 1e-06, + "loss": 0.369, + "mean_token_accuracy": 0.8860691785812378, + "num_tokens": 730458922.0, + "step": 19142 + }, + { + "epoch": 2.435186363058135, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.87537384033203, + "learning_rate": 1e-06, + "loss": 0.4167, + "mean_token_accuracy": 0.872714638710022, + "num_tokens": 730494583.0, + "step": 19143 + }, + { + "epoch": 2.4353135733367255, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.034011840820312, + "learning_rate": 1e-06, + "loss": 0.3789, + "mean_token_accuracy": 0.87615567445755, + "num_tokens": 730529526.0, + "step": 19144 + }, + { + "epoch": 2.435440783615316, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.822607040405273, + "learning_rate": 1e-06, + "loss": 0.3784, + "mean_token_accuracy": 0.8779901266098022, + "num_tokens": 730558584.0, + "step": 19145 + }, + { + "epoch": 2.4355679938939065, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.913278579711914, + "learning_rate": 1e-06, + "loss": 0.3929, + "mean_token_accuracy": 0.8746574521064758, + "num_tokens": 730598677.0, + "step": 19146 + }, + { + "epoch": 2.435695204172497, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.898651123046875, + "learning_rate": 1e-06, + "loss": 0.3688, + "mean_token_accuracy": 0.8836928606033325, + "num_tokens": 730636985.0, + "step": 19147 + }, + { + "epoch": 2.4358224144510876, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.945566177368164, + "learning_rate": 1e-06, + "loss": 0.4508, + "mean_token_accuracy": 0.8570466637611389, + "num_tokens": 730675409.0, + "step": 19148 + }, + { + "epoch": 2.435949624729678, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.855411529541016, + "learning_rate": 1e-06, + "loss": 0.4301, + "mean_token_accuracy": 0.862841010093689, + "num_tokens": 730714739.0, + "step": 19149 + }, + { + "epoch": 2.4360768350082687, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.879352569580078, + "learning_rate": 1e-06, + "loss": 0.4054, + "mean_token_accuracy": 0.8727011680603027, + "num_tokens": 730757059.0, + "step": 19150 + }, + { + "epoch": 2.436204045286859, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.890413284301758, + "learning_rate": 1e-06, + "loss": 0.4054, + "mean_token_accuracy": 0.8722846508026123, + "num_tokens": 730793469.0, + "step": 19151 + }, + { + "epoch": 2.4363312555654497, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.924537658691406, + "learning_rate": 1e-06, + "loss": 0.3799, + "mean_token_accuracy": 0.8788546323776245, + "num_tokens": 730829336.0, + "step": 19152 + }, + { + "epoch": 2.4364584658440402, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.877906799316406, + "learning_rate": 1e-06, + "loss": 0.4116, + "mean_token_accuracy": 0.8722547292709351, + "num_tokens": 730869304.0, + "step": 19153 + }, + { + "epoch": 2.4365856761226308, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.07000160217285, + "learning_rate": 1e-06, + "loss": 0.4385, + "mean_token_accuracy": 0.8618171215057373, + "num_tokens": 730905315.0, + "step": 19154 + }, + { + "epoch": 2.4367128864012213, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.0467529296875, + "learning_rate": 1e-06, + "loss": 0.4154, + "mean_token_accuracy": 0.8653631210327148, + "num_tokens": 730947346.0, + "step": 19155 + }, + { + "epoch": 2.436840096679812, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.07819938659668, + "learning_rate": 1e-06, + "loss": 0.3806, + "mean_token_accuracy": 0.8784064054489136, + "num_tokens": 730976907.0, + "step": 19156 + }, + { + "epoch": 2.4369673069584024, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.938600540161133, + "learning_rate": 1e-06, + "loss": 0.4162, + "mean_token_accuracy": 0.8705490231513977, + "num_tokens": 731015940.0, + "step": 19157 + }, + { + "epoch": 2.437094517236993, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.998218536376953, + "learning_rate": 1e-06, + "loss": 0.3384, + "mean_token_accuracy": 0.8895522356033325, + "num_tokens": 731051345.0, + "step": 19158 + }, + { + "epoch": 2.4372217275155834, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.008121490478516, + "learning_rate": 1e-06, + "loss": 0.4829, + "mean_token_accuracy": 0.8424583673477173, + "num_tokens": 731089642.0, + "step": 19159 + }, + { + "epoch": 2.437348937794174, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.971010208129883, + "learning_rate": 1e-06, + "loss": 0.3501, + "mean_token_accuracy": 0.8908633589744568, + "num_tokens": 731133196.0, + "step": 19160 + }, + { + "epoch": 2.4374761480727645, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.145463943481445, + "learning_rate": 1e-06, + "loss": 0.4394, + "mean_token_accuracy": 0.8624349236488342, + "num_tokens": 731175510.0, + "step": 19161 + }, + { + "epoch": 2.437603358351355, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.09568977355957, + "learning_rate": 1e-06, + "loss": 0.3821, + "mean_token_accuracy": 0.881956934928894, + "num_tokens": 731217526.0, + "step": 19162 + }, + { + "epoch": 2.437730568629945, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.903194427490234, + "learning_rate": 1e-06, + "loss": 0.4291, + "mean_token_accuracy": 0.8611390590667725, + "num_tokens": 731259525.0, + "step": 19163 + }, + { + "epoch": 2.437857778908536, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.75423240661621, + "learning_rate": 1e-06, + "loss": 0.3885, + "mean_token_accuracy": 0.8778831958770752, + "num_tokens": 731301480.0, + "step": 19164 + }, + { + "epoch": 2.437984989187126, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.966697692871094, + "learning_rate": 1e-06, + "loss": 0.3786, + "mean_token_accuracy": 0.8823809623718262, + "num_tokens": 731347385.0, + "step": 19165 + }, + { + "epoch": 2.4381121994657167, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.08526039123535, + "learning_rate": 1e-06, + "loss": 0.3712, + "mean_token_accuracy": 0.8787795305252075, + "num_tokens": 731383666.0, + "step": 19166 + }, + { + "epoch": 2.438239409744307, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.101951599121094, + "learning_rate": 1e-06, + "loss": 0.3972, + "mean_token_accuracy": 0.8740322589874268, + "num_tokens": 731420297.0, + "step": 19167 + }, + { + "epoch": 2.4383666200228977, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.986724853515625, + "learning_rate": 1e-06, + "loss": 0.4269, + "mean_token_accuracy": 0.8704152703285217, + "num_tokens": 731455083.0, + "step": 19168 + }, + { + "epoch": 2.4384938303014883, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.876827239990234, + "learning_rate": 1e-06, + "loss": 0.3703, + "mean_token_accuracy": 0.8825331926345825, + "num_tokens": 731482555.0, + "step": 19169 + }, + { + "epoch": 2.438621040580079, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.90804672241211, + "learning_rate": 1e-06, + "loss": 0.3708, + "mean_token_accuracy": 0.8863736391067505, + "num_tokens": 731519459.0, + "step": 19170 + }, + { + "epoch": 2.4387482508586693, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.978527069091797, + "learning_rate": 1e-06, + "loss": 0.3814, + "mean_token_accuracy": 0.8787711262702942, + "num_tokens": 731554577.0, + "step": 19171 + }, + { + "epoch": 2.43887546113726, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.241411209106445, + "learning_rate": 1e-06, + "loss": 0.4277, + "mean_token_accuracy": 0.8651442527770996, + "num_tokens": 731594993.0, + "step": 19172 + }, + { + "epoch": 2.4390026714158504, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.997953414916992, + "learning_rate": 1e-06, + "loss": 0.3905, + "mean_token_accuracy": 0.8780443072319031, + "num_tokens": 731629182.0, + "step": 19173 + }, + { + "epoch": 2.439129881694441, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.849369049072266, + "learning_rate": 1e-06, + "loss": 0.4083, + "mean_token_accuracy": 0.8735877275466919, + "num_tokens": 731667661.0, + "step": 19174 + }, + { + "epoch": 2.4392570919730314, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.27117156982422, + "learning_rate": 1e-06, + "loss": 0.4026, + "mean_token_accuracy": 0.8744811415672302, + "num_tokens": 731704911.0, + "step": 19175 + }, + { + "epoch": 2.439384302251622, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.92824363708496, + "learning_rate": 1e-06, + "loss": 0.4514, + "mean_token_accuracy": 0.8572578430175781, + "num_tokens": 731744990.0, + "step": 19176 + }, + { + "epoch": 2.4395115125302125, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.19488525390625, + "learning_rate": 1e-06, + "loss": 0.3941, + "mean_token_accuracy": 0.8754295706748962, + "num_tokens": 731783335.0, + "step": 19177 + }, + { + "epoch": 2.439638722808803, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.890533447265625, + "learning_rate": 1e-06, + "loss": 0.3864, + "mean_token_accuracy": 0.8765785694122314, + "num_tokens": 731819206.0, + "step": 19178 + }, + { + "epoch": 2.4397659330873935, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.959104537963867, + "learning_rate": 1e-06, + "loss": 0.4115, + "mean_token_accuracy": 0.871479332447052, + "num_tokens": 731855291.0, + "step": 19179 + }, + { + "epoch": 2.439893143365984, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.877819061279297, + "learning_rate": 1e-06, + "loss": 0.3885, + "mean_token_accuracy": 0.8779483437538147, + "num_tokens": 731899392.0, + "step": 19180 + }, + { + "epoch": 2.4400203536445746, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.98850440979004, + "learning_rate": 1e-06, + "loss": 0.3921, + "mean_token_accuracy": 0.8729602694511414, + "num_tokens": 731937502.0, + "step": 19181 + }, + { + "epoch": 2.440147563923165, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.892305374145508, + "learning_rate": 1e-06, + "loss": 0.3868, + "mean_token_accuracy": 0.8767423033714294, + "num_tokens": 731974946.0, + "step": 19182 + }, + { + "epoch": 2.4402747742017556, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.16240882873535, + "learning_rate": 1e-06, + "loss": 0.434, + "mean_token_accuracy": 0.8619188070297241, + "num_tokens": 732018732.0, + "step": 19183 + }, + { + "epoch": 2.440401984480346, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.988046646118164, + "learning_rate": 1e-06, + "loss": 0.4022, + "mean_token_accuracy": 0.8733929395675659, + "num_tokens": 732052036.0, + "step": 19184 + }, + { + "epoch": 2.4405291947589367, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.989919662475586, + "learning_rate": 1e-06, + "loss": 0.4141, + "mean_token_accuracy": 0.8689987659454346, + "num_tokens": 732090944.0, + "step": 19185 + }, + { + "epoch": 2.440656405037527, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.011829376220703, + "learning_rate": 1e-06, + "loss": 0.3938, + "mean_token_accuracy": 0.8764835596084595, + "num_tokens": 732131406.0, + "step": 19186 + }, + { + "epoch": 2.4407836153161178, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.124217987060547, + "learning_rate": 1e-06, + "loss": 0.4077, + "mean_token_accuracy": 0.8727864027023315, + "num_tokens": 732171701.0, + "step": 19187 + }, + { + "epoch": 2.440910825594708, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.12306022644043, + "learning_rate": 1e-06, + "loss": 0.367, + "mean_token_accuracy": 0.882163405418396, + "num_tokens": 732209658.0, + "step": 19188 + }, + { + "epoch": 2.4410380358732984, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.94205093383789, + "learning_rate": 1e-06, + "loss": 0.4426, + "mean_token_accuracy": 0.8616195321083069, + "num_tokens": 732251395.0, + "step": 19189 + }, + { + "epoch": 2.441165246151889, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.003450393676758, + "learning_rate": 1e-06, + "loss": 0.4283, + "mean_token_accuracy": 0.8638916611671448, + "num_tokens": 732289306.0, + "step": 19190 + }, + { + "epoch": 2.4412924564304794, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.194700241088867, + "learning_rate": 1e-06, + "loss": 0.4357, + "mean_token_accuracy": 0.859684944152832, + "num_tokens": 732337916.0, + "step": 19191 + }, + { + "epoch": 2.44141966670907, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.335697174072266, + "learning_rate": 1e-06, + "loss": 0.4509, + "mean_token_accuracy": 0.8605589866638184, + "num_tokens": 732380358.0, + "step": 19192 + }, + { + "epoch": 2.4415468769876605, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.969764709472656, + "learning_rate": 1e-06, + "loss": 0.3635, + "mean_token_accuracy": 0.8835904598236084, + "num_tokens": 732416534.0, + "step": 19193 + }, + { + "epoch": 2.441674087266251, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.929100036621094, + "learning_rate": 1e-06, + "loss": 0.4581, + "mean_token_accuracy": 0.853786826133728, + "num_tokens": 732458927.0, + "step": 19194 + }, + { + "epoch": 2.4418012975448415, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.124034881591797, + "learning_rate": 1e-06, + "loss": 0.4381, + "mean_token_accuracy": 0.8651207089424133, + "num_tokens": 732501261.0, + "step": 19195 + }, + { + "epoch": 2.441928507823432, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.970064163208008, + "learning_rate": 1e-06, + "loss": 0.3816, + "mean_token_accuracy": 0.8821848630905151, + "num_tokens": 732541162.0, + "step": 19196 + }, + { + "epoch": 2.4420557181020226, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.06688117980957, + "learning_rate": 1e-06, + "loss": 0.415, + "mean_token_accuracy": 0.8701504468917847, + "num_tokens": 732578237.0, + "step": 19197 + }, + { + "epoch": 2.442182928380613, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.893138885498047, + "learning_rate": 1e-06, + "loss": 0.4281, + "mean_token_accuracy": 0.8639897108078003, + "num_tokens": 732618624.0, + "step": 19198 + }, + { + "epoch": 2.4423101386592037, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.915281295776367, + "learning_rate": 1e-06, + "loss": 0.4197, + "mean_token_accuracy": 0.8701200485229492, + "num_tokens": 732661747.0, + "step": 19199 + }, + { + "epoch": 2.442437348937794, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.019712448120117, + "learning_rate": 1e-06, + "loss": 0.4637, + "mean_token_accuracy": 0.8530282378196716, + "num_tokens": 732703511.0, + "step": 19200 + }, + { + "epoch": 2.4425645592163847, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.89282989501953, + "learning_rate": 1e-06, + "loss": 0.4618, + "mean_token_accuracy": 0.8537096977233887, + "num_tokens": 732746749.0, + "step": 19201 + }, + { + "epoch": 2.4426917694949752, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.845722198486328, + "learning_rate": 1e-06, + "loss": 0.3921, + "mean_token_accuracy": 0.8753302097320557, + "num_tokens": 732783184.0, + "step": 19202 + }, + { + "epoch": 2.4428189797735658, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.01725959777832, + "learning_rate": 1e-06, + "loss": 0.4157, + "mean_token_accuracy": 0.8681324124336243, + "num_tokens": 732818664.0, + "step": 19203 + }, + { + "epoch": 2.4429461900521563, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.793981552124023, + "learning_rate": 1e-06, + "loss": 0.4312, + "mean_token_accuracy": 0.8626563549041748, + "num_tokens": 732854286.0, + "step": 19204 + }, + { + "epoch": 2.443073400330747, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.18382453918457, + "learning_rate": 1e-06, + "loss": 0.3814, + "mean_token_accuracy": 0.8805381655693054, + "num_tokens": 732893032.0, + "step": 19205 + }, + { + "epoch": 2.4432006106093374, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.074708938598633, + "learning_rate": 1e-06, + "loss": 0.4022, + "mean_token_accuracy": 0.8724374771118164, + "num_tokens": 732929942.0, + "step": 19206 + }, + { + "epoch": 2.443327820887928, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.015335083007812, + "learning_rate": 1e-06, + "loss": 0.4086, + "mean_token_accuracy": 0.8695065379142761, + "num_tokens": 732968218.0, + "step": 19207 + }, + { + "epoch": 2.4434550311665184, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.961883544921875, + "learning_rate": 1e-06, + "loss": 0.3268, + "mean_token_accuracy": 0.8973384499549866, + "num_tokens": 733001464.0, + "step": 19208 + }, + { + "epoch": 2.443582241445109, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.263763427734375, + "learning_rate": 1e-06, + "loss": 0.4272, + "mean_token_accuracy": 0.8646396994590759, + "num_tokens": 733036311.0, + "step": 19209 + }, + { + "epoch": 2.4437094517236995, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.0056095123291, + "learning_rate": 1e-06, + "loss": 0.399, + "mean_token_accuracy": 0.8787740468978882, + "num_tokens": 733070487.0, + "step": 19210 + }, + { + "epoch": 2.4438366620022896, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.167316436767578, + "learning_rate": 1e-06, + "loss": 0.4145, + "mean_token_accuracy": 0.872413694858551, + "num_tokens": 733110357.0, + "step": 19211 + }, + { + "epoch": 2.4439638722808805, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.17898178100586, + "learning_rate": 1e-06, + "loss": 0.4147, + "mean_token_accuracy": 0.8653737306594849, + "num_tokens": 733149070.0, + "step": 19212 + }, + { + "epoch": 2.4440910825594706, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.017724990844727, + "learning_rate": 1e-06, + "loss": 0.3562, + "mean_token_accuracy": 0.8876990079879761, + "num_tokens": 733180848.0, + "step": 19213 + }, + { + "epoch": 2.444218292838061, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.020536422729492, + "learning_rate": 1e-06, + "loss": 0.4056, + "mean_token_accuracy": 0.8752982020378113, + "num_tokens": 733219204.0, + "step": 19214 + }, + { + "epoch": 2.4443455031166517, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.12692642211914, + "learning_rate": 1e-06, + "loss": 0.4309, + "mean_token_accuracy": 0.863413393497467, + "num_tokens": 733255237.0, + "step": 19215 + }, + { + "epoch": 2.444472713395242, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.021841049194336, + "learning_rate": 1e-06, + "loss": 0.3605, + "mean_token_accuracy": 0.8839754462242126, + "num_tokens": 733293243.0, + "step": 19216 + }, + { + "epoch": 2.4445999236738327, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.92540168762207, + "learning_rate": 1e-06, + "loss": 0.3986, + "mean_token_accuracy": 0.8760427832603455, + "num_tokens": 733333530.0, + "step": 19217 + }, + { + "epoch": 2.4447271339524232, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.972185134887695, + "learning_rate": 1e-06, + "loss": 0.4566, + "mean_token_accuracy": 0.8560816645622253, + "num_tokens": 733373790.0, + "step": 19218 + }, + { + "epoch": 2.4448543442310138, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.151247024536133, + "learning_rate": 1e-06, + "loss": 0.3855, + "mean_token_accuracy": 0.87628573179245, + "num_tokens": 733416037.0, + "step": 19219 + }, + { + "epoch": 2.4449815545096043, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.094375610351562, + "learning_rate": 1e-06, + "loss": 0.4067, + "mean_token_accuracy": 0.8702359199523926, + "num_tokens": 733453014.0, + "step": 19220 + }, + { + "epoch": 2.445108764788195, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.797134399414062, + "learning_rate": 1e-06, + "loss": 0.4608, + "mean_token_accuracy": 0.8582533001899719, + "num_tokens": 733491197.0, + "step": 19221 + }, + { + "epoch": 2.4452359750667854, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.150667190551758, + "learning_rate": 1e-06, + "loss": 0.3651, + "mean_token_accuracy": 0.8863773345947266, + "num_tokens": 733531142.0, + "step": 19222 + }, + { + "epoch": 2.445363185345376, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.9483642578125, + "learning_rate": 1e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.8679356575012207, + "num_tokens": 733573916.0, + "step": 19223 + }, + { + "epoch": 2.4454903956239664, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.857982635498047, + "learning_rate": 1e-06, + "loss": 0.4306, + "mean_token_accuracy": 0.8679267764091492, + "num_tokens": 733613204.0, + "step": 19224 + }, + { + "epoch": 2.445617605902557, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.936359405517578, + "learning_rate": 1e-06, + "loss": 0.4192, + "mean_token_accuracy": 0.8689414262771606, + "num_tokens": 733653618.0, + "step": 19225 + }, + { + "epoch": 2.4457448161811475, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.970815658569336, + "learning_rate": 1e-06, + "loss": 0.3561, + "mean_token_accuracy": 0.8892274498939514, + "num_tokens": 733687795.0, + "step": 19226 + }, + { + "epoch": 2.445872026459738, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.032468795776367, + "learning_rate": 1e-06, + "loss": 0.4134, + "mean_token_accuracy": 0.8733850717544556, + "num_tokens": 733723574.0, + "step": 19227 + }, + { + "epoch": 2.4459992367383285, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.00878143310547, + "learning_rate": 1e-06, + "loss": 0.3742, + "mean_token_accuracy": 0.8801717162132263, + "num_tokens": 733762051.0, + "step": 19228 + }, + { + "epoch": 2.446126447016919, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.612751007080078, + "learning_rate": 1e-06, + "loss": 0.3477, + "mean_token_accuracy": 0.8890484571456909, + "num_tokens": 733800640.0, + "step": 19229 + }, + { + "epoch": 2.4462536572955096, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.990270614624023, + "learning_rate": 1e-06, + "loss": 0.431, + "mean_token_accuracy": 0.8637404441833496, + "num_tokens": 733841321.0, + "step": 19230 + }, + { + "epoch": 2.4463808675741, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.94869041442871, + "learning_rate": 1e-06, + "loss": 0.3927, + "mean_token_accuracy": 0.8762465119361877, + "num_tokens": 733874104.0, + "step": 19231 + }, + { + "epoch": 2.4465080778526906, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.867923736572266, + "learning_rate": 1e-06, + "loss": 0.3988, + "mean_token_accuracy": 0.8773999214172363, + "num_tokens": 733913127.0, + "step": 19232 + }, + { + "epoch": 2.446635288131281, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.902135848999023, + "learning_rate": 1e-06, + "loss": 0.3485, + "mean_token_accuracy": 0.8894214034080505, + "num_tokens": 733951900.0, + "step": 19233 + }, + { + "epoch": 2.4467624984098717, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.983009338378906, + "learning_rate": 1e-06, + "loss": 0.3979, + "mean_token_accuracy": 0.8753160238265991, + "num_tokens": 733992866.0, + "step": 19234 + }, + { + "epoch": 2.4468897086884622, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.00389289855957, + "learning_rate": 1e-06, + "loss": 0.4173, + "mean_token_accuracy": 0.868226945400238, + "num_tokens": 734029895.0, + "step": 19235 + }, + { + "epoch": 2.4470169189670523, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.901212692260742, + "learning_rate": 1e-06, + "loss": 0.3587, + "mean_token_accuracy": 0.8896133899688721, + "num_tokens": 734066954.0, + "step": 19236 + }, + { + "epoch": 2.4471441292456433, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.03443717956543, + "learning_rate": 1e-06, + "loss": 0.3365, + "mean_token_accuracy": 0.8947887420654297, + "num_tokens": 734102412.0, + "step": 19237 + }, + { + "epoch": 2.4472713395242334, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.884138107299805, + "learning_rate": 1e-06, + "loss": 0.3478, + "mean_token_accuracy": 0.8894572257995605, + "num_tokens": 734141400.0, + "step": 19238 + }, + { + "epoch": 2.447398549802824, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.05882453918457, + "learning_rate": 1e-06, + "loss": 0.4704, + "mean_token_accuracy": 0.8543676137924194, + "num_tokens": 734176628.0, + "step": 19239 + }, + { + "epoch": 2.4475257600814144, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.788644790649414, + "learning_rate": 1e-06, + "loss": 0.3932, + "mean_token_accuracy": 0.8795085549354553, + "num_tokens": 734219756.0, + "step": 19240 + }, + { + "epoch": 2.447652970360005, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.33434295654297, + "learning_rate": 1e-06, + "loss": 0.4157, + "mean_token_accuracy": 0.8730948567390442, + "num_tokens": 734250509.0, + "step": 19241 + }, + { + "epoch": 2.4477801806385955, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.05646324157715, + "learning_rate": 1e-06, + "loss": 0.3793, + "mean_token_accuracy": 0.8808653354644775, + "num_tokens": 734287887.0, + "step": 19242 + }, + { + "epoch": 2.447907390917186, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.990190505981445, + "learning_rate": 1e-06, + "loss": 0.3858, + "mean_token_accuracy": 0.8782293796539307, + "num_tokens": 734330446.0, + "step": 19243 + }, + { + "epoch": 2.4480346011957765, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.1016902923584, + "learning_rate": 1e-06, + "loss": 0.3811, + "mean_token_accuracy": 0.8793286085128784, + "num_tokens": 734368427.0, + "step": 19244 + }, + { + "epoch": 2.448161811474367, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.9508113861084, + "learning_rate": 1e-06, + "loss": 0.3395, + "mean_token_accuracy": 0.8921475410461426, + "num_tokens": 734405103.0, + "step": 19245 + }, + { + "epoch": 2.4482890217529576, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.08036231994629, + "learning_rate": 1e-06, + "loss": 0.4112, + "mean_token_accuracy": 0.8703868389129639, + "num_tokens": 734448069.0, + "step": 19246 + }, + { + "epoch": 2.448416232031548, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.05353546142578, + "learning_rate": 1e-06, + "loss": 0.4407, + "mean_token_accuracy": 0.8602977991104126, + "num_tokens": 734486016.0, + "step": 19247 + }, + { + "epoch": 2.4485434423101387, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.990787506103516, + "learning_rate": 1e-06, + "loss": 0.4372, + "mean_token_accuracy": 0.8626412153244019, + "num_tokens": 734526943.0, + "step": 19248 + }, + { + "epoch": 2.448670652588729, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.99370002746582, + "learning_rate": 1e-06, + "loss": 0.3627, + "mean_token_accuracy": 0.887222409248352, + "num_tokens": 734566137.0, + "step": 19249 + }, + { + "epoch": 2.4487978628673197, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.9234619140625, + "learning_rate": 1e-06, + "loss": 0.4061, + "mean_token_accuracy": 0.871716320514679, + "num_tokens": 734602366.0, + "step": 19250 + }, + { + "epoch": 2.4489250731459102, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.746898651123047, + "learning_rate": 1e-06, + "loss": 0.3757, + "mean_token_accuracy": 0.881843090057373, + "num_tokens": 734644251.0, + "step": 19251 + }, + { + "epoch": 2.4490522834245008, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.067432403564453, + "learning_rate": 1e-06, + "loss": 0.407, + "mean_token_accuracy": 0.8702867031097412, + "num_tokens": 734682938.0, + "step": 19252 + }, + { + "epoch": 2.4491794937030913, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.98956298828125, + "learning_rate": 1e-06, + "loss": 0.3945, + "mean_token_accuracy": 0.8751979470252991, + "num_tokens": 734722975.0, + "step": 19253 + }, + { + "epoch": 2.449306703981682, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.941802978515625, + "learning_rate": 1e-06, + "loss": 0.3826, + "mean_token_accuracy": 0.8765232563018799, + "num_tokens": 734761361.0, + "step": 19254 + }, + { + "epoch": 2.4494339142602723, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.21016502380371, + "learning_rate": 1e-06, + "loss": 0.4167, + "mean_token_accuracy": 0.8722168207168579, + "num_tokens": 734800597.0, + "step": 19255 + }, + { + "epoch": 2.449561124538863, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.795183181762695, + "learning_rate": 1e-06, + "loss": 0.4353, + "mean_token_accuracy": 0.8625717163085938, + "num_tokens": 734840575.0, + "step": 19256 + }, + { + "epoch": 2.4496883348174534, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.959030151367188, + "learning_rate": 1e-06, + "loss": 0.4275, + "mean_token_accuracy": 0.8649303913116455, + "num_tokens": 734876943.0, + "step": 19257 + }, + { + "epoch": 2.449815545096044, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.879352569580078, + "learning_rate": 1e-06, + "loss": 0.3509, + "mean_token_accuracy": 0.8896701335906982, + "num_tokens": 734916436.0, + "step": 19258 + }, + { + "epoch": 2.4499427553746345, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.16208267211914, + "learning_rate": 1e-06, + "loss": 0.4047, + "mean_token_accuracy": 0.8729015588760376, + "num_tokens": 734956336.0, + "step": 19259 + }, + { + "epoch": 2.450069965653225, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.16783332824707, + "learning_rate": 1e-06, + "loss": 0.4031, + "mean_token_accuracy": 0.872763991355896, + "num_tokens": 734997132.0, + "step": 19260 + }, + { + "epoch": 2.450197175931815, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.90321159362793, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8622205853462219, + "num_tokens": 735036192.0, + "step": 19261 + }, + { + "epoch": 2.450324386210406, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.98811912536621, + "learning_rate": 1e-06, + "loss": 0.3562, + "mean_token_accuracy": 0.8873615860939026, + "num_tokens": 735079368.0, + "step": 19262 + }, + { + "epoch": 2.450451596488996, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.219289779663086, + "learning_rate": 1e-06, + "loss": 0.3736, + "mean_token_accuracy": 0.8801667094230652, + "num_tokens": 735109714.0, + "step": 19263 + }, + { + "epoch": 2.4505788067675867, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.076663970947266, + "learning_rate": 1e-06, + "loss": 0.4069, + "mean_token_accuracy": 0.870773196220398, + "num_tokens": 735152712.0, + "step": 19264 + }, + { + "epoch": 2.450706017046177, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.048765182495117, + "learning_rate": 1e-06, + "loss": 0.4151, + "mean_token_accuracy": 0.8693326711654663, + "num_tokens": 735193313.0, + "step": 19265 + }, + { + "epoch": 2.4508332273247677, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.163782119750977, + "learning_rate": 1e-06, + "loss": 0.359, + "mean_token_accuracy": 0.8881226778030396, + "num_tokens": 735235413.0, + "step": 19266 + }, + { + "epoch": 2.4509604376033582, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.856517791748047, + "learning_rate": 1e-06, + "loss": 0.3973, + "mean_token_accuracy": 0.8745875954627991, + "num_tokens": 735270809.0, + "step": 19267 + }, + { + "epoch": 2.4510876478819488, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.20004653930664, + "learning_rate": 1e-06, + "loss": 0.4263, + "mean_token_accuracy": 0.8672026991844177, + "num_tokens": 735309790.0, + "step": 19268 + }, + { + "epoch": 2.4512148581605393, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.126203536987305, + "learning_rate": 1e-06, + "loss": 0.3937, + "mean_token_accuracy": 0.8744758367538452, + "num_tokens": 735341084.0, + "step": 19269 + }, + { + "epoch": 2.45134206843913, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.996692657470703, + "learning_rate": 1e-06, + "loss": 0.3488, + "mean_token_accuracy": 0.8907091021537781, + "num_tokens": 735376964.0, + "step": 19270 + }, + { + "epoch": 2.4514692787177204, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.034711837768555, + "learning_rate": 1e-06, + "loss": 0.3746, + "mean_token_accuracy": 0.8838216066360474, + "num_tokens": 735413296.0, + "step": 19271 + }, + { + "epoch": 2.451596488996311, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.049297332763672, + "learning_rate": 1e-06, + "loss": 0.3885, + "mean_token_accuracy": 0.8781640529632568, + "num_tokens": 735454992.0, + "step": 19272 + }, + { + "epoch": 2.4517236992749014, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.149320602416992, + "learning_rate": 1e-06, + "loss": 0.3899, + "mean_token_accuracy": 0.8762211203575134, + "num_tokens": 735493315.0, + "step": 19273 + }, + { + "epoch": 2.451850909553492, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.883827209472656, + "learning_rate": 1e-06, + "loss": 0.3713, + "mean_token_accuracy": 0.8818612098693848, + "num_tokens": 735532666.0, + "step": 19274 + }, + { + "epoch": 2.4519781198320825, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.07682991027832, + "learning_rate": 1e-06, + "loss": 0.4075, + "mean_token_accuracy": 0.8720778226852417, + "num_tokens": 735571987.0, + "step": 19275 + }, + { + "epoch": 2.452105330110673, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.764366149902344, + "learning_rate": 1e-06, + "loss": 0.4032, + "mean_token_accuracy": 0.8719688653945923, + "num_tokens": 735609409.0, + "step": 19276 + }, + { + "epoch": 2.4522325403892635, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.161508560180664, + "learning_rate": 1e-06, + "loss": 0.4834, + "mean_token_accuracy": 0.8481701016426086, + "num_tokens": 735647299.0, + "step": 19277 + }, + { + "epoch": 2.452359750667854, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.067337036132812, + "learning_rate": 1e-06, + "loss": 0.3697, + "mean_token_accuracy": 0.8823676705360413, + "num_tokens": 735677757.0, + "step": 19278 + }, + { + "epoch": 2.4524869609464446, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.932104110717773, + "learning_rate": 1e-06, + "loss": 0.3884, + "mean_token_accuracy": 0.8791482448577881, + "num_tokens": 735723924.0, + "step": 19279 + }, + { + "epoch": 2.452614171225035, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.012088775634766, + "learning_rate": 1e-06, + "loss": 0.4377, + "mean_token_accuracy": 0.8592456579208374, + "num_tokens": 735758105.0, + "step": 19280 + }, + { + "epoch": 2.4527413815036256, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.39292335510254, + "learning_rate": 1e-06, + "loss": 0.4128, + "mean_token_accuracy": 0.8706507086753845, + "num_tokens": 735797603.0, + "step": 19281 + }, + { + "epoch": 2.452868591782216, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.930540084838867, + "learning_rate": 1e-06, + "loss": 0.483, + "mean_token_accuracy": 0.8477315902709961, + "num_tokens": 735838958.0, + "step": 19282 + }, + { + "epoch": 2.4529958020608067, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.172096252441406, + "learning_rate": 1e-06, + "loss": 0.4299, + "mean_token_accuracy": 0.8663061857223511, + "num_tokens": 735878894.0, + "step": 19283 + }, + { + "epoch": 2.453123012339397, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.011104583740234, + "learning_rate": 1e-06, + "loss": 0.3108, + "mean_token_accuracy": 0.9036685824394226, + "num_tokens": 735913222.0, + "step": 19284 + }, + { + "epoch": 2.4532502226179878, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.91875648498535, + "learning_rate": 1e-06, + "loss": 0.3728, + "mean_token_accuracy": 0.8817014098167419, + "num_tokens": 735951884.0, + "step": 19285 + }, + { + "epoch": 2.453377432896578, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.120798110961914, + "learning_rate": 1e-06, + "loss": 0.4151, + "mean_token_accuracy": 0.8694732189178467, + "num_tokens": 735998955.0, + "step": 19286 + }, + { + "epoch": 2.4535046431751684, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.233234405517578, + "learning_rate": 1e-06, + "loss": 0.3971, + "mean_token_accuracy": 0.8725294470787048, + "num_tokens": 736039631.0, + "step": 19287 + }, + { + "epoch": 2.453631853453759, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.042537689208984, + "learning_rate": 1e-06, + "loss": 0.4119, + "mean_token_accuracy": 0.8698588609695435, + "num_tokens": 736085100.0, + "step": 19288 + }, + { + "epoch": 2.4537590637323494, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.08296775817871, + "learning_rate": 1e-06, + "loss": 0.3505, + "mean_token_accuracy": 0.8882473707199097, + "num_tokens": 736119680.0, + "step": 19289 + }, + { + "epoch": 2.45388627401094, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.9011287689209, + "learning_rate": 1e-06, + "loss": 0.3671, + "mean_token_accuracy": 0.8854305744171143, + "num_tokens": 736159769.0, + "step": 19290 + }, + { + "epoch": 2.4540134842895305, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.966800689697266, + "learning_rate": 1e-06, + "loss": 0.3751, + "mean_token_accuracy": 0.8801692724227905, + "num_tokens": 736199566.0, + "step": 19291 + }, + { + "epoch": 2.454140694568121, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.937170028686523, + "learning_rate": 1e-06, + "loss": 0.385, + "mean_token_accuracy": 0.8785828351974487, + "num_tokens": 736239238.0, + "step": 19292 + }, + { + "epoch": 2.4542679048467115, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.179338455200195, + "learning_rate": 1e-06, + "loss": 0.3958, + "mean_token_accuracy": 0.8768845200538635, + "num_tokens": 736274576.0, + "step": 19293 + }, + { + "epoch": 2.454395115125302, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.752378463745117, + "learning_rate": 1e-06, + "loss": 0.4296, + "mean_token_accuracy": 0.868040919303894, + "num_tokens": 736307509.0, + "step": 19294 + }, + { + "epoch": 2.4545223254038926, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.06804084777832, + "learning_rate": 1e-06, + "loss": 0.4147, + "mean_token_accuracy": 0.8711496591567993, + "num_tokens": 736348313.0, + "step": 19295 + }, + { + "epoch": 2.454649535682483, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.93854331970215, + "learning_rate": 1e-06, + "loss": 0.4094, + "mean_token_accuracy": 0.869662880897522, + "num_tokens": 736380716.0, + "step": 19296 + }, + { + "epoch": 2.4547767459610736, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.211254119873047, + "learning_rate": 1e-06, + "loss": 0.4678, + "mean_token_accuracy": 0.8524206876754761, + "num_tokens": 736422669.0, + "step": 19297 + }, + { + "epoch": 2.454903956239664, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.117416381835938, + "learning_rate": 1e-06, + "loss": 0.3838, + "mean_token_accuracy": 0.8820920586585999, + "num_tokens": 736452929.0, + "step": 19298 + }, + { + "epoch": 2.4550311665182547, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.996246337890625, + "learning_rate": 1e-06, + "loss": 0.3896, + "mean_token_accuracy": 0.8749105334281921, + "num_tokens": 736486755.0, + "step": 19299 + }, + { + "epoch": 2.4551583767968452, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.90140151977539, + "learning_rate": 1e-06, + "loss": 0.3711, + "mean_token_accuracy": 0.8819839954376221, + "num_tokens": 736521542.0, + "step": 19300 + }, + { + "epoch": 2.4552855870754358, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.896730422973633, + "learning_rate": 1e-06, + "loss": 0.3964, + "mean_token_accuracy": 0.8756338357925415, + "num_tokens": 736557312.0, + "step": 19301 + }, + { + "epoch": 2.4554127973540263, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.05042266845703, + "learning_rate": 1e-06, + "loss": 0.4132, + "mean_token_accuracy": 0.8706314563751221, + "num_tokens": 736596469.0, + "step": 19302 + }, + { + "epoch": 2.455540007632617, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.983774185180664, + "learning_rate": 1e-06, + "loss": 0.4438, + "mean_token_accuracy": 0.8584535121917725, + "num_tokens": 736636078.0, + "step": 19303 + }, + { + "epoch": 2.4556672179112073, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.108366012573242, + "learning_rate": 1e-06, + "loss": 0.3516, + "mean_token_accuracy": 0.8857948184013367, + "num_tokens": 736674117.0, + "step": 19304 + }, + { + "epoch": 2.455794428189798, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.770023345947266, + "learning_rate": 1e-06, + "loss": 0.4104, + "mean_token_accuracy": 0.8712470531463623, + "num_tokens": 736709116.0, + "step": 19305 + }, + { + "epoch": 2.4559216384683884, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.91170310974121, + "learning_rate": 1e-06, + "loss": 0.4308, + "mean_token_accuracy": 0.8659457564353943, + "num_tokens": 736750383.0, + "step": 19306 + }, + { + "epoch": 2.456048848746979, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.09856605529785, + "learning_rate": 1e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.862660825252533, + "num_tokens": 736790836.0, + "step": 19307 + }, + { + "epoch": 2.4561760590255695, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.15228271484375, + "learning_rate": 1e-06, + "loss": 0.408, + "mean_token_accuracy": 0.8722323179244995, + "num_tokens": 736826085.0, + "step": 19308 + }, + { + "epoch": 2.4563032693041595, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.96050453186035, + "learning_rate": 1e-06, + "loss": 0.3953, + "mean_token_accuracy": 0.8738561868667603, + "num_tokens": 736860599.0, + "step": 19309 + }, + { + "epoch": 2.4564304795827505, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.1413631439209, + "learning_rate": 1e-06, + "loss": 0.3378, + "mean_token_accuracy": 0.8911997079849243, + "num_tokens": 736893286.0, + "step": 19310 + }, + { + "epoch": 2.4565576898613406, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.002029418945312, + "learning_rate": 1e-06, + "loss": 0.3713, + "mean_token_accuracy": 0.8833901286125183, + "num_tokens": 736931597.0, + "step": 19311 + }, + { + "epoch": 2.456684900139931, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.112699508666992, + "learning_rate": 1e-06, + "loss": 0.3674, + "mean_token_accuracy": 0.881916344165802, + "num_tokens": 736972183.0, + "step": 19312 + }, + { + "epoch": 2.4568121104185217, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.17154312133789, + "learning_rate": 1e-06, + "loss": 0.3732, + "mean_token_accuracy": 0.8805615901947021, + "num_tokens": 737004184.0, + "step": 19313 + }, + { + "epoch": 2.456939320697112, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.060802459716797, + "learning_rate": 1e-06, + "loss": 0.4039, + "mean_token_accuracy": 0.8742137551307678, + "num_tokens": 737042644.0, + "step": 19314 + }, + { + "epoch": 2.4570665309757027, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.203792572021484, + "learning_rate": 1e-06, + "loss": 0.4231, + "mean_token_accuracy": 0.8650084733963013, + "num_tokens": 737074830.0, + "step": 19315 + }, + { + "epoch": 2.4571937412542932, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.11910057067871, + "learning_rate": 1e-06, + "loss": 0.3777, + "mean_token_accuracy": 0.8808165788650513, + "num_tokens": 737107756.0, + "step": 19316 + }, + { + "epoch": 2.4573209515328838, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.1622257232666, + "learning_rate": 1e-06, + "loss": 0.3814, + "mean_token_accuracy": 0.8833416700363159, + "num_tokens": 737143457.0, + "step": 19317 + }, + { + "epoch": 2.4574481618114743, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.05310821533203, + "learning_rate": 1e-06, + "loss": 0.4122, + "mean_token_accuracy": 0.8696722388267517, + "num_tokens": 737177005.0, + "step": 19318 + }, + { + "epoch": 2.457575372090065, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.15852928161621, + "learning_rate": 1e-06, + "loss": 0.4337, + "mean_token_accuracy": 0.8654493689537048, + "num_tokens": 737218265.0, + "step": 19319 + }, + { + "epoch": 2.4577025823686554, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.192588806152344, + "learning_rate": 1e-06, + "loss": 0.4403, + "mean_token_accuracy": 0.8643875122070312, + "num_tokens": 737259495.0, + "step": 19320 + }, + { + "epoch": 2.457829792647246, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.053857803344727, + "learning_rate": 1e-06, + "loss": 0.4275, + "mean_token_accuracy": 0.8664473295211792, + "num_tokens": 737300073.0, + "step": 19321 + }, + { + "epoch": 2.4579570029258364, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.097082138061523, + "learning_rate": 1e-06, + "loss": 0.3951, + "mean_token_accuracy": 0.87537682056427, + "num_tokens": 737334716.0, + "step": 19322 + }, + { + "epoch": 2.458084213204427, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.90391731262207, + "learning_rate": 1e-06, + "loss": 0.3898, + "mean_token_accuracy": 0.8783477544784546, + "num_tokens": 737377086.0, + "step": 19323 + }, + { + "epoch": 2.4582114234830175, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.03725814819336, + "learning_rate": 1e-06, + "loss": 0.4242, + "mean_token_accuracy": 0.8634848594665527, + "num_tokens": 737418259.0, + "step": 19324 + }, + { + "epoch": 2.458338633761608, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.077747344970703, + "learning_rate": 1e-06, + "loss": 0.4272, + "mean_token_accuracy": 0.8645204305648804, + "num_tokens": 737453150.0, + "step": 19325 + }, + { + "epoch": 2.4584658440401985, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.9411563873291, + "learning_rate": 1e-06, + "loss": 0.3551, + "mean_token_accuracy": 0.8893730044364929, + "num_tokens": 737495552.0, + "step": 19326 + }, + { + "epoch": 2.458593054318789, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.16339683532715, + "learning_rate": 1e-06, + "loss": 0.3793, + "mean_token_accuracy": 0.8795942068099976, + "num_tokens": 737537518.0, + "step": 19327 + }, + { + "epoch": 2.4587202645973796, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.100337982177734, + "learning_rate": 1e-06, + "loss": 0.4191, + "mean_token_accuracy": 0.8706586956977844, + "num_tokens": 737577472.0, + "step": 19328 + }, + { + "epoch": 2.45884747487597, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.170738220214844, + "learning_rate": 1e-06, + "loss": 0.4212, + "mean_token_accuracy": 0.8646475076675415, + "num_tokens": 737617250.0, + "step": 19329 + }, + { + "epoch": 2.4589746851545606, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.87712860107422, + "learning_rate": 1e-06, + "loss": 0.3875, + "mean_token_accuracy": 0.8779178857803345, + "num_tokens": 737655544.0, + "step": 19330 + }, + { + "epoch": 2.459101895433151, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.068376541137695, + "learning_rate": 1e-06, + "loss": 0.3893, + "mean_token_accuracy": 0.8768813610076904, + "num_tokens": 737688644.0, + "step": 19331 + }, + { + "epoch": 2.4592291057117417, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.15125274658203, + "learning_rate": 1e-06, + "loss": 0.3863, + "mean_token_accuracy": 0.8788728713989258, + "num_tokens": 737730264.0, + "step": 19332 + }, + { + "epoch": 2.459356315990332, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.071855545043945, + "learning_rate": 1e-06, + "loss": 0.4158, + "mean_token_accuracy": 0.868794322013855, + "num_tokens": 737763578.0, + "step": 19333 + }, + { + "epoch": 2.4594835262689223, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.93234634399414, + "learning_rate": 1e-06, + "loss": 0.3951, + "mean_token_accuracy": 0.875438928604126, + "num_tokens": 737804062.0, + "step": 19334 + }, + { + "epoch": 2.4596107365475133, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.87896728515625, + "learning_rate": 1e-06, + "loss": 0.3601, + "mean_token_accuracy": 0.8879595994949341, + "num_tokens": 737845608.0, + "step": 19335 + }, + { + "epoch": 2.4597379468261034, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.132761001586914, + "learning_rate": 1e-06, + "loss": 0.4543, + "mean_token_accuracy": 0.8561714291572571, + "num_tokens": 737881366.0, + "step": 19336 + }, + { + "epoch": 2.459865157104694, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.016191482543945, + "learning_rate": 1e-06, + "loss": 0.4237, + "mean_token_accuracy": 0.8659993410110474, + "num_tokens": 737922595.0, + "step": 19337 + }, + { + "epoch": 2.4599923673832844, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.134746551513672, + "learning_rate": 1e-06, + "loss": 0.3852, + "mean_token_accuracy": 0.8808399438858032, + "num_tokens": 737956368.0, + "step": 19338 + }, + { + "epoch": 2.460119577661875, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.88637924194336, + "learning_rate": 1e-06, + "loss": 0.3788, + "mean_token_accuracy": 0.8785644769668579, + "num_tokens": 738000167.0, + "step": 19339 + }, + { + "epoch": 2.4602467879404655, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.08678436279297, + "learning_rate": 1e-06, + "loss": 0.438, + "mean_token_accuracy": 0.8665512204170227, + "num_tokens": 738040850.0, + "step": 19340 + }, + { + "epoch": 2.460373998219056, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.803083419799805, + "learning_rate": 1e-06, + "loss": 0.3974, + "mean_token_accuracy": 0.8719942569732666, + "num_tokens": 738080370.0, + "step": 19341 + }, + { + "epoch": 2.4605012084976465, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.031620025634766, + "learning_rate": 1e-06, + "loss": 0.3835, + "mean_token_accuracy": 0.8793306350708008, + "num_tokens": 738116751.0, + "step": 19342 + }, + { + "epoch": 2.460628418776237, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.957595825195312, + "learning_rate": 1e-06, + "loss": 0.3271, + "mean_token_accuracy": 0.8919131755828857, + "num_tokens": 738154597.0, + "step": 19343 + }, + { + "epoch": 2.4607556290548276, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.128358840942383, + "learning_rate": 1e-06, + "loss": 0.3784, + "mean_token_accuracy": 0.8814692497253418, + "num_tokens": 738186484.0, + "step": 19344 + }, + { + "epoch": 2.460882839333418, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.0811824798584, + "learning_rate": 1e-06, + "loss": 0.3787, + "mean_token_accuracy": 0.8777081370353699, + "num_tokens": 738217722.0, + "step": 19345 + }, + { + "epoch": 2.4610100496120086, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.908597946166992, + "learning_rate": 1e-06, + "loss": 0.3579, + "mean_token_accuracy": 0.8870683908462524, + "num_tokens": 738255304.0, + "step": 19346 + }, + { + "epoch": 2.461137259890599, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.25309181213379, + "learning_rate": 1e-06, + "loss": 0.404, + "mean_token_accuracy": 0.8697222471237183, + "num_tokens": 738292162.0, + "step": 19347 + }, + { + "epoch": 2.4612644701691897, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.931734085083008, + "learning_rate": 1e-06, + "loss": 0.3607, + "mean_token_accuracy": 0.8866804242134094, + "num_tokens": 738332851.0, + "step": 19348 + }, + { + "epoch": 2.4613916804477802, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.157039642333984, + "learning_rate": 1e-06, + "loss": 0.3659, + "mean_token_accuracy": 0.8839205503463745, + "num_tokens": 738375120.0, + "step": 19349 + }, + { + "epoch": 2.4615188907263708, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.013504028320312, + "learning_rate": 1e-06, + "loss": 0.4175, + "mean_token_accuracy": 0.8654749989509583, + "num_tokens": 738413857.0, + "step": 19350 + }, + { + "epoch": 2.4616461010049613, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.006343841552734, + "learning_rate": 1e-06, + "loss": 0.4163, + "mean_token_accuracy": 0.8677005767822266, + "num_tokens": 738450097.0, + "step": 19351 + }, + { + "epoch": 2.461773311283552, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.970529556274414, + "learning_rate": 1e-06, + "loss": 0.4704, + "mean_token_accuracy": 0.8559325933456421, + "num_tokens": 738483784.0, + "step": 19352 + }, + { + "epoch": 2.4619005215621423, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.834259033203125, + "learning_rate": 1e-06, + "loss": 0.4645, + "mean_token_accuracy": 0.8549925684928894, + "num_tokens": 738526820.0, + "step": 19353 + }, + { + "epoch": 2.462027731840733, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.830686569213867, + "learning_rate": 1e-06, + "loss": 0.4398, + "mean_token_accuracy": 0.8601477146148682, + "num_tokens": 738569049.0, + "step": 19354 + }, + { + "epoch": 2.4621549421193234, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.865413665771484, + "learning_rate": 1e-06, + "loss": 0.4109, + "mean_token_accuracy": 0.8706674575805664, + "num_tokens": 738608320.0, + "step": 19355 + }, + { + "epoch": 2.462282152397914, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.969951629638672, + "learning_rate": 1e-06, + "loss": 0.3562, + "mean_token_accuracy": 0.8884380459785461, + "num_tokens": 738645566.0, + "step": 19356 + }, + { + "epoch": 2.4624093626765045, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.007417678833008, + "learning_rate": 1e-06, + "loss": 0.4732, + "mean_token_accuracy": 0.8511083722114563, + "num_tokens": 738686184.0, + "step": 19357 + }, + { + "epoch": 2.462536572955095, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.90738868713379, + "learning_rate": 1e-06, + "loss": 0.44, + "mean_token_accuracy": 0.8603338003158569, + "num_tokens": 738728820.0, + "step": 19358 + }, + { + "epoch": 2.462663783233685, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.13255500793457, + "learning_rate": 1e-06, + "loss": 0.4898, + "mean_token_accuracy": 0.8450102806091309, + "num_tokens": 738760180.0, + "step": 19359 + }, + { + "epoch": 2.462790993512276, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.102210998535156, + "learning_rate": 1e-06, + "loss": 0.3679, + "mean_token_accuracy": 0.8821389079093933, + "num_tokens": 738795501.0, + "step": 19360 + }, + { + "epoch": 2.462918203790866, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.935651779174805, + "learning_rate": 1e-06, + "loss": 0.4254, + "mean_token_accuracy": 0.864080011844635, + "num_tokens": 738831027.0, + "step": 19361 + }, + { + "epoch": 2.4630454140694567, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.193479537963867, + "learning_rate": 1e-06, + "loss": 0.4395, + "mean_token_accuracy": 0.8562418222427368, + "num_tokens": 738870383.0, + "step": 19362 + }, + { + "epoch": 2.463172624348047, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.000389099121094, + "learning_rate": 1e-06, + "loss": 0.3757, + "mean_token_accuracy": 0.8789055347442627, + "num_tokens": 738908231.0, + "step": 19363 + }, + { + "epoch": 2.4632998346266377, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.884445190429688, + "learning_rate": 1e-06, + "loss": 0.4209, + "mean_token_accuracy": 0.8651745319366455, + "num_tokens": 738950927.0, + "step": 19364 + }, + { + "epoch": 2.4634270449052282, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.02163314819336, + "learning_rate": 1e-06, + "loss": 0.3673, + "mean_token_accuracy": 0.8851227760314941, + "num_tokens": 738989857.0, + "step": 19365 + }, + { + "epoch": 2.4635542551838188, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.146133422851562, + "learning_rate": 1e-06, + "loss": 0.3602, + "mean_token_accuracy": 0.8870632648468018, + "num_tokens": 739021281.0, + "step": 19366 + }, + { + "epoch": 2.4636814654624093, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.849523544311523, + "learning_rate": 1e-06, + "loss": 0.3766, + "mean_token_accuracy": 0.8801599144935608, + "num_tokens": 739063475.0, + "step": 19367 + }, + { + "epoch": 2.463808675741, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.99041175842285, + "learning_rate": 1e-06, + "loss": 0.4099, + "mean_token_accuracy": 0.869485080242157, + "num_tokens": 739107083.0, + "step": 19368 + }, + { + "epoch": 2.4639358860195903, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.969661712646484, + "learning_rate": 1e-06, + "loss": 0.4632, + "mean_token_accuracy": 0.8526705503463745, + "num_tokens": 739152002.0, + "step": 19369 + }, + { + "epoch": 2.464063096298181, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.98556900024414, + "learning_rate": 1e-06, + "loss": 0.4003, + "mean_token_accuracy": 0.8710118532180786, + "num_tokens": 739191781.0, + "step": 19370 + }, + { + "epoch": 2.4641903065767714, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.002552032470703, + "learning_rate": 1e-06, + "loss": 0.3415, + "mean_token_accuracy": 0.8934237360954285, + "num_tokens": 739230374.0, + "step": 19371 + }, + { + "epoch": 2.464317516855362, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.96904754638672, + "learning_rate": 1e-06, + "loss": 0.4165, + "mean_token_accuracy": 0.8711722493171692, + "num_tokens": 739267310.0, + "step": 19372 + }, + { + "epoch": 2.4644447271339525, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.04166603088379, + "learning_rate": 1e-06, + "loss": 0.3872, + "mean_token_accuracy": 0.8768209218978882, + "num_tokens": 739305074.0, + "step": 19373 + }, + { + "epoch": 2.464571937412543, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.043649673461914, + "learning_rate": 1e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.8655654788017273, + "num_tokens": 739343028.0, + "step": 19374 + }, + { + "epoch": 2.4646991476911335, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.96788787841797, + "learning_rate": 1e-06, + "loss": 0.4187, + "mean_token_accuracy": 0.8721991777420044, + "num_tokens": 739384364.0, + "step": 19375 + }, + { + "epoch": 2.464826357969724, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.06553077697754, + "learning_rate": 1e-06, + "loss": 0.3517, + "mean_token_accuracy": 0.8892821073532104, + "num_tokens": 739423564.0, + "step": 19376 + }, + { + "epoch": 2.4649535682483146, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.757835388183594, + "learning_rate": 1e-06, + "loss": 0.3634, + "mean_token_accuracy": 0.8823323249816895, + "num_tokens": 739457609.0, + "step": 19377 + }, + { + "epoch": 2.465080778526905, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.978992462158203, + "learning_rate": 1e-06, + "loss": 0.4015, + "mean_token_accuracy": 0.8740874528884888, + "num_tokens": 739497204.0, + "step": 19378 + }, + { + "epoch": 2.4652079888054956, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.012351989746094, + "learning_rate": 1e-06, + "loss": 0.4191, + "mean_token_accuracy": 0.8678464293479919, + "num_tokens": 739543688.0, + "step": 19379 + }, + { + "epoch": 2.465335199084086, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.99797821044922, + "learning_rate": 1e-06, + "loss": 0.3717, + "mean_token_accuracy": 0.8850656747817993, + "num_tokens": 739578221.0, + "step": 19380 + }, + { + "epoch": 2.4654624093626767, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.855348587036133, + "learning_rate": 1e-06, + "loss": 0.3938, + "mean_token_accuracy": 0.8790707588195801, + "num_tokens": 739615892.0, + "step": 19381 + }, + { + "epoch": 2.4655896196412668, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.193859100341797, + "learning_rate": 1e-06, + "loss": 0.4313, + "mean_token_accuracy": 0.8593950271606445, + "num_tokens": 739657697.0, + "step": 19382 + }, + { + "epoch": 2.4657168299198577, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.874250411987305, + "learning_rate": 1e-06, + "loss": 0.3633, + "mean_token_accuracy": 0.8821743726730347, + "num_tokens": 739694173.0, + "step": 19383 + }, + { + "epoch": 2.465844040198448, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.065746307373047, + "learning_rate": 1e-06, + "loss": 0.4049, + "mean_token_accuracy": 0.8715478777885437, + "num_tokens": 739734111.0, + "step": 19384 + }, + { + "epoch": 2.4659712504770384, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.852224349975586, + "learning_rate": 1e-06, + "loss": 0.3965, + "mean_token_accuracy": 0.875237226486206, + "num_tokens": 739771409.0, + "step": 19385 + }, + { + "epoch": 2.466098460755629, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.281774520874023, + "learning_rate": 1e-06, + "loss": 0.4508, + "mean_token_accuracy": 0.8606853485107422, + "num_tokens": 739807185.0, + "step": 19386 + }, + { + "epoch": 2.4662256710342194, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.070680618286133, + "learning_rate": 1e-06, + "loss": 0.3646, + "mean_token_accuracy": 0.8858506679534912, + "num_tokens": 739837750.0, + "step": 19387 + }, + { + "epoch": 2.46635288131281, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.757417678833008, + "learning_rate": 1e-06, + "loss": 0.4086, + "mean_token_accuracy": 0.8725844621658325, + "num_tokens": 739879516.0, + "step": 19388 + }, + { + "epoch": 2.4664800915914005, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.080669403076172, + "learning_rate": 1e-06, + "loss": 0.3863, + "mean_token_accuracy": 0.8797813653945923, + "num_tokens": 739918688.0, + "step": 19389 + }, + { + "epoch": 2.466607301869991, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.869752883911133, + "learning_rate": 1e-06, + "loss": 0.3942, + "mean_token_accuracy": 0.8775583505630493, + "num_tokens": 739958978.0, + "step": 19390 + }, + { + "epoch": 2.4667345121485815, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.113990783691406, + "learning_rate": 1e-06, + "loss": 0.3929, + "mean_token_accuracy": 0.8787342309951782, + "num_tokens": 740002428.0, + "step": 19391 + }, + { + "epoch": 2.466861722427172, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.015043258666992, + "learning_rate": 1e-06, + "loss": 0.345, + "mean_token_accuracy": 0.8942111730575562, + "num_tokens": 740045856.0, + "step": 19392 + }, + { + "epoch": 2.4669889327057626, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.89320182800293, + "learning_rate": 1e-06, + "loss": 0.4097, + "mean_token_accuracy": 0.8712950944900513, + "num_tokens": 740084512.0, + "step": 19393 + }, + { + "epoch": 2.467116142984353, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.99629020690918, + "learning_rate": 1e-06, + "loss": 0.3642, + "mean_token_accuracy": 0.8848074078559875, + "num_tokens": 740129624.0, + "step": 19394 + }, + { + "epoch": 2.4672433532629436, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.9901123046875, + "learning_rate": 1e-06, + "loss": 0.4528, + "mean_token_accuracy": 0.8578168153762817, + "num_tokens": 740168368.0, + "step": 19395 + }, + { + "epoch": 2.467370563541534, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.029325485229492, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.8676645159721375, + "num_tokens": 740206670.0, + "step": 19396 + }, + { + "epoch": 2.4674977738201247, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.953855514526367, + "learning_rate": 1e-06, + "loss": 0.4609, + "mean_token_accuracy": 0.8527324795722961, + "num_tokens": 740246422.0, + "step": 19397 + }, + { + "epoch": 2.4676249840987152, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.976232528686523, + "learning_rate": 1e-06, + "loss": 0.402, + "mean_token_accuracy": 0.8707395195960999, + "num_tokens": 740284450.0, + "step": 19398 + }, + { + "epoch": 2.4677521943773058, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.97950553894043, + "learning_rate": 1e-06, + "loss": 0.464, + "mean_token_accuracy": 0.8519545793533325, + "num_tokens": 740319062.0, + "step": 19399 + }, + { + "epoch": 2.4678794046558963, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.029027938842773, + "learning_rate": 1e-06, + "loss": 0.3889, + "mean_token_accuracy": 0.8744902610778809, + "num_tokens": 740353996.0, + "step": 19400 + }, + { + "epoch": 2.468006614934487, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.023225784301758, + "learning_rate": 1e-06, + "loss": 0.3999, + "mean_token_accuracy": 0.8727576732635498, + "num_tokens": 740394694.0, + "step": 19401 + }, + { + "epoch": 2.4681338252130773, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.974973678588867, + "learning_rate": 1e-06, + "loss": 0.3426, + "mean_token_accuracy": 0.8938344717025757, + "num_tokens": 740436567.0, + "step": 19402 + }, + { + "epoch": 2.468261035491668, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.916488647460938, + "learning_rate": 1e-06, + "loss": 0.4182, + "mean_token_accuracy": 0.8662208318710327, + "num_tokens": 740476458.0, + "step": 19403 + }, + { + "epoch": 2.4683882457702584, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.959545135498047, + "learning_rate": 1e-06, + "loss": 0.3637, + "mean_token_accuracy": 0.8822005987167358, + "num_tokens": 740514703.0, + "step": 19404 + }, + { + "epoch": 2.468515456048849, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.193124771118164, + "learning_rate": 1e-06, + "loss": 0.3713, + "mean_token_accuracy": 0.8844826221466064, + "num_tokens": 740552328.0, + "step": 19405 + }, + { + "epoch": 2.4686426663274394, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.11723518371582, + "learning_rate": 1e-06, + "loss": 0.4521, + "mean_token_accuracy": 0.8555429577827454, + "num_tokens": 740593141.0, + "step": 19406 + }, + { + "epoch": 2.4687698766060295, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.931161880493164, + "learning_rate": 1e-06, + "loss": 0.3683, + "mean_token_accuracy": 0.8836901783943176, + "num_tokens": 740634097.0, + "step": 19407 + }, + { + "epoch": 2.4688970868846205, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.054840087890625, + "learning_rate": 1e-06, + "loss": 0.3842, + "mean_token_accuracy": 0.8799415826797485, + "num_tokens": 740672917.0, + "step": 19408 + }, + { + "epoch": 2.4690242971632106, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.148117065429688, + "learning_rate": 1e-06, + "loss": 0.414, + "mean_token_accuracy": 0.8689708113670349, + "num_tokens": 740709351.0, + "step": 19409 + }, + { + "epoch": 2.469151507441801, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.086090087890625, + "learning_rate": 1e-06, + "loss": 0.4361, + "mean_token_accuracy": 0.864457368850708, + "num_tokens": 740744049.0, + "step": 19410 + }, + { + "epoch": 2.4692787177203916, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.066015243530273, + "learning_rate": 1e-06, + "loss": 0.398, + "mean_token_accuracy": 0.877059817314148, + "num_tokens": 740784765.0, + "step": 19411 + }, + { + "epoch": 2.469405927998982, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.228973388671875, + "learning_rate": 1e-06, + "loss": 0.4227, + "mean_token_accuracy": 0.8664908409118652, + "num_tokens": 740815859.0, + "step": 19412 + }, + { + "epoch": 2.4695331382775727, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.795364379882812, + "learning_rate": 1e-06, + "loss": 0.339, + "mean_token_accuracy": 0.8913216590881348, + "num_tokens": 740854300.0, + "step": 19413 + }, + { + "epoch": 2.4696603485561632, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.978944778442383, + "learning_rate": 1e-06, + "loss": 0.4509, + "mean_token_accuracy": 0.8582606315612793, + "num_tokens": 740891836.0, + "step": 19414 + }, + { + "epoch": 2.4697875588347538, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.160953521728516, + "learning_rate": 1e-06, + "loss": 0.3739, + "mean_token_accuracy": 0.8818261027336121, + "num_tokens": 740927434.0, + "step": 19415 + }, + { + "epoch": 2.4699147691133443, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.302112579345703, + "learning_rate": 1e-06, + "loss": 0.3992, + "mean_token_accuracy": 0.8765795230865479, + "num_tokens": 740973707.0, + "step": 19416 + }, + { + "epoch": 2.470041979391935, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.235912322998047, + "learning_rate": 1e-06, + "loss": 0.383, + "mean_token_accuracy": 0.8806518316268921, + "num_tokens": 741015953.0, + "step": 19417 + }, + { + "epoch": 2.4701691896705253, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.010061264038086, + "learning_rate": 1e-06, + "loss": 0.3773, + "mean_token_accuracy": 0.8803555965423584, + "num_tokens": 741049634.0, + "step": 19418 + }, + { + "epoch": 2.470296399949116, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.037830352783203, + "learning_rate": 1e-06, + "loss": 0.3719, + "mean_token_accuracy": 0.8837062120437622, + "num_tokens": 741084267.0, + "step": 19419 + }, + { + "epoch": 2.4704236102277064, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.158920288085938, + "learning_rate": 1e-06, + "loss": 0.3884, + "mean_token_accuracy": 0.88039231300354, + "num_tokens": 741122237.0, + "step": 19420 + }, + { + "epoch": 2.470550820506297, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.163673400878906, + "learning_rate": 1e-06, + "loss": 0.4411, + "mean_token_accuracy": 0.8615929484367371, + "num_tokens": 741163593.0, + "step": 19421 + }, + { + "epoch": 2.4706780307848875, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.102981567382812, + "learning_rate": 1e-06, + "loss": 0.4322, + "mean_token_accuracy": 0.8644311428070068, + "num_tokens": 741202328.0, + "step": 19422 + }, + { + "epoch": 2.470805241063478, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.069658279418945, + "learning_rate": 1e-06, + "loss": 0.3372, + "mean_token_accuracy": 0.8931036591529846, + "num_tokens": 741240095.0, + "step": 19423 + }, + { + "epoch": 2.4709324513420685, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.2283935546875, + "learning_rate": 1e-06, + "loss": 0.4043, + "mean_token_accuracy": 0.8758342862129211, + "num_tokens": 741278509.0, + "step": 19424 + }, + { + "epoch": 2.471059661620659, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.236467361450195, + "learning_rate": 1e-06, + "loss": 0.403, + "mean_token_accuracy": 0.870336651802063, + "num_tokens": 741321446.0, + "step": 19425 + }, + { + "epoch": 2.4711868718992496, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.984455108642578, + "learning_rate": 1e-06, + "loss": 0.407, + "mean_token_accuracy": 0.8695774674415588, + "num_tokens": 741360469.0, + "step": 19426 + }, + { + "epoch": 2.47131408217784, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.124908447265625, + "learning_rate": 1e-06, + "loss": 0.4432, + "mean_token_accuracy": 0.8646572828292847, + "num_tokens": 741393833.0, + "step": 19427 + }, + { + "epoch": 2.4714412924564306, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.989423751831055, + "learning_rate": 1e-06, + "loss": 0.3843, + "mean_token_accuracy": 0.8796060085296631, + "num_tokens": 741426956.0, + "step": 19428 + }, + { + "epoch": 2.471568502735021, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.97235107421875, + "learning_rate": 1e-06, + "loss": 0.4036, + "mean_token_accuracy": 0.8740859031677246, + "num_tokens": 741464820.0, + "step": 19429 + }, + { + "epoch": 2.4716957130136117, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.049070358276367, + "learning_rate": 1e-06, + "loss": 0.3756, + "mean_token_accuracy": 0.8804256916046143, + "num_tokens": 741499519.0, + "step": 19430 + }, + { + "epoch": 2.471822923292202, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.25855255126953, + "learning_rate": 1e-06, + "loss": 0.3742, + "mean_token_accuracy": 0.8808510303497314, + "num_tokens": 741533443.0, + "step": 19431 + }, + { + "epoch": 2.4719501335707923, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.907920837402344, + "learning_rate": 1e-06, + "loss": 0.4091, + "mean_token_accuracy": 0.8700868487358093, + "num_tokens": 741576849.0, + "step": 19432 + }, + { + "epoch": 2.4720773438493833, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.222034454345703, + "learning_rate": 1e-06, + "loss": 0.3896, + "mean_token_accuracy": 0.8786182403564453, + "num_tokens": 741614653.0, + "step": 19433 + }, + { + "epoch": 2.4722045541279734, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.216861724853516, + "learning_rate": 1e-06, + "loss": 0.406, + "mean_token_accuracy": 0.868212103843689, + "num_tokens": 741649354.0, + "step": 19434 + }, + { + "epoch": 2.472331764406564, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.133638381958008, + "learning_rate": 1e-06, + "loss": 0.4267, + "mean_token_accuracy": 0.8661331534385681, + "num_tokens": 741692257.0, + "step": 19435 + }, + { + "epoch": 2.4724589746851544, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.916828155517578, + "learning_rate": 1e-06, + "loss": 0.4114, + "mean_token_accuracy": 0.8713058829307556, + "num_tokens": 741729739.0, + "step": 19436 + }, + { + "epoch": 2.472586184963745, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.393577575683594, + "learning_rate": 1e-06, + "loss": 0.4229, + "mean_token_accuracy": 0.8645987510681152, + "num_tokens": 741763409.0, + "step": 19437 + }, + { + "epoch": 2.4727133952423355, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.965011596679688, + "learning_rate": 1e-06, + "loss": 0.4046, + "mean_token_accuracy": 0.8709501028060913, + "num_tokens": 741798746.0, + "step": 19438 + }, + { + "epoch": 2.472840605520926, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.955448150634766, + "learning_rate": 1e-06, + "loss": 0.3694, + "mean_token_accuracy": 0.8824522495269775, + "num_tokens": 741840488.0, + "step": 19439 + }, + { + "epoch": 2.4729678157995165, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.274707794189453, + "learning_rate": 1e-06, + "loss": 0.3482, + "mean_token_accuracy": 0.8902812600135803, + "num_tokens": 741880839.0, + "step": 19440 + }, + { + "epoch": 2.473095026078107, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.958698272705078, + "learning_rate": 1e-06, + "loss": 0.4238, + "mean_token_accuracy": 0.8675603270530701, + "num_tokens": 741919317.0, + "step": 19441 + }, + { + "epoch": 2.4732222363566976, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.270671844482422, + "learning_rate": 1e-06, + "loss": 0.4607, + "mean_token_accuracy": 0.8541650772094727, + "num_tokens": 741955782.0, + "step": 19442 + }, + { + "epoch": 2.473349446635288, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.194316864013672, + "learning_rate": 1e-06, + "loss": 0.4183, + "mean_token_accuracy": 0.8682951927185059, + "num_tokens": 741990410.0, + "step": 19443 + }, + { + "epoch": 2.4734766569138786, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.017948150634766, + "learning_rate": 1e-06, + "loss": 0.4135, + "mean_token_accuracy": 0.870120644569397, + "num_tokens": 742031536.0, + "step": 19444 + }, + { + "epoch": 2.473603867192469, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.166271209716797, + "learning_rate": 1e-06, + "loss": 0.4236, + "mean_token_accuracy": 0.8632729053497314, + "num_tokens": 742067222.0, + "step": 19445 + }, + { + "epoch": 2.4737310774710597, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.95186996459961, + "learning_rate": 1e-06, + "loss": 0.3701, + "mean_token_accuracy": 0.8846990466117859, + "num_tokens": 742114740.0, + "step": 19446 + }, + { + "epoch": 2.47385828774965, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.96813201904297, + "learning_rate": 1e-06, + "loss": 0.3729, + "mean_token_accuracy": 0.8809205293655396, + "num_tokens": 742149344.0, + "step": 19447 + }, + { + "epoch": 2.4739854980282407, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.422632217407227, + "learning_rate": 1e-06, + "loss": 0.3897, + "mean_token_accuracy": 0.8746685981750488, + "num_tokens": 742191089.0, + "step": 19448 + }, + { + "epoch": 2.4741127083068313, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.971359252929688, + "learning_rate": 1e-06, + "loss": 0.4012, + "mean_token_accuracy": 0.8731344938278198, + "num_tokens": 742224944.0, + "step": 19449 + }, + { + "epoch": 2.474239918585422, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.99821662902832, + "learning_rate": 1e-06, + "loss": 0.4304, + "mean_token_accuracy": 0.8632509708404541, + "num_tokens": 742263674.0, + "step": 19450 + }, + { + "epoch": 2.4743671288640123, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.106704711914062, + "learning_rate": 1e-06, + "loss": 0.4195, + "mean_token_accuracy": 0.8687036633491516, + "num_tokens": 742300424.0, + "step": 19451 + }, + { + "epoch": 2.474494339142603, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.094451904296875, + "learning_rate": 1e-06, + "loss": 0.3727, + "mean_token_accuracy": 0.8841061592102051, + "num_tokens": 742333854.0, + "step": 19452 + }, + { + "epoch": 2.4746215494211934, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.025386810302734, + "learning_rate": 1e-06, + "loss": 0.3722, + "mean_token_accuracy": 0.8825266361236572, + "num_tokens": 742377126.0, + "step": 19453 + }, + { + "epoch": 2.474748759699784, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.333829879760742, + "learning_rate": 1e-06, + "loss": 0.4088, + "mean_token_accuracy": 0.8712344169616699, + "num_tokens": 742421284.0, + "step": 19454 + }, + { + "epoch": 2.4748759699783744, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.045063018798828, + "learning_rate": 1e-06, + "loss": 0.4624, + "mean_token_accuracy": 0.8572163581848145, + "num_tokens": 742459124.0, + "step": 19455 + }, + { + "epoch": 2.475003180256965, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.94956398010254, + "learning_rate": 1e-06, + "loss": 0.4147, + "mean_token_accuracy": 0.8643679022789001, + "num_tokens": 742493137.0, + "step": 19456 + }, + { + "epoch": 2.475130390535555, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.09404754638672, + "learning_rate": 1e-06, + "loss": 0.3867, + "mean_token_accuracy": 0.8772213459014893, + "num_tokens": 742529805.0, + "step": 19457 + }, + { + "epoch": 2.475257600814146, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.217121124267578, + "learning_rate": 1e-06, + "loss": 0.3589, + "mean_token_accuracy": 0.8865835666656494, + "num_tokens": 742560859.0, + "step": 19458 + }, + { + "epoch": 2.475384811092736, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.105173110961914, + "learning_rate": 1e-06, + "loss": 0.3784, + "mean_token_accuracy": 0.8854234218597412, + "num_tokens": 742601465.0, + "step": 19459 + }, + { + "epoch": 2.4755120213713266, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.21148109436035, + "learning_rate": 1e-06, + "loss": 0.4191, + "mean_token_accuracy": 0.8670628070831299, + "num_tokens": 742639242.0, + "step": 19460 + }, + { + "epoch": 2.475639231649917, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.384429931640625, + "learning_rate": 1e-06, + "loss": 0.4072, + "mean_token_accuracy": 0.8735994696617126, + "num_tokens": 742676574.0, + "step": 19461 + }, + { + "epoch": 2.4757664419285077, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.095226287841797, + "learning_rate": 1e-06, + "loss": 0.4256, + "mean_token_accuracy": 0.8636307716369629, + "num_tokens": 742713326.0, + "step": 19462 + }, + { + "epoch": 2.4758936522070982, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.30242347717285, + "learning_rate": 1e-06, + "loss": 0.4638, + "mean_token_accuracy": 0.850829005241394, + "num_tokens": 742756883.0, + "step": 19463 + }, + { + "epoch": 2.4760208624856888, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.23259925842285, + "learning_rate": 1e-06, + "loss": 0.4141, + "mean_token_accuracy": 0.8694885969161987, + "num_tokens": 742794301.0, + "step": 19464 + }, + { + "epoch": 2.4761480727642793, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.205698013305664, + "learning_rate": 1e-06, + "loss": 0.3984, + "mean_token_accuracy": 0.876064121723175, + "num_tokens": 742832966.0, + "step": 19465 + }, + { + "epoch": 2.47627528304287, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.359050750732422, + "learning_rate": 1e-06, + "loss": 0.4232, + "mean_token_accuracy": 0.8696417808532715, + "num_tokens": 742872926.0, + "step": 19466 + }, + { + "epoch": 2.4764024933214603, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.478778839111328, + "learning_rate": 1e-06, + "loss": 0.3759, + "mean_token_accuracy": 0.8803489208221436, + "num_tokens": 742910838.0, + "step": 19467 + }, + { + "epoch": 2.476529703600051, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.093957901000977, + "learning_rate": 1e-06, + "loss": 0.3723, + "mean_token_accuracy": 0.8818861842155457, + "num_tokens": 742950098.0, + "step": 19468 + }, + { + "epoch": 2.4766569138786414, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.940044403076172, + "learning_rate": 1e-06, + "loss": 0.3604, + "mean_token_accuracy": 0.8838738203048706, + "num_tokens": 742983872.0, + "step": 19469 + }, + { + "epoch": 2.476784124157232, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.313270568847656, + "learning_rate": 1e-06, + "loss": 0.4339, + "mean_token_accuracy": 0.8632687926292419, + "num_tokens": 743029816.0, + "step": 19470 + }, + { + "epoch": 2.4769113344358225, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.090696334838867, + "learning_rate": 1e-06, + "loss": 0.4215, + "mean_token_accuracy": 0.8658098578453064, + "num_tokens": 743068565.0, + "step": 19471 + }, + { + "epoch": 2.477038544714413, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.16533660888672, + "learning_rate": 1e-06, + "loss": 0.3985, + "mean_token_accuracy": 0.8745737075805664, + "num_tokens": 743112137.0, + "step": 19472 + }, + { + "epoch": 2.4771657549930035, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.349201202392578, + "learning_rate": 1e-06, + "loss": 0.3579, + "mean_token_accuracy": 0.8886660933494568, + "num_tokens": 743145067.0, + "step": 19473 + }, + { + "epoch": 2.477292965271594, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.942169189453125, + "learning_rate": 1e-06, + "loss": 0.3798, + "mean_token_accuracy": 0.8791751861572266, + "num_tokens": 743177017.0, + "step": 19474 + }, + { + "epoch": 2.4774201755501846, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.050640106201172, + "learning_rate": 1e-06, + "loss": 0.4039, + "mean_token_accuracy": 0.8736238479614258, + "num_tokens": 743218121.0, + "step": 19475 + }, + { + "epoch": 2.477547385828775, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.203205108642578, + "learning_rate": 1e-06, + "loss": 0.3841, + "mean_token_accuracy": 0.8802955746650696, + "num_tokens": 743256635.0, + "step": 19476 + }, + { + "epoch": 2.4776745961073656, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.15468978881836, + "learning_rate": 1e-06, + "loss": 0.4127, + "mean_token_accuracy": 0.871306300163269, + "num_tokens": 743293801.0, + "step": 19477 + }, + { + "epoch": 2.477801806385956, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.975025177001953, + "learning_rate": 1e-06, + "loss": 0.3573, + "mean_token_accuracy": 0.8870412111282349, + "num_tokens": 743331582.0, + "step": 19478 + }, + { + "epoch": 2.4779290166645467, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.290742874145508, + "learning_rate": 1e-06, + "loss": 0.3914, + "mean_token_accuracy": 0.8766342401504517, + "num_tokens": 743369901.0, + "step": 19479 + }, + { + "epoch": 2.4780562269431368, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.11709976196289, + "learning_rate": 1e-06, + "loss": 0.426, + "mean_token_accuracy": 0.8695844411849976, + "num_tokens": 743405808.0, + "step": 19480 + }, + { + "epoch": 2.4781834372217277, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.220979690551758, + "learning_rate": 1e-06, + "loss": 0.3892, + "mean_token_accuracy": 0.8755795955657959, + "num_tokens": 743447236.0, + "step": 19481 + }, + { + "epoch": 2.478310647500318, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.331716537475586, + "learning_rate": 1e-06, + "loss": 0.3734, + "mean_token_accuracy": 0.8812533617019653, + "num_tokens": 743486494.0, + "step": 19482 + }, + { + "epoch": 2.4784378577789083, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.94024085998535, + "learning_rate": 1e-06, + "loss": 0.4033, + "mean_token_accuracy": 0.8702623844146729, + "num_tokens": 743523941.0, + "step": 19483 + }, + { + "epoch": 2.478565068057499, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.924671173095703, + "learning_rate": 1e-06, + "loss": 0.394, + "mean_token_accuracy": 0.8779990673065186, + "num_tokens": 743554169.0, + "step": 19484 + }, + { + "epoch": 2.4786922783360894, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.065746307373047, + "learning_rate": 1e-06, + "loss": 0.4503, + "mean_token_accuracy": 0.8592546582221985, + "num_tokens": 743594656.0, + "step": 19485 + }, + { + "epoch": 2.47881948861468, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.09193229675293, + "learning_rate": 1e-06, + "loss": 0.3839, + "mean_token_accuracy": 0.8803067207336426, + "num_tokens": 743628638.0, + "step": 19486 + }, + { + "epoch": 2.4789466988932705, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.002893447875977, + "learning_rate": 1e-06, + "loss": 0.4322, + "mean_token_accuracy": 0.8666077256202698, + "num_tokens": 743665382.0, + "step": 19487 + }, + { + "epoch": 2.479073909171861, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.101903915405273, + "learning_rate": 1e-06, + "loss": 0.3602, + "mean_token_accuracy": 0.8864294290542603, + "num_tokens": 743701712.0, + "step": 19488 + }, + { + "epoch": 2.4792011194504515, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.972993850708008, + "learning_rate": 1e-06, + "loss": 0.4005, + "mean_token_accuracy": 0.8735278844833374, + "num_tokens": 743740347.0, + "step": 19489 + }, + { + "epoch": 2.479328329729042, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.185333251953125, + "learning_rate": 1e-06, + "loss": 0.3865, + "mean_token_accuracy": 0.8765257000923157, + "num_tokens": 743775434.0, + "step": 19490 + }, + { + "epoch": 2.4794555400076326, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.228832244873047, + "learning_rate": 1e-06, + "loss": 0.4058, + "mean_token_accuracy": 0.8729144334793091, + "num_tokens": 743812475.0, + "step": 19491 + }, + { + "epoch": 2.479582750286223, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.993247985839844, + "learning_rate": 1e-06, + "loss": 0.4237, + "mean_token_accuracy": 0.8643479347229004, + "num_tokens": 743851350.0, + "step": 19492 + }, + { + "epoch": 2.4797099605648136, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.902767181396484, + "learning_rate": 1e-06, + "loss": 0.4056, + "mean_token_accuracy": 0.8706381320953369, + "num_tokens": 743885086.0, + "step": 19493 + }, + { + "epoch": 2.479837170843404, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.226194381713867, + "learning_rate": 1e-06, + "loss": 0.3865, + "mean_token_accuracy": 0.8756985664367676, + "num_tokens": 743931747.0, + "step": 19494 + }, + { + "epoch": 2.4799643811219947, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.248388290405273, + "learning_rate": 1e-06, + "loss": 0.3783, + "mean_token_accuracy": 0.8862703442573547, + "num_tokens": 743971806.0, + "step": 19495 + }, + { + "epoch": 2.480091591400585, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.233150482177734, + "learning_rate": 1e-06, + "loss": 0.3684, + "mean_token_accuracy": 0.883243203163147, + "num_tokens": 744005304.0, + "step": 19496 + }, + { + "epoch": 2.4802188016791757, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.25257110595703, + "learning_rate": 1e-06, + "loss": 0.4293, + "mean_token_accuracy": 0.865689754486084, + "num_tokens": 744051159.0, + "step": 19497 + }, + { + "epoch": 2.4803460119577663, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.236570358276367, + "learning_rate": 1e-06, + "loss": 0.4495, + "mean_token_accuracy": 0.8559347987174988, + "num_tokens": 744089856.0, + "step": 19498 + }, + { + "epoch": 2.480473222236357, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.09591293334961, + "learning_rate": 1e-06, + "loss": 0.4693, + "mean_token_accuracy": 0.8544496893882751, + "num_tokens": 744136355.0, + "step": 19499 + }, + { + "epoch": 2.4806004325149473, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.23228645324707, + "learning_rate": 1e-06, + "loss": 0.4298, + "mean_token_accuracy": 0.8625420928001404, + "num_tokens": 744177682.0, + "step": 19500 + }, + { + "epoch": 2.480727642793538, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.05763053894043, + "learning_rate": 1e-06, + "loss": 0.3945, + "mean_token_accuracy": 0.874968409538269, + "num_tokens": 744214209.0, + "step": 19501 + }, + { + "epoch": 2.4808548530721284, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.942276000976562, + "learning_rate": 1e-06, + "loss": 0.4504, + "mean_token_accuracy": 0.8565101623535156, + "num_tokens": 744257923.0, + "step": 19502 + }, + { + "epoch": 2.480982063350719, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.09288787841797, + "learning_rate": 1e-06, + "loss": 0.374, + "mean_token_accuracy": 0.880425214767456, + "num_tokens": 744293370.0, + "step": 19503 + }, + { + "epoch": 2.4811092736293094, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.04566764831543, + "learning_rate": 1e-06, + "loss": 0.3726, + "mean_token_accuracy": 0.8792628049850464, + "num_tokens": 744328572.0, + "step": 19504 + }, + { + "epoch": 2.4812364839078995, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.316022872924805, + "learning_rate": 1e-06, + "loss": 0.4477, + "mean_token_accuracy": 0.8578017950057983, + "num_tokens": 744364124.0, + "step": 19505 + }, + { + "epoch": 2.4813636941864905, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.2568302154541, + "learning_rate": 1e-06, + "loss": 0.399, + "mean_token_accuracy": 0.8770277500152588, + "num_tokens": 744403679.0, + "step": 19506 + }, + { + "epoch": 2.4814909044650806, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.82503890991211, + "learning_rate": 1e-06, + "loss": 0.4073, + "mean_token_accuracy": 0.8732336163520813, + "num_tokens": 744440922.0, + "step": 19507 + }, + { + "epoch": 2.481618114743671, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.181007385253906, + "learning_rate": 1e-06, + "loss": 0.4055, + "mean_token_accuracy": 0.8707519769668579, + "num_tokens": 744479025.0, + "step": 19508 + }, + { + "epoch": 2.4817453250222616, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.131174087524414, + "learning_rate": 1e-06, + "loss": 0.3888, + "mean_token_accuracy": 0.8751716613769531, + "num_tokens": 744514165.0, + "step": 19509 + }, + { + "epoch": 2.481872535300852, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.223785400390625, + "learning_rate": 1e-06, + "loss": 0.4127, + "mean_token_accuracy": 0.869015097618103, + "num_tokens": 744547210.0, + "step": 19510 + }, + { + "epoch": 2.4819997455794427, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.279380798339844, + "learning_rate": 1e-06, + "loss": 0.398, + "mean_token_accuracy": 0.8732950687408447, + "num_tokens": 744585132.0, + "step": 19511 + }, + { + "epoch": 2.4821269558580332, + "ewc_loss": 0.03759765625, + "ewc_loss_parallel": 3.7670135498046875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.9904842376709, + "learning_rate": 1e-06, + "loss": 0.3638, + "mean_token_accuracy": 0.8840398788452148, + "num_tokens": 744617921.0, + "step": 19512 + }, + { + "epoch": 2.4822541661366238, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.18444061279297, + "learning_rate": 1e-06, + "loss": 0.4127, + "mean_token_accuracy": 0.8677071332931519, + "num_tokens": 744653095.0, + "step": 19513 + }, + { + "epoch": 2.4823813764152143, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.95942497253418, + "learning_rate": 1e-06, + "loss": 0.3821, + "mean_token_accuracy": 0.8803491592407227, + "num_tokens": 744688258.0, + "step": 19514 + }, + { + "epoch": 2.482508586693805, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.184865951538086, + "learning_rate": 1e-06, + "loss": 0.4014, + "mean_token_accuracy": 0.8734787702560425, + "num_tokens": 744723500.0, + "step": 19515 + }, + { + "epoch": 2.4826357969723953, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.93090057373047, + "learning_rate": 1e-06, + "loss": 0.3994, + "mean_token_accuracy": 0.8738990426063538, + "num_tokens": 744757133.0, + "step": 19516 + }, + { + "epoch": 2.482763007250986, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.146053314208984, + "learning_rate": 1e-06, + "loss": 0.364, + "mean_token_accuracy": 0.8871258497238159, + "num_tokens": 744793814.0, + "step": 19517 + }, + { + "epoch": 2.4828902175295764, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.03557586669922, + "learning_rate": 1e-06, + "loss": 0.3935, + "mean_token_accuracy": 0.8765512704849243, + "num_tokens": 744828680.0, + "step": 19518 + }, + { + "epoch": 2.483017427808167, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.29340171813965, + "learning_rate": 1e-06, + "loss": 0.4141, + "mean_token_accuracy": 0.8688116669654846, + "num_tokens": 744861642.0, + "step": 19519 + }, + { + "epoch": 2.4831446380867574, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.105409622192383, + "learning_rate": 1e-06, + "loss": 0.3654, + "mean_token_accuracy": 0.8846065998077393, + "num_tokens": 744900345.0, + "step": 19520 + }, + { + "epoch": 2.483271848365348, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.10245132446289, + "learning_rate": 1e-06, + "loss": 0.3764, + "mean_token_accuracy": 0.879482626914978, + "num_tokens": 744938589.0, + "step": 19521 + }, + { + "epoch": 2.4833990586439385, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.23860740661621, + "learning_rate": 1e-06, + "loss": 0.45, + "mean_token_accuracy": 0.864391028881073, + "num_tokens": 744975385.0, + "step": 19522 + }, + { + "epoch": 2.483526268922529, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.971221923828125, + "learning_rate": 1e-06, + "loss": 0.4158, + "mean_token_accuracy": 0.8631771802902222, + "num_tokens": 745010860.0, + "step": 19523 + }, + { + "epoch": 2.4836534792011196, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.15626335144043, + "learning_rate": 1e-06, + "loss": 0.4292, + "mean_token_accuracy": 0.8655684590339661, + "num_tokens": 745047183.0, + "step": 19524 + }, + { + "epoch": 2.48378068947971, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.172691345214844, + "learning_rate": 1e-06, + "loss": 0.4078, + "mean_token_accuracy": 0.8704876899719238, + "num_tokens": 745081906.0, + "step": 19525 + }, + { + "epoch": 2.4839078997583006, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.165040969848633, + "learning_rate": 1e-06, + "loss": 0.382, + "mean_token_accuracy": 0.8782644271850586, + "num_tokens": 745127285.0, + "step": 19526 + }, + { + "epoch": 2.484035110036891, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.163475036621094, + "learning_rate": 1e-06, + "loss": 0.3548, + "mean_token_accuracy": 0.889083981513977, + "num_tokens": 745171564.0, + "step": 19527 + }, + { + "epoch": 2.4841623203154817, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.15729331970215, + "learning_rate": 1e-06, + "loss": 0.3748, + "mean_token_accuracy": 0.8834328651428223, + "num_tokens": 745214252.0, + "step": 19528 + }, + { + "epoch": 2.484289530594072, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.51174545288086, + "learning_rate": 1e-06, + "loss": 0.3997, + "mean_token_accuracy": 0.875942587852478, + "num_tokens": 745251983.0, + "step": 19529 + }, + { + "epoch": 2.4844167408726623, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.004640579223633, + "learning_rate": 1e-06, + "loss": 0.3955, + "mean_token_accuracy": 0.8763140439987183, + "num_tokens": 745287037.0, + "step": 19530 + }, + { + "epoch": 2.4845439511512533, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.117990493774414, + "learning_rate": 1e-06, + "loss": 0.403, + "mean_token_accuracy": 0.871123194694519, + "num_tokens": 745327854.0, + "step": 19531 + }, + { + "epoch": 2.4846711614298433, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.345516204833984, + "learning_rate": 1e-06, + "loss": 0.4313, + "mean_token_accuracy": 0.8625118732452393, + "num_tokens": 745370227.0, + "step": 19532 + }, + { + "epoch": 2.484798371708434, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.02401351928711, + "learning_rate": 1e-06, + "loss": 0.403, + "mean_token_accuracy": 0.8719096183776855, + "num_tokens": 745409416.0, + "step": 19533 + }, + { + "epoch": 2.4849255819870244, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.051790237426758, + "learning_rate": 1e-06, + "loss": 0.462, + "mean_token_accuracy": 0.8569670915603638, + "num_tokens": 745449908.0, + "step": 19534 + }, + { + "epoch": 2.485052792265615, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.25911521911621, + "learning_rate": 1e-06, + "loss": 0.4417, + "mean_token_accuracy": 0.8624155521392822, + "num_tokens": 745486384.0, + "step": 19535 + }, + { + "epoch": 2.4851800025442055, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.269739151000977, + "learning_rate": 1e-06, + "loss": 0.4269, + "mean_token_accuracy": 0.8629858493804932, + "num_tokens": 745523058.0, + "step": 19536 + }, + { + "epoch": 2.485307212822796, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.837753295898438, + "learning_rate": 1e-06, + "loss": 0.4546, + "mean_token_accuracy": 0.8566679358482361, + "num_tokens": 745559276.0, + "step": 19537 + }, + { + "epoch": 2.4854344231013865, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.23075294494629, + "learning_rate": 1e-06, + "loss": 0.4132, + "mean_token_accuracy": 0.8722158670425415, + "num_tokens": 745592726.0, + "step": 19538 + }, + { + "epoch": 2.485561633379977, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.073158264160156, + "learning_rate": 1e-06, + "loss": 0.397, + "mean_token_accuracy": 0.8751578330993652, + "num_tokens": 745630117.0, + "step": 19539 + }, + { + "epoch": 2.4856888436585676, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.98069190979004, + "learning_rate": 1e-06, + "loss": 0.4373, + "mean_token_accuracy": 0.8598183393478394, + "num_tokens": 745664968.0, + "step": 19540 + }, + { + "epoch": 2.485816053937158, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.380382537841797, + "learning_rate": 1e-06, + "loss": 0.3951, + "mean_token_accuracy": 0.8748760223388672, + "num_tokens": 745701893.0, + "step": 19541 + }, + { + "epoch": 2.4859432642157486, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.082691192626953, + "learning_rate": 1e-06, + "loss": 0.4117, + "mean_token_accuracy": 0.8706239461898804, + "num_tokens": 745736884.0, + "step": 19542 + }, + { + "epoch": 2.486070474494339, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.11096954345703, + "learning_rate": 1e-06, + "loss": 0.3634, + "mean_token_accuracy": 0.8858280777931213, + "num_tokens": 745782381.0, + "step": 19543 + }, + { + "epoch": 2.4861976847729297, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.026565551757812, + "learning_rate": 1e-06, + "loss": 0.3499, + "mean_token_accuracy": 0.8887490034103394, + "num_tokens": 745827630.0, + "step": 19544 + }, + { + "epoch": 2.48632489505152, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.158838272094727, + "learning_rate": 1e-06, + "loss": 0.3812, + "mean_token_accuracy": 0.880648136138916, + "num_tokens": 745862223.0, + "step": 19545 + }, + { + "epoch": 2.4864521053301107, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.04021644592285, + "learning_rate": 1e-06, + "loss": 0.3567, + "mean_token_accuracy": 0.8889753818511963, + "num_tokens": 745903077.0, + "step": 19546 + }, + { + "epoch": 2.4865793156087013, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.838783264160156, + "learning_rate": 1e-06, + "loss": 0.3908, + "mean_token_accuracy": 0.8733541965484619, + "num_tokens": 745945534.0, + "step": 19547 + }, + { + "epoch": 2.486706525887292, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.256059646606445, + "learning_rate": 1e-06, + "loss": 0.4498, + "mean_token_accuracy": 0.8576551675796509, + "num_tokens": 745986308.0, + "step": 19548 + }, + { + "epoch": 2.4868337361658823, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.096500396728516, + "learning_rate": 1e-06, + "loss": 0.4309, + "mean_token_accuracy": 0.8643311262130737, + "num_tokens": 746022206.0, + "step": 19549 + }, + { + "epoch": 2.486960946444473, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.124088287353516, + "learning_rate": 1e-06, + "loss": 0.3732, + "mean_token_accuracy": 0.8808996081352234, + "num_tokens": 746059854.0, + "step": 19550 + }, + { + "epoch": 2.4870881567230634, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.02419090270996, + "learning_rate": 1e-06, + "loss": 0.3871, + "mean_token_accuracy": 0.8804801106452942, + "num_tokens": 746103994.0, + "step": 19551 + }, + { + "epoch": 2.487215367001654, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.42490577697754, + "learning_rate": 1e-06, + "loss": 0.4446, + "mean_token_accuracy": 0.8622031807899475, + "num_tokens": 746138491.0, + "step": 19552 + }, + { + "epoch": 2.4873425772802444, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.88595962524414, + "learning_rate": 1e-06, + "loss": 0.4087, + "mean_token_accuracy": 0.8711968064308167, + "num_tokens": 746172598.0, + "step": 19553 + }, + { + "epoch": 2.487469787558835, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.150251388549805, + "learning_rate": 1e-06, + "loss": 0.382, + "mean_token_accuracy": 0.8810446262359619, + "num_tokens": 746210385.0, + "step": 19554 + }, + { + "epoch": 2.487596997837425, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.32415771484375, + "learning_rate": 1e-06, + "loss": 0.3882, + "mean_token_accuracy": 0.8762884736061096, + "num_tokens": 746252563.0, + "step": 19555 + }, + { + "epoch": 2.487724208116016, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.896739959716797, + "learning_rate": 1e-06, + "loss": 0.3816, + "mean_token_accuracy": 0.8793068528175354, + "num_tokens": 746292356.0, + "step": 19556 + }, + { + "epoch": 2.487851418394606, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.079626083374023, + "learning_rate": 1e-06, + "loss": 0.4342, + "mean_token_accuracy": 0.8621106743812561, + "num_tokens": 746325467.0, + "step": 19557 + }, + { + "epoch": 2.4879786286731966, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.05323600769043, + "learning_rate": 1e-06, + "loss": 0.4166, + "mean_token_accuracy": 0.8689494132995605, + "num_tokens": 746362285.0, + "step": 19558 + }, + { + "epoch": 2.488105838951787, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.040620803833008, + "learning_rate": 1e-06, + "loss": 0.3797, + "mean_token_accuracy": 0.8806572556495667, + "num_tokens": 746401540.0, + "step": 19559 + }, + { + "epoch": 2.4882330492303777, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.982440948486328, + "learning_rate": 1e-06, + "loss": 0.3985, + "mean_token_accuracy": 0.8748838901519775, + "num_tokens": 746439282.0, + "step": 19560 + }, + { + "epoch": 2.488360259508968, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.356042861938477, + "learning_rate": 1e-06, + "loss": 0.3685, + "mean_token_accuracy": 0.8843114972114563, + "num_tokens": 746480993.0, + "step": 19561 + }, + { + "epoch": 2.4884874697875587, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.891300201416016, + "learning_rate": 1e-06, + "loss": 0.4112, + "mean_token_accuracy": 0.8704588413238525, + "num_tokens": 746519318.0, + "step": 19562 + }, + { + "epoch": 2.4886146800661493, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.87419319152832, + "learning_rate": 1e-06, + "loss": 0.4458, + "mean_token_accuracy": 0.8605401515960693, + "num_tokens": 746559761.0, + "step": 19563 + }, + { + "epoch": 2.48874189034474, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.063961029052734, + "learning_rate": 1e-06, + "loss": 0.3729, + "mean_token_accuracy": 0.8817906379699707, + "num_tokens": 746591622.0, + "step": 19564 + }, + { + "epoch": 2.4888691006233303, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.1424503326416, + "learning_rate": 1e-06, + "loss": 0.4122, + "mean_token_accuracy": 0.871374249458313, + "num_tokens": 746627709.0, + "step": 19565 + }, + { + "epoch": 2.488996310901921, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.977449417114258, + "learning_rate": 1e-06, + "loss": 0.4177, + "mean_token_accuracy": 0.8698092699050903, + "num_tokens": 746663445.0, + "step": 19566 + }, + { + "epoch": 2.4891235211805114, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.021623611450195, + "learning_rate": 1e-06, + "loss": 0.4317, + "mean_token_accuracy": 0.8612877130508423, + "num_tokens": 746706869.0, + "step": 19567 + }, + { + "epoch": 2.489250731459102, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.04476547241211, + "learning_rate": 1e-06, + "loss": 0.3719, + "mean_token_accuracy": 0.8854323625564575, + "num_tokens": 746749502.0, + "step": 19568 + }, + { + "epoch": 2.4893779417376924, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.91883659362793, + "learning_rate": 1e-06, + "loss": 0.395, + "mean_token_accuracy": 0.8762491345405579, + "num_tokens": 746786510.0, + "step": 19569 + }, + { + "epoch": 2.489505152016283, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.127304077148438, + "learning_rate": 1e-06, + "loss": 0.3884, + "mean_token_accuracy": 0.8785338997840881, + "num_tokens": 746819430.0, + "step": 19570 + }, + { + "epoch": 2.4896323622948735, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.022899627685547, + "learning_rate": 1e-06, + "loss": 0.3853, + "mean_token_accuracy": 0.8773037195205688, + "num_tokens": 746856018.0, + "step": 19571 + }, + { + "epoch": 2.489759572573464, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.97329330444336, + "learning_rate": 1e-06, + "loss": 0.4111, + "mean_token_accuracy": 0.8673551082611084, + "num_tokens": 746897350.0, + "step": 19572 + }, + { + "epoch": 2.4898867828520546, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.052011489868164, + "learning_rate": 1e-06, + "loss": 0.4292, + "mean_token_accuracy": 0.8676496744155884, + "num_tokens": 746937163.0, + "step": 19573 + }, + { + "epoch": 2.490013993130645, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.073715209960938, + "learning_rate": 1e-06, + "loss": 0.3692, + "mean_token_accuracy": 0.8833566904067993, + "num_tokens": 746976140.0, + "step": 19574 + }, + { + "epoch": 2.4901412034092356, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.91336441040039, + "learning_rate": 1e-06, + "loss": 0.3973, + "mean_token_accuracy": 0.8740789294242859, + "num_tokens": 747012576.0, + "step": 19575 + }, + { + "epoch": 2.490268413687826, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.05378532409668, + "learning_rate": 1e-06, + "loss": 0.4076, + "mean_token_accuracy": 0.8715862035751343, + "num_tokens": 747048179.0, + "step": 19576 + }, + { + "epoch": 2.4903956239664167, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.170969009399414, + "learning_rate": 1e-06, + "loss": 0.4521, + "mean_token_accuracy": 0.8553075790405273, + "num_tokens": 747085291.0, + "step": 19577 + }, + { + "epoch": 2.4905228342450068, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.97299575805664, + "learning_rate": 1e-06, + "loss": 0.3789, + "mean_token_accuracy": 0.880279541015625, + "num_tokens": 747119085.0, + "step": 19578 + }, + { + "epoch": 2.4906500445235977, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.11432647705078, + "learning_rate": 1e-06, + "loss": 0.3912, + "mean_token_accuracy": 0.8716709613800049, + "num_tokens": 747153463.0, + "step": 19579 + }, + { + "epoch": 2.490777254802188, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.012104034423828, + "learning_rate": 1e-06, + "loss": 0.4808, + "mean_token_accuracy": 0.8484055995941162, + "num_tokens": 747194781.0, + "step": 19580 + }, + { + "epoch": 2.4909044650807783, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.09183120727539, + "learning_rate": 1e-06, + "loss": 0.4098, + "mean_token_accuracy": 0.87474524974823, + "num_tokens": 747235328.0, + "step": 19581 + }, + { + "epoch": 2.491031675359369, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.234617233276367, + "learning_rate": 1e-06, + "loss": 0.4095, + "mean_token_accuracy": 0.8714233040809631, + "num_tokens": 747278481.0, + "step": 19582 + }, + { + "epoch": 2.4911588856379594, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.005117416381836, + "learning_rate": 1e-06, + "loss": 0.3528, + "mean_token_accuracy": 0.8878628611564636, + "num_tokens": 747313077.0, + "step": 19583 + }, + { + "epoch": 2.49128609591655, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.085481643676758, + "learning_rate": 1e-06, + "loss": 0.3803, + "mean_token_accuracy": 0.8803523182868958, + "num_tokens": 747352693.0, + "step": 19584 + }, + { + "epoch": 2.4914133061951405, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.13762855529785, + "learning_rate": 1e-06, + "loss": 0.4262, + "mean_token_accuracy": 0.8656258583068848, + "num_tokens": 747397717.0, + "step": 19585 + }, + { + "epoch": 2.491540516473731, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.974294662475586, + "learning_rate": 1e-06, + "loss": 0.3822, + "mean_token_accuracy": 0.8768466711044312, + "num_tokens": 747435457.0, + "step": 19586 + }, + { + "epoch": 2.4916677267523215, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.06843376159668, + "learning_rate": 1e-06, + "loss": 0.3927, + "mean_token_accuracy": 0.8772847056388855, + "num_tokens": 747479017.0, + "step": 19587 + }, + { + "epoch": 2.491794937030912, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.068546295166016, + "learning_rate": 1e-06, + "loss": 0.4103, + "mean_token_accuracy": 0.8734207153320312, + "num_tokens": 747520247.0, + "step": 19588 + }, + { + "epoch": 2.4919221473095026, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.003284454345703, + "learning_rate": 1e-06, + "loss": 0.4444, + "mean_token_accuracy": 0.8599655628204346, + "num_tokens": 747559983.0, + "step": 19589 + }, + { + "epoch": 2.492049357588093, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.401628494262695, + "learning_rate": 1e-06, + "loss": 0.4101, + "mean_token_accuracy": 0.8685534596443176, + "num_tokens": 747595746.0, + "step": 19590 + }, + { + "epoch": 2.4921765678666836, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.05742645263672, + "learning_rate": 1e-06, + "loss": 0.4158, + "mean_token_accuracy": 0.8709093928337097, + "num_tokens": 747639261.0, + "step": 19591 + }, + { + "epoch": 2.492303778145274, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.063692092895508, + "learning_rate": 1e-06, + "loss": 0.4567, + "mean_token_accuracy": 0.8539276123046875, + "num_tokens": 747680665.0, + "step": 19592 + }, + { + "epoch": 2.4924309884238647, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.117183685302734, + "learning_rate": 1e-06, + "loss": 0.3881, + "mean_token_accuracy": 0.8773638606071472, + "num_tokens": 747713320.0, + "step": 19593 + }, + { + "epoch": 2.492558198702455, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.971986770629883, + "learning_rate": 1e-06, + "loss": 0.4948, + "mean_token_accuracy": 0.8464624881744385, + "num_tokens": 747752167.0, + "step": 19594 + }, + { + "epoch": 2.4926854089810457, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.056703567504883, + "learning_rate": 1e-06, + "loss": 0.381, + "mean_token_accuracy": 0.879483163356781, + "num_tokens": 747787274.0, + "step": 19595 + }, + { + "epoch": 2.4928126192596363, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.49659538269043, + "learning_rate": 1e-06, + "loss": 0.412, + "mean_token_accuracy": 0.8686687350273132, + "num_tokens": 747826647.0, + "step": 19596 + }, + { + "epoch": 2.492939829538227, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.050983428955078, + "learning_rate": 1e-06, + "loss": 0.4156, + "mean_token_accuracy": 0.8690703511238098, + "num_tokens": 747866797.0, + "step": 19597 + }, + { + "epoch": 2.4930670398168173, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.848791122436523, + "learning_rate": 1e-06, + "loss": 0.3932, + "mean_token_accuracy": 0.8758916854858398, + "num_tokens": 747907589.0, + "step": 19598 + }, + { + "epoch": 2.493194250095408, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.151391983032227, + "learning_rate": 1e-06, + "loss": 0.4196, + "mean_token_accuracy": 0.8648896217346191, + "num_tokens": 747939865.0, + "step": 19599 + }, + { + "epoch": 2.4933214603739984, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.962074279785156, + "learning_rate": 1e-06, + "loss": 0.4136, + "mean_token_accuracy": 0.8695408701896667, + "num_tokens": 747976238.0, + "step": 19600 + }, + { + "epoch": 2.493448670652589, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.12259292602539, + "learning_rate": 1e-06, + "loss": 0.3857, + "mean_token_accuracy": 0.8705078363418579, + "num_tokens": 748012076.0, + "step": 19601 + }, + { + "epoch": 2.4935758809311794, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.067543029785156, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.8664146065711975, + "num_tokens": 748042518.0, + "step": 19602 + }, + { + "epoch": 2.4937030912097695, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.064050674438477, + "learning_rate": 1e-06, + "loss": 0.3819, + "mean_token_accuracy": 0.8806601762771606, + "num_tokens": 748077509.0, + "step": 19603 + }, + { + "epoch": 2.4938303014883605, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.291370391845703, + "learning_rate": 1e-06, + "loss": 0.389, + "mean_token_accuracy": 0.8782023787498474, + "num_tokens": 748118164.0, + "step": 19604 + }, + { + "epoch": 2.4939575117669506, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.2100772857666, + "learning_rate": 1e-06, + "loss": 0.4252, + "mean_token_accuracy": 0.8663721084594727, + "num_tokens": 748153609.0, + "step": 19605 + }, + { + "epoch": 2.494084722045541, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.103168487548828, + "learning_rate": 1e-06, + "loss": 0.3989, + "mean_token_accuracy": 0.8703650236129761, + "num_tokens": 748189573.0, + "step": 19606 + }, + { + "epoch": 2.4942119323241316, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.28595733642578, + "learning_rate": 1e-06, + "loss": 0.4216, + "mean_token_accuracy": 0.868098258972168, + "num_tokens": 748231121.0, + "step": 19607 + }, + { + "epoch": 2.494339142602722, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.00868034362793, + "learning_rate": 1e-06, + "loss": 0.3618, + "mean_token_accuracy": 0.8843469619750977, + "num_tokens": 748268744.0, + "step": 19608 + }, + { + "epoch": 2.4944663528813127, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.146642684936523, + "learning_rate": 1e-06, + "loss": 0.3815, + "mean_token_accuracy": 0.8767775297164917, + "num_tokens": 748307441.0, + "step": 19609 + }, + { + "epoch": 2.494593563159903, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.920936584472656, + "learning_rate": 1e-06, + "loss": 0.4021, + "mean_token_accuracy": 0.8726081252098083, + "num_tokens": 748343506.0, + "step": 19610 + }, + { + "epoch": 2.4947207734384937, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.04408836364746, + "learning_rate": 1e-06, + "loss": 0.3945, + "mean_token_accuracy": 0.8735390305519104, + "num_tokens": 748382209.0, + "step": 19611 + }, + { + "epoch": 2.4948479837170843, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.05106544494629, + "learning_rate": 1e-06, + "loss": 0.3362, + "mean_token_accuracy": 0.8924756050109863, + "num_tokens": 748413660.0, + "step": 19612 + }, + { + "epoch": 2.494975193995675, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.020906448364258, + "learning_rate": 1e-06, + "loss": 0.4016, + "mean_token_accuracy": 0.8740202188491821, + "num_tokens": 748457037.0, + "step": 19613 + }, + { + "epoch": 2.4951024042742653, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.06713104248047, + "learning_rate": 1e-06, + "loss": 0.4725, + "mean_token_accuracy": 0.8518727421760559, + "num_tokens": 748497227.0, + "step": 19614 + }, + { + "epoch": 2.495229614552856, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.2075252532959, + "learning_rate": 1e-06, + "loss": 0.4173, + "mean_token_accuracy": 0.8698878884315491, + "num_tokens": 748529901.0, + "step": 19615 + }, + { + "epoch": 2.4953568248314464, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.829023361206055, + "learning_rate": 1e-06, + "loss": 0.4466, + "mean_token_accuracy": 0.8599649667739868, + "num_tokens": 748568238.0, + "step": 19616 + }, + { + "epoch": 2.495484035110037, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.991453170776367, + "learning_rate": 1e-06, + "loss": 0.4328, + "mean_token_accuracy": 0.862494945526123, + "num_tokens": 748608153.0, + "step": 19617 + }, + { + "epoch": 2.4956112453886274, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.23900032043457, + "learning_rate": 1e-06, + "loss": 0.4153, + "mean_token_accuracy": 0.8687366247177124, + "num_tokens": 748646977.0, + "step": 19618 + }, + { + "epoch": 2.495738455667218, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.475589752197266, + "learning_rate": 1e-06, + "loss": 0.4339, + "mean_token_accuracy": 0.8633471131324768, + "num_tokens": 748689586.0, + "step": 19619 + }, + { + "epoch": 2.4958656659458085, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.175386428833008, + "learning_rate": 1e-06, + "loss": 0.4069, + "mean_token_accuracy": 0.872200071811676, + "num_tokens": 748730915.0, + "step": 19620 + }, + { + "epoch": 2.495992876224399, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.229536056518555, + "learning_rate": 1e-06, + "loss": 0.4092, + "mean_token_accuracy": 0.8686445951461792, + "num_tokens": 748766996.0, + "step": 19621 + }, + { + "epoch": 2.4961200865029896, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.25218391418457, + "learning_rate": 1e-06, + "loss": 0.4185, + "mean_token_accuracy": 0.8656531572341919, + "num_tokens": 748801225.0, + "step": 19622 + }, + { + "epoch": 2.49624729678158, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.055686950683594, + "learning_rate": 1e-06, + "loss": 0.4151, + "mean_token_accuracy": 0.8679092526435852, + "num_tokens": 748841355.0, + "step": 19623 + }, + { + "epoch": 2.4963745070601706, + "ewc_loss": 0.037841796875, + "ewc_loss_parallel": 3.790855407714844e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.30103302001953, + "learning_rate": 1e-06, + "loss": 0.4145, + "mean_token_accuracy": 0.8707665205001831, + "num_tokens": 748879636.0, + "step": 19624 + }, + { + "epoch": 2.496501717338761, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.073545455932617, + "learning_rate": 1e-06, + "loss": 0.4236, + "mean_token_accuracy": 0.8694873452186584, + "num_tokens": 748917795.0, + "step": 19625 + }, + { + "epoch": 2.4966289276173517, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.024513244628906, + "learning_rate": 1e-06, + "loss": 0.4911, + "mean_token_accuracy": 0.8461400270462036, + "num_tokens": 748956512.0, + "step": 19626 + }, + { + "epoch": 2.496756137895942, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.25792694091797, + "learning_rate": 1e-06, + "loss": 0.395, + "mean_token_accuracy": 0.8769068717956543, + "num_tokens": 748995577.0, + "step": 19627 + }, + { + "epoch": 2.4968833481745323, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.10547637939453, + "learning_rate": 1e-06, + "loss": 0.4238, + "mean_token_accuracy": 0.868955135345459, + "num_tokens": 749027672.0, + "step": 19628 + }, + { + "epoch": 2.4970105584531233, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.921524047851562, + "learning_rate": 1e-06, + "loss": 0.3743, + "mean_token_accuracy": 0.8835429549217224, + "num_tokens": 749059101.0, + "step": 19629 + }, + { + "epoch": 2.4971377687317133, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.288301467895508, + "learning_rate": 1e-06, + "loss": 0.4485, + "mean_token_accuracy": 0.8575677871704102, + "num_tokens": 749097034.0, + "step": 19630 + }, + { + "epoch": 2.497264979010304, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.17831039428711, + "learning_rate": 1e-06, + "loss": 0.4132, + "mean_token_accuracy": 0.8698855638504028, + "num_tokens": 749137536.0, + "step": 19631 + }, + { + "epoch": 2.4973921892888944, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.13163185119629, + "learning_rate": 1e-06, + "loss": 0.4004, + "mean_token_accuracy": 0.8728702664375305, + "num_tokens": 749171487.0, + "step": 19632 + }, + { + "epoch": 2.497519399567485, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.09636116027832, + "learning_rate": 1e-06, + "loss": 0.4316, + "mean_token_accuracy": 0.8638273477554321, + "num_tokens": 749213234.0, + "step": 19633 + }, + { + "epoch": 2.4976466098460754, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.19289779663086, + "learning_rate": 1e-06, + "loss": 0.4476, + "mean_token_accuracy": 0.8611096143722534, + "num_tokens": 749256184.0, + "step": 19634 + }, + { + "epoch": 2.497773820124666, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.040756225585938, + "learning_rate": 1e-06, + "loss": 0.414, + "mean_token_accuracy": 0.8699440360069275, + "num_tokens": 749292909.0, + "step": 19635 + }, + { + "epoch": 2.4979010304032565, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.24797248840332, + "learning_rate": 1e-06, + "loss": 0.4514, + "mean_token_accuracy": 0.8596142530441284, + "num_tokens": 749327947.0, + "step": 19636 + }, + { + "epoch": 2.498028240681847, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.246904373168945, + "learning_rate": 1e-06, + "loss": 0.3841, + "mean_token_accuracy": 0.8809033036231995, + "num_tokens": 749364107.0, + "step": 19637 + }, + { + "epoch": 2.4981554509604376, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.858076095581055, + "learning_rate": 1e-06, + "loss": 0.3554, + "mean_token_accuracy": 0.8875234127044678, + "num_tokens": 749398055.0, + "step": 19638 + }, + { + "epoch": 2.498282661239028, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.393014907836914, + "learning_rate": 1e-06, + "loss": 0.4114, + "mean_token_accuracy": 0.8720440864562988, + "num_tokens": 749434626.0, + "step": 19639 + }, + { + "epoch": 2.4984098715176186, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.197080612182617, + "learning_rate": 1e-06, + "loss": 0.429, + "mean_token_accuracy": 0.8653508424758911, + "num_tokens": 749470616.0, + "step": 19640 + }, + { + "epoch": 2.498537081796209, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.945316314697266, + "learning_rate": 1e-06, + "loss": 0.3921, + "mean_token_accuracy": 0.8741357326507568, + "num_tokens": 749513072.0, + "step": 19641 + }, + { + "epoch": 2.4986642920747997, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.152801513671875, + "learning_rate": 1e-06, + "loss": 0.3602, + "mean_token_accuracy": 0.8870678544044495, + "num_tokens": 749546988.0, + "step": 19642 + }, + { + "epoch": 2.49879150235339, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.0617618560791, + "learning_rate": 1e-06, + "loss": 0.4279, + "mean_token_accuracy": 0.8668583631515503, + "num_tokens": 749585851.0, + "step": 19643 + }, + { + "epoch": 2.4989187126319807, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.007741928100586, + "learning_rate": 1e-06, + "loss": 0.4143, + "mean_token_accuracy": 0.8696537017822266, + "num_tokens": 749619510.0, + "step": 19644 + }, + { + "epoch": 2.4990459229105713, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.276103973388672, + "learning_rate": 1e-06, + "loss": 0.384, + "mean_token_accuracy": 0.8799171447753906, + "num_tokens": 749653660.0, + "step": 19645 + }, + { + "epoch": 2.499173133189162, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.295312881469727, + "learning_rate": 1e-06, + "loss": 0.4067, + "mean_token_accuracy": 0.8725922107696533, + "num_tokens": 749691321.0, + "step": 19646 + }, + { + "epoch": 2.4993003434677523, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.074398040771484, + "learning_rate": 1e-06, + "loss": 0.4069, + "mean_token_accuracy": 0.8727981448173523, + "num_tokens": 749729305.0, + "step": 19647 + }, + { + "epoch": 2.499427553746343, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.178924560546875, + "learning_rate": 1e-06, + "loss": 0.3873, + "mean_token_accuracy": 0.8785023093223572, + "num_tokens": 749766134.0, + "step": 19648 + }, + { + "epoch": 2.4995547640249334, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.95130729675293, + "learning_rate": 1e-06, + "loss": 0.4187, + "mean_token_accuracy": 0.8696361184120178, + "num_tokens": 749807715.0, + "step": 19649 + }, + { + "epoch": 2.499681974303524, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.278013229370117, + "learning_rate": 1e-06, + "loss": 0.4043, + "mean_token_accuracy": 0.8723419904708862, + "num_tokens": 749838799.0, + "step": 19650 + }, + { + "epoch": 2.499809184582114, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.13713264465332, + "learning_rate": 1e-06, + "loss": 0.377, + "mean_token_accuracy": 0.8858734369277954, + "num_tokens": 749874283.0, + "step": 19651 + }, + { + "epoch": 2.499936394860705, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.039260864257812, + "learning_rate": 1e-06, + "loss": 0.4095, + "mean_token_accuracy": 0.8695172071456909, + "num_tokens": 749915912.0, + "step": 19652 + }, + { + "epoch": 2.500063605139295, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.153318405151367, + "learning_rate": 1e-06, + "loss": 0.3842, + "mean_token_accuracy": 0.8784690499305725, + "num_tokens": 749959133.0, + "step": 19653 + }, + { + "epoch": 2.500190815417886, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.020841598510742, + "learning_rate": 1e-06, + "loss": 0.4143, + "mean_token_accuracy": 0.8687437772750854, + "num_tokens": 749997846.0, + "step": 19654 + }, + { + "epoch": 2.500318025696476, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.197288513183594, + "learning_rate": 1e-06, + "loss": 0.3908, + "mean_token_accuracy": 0.8758823871612549, + "num_tokens": 750040973.0, + "step": 19655 + }, + { + "epoch": 2.5004452359750666, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.160629272460938, + "learning_rate": 1e-06, + "loss": 0.4349, + "mean_token_accuracy": 0.866870641708374, + "num_tokens": 750082171.0, + "step": 19656 + }, + { + "epoch": 2.500572446253657, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.93354606628418, + "learning_rate": 1e-06, + "loss": 0.3643, + "mean_token_accuracy": 0.8837819695472717, + "num_tokens": 750123083.0, + "step": 19657 + }, + { + "epoch": 2.5006996565322477, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.892974853515625, + "learning_rate": 1e-06, + "loss": 0.4398, + "mean_token_accuracy": 0.8585917353630066, + "num_tokens": 750162235.0, + "step": 19658 + }, + { + "epoch": 2.500826866810838, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.07968521118164, + "learning_rate": 1e-06, + "loss": 0.3617, + "mean_token_accuracy": 0.8851101994514465, + "num_tokens": 750196501.0, + "step": 19659 + }, + { + "epoch": 2.5009540770894287, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.123056411743164, + "learning_rate": 1e-06, + "loss": 0.5032, + "mean_token_accuracy": 0.8439249992370605, + "num_tokens": 750241178.0, + "step": 19660 + }, + { + "epoch": 2.5010812873680193, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.065093994140625, + "learning_rate": 1e-06, + "loss": 0.3905, + "mean_token_accuracy": 0.8771809339523315, + "num_tokens": 750284104.0, + "step": 19661 + }, + { + "epoch": 2.50120849764661, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.831605911254883, + "learning_rate": 1e-06, + "loss": 0.4003, + "mean_token_accuracy": 0.8775992393493652, + "num_tokens": 750318525.0, + "step": 19662 + }, + { + "epoch": 2.5013357079252003, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.190767288208008, + "learning_rate": 1e-06, + "loss": 0.3878, + "mean_token_accuracy": 0.8811994791030884, + "num_tokens": 750358892.0, + "step": 19663 + }, + { + "epoch": 2.501462918203791, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.25681495666504, + "learning_rate": 1e-06, + "loss": 0.4405, + "mean_token_accuracy": 0.8589388132095337, + "num_tokens": 750399325.0, + "step": 19664 + }, + { + "epoch": 2.5015901284823814, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.777976989746094, + "learning_rate": 1e-06, + "loss": 0.4254, + "mean_token_accuracy": 0.8654735088348389, + "num_tokens": 750437493.0, + "step": 19665 + }, + { + "epoch": 2.501717338760972, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.348848342895508, + "learning_rate": 1e-06, + "loss": 0.3842, + "mean_token_accuracy": 0.8802556991577148, + "num_tokens": 750472381.0, + "step": 19666 + }, + { + "epoch": 2.5018445490395624, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.180992126464844, + "learning_rate": 1e-06, + "loss": 0.4531, + "mean_token_accuracy": 0.8552311062812805, + "num_tokens": 750511786.0, + "step": 19667 + }, + { + "epoch": 2.501971759318153, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.056631088256836, + "learning_rate": 1e-06, + "loss": 0.3763, + "mean_token_accuracy": 0.8784446120262146, + "num_tokens": 750550596.0, + "step": 19668 + }, + { + "epoch": 2.5020989695967435, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.176671981811523, + "learning_rate": 1e-06, + "loss": 0.3884, + "mean_token_accuracy": 0.8763364553451538, + "num_tokens": 750594790.0, + "step": 19669 + }, + { + "epoch": 2.502226179875334, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.258094787597656, + "learning_rate": 1e-06, + "loss": 0.3698, + "mean_token_accuracy": 0.8839605450630188, + "num_tokens": 750636738.0, + "step": 19670 + }, + { + "epoch": 2.5023533901539246, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.23711585998535, + "learning_rate": 1e-06, + "loss": 0.3615, + "mean_token_accuracy": 0.8839868307113647, + "num_tokens": 750672338.0, + "step": 19671 + }, + { + "epoch": 2.502480600432515, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.09588050842285, + "learning_rate": 1e-06, + "loss": 0.4202, + "mean_token_accuracy": 0.8664392828941345, + "num_tokens": 750709149.0, + "step": 19672 + }, + { + "epoch": 2.5026078107111056, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.206741333007812, + "learning_rate": 1e-06, + "loss": 0.3563, + "mean_token_accuracy": 0.8845707178115845, + "num_tokens": 750741727.0, + "step": 19673 + }, + { + "epoch": 2.5027350209896957, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.08357048034668, + "learning_rate": 1e-06, + "loss": 0.432, + "mean_token_accuracy": 0.860722541809082, + "num_tokens": 750775903.0, + "step": 19674 + }, + { + "epoch": 2.5028622312682867, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.02206039428711, + "learning_rate": 1e-06, + "loss": 0.4168, + "mean_token_accuracy": 0.8681466579437256, + "num_tokens": 750810502.0, + "step": 19675 + }, + { + "epoch": 2.5029894415468767, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.374404907226562, + "learning_rate": 1e-06, + "loss": 0.3793, + "mean_token_accuracy": 0.8798699975013733, + "num_tokens": 750847762.0, + "step": 19676 + }, + { + "epoch": 2.5031166518254677, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.995080947875977, + "learning_rate": 1e-06, + "loss": 0.4186, + "mean_token_accuracy": 0.874126136302948, + "num_tokens": 750887201.0, + "step": 19677 + }, + { + "epoch": 2.503243862104058, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.04722785949707, + "learning_rate": 1e-06, + "loss": 0.405, + "mean_token_accuracy": 0.8705404996871948, + "num_tokens": 750927323.0, + "step": 19678 + }, + { + "epoch": 2.5033710723826488, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.85982322692871, + "learning_rate": 1e-06, + "loss": 0.4295, + "mean_token_accuracy": 0.8645620346069336, + "num_tokens": 750972501.0, + "step": 19679 + }, + { + "epoch": 2.503498282661239, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.98569107055664, + "learning_rate": 1e-06, + "loss": 0.3716, + "mean_token_accuracy": 0.882245659828186, + "num_tokens": 751010743.0, + "step": 19680 + }, + { + "epoch": 2.5036254929398294, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.331117630004883, + "learning_rate": 1e-06, + "loss": 0.4207, + "mean_token_accuracy": 0.8675541877746582, + "num_tokens": 751050440.0, + "step": 19681 + }, + { + "epoch": 2.50375270321842, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.9149112701416, + "learning_rate": 1e-06, + "loss": 0.3886, + "mean_token_accuracy": 0.8768700957298279, + "num_tokens": 751088319.0, + "step": 19682 + }, + { + "epoch": 2.5038799134970104, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.992605209350586, + "learning_rate": 1e-06, + "loss": 0.3993, + "mean_token_accuracy": 0.8720869421958923, + "num_tokens": 751123766.0, + "step": 19683 + }, + { + "epoch": 2.504007123775601, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.08852195739746, + "learning_rate": 1e-06, + "loss": 0.3646, + "mean_token_accuracy": 0.888974666595459, + "num_tokens": 751163375.0, + "step": 19684 + }, + { + "epoch": 2.5041343340541915, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.032001495361328, + "learning_rate": 1e-06, + "loss": 0.4171, + "mean_token_accuracy": 0.868512749671936, + "num_tokens": 751204526.0, + "step": 19685 + }, + { + "epoch": 2.504261544332782, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.318727493286133, + "learning_rate": 1e-06, + "loss": 0.3827, + "mean_token_accuracy": 0.8771529197692871, + "num_tokens": 751245133.0, + "step": 19686 + }, + { + "epoch": 2.5043887546113726, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.166383743286133, + "learning_rate": 1e-06, + "loss": 0.4424, + "mean_token_accuracy": 0.8620408773422241, + "num_tokens": 751284578.0, + "step": 19687 + }, + { + "epoch": 2.504515964889963, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.03217315673828, + "learning_rate": 1e-06, + "loss": 0.4223, + "mean_token_accuracy": 0.863991916179657, + "num_tokens": 751325794.0, + "step": 19688 + }, + { + "epoch": 2.5046431751685536, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.976743698120117, + "learning_rate": 1e-06, + "loss": 0.3753, + "mean_token_accuracy": 0.8791112899780273, + "num_tokens": 751363777.0, + "step": 19689 + }, + { + "epoch": 2.504770385447144, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.215105056762695, + "learning_rate": 1e-06, + "loss": 0.4383, + "mean_token_accuracy": 0.8639012575149536, + "num_tokens": 751400499.0, + "step": 19690 + }, + { + "epoch": 2.5048975957257347, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.373620986938477, + "learning_rate": 1e-06, + "loss": 0.4136, + "mean_token_accuracy": 0.8715871572494507, + "num_tokens": 751434895.0, + "step": 19691 + }, + { + "epoch": 2.505024806004325, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.17546272277832, + "learning_rate": 1e-06, + "loss": 0.3679, + "mean_token_accuracy": 0.8816401958465576, + "num_tokens": 751473984.0, + "step": 19692 + }, + { + "epoch": 2.5051520162829157, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.027389526367188, + "learning_rate": 1e-06, + "loss": 0.4033, + "mean_token_accuracy": 0.873054563999176, + "num_tokens": 751515030.0, + "step": 19693 + }, + { + "epoch": 2.5052792265615063, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.145898818969727, + "learning_rate": 1e-06, + "loss": 0.4105, + "mean_token_accuracy": 0.8701030015945435, + "num_tokens": 751550023.0, + "step": 19694 + }, + { + "epoch": 2.505406436840097, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.95699119567871, + "learning_rate": 1e-06, + "loss": 0.3803, + "mean_token_accuracy": 0.8739688992500305, + "num_tokens": 751584793.0, + "step": 19695 + }, + { + "epoch": 2.5055336471186873, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.10759162902832, + "learning_rate": 1e-06, + "loss": 0.4577, + "mean_token_accuracy": 0.8542484641075134, + "num_tokens": 751619817.0, + "step": 19696 + }, + { + "epoch": 2.505660857397278, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.149383544921875, + "learning_rate": 1e-06, + "loss": 0.3863, + "mean_token_accuracy": 0.8792500495910645, + "num_tokens": 751657097.0, + "step": 19697 + }, + { + "epoch": 2.5057880676758684, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.188753128051758, + "learning_rate": 1e-06, + "loss": 0.4246, + "mean_token_accuracy": 0.8634451627731323, + "num_tokens": 751687042.0, + "step": 19698 + }, + { + "epoch": 2.5059152779544585, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.957902908325195, + "learning_rate": 1e-06, + "loss": 0.3714, + "mean_token_accuracy": 0.8805794715881348, + "num_tokens": 751724281.0, + "step": 19699 + }, + { + "epoch": 2.5060424882330494, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.869504928588867, + "learning_rate": 1e-06, + "loss": 0.4585, + "mean_token_accuracy": 0.8533164262771606, + "num_tokens": 751765144.0, + "step": 19700 + }, + { + "epoch": 2.5061696985116395, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.074058532714844, + "learning_rate": 1e-06, + "loss": 0.3788, + "mean_token_accuracy": 0.8796143531799316, + "num_tokens": 751799602.0, + "step": 19701 + }, + { + "epoch": 2.5062969087902305, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.24424171447754, + "learning_rate": 1e-06, + "loss": 0.3784, + "mean_token_accuracy": 0.880596399307251, + "num_tokens": 751836972.0, + "step": 19702 + }, + { + "epoch": 2.5064241190688206, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.22604751586914, + "learning_rate": 1e-06, + "loss": 0.3937, + "mean_token_accuracy": 0.8744757175445557, + "num_tokens": 751869627.0, + "step": 19703 + }, + { + "epoch": 2.5065513293474115, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.130699157714844, + "learning_rate": 1e-06, + "loss": 0.4389, + "mean_token_accuracy": 0.8647968769073486, + "num_tokens": 751911103.0, + "step": 19704 + }, + { + "epoch": 2.5066785396260016, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.088489532470703, + "learning_rate": 1e-06, + "loss": 0.4132, + "mean_token_accuracy": 0.8671799898147583, + "num_tokens": 751948956.0, + "step": 19705 + }, + { + "epoch": 2.506805749904592, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.13409423828125, + "learning_rate": 1e-06, + "loss": 0.3997, + "mean_token_accuracy": 0.8760444521903992, + "num_tokens": 751992185.0, + "step": 19706 + }, + { + "epoch": 2.5069329601831827, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.924943923950195, + "learning_rate": 1e-06, + "loss": 0.4162, + "mean_token_accuracy": 0.8677879571914673, + "num_tokens": 752032661.0, + "step": 19707 + }, + { + "epoch": 2.507060170461773, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.13257598876953, + "learning_rate": 1e-06, + "loss": 0.3883, + "mean_token_accuracy": 0.8769951462745667, + "num_tokens": 752073946.0, + "step": 19708 + }, + { + "epoch": 2.5071873807403637, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.164396286010742, + "learning_rate": 1e-06, + "loss": 0.4509, + "mean_token_accuracy": 0.8580989837646484, + "num_tokens": 752118766.0, + "step": 19709 + }, + { + "epoch": 2.5073145910189543, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.101078033447266, + "learning_rate": 1e-06, + "loss": 0.3966, + "mean_token_accuracy": 0.87532639503479, + "num_tokens": 752159669.0, + "step": 19710 + }, + { + "epoch": 2.507441801297545, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.21739387512207, + "learning_rate": 1e-06, + "loss": 0.3981, + "mean_token_accuracy": 0.8764838576316833, + "num_tokens": 752198139.0, + "step": 19711 + }, + { + "epoch": 2.5075690115761353, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.019346237182617, + "learning_rate": 1e-06, + "loss": 0.3975, + "mean_token_accuracy": 0.8749532699584961, + "num_tokens": 752239686.0, + "step": 19712 + }, + { + "epoch": 2.507696221854726, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.97832679748535, + "learning_rate": 1e-06, + "loss": 0.4479, + "mean_token_accuracy": 0.8543764352798462, + "num_tokens": 752275740.0, + "step": 19713 + }, + { + "epoch": 2.5078234321333164, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.17877197265625, + "learning_rate": 1e-06, + "loss": 0.4476, + "mean_token_accuracy": 0.8603355288505554, + "num_tokens": 752310554.0, + "step": 19714 + }, + { + "epoch": 2.507950642411907, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.0336856842041, + "learning_rate": 1e-06, + "loss": 0.3926, + "mean_token_accuracy": 0.8773471713066101, + "num_tokens": 752347761.0, + "step": 19715 + }, + { + "epoch": 2.5080778526904974, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.13785171508789, + "learning_rate": 1e-06, + "loss": 0.4416, + "mean_token_accuracy": 0.862391471862793, + "num_tokens": 752387128.0, + "step": 19716 + }, + { + "epoch": 2.508205062969088, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.526540756225586, + "learning_rate": 1e-06, + "loss": 0.4054, + "mean_token_accuracy": 0.8717132806777954, + "num_tokens": 752428722.0, + "step": 19717 + }, + { + "epoch": 2.5083322732476785, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.12320327758789, + "learning_rate": 1e-06, + "loss": 0.3925, + "mean_token_accuracy": 0.8713130950927734, + "num_tokens": 752468920.0, + "step": 19718 + }, + { + "epoch": 2.508459483526269, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.009550094604492, + "learning_rate": 1e-06, + "loss": 0.4481, + "mean_token_accuracy": 0.8630262017250061, + "num_tokens": 752505226.0, + "step": 19719 + }, + { + "epoch": 2.5085866938048595, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.041973114013672, + "learning_rate": 1e-06, + "loss": 0.3954, + "mean_token_accuracy": 0.8734657764434814, + "num_tokens": 752543693.0, + "step": 19720 + }, + { + "epoch": 2.50871390408345, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.163753509521484, + "learning_rate": 1e-06, + "loss": 0.4475, + "mean_token_accuracy": 0.85712730884552, + "num_tokens": 752578758.0, + "step": 19721 + }, + { + "epoch": 2.5088411143620406, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.990482330322266, + "learning_rate": 1e-06, + "loss": 0.3791, + "mean_token_accuracy": 0.8824571967124939, + "num_tokens": 752612115.0, + "step": 19722 + }, + { + "epoch": 2.508968324640631, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.017593383789062, + "learning_rate": 1e-06, + "loss": 0.385, + "mean_token_accuracy": 0.877684473991394, + "num_tokens": 752644601.0, + "step": 19723 + }, + { + "epoch": 2.509095534919221, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.153196334838867, + "learning_rate": 1e-06, + "loss": 0.3853, + "mean_token_accuracy": 0.87547767162323, + "num_tokens": 752682333.0, + "step": 19724 + }, + { + "epoch": 2.509222745197812, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.955415725708008, + "learning_rate": 1e-06, + "loss": 0.4269, + "mean_token_accuracy": 0.8666714429855347, + "num_tokens": 752721802.0, + "step": 19725 + }, + { + "epoch": 2.5093499554764023, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.44968605041504, + "learning_rate": 1e-06, + "loss": 0.3538, + "mean_token_accuracy": 0.8873307704925537, + "num_tokens": 752755673.0, + "step": 19726 + }, + { + "epoch": 2.5094771657549932, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.226408004760742, + "learning_rate": 1e-06, + "loss": 0.3797, + "mean_token_accuracy": 0.881716787815094, + "num_tokens": 752795575.0, + "step": 19727 + }, + { + "epoch": 2.5096043760335833, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.963712692260742, + "learning_rate": 1e-06, + "loss": 0.3565, + "mean_token_accuracy": 0.8875230550765991, + "num_tokens": 752833172.0, + "step": 19728 + }, + { + "epoch": 2.5097315863121743, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.213233947753906, + "learning_rate": 1e-06, + "loss": 0.4403, + "mean_token_accuracy": 0.8575066328048706, + "num_tokens": 752872857.0, + "step": 19729 + }, + { + "epoch": 2.5098587965907644, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.012006759643555, + "learning_rate": 1e-06, + "loss": 0.3856, + "mean_token_accuracy": 0.8816466331481934, + "num_tokens": 752908553.0, + "step": 19730 + }, + { + "epoch": 2.509986006869355, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.07532501220703, + "learning_rate": 1e-06, + "loss": 0.3818, + "mean_token_accuracy": 0.8780792355537415, + "num_tokens": 752937762.0, + "step": 19731 + }, + { + "epoch": 2.5101132171479454, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.1468505859375, + "learning_rate": 1e-06, + "loss": 0.4399, + "mean_token_accuracy": 0.8583071231842041, + "num_tokens": 752977184.0, + "step": 19732 + }, + { + "epoch": 2.510240427426536, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.309799194335938, + "learning_rate": 1e-06, + "loss": 0.4079, + "mean_token_accuracy": 0.8683167099952698, + "num_tokens": 753021071.0, + "step": 19733 + }, + { + "epoch": 2.5103676377051265, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.268476486206055, + "learning_rate": 1e-06, + "loss": 0.488, + "mean_token_accuracy": 0.8458300828933716, + "num_tokens": 753056597.0, + "step": 19734 + }, + { + "epoch": 2.510494847983717, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.107486724853516, + "learning_rate": 1e-06, + "loss": 0.4016, + "mean_token_accuracy": 0.8747328519821167, + "num_tokens": 753096178.0, + "step": 19735 + }, + { + "epoch": 2.5106220582623076, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.14569854736328, + "learning_rate": 1e-06, + "loss": 0.3965, + "mean_token_accuracy": 0.8754905462265015, + "num_tokens": 753130388.0, + "step": 19736 + }, + { + "epoch": 2.510749268540898, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.006038665771484, + "learning_rate": 1e-06, + "loss": 0.3896, + "mean_token_accuracy": 0.8790886402130127, + "num_tokens": 753166585.0, + "step": 19737 + }, + { + "epoch": 2.5108764788194886, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.101289749145508, + "learning_rate": 1e-06, + "loss": 0.3798, + "mean_token_accuracy": 0.881130039691925, + "num_tokens": 753203403.0, + "step": 19738 + }, + { + "epoch": 2.511003689098079, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.11810302734375, + "learning_rate": 1e-06, + "loss": 0.3927, + "mean_token_accuracy": 0.8783196806907654, + "num_tokens": 753241632.0, + "step": 19739 + }, + { + "epoch": 2.5111308993766697, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.491069793701172, + "learning_rate": 1e-06, + "loss": 0.3649, + "mean_token_accuracy": 0.886293351650238, + "num_tokens": 753279292.0, + "step": 19740 + }, + { + "epoch": 2.51125810965526, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.072425842285156, + "learning_rate": 1e-06, + "loss": 0.4334, + "mean_token_accuracy": 0.8631397485733032, + "num_tokens": 753319121.0, + "step": 19741 + }, + { + "epoch": 2.5113853199338507, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.156967163085938, + "learning_rate": 1e-06, + "loss": 0.4496, + "mean_token_accuracy": 0.8574668169021606, + "num_tokens": 753353074.0, + "step": 19742 + }, + { + "epoch": 2.5115125302124413, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.323213577270508, + "learning_rate": 1e-06, + "loss": 0.3505, + "mean_token_accuracy": 0.8902461528778076, + "num_tokens": 753388994.0, + "step": 19743 + }, + { + "epoch": 2.511639740491032, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.000450134277344, + "learning_rate": 1e-06, + "loss": 0.3743, + "mean_token_accuracy": 0.878522515296936, + "num_tokens": 753430271.0, + "step": 19744 + }, + { + "epoch": 2.5117669507696223, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.050729751586914, + "learning_rate": 1e-06, + "loss": 0.4343, + "mean_token_accuracy": 0.864621639251709, + "num_tokens": 753473820.0, + "step": 19745 + }, + { + "epoch": 2.511894161048213, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.062835693359375, + "learning_rate": 1e-06, + "loss": 0.4109, + "mean_token_accuracy": 0.8686431646347046, + "num_tokens": 753516397.0, + "step": 19746 + }, + { + "epoch": 2.5120213713268034, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.105430603027344, + "learning_rate": 1e-06, + "loss": 0.3708, + "mean_token_accuracy": 0.8828173875808716, + "num_tokens": 753557589.0, + "step": 19747 + }, + { + "epoch": 2.512148581605394, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.433795928955078, + "learning_rate": 1e-06, + "loss": 0.3774, + "mean_token_accuracy": 0.8798884153366089, + "num_tokens": 753593764.0, + "step": 19748 + }, + { + "epoch": 2.512275791883984, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.151351928710938, + "learning_rate": 1e-06, + "loss": 0.4414, + "mean_token_accuracy": 0.860923707485199, + "num_tokens": 753635967.0, + "step": 19749 + }, + { + "epoch": 2.512403002162575, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.99912452697754, + "learning_rate": 1e-06, + "loss": 0.3871, + "mean_token_accuracy": 0.8795754909515381, + "num_tokens": 753674595.0, + "step": 19750 + }, + { + "epoch": 2.512530212441165, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.21621322631836, + "learning_rate": 1e-06, + "loss": 0.4623, + "mean_token_accuracy": 0.8570218086242676, + "num_tokens": 753714118.0, + "step": 19751 + }, + { + "epoch": 2.512657422719756, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.176570892333984, + "learning_rate": 1e-06, + "loss": 0.4536, + "mean_token_accuracy": 0.8584141731262207, + "num_tokens": 753758139.0, + "step": 19752 + }, + { + "epoch": 2.512784632998346, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.246374130249023, + "learning_rate": 1e-06, + "loss": 0.4368, + "mean_token_accuracy": 0.8605564832687378, + "num_tokens": 753800373.0, + "step": 19753 + }, + { + "epoch": 2.5129118432769366, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.133142471313477, + "learning_rate": 1e-06, + "loss": 0.4141, + "mean_token_accuracy": 0.8667466640472412, + "num_tokens": 753832366.0, + "step": 19754 + }, + { + "epoch": 2.513039053555527, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.474973678588867, + "learning_rate": 1e-06, + "loss": 0.3506, + "mean_token_accuracy": 0.8910900354385376, + "num_tokens": 753866176.0, + "step": 19755 + }, + { + "epoch": 2.5131662638341177, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.183088302612305, + "learning_rate": 1e-06, + "loss": 0.4072, + "mean_token_accuracy": 0.8715200424194336, + "num_tokens": 753911470.0, + "step": 19756 + }, + { + "epoch": 2.513293474112708, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.077367782592773, + "learning_rate": 1e-06, + "loss": 0.4214, + "mean_token_accuracy": 0.869328498840332, + "num_tokens": 753953027.0, + "step": 19757 + }, + { + "epoch": 2.5134206843912987, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.26970672607422, + "learning_rate": 1e-06, + "loss": 0.4252, + "mean_token_accuracy": 0.866316020488739, + "num_tokens": 753987992.0, + "step": 19758 + }, + { + "epoch": 2.5135478946698893, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.06916618347168, + "learning_rate": 1e-06, + "loss": 0.4027, + "mean_token_accuracy": 0.876038670539856, + "num_tokens": 754026671.0, + "step": 19759 + }, + { + "epoch": 2.51367510494848, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.070743560791016, + "learning_rate": 1e-06, + "loss": 0.3892, + "mean_token_accuracy": 0.8733258247375488, + "num_tokens": 754063526.0, + "step": 19760 + }, + { + "epoch": 2.5138023152270703, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.982803344726562, + "learning_rate": 1e-06, + "loss": 0.453, + "mean_token_accuracy": 0.8554177284240723, + "num_tokens": 754103361.0, + "step": 19761 + }, + { + "epoch": 2.513929525505661, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.92079734802246, + "learning_rate": 1e-06, + "loss": 0.3931, + "mean_token_accuracy": 0.876642107963562, + "num_tokens": 754141969.0, + "step": 19762 + }, + { + "epoch": 2.5140567357842514, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.15138816833496, + "learning_rate": 1e-06, + "loss": 0.357, + "mean_token_accuracy": 0.8881910443305969, + "num_tokens": 754182691.0, + "step": 19763 + }, + { + "epoch": 2.514183946062842, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.92644691467285, + "learning_rate": 1e-06, + "loss": 0.3882, + "mean_token_accuracy": 0.8714331984519958, + "num_tokens": 754219956.0, + "step": 19764 + }, + { + "epoch": 2.5143111563414324, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.155914306640625, + "learning_rate": 1e-06, + "loss": 0.4156, + "mean_token_accuracy": 0.8676072955131531, + "num_tokens": 754259467.0, + "step": 19765 + }, + { + "epoch": 2.514438366620023, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.897306442260742, + "learning_rate": 1e-06, + "loss": 0.3852, + "mean_token_accuracy": 0.8754945993423462, + "num_tokens": 754300344.0, + "step": 19766 + }, + { + "epoch": 2.5145655768986135, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.105224609375, + "learning_rate": 1e-06, + "loss": 0.3984, + "mean_token_accuracy": 0.8710311651229858, + "num_tokens": 754342606.0, + "step": 19767 + }, + { + "epoch": 2.514692787177204, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.23617172241211, + "learning_rate": 1e-06, + "loss": 0.4253, + "mean_token_accuracy": 0.8684512376785278, + "num_tokens": 754379946.0, + "step": 19768 + }, + { + "epoch": 2.5148199974557945, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.21172332763672, + "learning_rate": 1e-06, + "loss": 0.3624, + "mean_token_accuracy": 0.8860253095626831, + "num_tokens": 754414353.0, + "step": 19769 + }, + { + "epoch": 2.514947207734385, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.0289363861084, + "learning_rate": 1e-06, + "loss": 0.4494, + "mean_token_accuracy": 0.859701931476593, + "num_tokens": 754455506.0, + "step": 19770 + }, + { + "epoch": 2.5150744180129756, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.269575119018555, + "learning_rate": 1e-06, + "loss": 0.4576, + "mean_token_accuracy": 0.8557554483413696, + "num_tokens": 754498316.0, + "step": 19771 + }, + { + "epoch": 2.5152016282915657, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.04707145690918, + "learning_rate": 1e-06, + "loss": 0.4358, + "mean_token_accuracy": 0.860931932926178, + "num_tokens": 754534943.0, + "step": 19772 + }, + { + "epoch": 2.5153288385701567, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.190195083618164, + "learning_rate": 1e-06, + "loss": 0.4152, + "mean_token_accuracy": 0.869992733001709, + "num_tokens": 754572775.0, + "step": 19773 + }, + { + "epoch": 2.5154560488487467, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.149383544921875, + "learning_rate": 1e-06, + "loss": 0.36, + "mean_token_accuracy": 0.8823450207710266, + "num_tokens": 754610473.0, + "step": 19774 + }, + { + "epoch": 2.5155832591273377, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.023025512695312, + "learning_rate": 1e-06, + "loss": 0.4145, + "mean_token_accuracy": 0.8733881711959839, + "num_tokens": 754649527.0, + "step": 19775 + }, + { + "epoch": 2.515710469405928, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.346139907836914, + "learning_rate": 1e-06, + "loss": 0.3982, + "mean_token_accuracy": 0.8741185665130615, + "num_tokens": 754691513.0, + "step": 19776 + }, + { + "epoch": 2.5158376796845188, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.042179107666016, + "learning_rate": 1e-06, + "loss": 0.448, + "mean_token_accuracy": 0.8599152565002441, + "num_tokens": 754727932.0, + "step": 19777 + }, + { + "epoch": 2.515964889963109, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.14899253845215, + "learning_rate": 1e-06, + "loss": 0.3808, + "mean_token_accuracy": 0.8786494731903076, + "num_tokens": 754764571.0, + "step": 19778 + }, + { + "epoch": 2.5160921002416994, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.20459747314453, + "learning_rate": 1e-06, + "loss": 0.3538, + "mean_token_accuracy": 0.886716365814209, + "num_tokens": 754805846.0, + "step": 19779 + }, + { + "epoch": 2.51621931052029, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.8955135345459, + "learning_rate": 1e-06, + "loss": 0.38, + "mean_token_accuracy": 0.8791061043739319, + "num_tokens": 754844016.0, + "step": 19780 + }, + { + "epoch": 2.5163465207988804, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.198787689208984, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.8724019527435303, + "num_tokens": 754886493.0, + "step": 19781 + }, + { + "epoch": 2.516473731077471, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.92464256286621, + "learning_rate": 1e-06, + "loss": 0.3934, + "mean_token_accuracy": 0.8745492100715637, + "num_tokens": 754928195.0, + "step": 19782 + }, + { + "epoch": 2.5166009413560615, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.278718948364258, + "learning_rate": 1e-06, + "loss": 0.3536, + "mean_token_accuracy": 0.8816349506378174, + "num_tokens": 754958399.0, + "step": 19783 + }, + { + "epoch": 2.516728151634652, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.260957717895508, + "learning_rate": 1e-06, + "loss": 0.3926, + "mean_token_accuracy": 0.8770977258682251, + "num_tokens": 754997926.0, + "step": 19784 + }, + { + "epoch": 2.5168553619132426, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.95325469970703, + "learning_rate": 1e-06, + "loss": 0.3906, + "mean_token_accuracy": 0.8778163194656372, + "num_tokens": 755040577.0, + "step": 19785 + }, + { + "epoch": 2.516982572191833, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.109947204589844, + "learning_rate": 1e-06, + "loss": 0.4085, + "mean_token_accuracy": 0.8697121143341064, + "num_tokens": 755070458.0, + "step": 19786 + }, + { + "epoch": 2.5171097824704236, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.218719482421875, + "learning_rate": 1e-06, + "loss": 0.4688, + "mean_token_accuracy": 0.851399302482605, + "num_tokens": 755117205.0, + "step": 19787 + }, + { + "epoch": 2.517236992749014, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.013322830200195, + "learning_rate": 1e-06, + "loss": 0.4007, + "mean_token_accuracy": 0.8743679523468018, + "num_tokens": 755162098.0, + "step": 19788 + }, + { + "epoch": 2.5173642030276047, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.09695053100586, + "learning_rate": 1e-06, + "loss": 0.4106, + "mean_token_accuracy": 0.8709063529968262, + "num_tokens": 755199655.0, + "step": 19789 + }, + { + "epoch": 2.517491413306195, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.02480697631836, + "learning_rate": 1e-06, + "loss": 0.3668, + "mean_token_accuracy": 0.8865333199501038, + "num_tokens": 755234165.0, + "step": 19790 + }, + { + "epoch": 2.5176186235847857, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.04982566833496, + "learning_rate": 1e-06, + "loss": 0.3949, + "mean_token_accuracy": 0.8741624355316162, + "num_tokens": 755271933.0, + "step": 19791 + }, + { + "epoch": 2.5177458338633762, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.205387115478516, + "learning_rate": 1e-06, + "loss": 0.372, + "mean_token_accuracy": 0.8814193606376648, + "num_tokens": 755306903.0, + "step": 19792 + }, + { + "epoch": 2.5178730441419668, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.207523345947266, + "learning_rate": 1e-06, + "loss": 0.3784, + "mean_token_accuracy": 0.8800680637359619, + "num_tokens": 755347613.0, + "step": 19793 + }, + { + "epoch": 2.5180002544205573, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.078174591064453, + "learning_rate": 1e-06, + "loss": 0.35, + "mean_token_accuracy": 0.8912944793701172, + "num_tokens": 755380541.0, + "step": 19794 + }, + { + "epoch": 2.518127464699148, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.195446014404297, + "learning_rate": 1e-06, + "loss": 0.4029, + "mean_token_accuracy": 0.8701890707015991, + "num_tokens": 755422502.0, + "step": 19795 + }, + { + "epoch": 2.5182546749777384, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.034250259399414, + "learning_rate": 1e-06, + "loss": 0.4002, + "mean_token_accuracy": 0.8732954859733582, + "num_tokens": 755463975.0, + "step": 19796 + }, + { + "epoch": 2.5183818852563284, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.842554092407227, + "learning_rate": 1e-06, + "loss": 0.3986, + "mean_token_accuracy": 0.8738820552825928, + "num_tokens": 755504873.0, + "step": 19797 + }, + { + "epoch": 2.5185090955349194, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.32060432434082, + "learning_rate": 1e-06, + "loss": 0.4813, + "mean_token_accuracy": 0.8486449718475342, + "num_tokens": 755545646.0, + "step": 19798 + }, + { + "epoch": 2.5186363058135095, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.000728607177734, + "learning_rate": 1e-06, + "loss": 0.4459, + "mean_token_accuracy": 0.860064685344696, + "num_tokens": 755577228.0, + "step": 19799 + }, + { + "epoch": 2.5187635160921005, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.05380630493164, + "learning_rate": 1e-06, + "loss": 0.3795, + "mean_token_accuracy": 0.8819255828857422, + "num_tokens": 755621437.0, + "step": 19800 + }, + { + "epoch": 2.5188907263706906, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.092487335205078, + "learning_rate": 1e-06, + "loss": 0.4055, + "mean_token_accuracy": 0.8704589009284973, + "num_tokens": 755658266.0, + "step": 19801 + }, + { + "epoch": 2.5190179366492815, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.01170539855957, + "learning_rate": 1e-06, + "loss": 0.4356, + "mean_token_accuracy": 0.8612782955169678, + "num_tokens": 755696755.0, + "step": 19802 + }, + { + "epoch": 2.5191451469278716, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.143878936767578, + "learning_rate": 1e-06, + "loss": 0.3552, + "mean_token_accuracy": 0.8859933614730835, + "num_tokens": 755732434.0, + "step": 19803 + }, + { + "epoch": 2.519272357206462, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.246706008911133, + "learning_rate": 1e-06, + "loss": 0.3916, + "mean_token_accuracy": 0.8778710961341858, + "num_tokens": 755768488.0, + "step": 19804 + }, + { + "epoch": 2.5193995674850527, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.088489532470703, + "learning_rate": 1e-06, + "loss": 0.38, + "mean_token_accuracy": 0.8797497749328613, + "num_tokens": 755799637.0, + "step": 19805 + }, + { + "epoch": 2.519526777763643, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.22971534729004, + "learning_rate": 1e-06, + "loss": 0.4387, + "mean_token_accuracy": 0.8569008111953735, + "num_tokens": 755834229.0, + "step": 19806 + }, + { + "epoch": 2.5196539880422337, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.190956115722656, + "learning_rate": 1e-06, + "loss": 0.3925, + "mean_token_accuracy": 0.8772599697113037, + "num_tokens": 755871776.0, + "step": 19807 + }, + { + "epoch": 2.5197811983208243, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.025850296020508, + "learning_rate": 1e-06, + "loss": 0.3781, + "mean_token_accuracy": 0.8833630084991455, + "num_tokens": 755911125.0, + "step": 19808 + }, + { + "epoch": 2.519908408599415, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.151588439941406, + "learning_rate": 1e-06, + "loss": 0.3794, + "mean_token_accuracy": 0.8814852237701416, + "num_tokens": 755952204.0, + "step": 19809 + }, + { + "epoch": 2.5200356188780053, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.001766204833984, + "learning_rate": 1e-06, + "loss": 0.4445, + "mean_token_accuracy": 0.8617669343948364, + "num_tokens": 755996702.0, + "step": 19810 + }, + { + "epoch": 2.520162829156596, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.11248779296875, + "learning_rate": 1e-06, + "loss": 0.3545, + "mean_token_accuracy": 0.8889871835708618, + "num_tokens": 756033285.0, + "step": 19811 + }, + { + "epoch": 2.5202900394351864, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.20075798034668, + "learning_rate": 1e-06, + "loss": 0.4073, + "mean_token_accuracy": 0.8723697662353516, + "num_tokens": 756068851.0, + "step": 19812 + }, + { + "epoch": 2.520417249713777, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.02515983581543, + "learning_rate": 1e-06, + "loss": 0.4352, + "mean_token_accuracy": 0.864391565322876, + "num_tokens": 756111676.0, + "step": 19813 + }, + { + "epoch": 2.5205444599923674, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.22576904296875, + "learning_rate": 1e-06, + "loss": 0.4599, + "mean_token_accuracy": 0.8566218614578247, + "num_tokens": 756146575.0, + "step": 19814 + }, + { + "epoch": 2.520671670270958, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.031227111816406, + "learning_rate": 1e-06, + "loss": 0.4306, + "mean_token_accuracy": 0.8629269599914551, + "num_tokens": 756190006.0, + "step": 19815 + }, + { + "epoch": 2.5207988805495485, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.14630126953125, + "learning_rate": 1e-06, + "loss": 0.3656, + "mean_token_accuracy": 0.8841363191604614, + "num_tokens": 756232828.0, + "step": 19816 + }, + { + "epoch": 2.520926090828139, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.07640266418457, + "learning_rate": 1e-06, + "loss": 0.3909, + "mean_token_accuracy": 0.8792527914047241, + "num_tokens": 756273077.0, + "step": 19817 + }, + { + "epoch": 2.5210533011067295, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.852920532226562, + "learning_rate": 1e-06, + "loss": 0.4566, + "mean_token_accuracy": 0.8572174906730652, + "num_tokens": 756313421.0, + "step": 19818 + }, + { + "epoch": 2.52118051138532, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.13917350769043, + "learning_rate": 1e-06, + "loss": 0.4183, + "mean_token_accuracy": 0.868550181388855, + "num_tokens": 756356461.0, + "step": 19819 + }, + { + "epoch": 2.5213077216639106, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.081995010375977, + "learning_rate": 1e-06, + "loss": 0.4349, + "mean_token_accuracy": 0.8694523572921753, + "num_tokens": 756388309.0, + "step": 19820 + }, + { + "epoch": 2.521434931942501, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.994211196899414, + "learning_rate": 1e-06, + "loss": 0.3679, + "mean_token_accuracy": 0.8833330869674683, + "num_tokens": 756426655.0, + "step": 19821 + }, + { + "epoch": 2.521562142221091, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.08256721496582, + "learning_rate": 1e-06, + "loss": 0.417, + "mean_token_accuracy": 0.8701187968254089, + "num_tokens": 756464961.0, + "step": 19822 + }, + { + "epoch": 2.521689352499682, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.037757873535156, + "learning_rate": 1e-06, + "loss": 0.3484, + "mean_token_accuracy": 0.88747638463974, + "num_tokens": 756497338.0, + "step": 19823 + }, + { + "epoch": 2.5218165627782723, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.12202262878418, + "learning_rate": 1e-06, + "loss": 0.3855, + "mean_token_accuracy": 0.8786540031433105, + "num_tokens": 756535324.0, + "step": 19824 + }, + { + "epoch": 2.5219437730568632, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.112560272216797, + "learning_rate": 1e-06, + "loss": 0.3799, + "mean_token_accuracy": 0.8812799453735352, + "num_tokens": 756574138.0, + "step": 19825 + }, + { + "epoch": 2.5220709833354533, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.232236862182617, + "learning_rate": 1e-06, + "loss": 0.374, + "mean_token_accuracy": 0.882379412651062, + "num_tokens": 756608967.0, + "step": 19826 + }, + { + "epoch": 2.522198193614044, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.1628360748291, + "learning_rate": 1e-06, + "loss": 0.476, + "mean_token_accuracy": 0.8525500893592834, + "num_tokens": 756653882.0, + "step": 19827 + }, + { + "epoch": 2.5223254038926344, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.033931732177734, + "learning_rate": 1e-06, + "loss": 0.4177, + "mean_token_accuracy": 0.8673806190490723, + "num_tokens": 756695638.0, + "step": 19828 + }, + { + "epoch": 2.522452614171225, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.08870506286621, + "learning_rate": 1e-06, + "loss": 0.4286, + "mean_token_accuracy": 0.866310715675354, + "num_tokens": 756734964.0, + "step": 19829 + }, + { + "epoch": 2.5225798244498154, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.089509963989258, + "learning_rate": 1e-06, + "loss": 0.3814, + "mean_token_accuracy": 0.877210259437561, + "num_tokens": 756768454.0, + "step": 19830 + }, + { + "epoch": 2.522707034728406, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.88997459411621, + "learning_rate": 1e-06, + "loss": 0.4221, + "mean_token_accuracy": 0.862746000289917, + "num_tokens": 756801696.0, + "step": 19831 + }, + { + "epoch": 2.5228342450069965, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.13738441467285, + "learning_rate": 1e-06, + "loss": 0.3928, + "mean_token_accuracy": 0.8735977411270142, + "num_tokens": 756839874.0, + "step": 19832 + }, + { + "epoch": 2.522961455285587, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.18111801147461, + "learning_rate": 1e-06, + "loss": 0.4405, + "mean_token_accuracy": 0.8623948097229004, + "num_tokens": 756884179.0, + "step": 19833 + }, + { + "epoch": 2.5230886655641775, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.010950088500977, + "learning_rate": 1e-06, + "loss": 0.3945, + "mean_token_accuracy": 0.874506413936615, + "num_tokens": 756922725.0, + "step": 19834 + }, + { + "epoch": 2.523215875842768, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.133756637573242, + "learning_rate": 1e-06, + "loss": 0.4165, + "mean_token_accuracy": 0.8684593439102173, + "num_tokens": 756956080.0, + "step": 19835 + }, + { + "epoch": 2.5233430861213586, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.06100845336914, + "learning_rate": 1e-06, + "loss": 0.4274, + "mean_token_accuracy": 0.863592803478241, + "num_tokens": 756995137.0, + "step": 19836 + }, + { + "epoch": 2.523470296399949, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.070207595825195, + "learning_rate": 1e-06, + "loss": 0.3889, + "mean_token_accuracy": 0.8792476654052734, + "num_tokens": 757035300.0, + "step": 19837 + }, + { + "epoch": 2.5235975066785397, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.19276237487793, + "learning_rate": 1e-06, + "loss": 0.4797, + "mean_token_accuracy": 0.8490622043609619, + "num_tokens": 757068789.0, + "step": 19838 + }, + { + "epoch": 2.52372471695713, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.21157455444336, + "learning_rate": 1e-06, + "loss": 0.4341, + "mean_token_accuracy": 0.8668006658554077, + "num_tokens": 757102913.0, + "step": 19839 + }, + { + "epoch": 2.5238519272357207, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.977285385131836, + "learning_rate": 1e-06, + "loss": 0.3976, + "mean_token_accuracy": 0.8735796213150024, + "num_tokens": 757138932.0, + "step": 19840 + }, + { + "epoch": 2.5239791375143112, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.031644821166992, + "learning_rate": 1e-06, + "loss": 0.3857, + "mean_token_accuracy": 0.8759893178939819, + "num_tokens": 757179181.0, + "step": 19841 + }, + { + "epoch": 2.5241063477929018, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.14310646057129, + "learning_rate": 1e-06, + "loss": 0.4923, + "mean_token_accuracy": 0.8429655432701111, + "num_tokens": 757224209.0, + "step": 19842 + }, + { + "epoch": 2.5242335580714923, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.067880630493164, + "learning_rate": 1e-06, + "loss": 0.3601, + "mean_token_accuracy": 0.8846782445907593, + "num_tokens": 757262398.0, + "step": 19843 + }, + { + "epoch": 2.524360768350083, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.884910583496094, + "learning_rate": 1e-06, + "loss": 0.429, + "mean_token_accuracy": 0.8630054593086243, + "num_tokens": 757301744.0, + "step": 19844 + }, + { + "epoch": 2.5244879786286734, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.03736114501953, + "learning_rate": 1e-06, + "loss": 0.3972, + "mean_token_accuracy": 0.8769150972366333, + "num_tokens": 757338273.0, + "step": 19845 + }, + { + "epoch": 2.524615188907264, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.01078987121582, + "learning_rate": 1e-06, + "loss": 0.422, + "mean_token_accuracy": 0.8644119501113892, + "num_tokens": 757383169.0, + "step": 19846 + }, + { + "epoch": 2.524742399185854, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.079177856445312, + "learning_rate": 1e-06, + "loss": 0.447, + "mean_token_accuracy": 0.8615411520004272, + "num_tokens": 757420998.0, + "step": 19847 + }, + { + "epoch": 2.524869609464445, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.04897117614746, + "learning_rate": 1e-06, + "loss": 0.4026, + "mean_token_accuracy": 0.8762730360031128, + "num_tokens": 757463281.0, + "step": 19848 + }, + { + "epoch": 2.524996819743035, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.106382369995117, + "learning_rate": 1e-06, + "loss": 0.4211, + "mean_token_accuracy": 0.8675345182418823, + "num_tokens": 757495022.0, + "step": 19849 + }, + { + "epoch": 2.525124030021626, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.179716110229492, + "learning_rate": 1e-06, + "loss": 0.3856, + "mean_token_accuracy": 0.8773146867752075, + "num_tokens": 757536801.0, + "step": 19850 + }, + { + "epoch": 2.525251240300216, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.210037231445312, + "learning_rate": 1e-06, + "loss": 0.3576, + "mean_token_accuracy": 0.8876535296440125, + "num_tokens": 757573419.0, + "step": 19851 + }, + { + "epoch": 2.5253784505788066, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.493955612182617, + "learning_rate": 1e-06, + "loss": 0.4065, + "mean_token_accuracy": 0.8675538301467896, + "num_tokens": 757610773.0, + "step": 19852 + }, + { + "epoch": 2.525505660857397, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.106746673583984, + "learning_rate": 1e-06, + "loss": 0.3619, + "mean_token_accuracy": 0.8821125030517578, + "num_tokens": 757648067.0, + "step": 19853 + }, + { + "epoch": 2.5256328711359877, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.111061096191406, + "learning_rate": 1e-06, + "loss": 0.3899, + "mean_token_accuracy": 0.8781232833862305, + "num_tokens": 757681598.0, + "step": 19854 + }, + { + "epoch": 2.525760081414578, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.232635498046875, + "learning_rate": 1e-06, + "loss": 0.4779, + "mean_token_accuracy": 0.854586660861969, + "num_tokens": 757724752.0, + "step": 19855 + }, + { + "epoch": 2.5258872916931687, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.29521942138672, + "learning_rate": 1e-06, + "loss": 0.3659, + "mean_token_accuracy": 0.8840415477752686, + "num_tokens": 757763801.0, + "step": 19856 + }, + { + "epoch": 2.5260145019717593, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.27206039428711, + "learning_rate": 1e-06, + "loss": 0.3989, + "mean_token_accuracy": 0.8716790676116943, + "num_tokens": 757799235.0, + "step": 19857 + }, + { + "epoch": 2.52614171225035, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.022708892822266, + "learning_rate": 1e-06, + "loss": 0.3931, + "mean_token_accuracy": 0.8767132759094238, + "num_tokens": 757839695.0, + "step": 19858 + }, + { + "epoch": 2.5262689225289403, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.73855972290039, + "learning_rate": 1e-06, + "loss": 0.4557, + "mean_token_accuracy": 0.8592948913574219, + "num_tokens": 757877152.0, + "step": 19859 + }, + { + "epoch": 2.526396132807531, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.198959350585938, + "learning_rate": 1e-06, + "loss": 0.4073, + "mean_token_accuracy": 0.8706527948379517, + "num_tokens": 757916173.0, + "step": 19860 + }, + { + "epoch": 2.5265233430861214, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.075061798095703, + "learning_rate": 1e-06, + "loss": 0.3904, + "mean_token_accuracy": 0.8777403831481934, + "num_tokens": 757957271.0, + "step": 19861 + }, + { + "epoch": 2.526650553364712, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.239730834960938, + "learning_rate": 1e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.8661456108093262, + "num_tokens": 757995553.0, + "step": 19862 + }, + { + "epoch": 2.5267777636433024, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.154388427734375, + "learning_rate": 1e-06, + "loss": 0.4433, + "mean_token_accuracy": 0.8613579869270325, + "num_tokens": 758035095.0, + "step": 19863 + }, + { + "epoch": 2.526904973921893, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.17351531982422, + "learning_rate": 1e-06, + "loss": 0.4086, + "mean_token_accuracy": 0.8706234097480774, + "num_tokens": 758071185.0, + "step": 19864 + }, + { + "epoch": 2.5270321842004835, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.3325138092041, + "learning_rate": 1e-06, + "loss": 0.3829, + "mean_token_accuracy": 0.8733969926834106, + "num_tokens": 758110251.0, + "step": 19865 + }, + { + "epoch": 2.527159394479074, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.164775848388672, + "learning_rate": 1e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.8573097586631775, + "num_tokens": 758144385.0, + "step": 19866 + }, + { + "epoch": 2.5272866047576645, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.19271469116211, + "learning_rate": 1e-06, + "loss": 0.3614, + "mean_token_accuracy": 0.8830116987228394, + "num_tokens": 758185935.0, + "step": 19867 + }, + { + "epoch": 2.527413815036255, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.085376739501953, + "learning_rate": 1e-06, + "loss": 0.3994, + "mean_token_accuracy": 0.876368522644043, + "num_tokens": 758226688.0, + "step": 19868 + }, + { + "epoch": 2.5275410253148456, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.184616088867188, + "learning_rate": 1e-06, + "loss": 0.3799, + "mean_token_accuracy": 0.877632737159729, + "num_tokens": 758263866.0, + "step": 19869 + }, + { + "epoch": 2.5276682355934357, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.200937271118164, + "learning_rate": 1e-06, + "loss": 0.3858, + "mean_token_accuracy": 0.8780210018157959, + "num_tokens": 758303607.0, + "step": 19870 + }, + { + "epoch": 2.5277954458720266, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.14493751525879, + "learning_rate": 1e-06, + "loss": 0.3973, + "mean_token_accuracy": 0.8761595487594604, + "num_tokens": 758340120.0, + "step": 19871 + }, + { + "epoch": 2.5279226561506167, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.135066986083984, + "learning_rate": 1e-06, + "loss": 0.4032, + "mean_token_accuracy": 0.8724244832992554, + "num_tokens": 758373792.0, + "step": 19872 + }, + { + "epoch": 2.5280498664292077, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.12976837158203, + "learning_rate": 1e-06, + "loss": 0.4437, + "mean_token_accuracy": 0.8604720830917358, + "num_tokens": 758418157.0, + "step": 19873 + }, + { + "epoch": 2.528177076707798, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.169767379760742, + "learning_rate": 1e-06, + "loss": 0.4266, + "mean_token_accuracy": 0.8652687072753906, + "num_tokens": 758454042.0, + "step": 19874 + }, + { + "epoch": 2.5283042869863888, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.94025993347168, + "learning_rate": 1e-06, + "loss": 0.4012, + "mean_token_accuracy": 0.8736466765403748, + "num_tokens": 758495305.0, + "step": 19875 + }, + { + "epoch": 2.528431497264979, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.192089080810547, + "learning_rate": 1e-06, + "loss": 0.3759, + "mean_token_accuracy": 0.8800552487373352, + "num_tokens": 758532646.0, + "step": 19876 + }, + { + "epoch": 2.5285587075435694, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.962299346923828, + "learning_rate": 1e-06, + "loss": 0.4188, + "mean_token_accuracy": 0.8679783344268799, + "num_tokens": 758568329.0, + "step": 19877 + }, + { + "epoch": 2.52868591782216, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.26997184753418, + "learning_rate": 1e-06, + "loss": 0.398, + "mean_token_accuracy": 0.8751595616340637, + "num_tokens": 758608246.0, + "step": 19878 + }, + { + "epoch": 2.5288131281007504, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.99627685546875, + "learning_rate": 1e-06, + "loss": 0.373, + "mean_token_accuracy": 0.8823586106300354, + "num_tokens": 758645356.0, + "step": 19879 + }, + { + "epoch": 2.528940338379341, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.154033660888672, + "learning_rate": 1e-06, + "loss": 0.3526, + "mean_token_accuracy": 0.8871918320655823, + "num_tokens": 758685657.0, + "step": 19880 + }, + { + "epoch": 2.5290675486579315, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.147869110107422, + "learning_rate": 1e-06, + "loss": 0.3977, + "mean_token_accuracy": 0.8758774399757385, + "num_tokens": 758718843.0, + "step": 19881 + }, + { + "epoch": 2.529194758936522, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.318952560424805, + "learning_rate": 1e-06, + "loss": 0.3931, + "mean_token_accuracy": 0.8761730194091797, + "num_tokens": 758757946.0, + "step": 19882 + }, + { + "epoch": 2.5293219692151125, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.324840545654297, + "learning_rate": 1e-06, + "loss": 0.4665, + "mean_token_accuracy": 0.8541793823242188, + "num_tokens": 758792832.0, + "step": 19883 + }, + { + "epoch": 2.529449179493703, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.28729820251465, + "learning_rate": 1e-06, + "loss": 0.4238, + "mean_token_accuracy": 0.864952564239502, + "num_tokens": 758826062.0, + "step": 19884 + }, + { + "epoch": 2.5295763897722936, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.343111038208008, + "learning_rate": 1e-06, + "loss": 0.3719, + "mean_token_accuracy": 0.8811807036399841, + "num_tokens": 758860940.0, + "step": 19885 + }, + { + "epoch": 2.529703600050884, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.42607307434082, + "learning_rate": 1e-06, + "loss": 0.4129, + "mean_token_accuracy": 0.8706468939781189, + "num_tokens": 758900740.0, + "step": 19886 + }, + { + "epoch": 2.5298308103294747, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.11993980407715, + "learning_rate": 1e-06, + "loss": 0.4201, + "mean_token_accuracy": 0.8695331811904907, + "num_tokens": 758942786.0, + "step": 19887 + }, + { + "epoch": 2.529958020608065, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.176916122436523, + "learning_rate": 1e-06, + "loss": 0.382, + "mean_token_accuracy": 0.8812432289123535, + "num_tokens": 758983376.0, + "step": 19888 + }, + { + "epoch": 2.5300852308866557, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.25252342224121, + "learning_rate": 1e-06, + "loss": 0.4235, + "mean_token_accuracy": 0.8605506420135498, + "num_tokens": 759024007.0, + "step": 19889 + }, + { + "epoch": 2.5302124411652462, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.36701774597168, + "learning_rate": 1e-06, + "loss": 0.3848, + "mean_token_accuracy": 0.879490852355957, + "num_tokens": 759061611.0, + "step": 19890 + }, + { + "epoch": 2.5303396514438368, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.995513916015625, + "learning_rate": 1e-06, + "loss": 0.45, + "mean_token_accuracy": 0.8569715023040771, + "num_tokens": 759101942.0, + "step": 19891 + }, + { + "epoch": 2.5304668617224273, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.469928741455078, + "learning_rate": 1e-06, + "loss": 0.3507, + "mean_token_accuracy": 0.8898506164550781, + "num_tokens": 759140281.0, + "step": 19892 + }, + { + "epoch": 2.530594072001018, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.454023361206055, + "learning_rate": 1e-06, + "loss": 0.3876, + "mean_token_accuracy": 0.8724958896636963, + "num_tokens": 759174319.0, + "step": 19893 + }, + { + "epoch": 2.5307212822796084, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.987438201904297, + "learning_rate": 1e-06, + "loss": 0.3955, + "mean_token_accuracy": 0.8794189095497131, + "num_tokens": 759208981.0, + "step": 19894 + }, + { + "epoch": 2.5308484925581984, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.65009880065918, + "learning_rate": 1e-06, + "loss": 0.397, + "mean_token_accuracy": 0.8748731017112732, + "num_tokens": 759240559.0, + "step": 19895 + }, + { + "epoch": 2.5309757028367894, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.24662971496582, + "learning_rate": 1e-06, + "loss": 0.3817, + "mean_token_accuracy": 0.8848649263381958, + "num_tokens": 759280550.0, + "step": 19896 + }, + { + "epoch": 2.5311029131153795, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.313777923583984, + "learning_rate": 1e-06, + "loss": 0.4146, + "mean_token_accuracy": 0.8661506175994873, + "num_tokens": 759321298.0, + "step": 19897 + }, + { + "epoch": 2.5312301233939705, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.2161922454834, + "learning_rate": 1e-06, + "loss": 0.3635, + "mean_token_accuracy": 0.8852623701095581, + "num_tokens": 759354627.0, + "step": 19898 + }, + { + "epoch": 2.5313573336725606, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.315383911132812, + "learning_rate": 1e-06, + "loss": 0.4234, + "mean_token_accuracy": 0.8674601316452026, + "num_tokens": 759388129.0, + "step": 19899 + }, + { + "epoch": 2.5314845439511515, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.299814224243164, + "learning_rate": 1e-06, + "loss": 0.3696, + "mean_token_accuracy": 0.8822316527366638, + "num_tokens": 759423667.0, + "step": 19900 + }, + { + "epoch": 2.5316117542297416, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.17496109008789, + "learning_rate": 1e-06, + "loss": 0.435, + "mean_token_accuracy": 0.8634387254714966, + "num_tokens": 759466586.0, + "step": 19901 + }, + { + "epoch": 2.531738964508332, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.261018753051758, + "learning_rate": 1e-06, + "loss": 0.4136, + "mean_token_accuracy": 0.8675700426101685, + "num_tokens": 759504214.0, + "step": 19902 + }, + { + "epoch": 2.5318661747869227, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.289012908935547, + "learning_rate": 1e-06, + "loss": 0.4578, + "mean_token_accuracy": 0.8588446378707886, + "num_tokens": 759543865.0, + "step": 19903 + }, + { + "epoch": 2.531993385065513, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.289573669433594, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.8685440421104431, + "num_tokens": 759581917.0, + "step": 19904 + }, + { + "epoch": 2.5321205953441037, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.132068634033203, + "learning_rate": 1e-06, + "loss": 0.3959, + "mean_token_accuracy": 0.8757932186126709, + "num_tokens": 759617641.0, + "step": 19905 + }, + { + "epoch": 2.5322478056226942, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.150882720947266, + "learning_rate": 1e-06, + "loss": 0.3668, + "mean_token_accuracy": 0.883797287940979, + "num_tokens": 759649097.0, + "step": 19906 + }, + { + "epoch": 2.5323750159012848, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.171039581298828, + "learning_rate": 1e-06, + "loss": 0.411, + "mean_token_accuracy": 0.8677214980125427, + "num_tokens": 759688802.0, + "step": 19907 + }, + { + "epoch": 2.5325022261798753, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.452617645263672, + "learning_rate": 1e-06, + "loss": 0.3638, + "mean_token_accuracy": 0.8827822208404541, + "num_tokens": 759724243.0, + "step": 19908 + }, + { + "epoch": 2.532629436458466, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.3021297454834, + "learning_rate": 1e-06, + "loss": 0.4223, + "mean_token_accuracy": 0.8658188581466675, + "num_tokens": 759764646.0, + "step": 19909 + }, + { + "epoch": 2.5327566467370564, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.375410079956055, + "learning_rate": 1e-06, + "loss": 0.4108, + "mean_token_accuracy": 0.8745806217193604, + "num_tokens": 759806939.0, + "step": 19910 + }, + { + "epoch": 2.532883857015647, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.340404510498047, + "learning_rate": 1e-06, + "loss": 0.4, + "mean_token_accuracy": 0.8743495345115662, + "num_tokens": 759848138.0, + "step": 19911 + }, + { + "epoch": 2.5330110672942374, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.35419273376465, + "learning_rate": 1e-06, + "loss": 0.4083, + "mean_token_accuracy": 0.8693887591362, + "num_tokens": 759881877.0, + "step": 19912 + }, + { + "epoch": 2.533138277572828, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.435640335083008, + "learning_rate": 1e-06, + "loss": 0.3726, + "mean_token_accuracy": 0.8834400177001953, + "num_tokens": 759922625.0, + "step": 19913 + }, + { + "epoch": 2.5332654878514185, + "ewc_loss": 0.0380859375, + "ewc_loss_parallel": 3.814697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.04195785522461, + "learning_rate": 1e-06, + "loss": 0.363, + "mean_token_accuracy": 0.8836333751678467, + "num_tokens": 759952429.0, + "step": 19914 + }, + { + "epoch": 2.533392698130009, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.59054946899414, + "learning_rate": 1e-06, + "loss": 0.4021, + "mean_token_accuracy": 0.8747400045394897, + "num_tokens": 759987678.0, + "step": 19915 + }, + { + "epoch": 2.5335199084085995, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.459688186645508, + "learning_rate": 1e-06, + "loss": 0.4497, + "mean_token_accuracy": 0.858841061592102, + "num_tokens": 760023943.0, + "step": 19916 + }, + { + "epoch": 2.53364711868719, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.184417724609375, + "learning_rate": 1e-06, + "loss": 0.3984, + "mean_token_accuracy": 0.8764232993125916, + "num_tokens": 760068229.0, + "step": 19917 + }, + { + "epoch": 2.5337743289657806, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.427352905273438, + "learning_rate": 1e-06, + "loss": 0.4389, + "mean_token_accuracy": 0.8595007658004761, + "num_tokens": 760109440.0, + "step": 19918 + }, + { + "epoch": 2.533901539244371, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.422536849975586, + "learning_rate": 1e-06, + "loss": 0.4033, + "mean_token_accuracy": 0.8743664622306824, + "num_tokens": 760146616.0, + "step": 19919 + }, + { + "epoch": 2.534028749522961, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.263656616210938, + "learning_rate": 1e-06, + "loss": 0.4563, + "mean_token_accuracy": 0.8558681011199951, + "num_tokens": 760188708.0, + "step": 19920 + }, + { + "epoch": 2.534155959801552, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.925556182861328, + "learning_rate": 1e-06, + "loss": 0.3632, + "mean_token_accuracy": 0.8889622092247009, + "num_tokens": 760231355.0, + "step": 19921 + }, + { + "epoch": 2.5342831700801423, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.535579681396484, + "learning_rate": 1e-06, + "loss": 0.4444, + "mean_token_accuracy": 0.859536349773407, + "num_tokens": 760268316.0, + "step": 19922 + }, + { + "epoch": 2.5344103803587332, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.423534393310547, + "learning_rate": 1e-06, + "loss": 0.4271, + "mean_token_accuracy": 0.864174485206604, + "num_tokens": 760305420.0, + "step": 19923 + }, + { + "epoch": 2.5345375906373233, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.174983978271484, + "learning_rate": 1e-06, + "loss": 0.3445, + "mean_token_accuracy": 0.892085075378418, + "num_tokens": 760342195.0, + "step": 19924 + }, + { + "epoch": 2.534664800915914, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.18312644958496, + "learning_rate": 1e-06, + "loss": 0.3627, + "mean_token_accuracy": 0.8862185478210449, + "num_tokens": 760375394.0, + "step": 19925 + }, + { + "epoch": 2.5347920111945044, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.178144454956055, + "learning_rate": 1e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.8615154027938843, + "num_tokens": 760411629.0, + "step": 19926 + }, + { + "epoch": 2.534919221473095, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.057140350341797, + "learning_rate": 1e-06, + "loss": 0.3953, + "mean_token_accuracy": 0.874902606010437, + "num_tokens": 760456212.0, + "step": 19927 + }, + { + "epoch": 2.5350464317516854, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.122804641723633, + "learning_rate": 1e-06, + "loss": 0.4288, + "mean_token_accuracy": 0.8634122610092163, + "num_tokens": 760496385.0, + "step": 19928 + }, + { + "epoch": 2.535173642030276, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.386091232299805, + "learning_rate": 1e-06, + "loss": 0.4121, + "mean_token_accuracy": 0.8718335032463074, + "num_tokens": 760530426.0, + "step": 19929 + }, + { + "epoch": 2.5353008523088665, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.063884735107422, + "learning_rate": 1e-06, + "loss": 0.3774, + "mean_token_accuracy": 0.884224534034729, + "num_tokens": 760568174.0, + "step": 19930 + }, + { + "epoch": 2.535428062587457, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.323057174682617, + "learning_rate": 1e-06, + "loss": 0.3626, + "mean_token_accuracy": 0.8903477191925049, + "num_tokens": 760608097.0, + "step": 19931 + }, + { + "epoch": 2.5355552728660475, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.176239013671875, + "learning_rate": 1e-06, + "loss": 0.4016, + "mean_token_accuracy": 0.8725502490997314, + "num_tokens": 760643018.0, + "step": 19932 + }, + { + "epoch": 2.535682483144638, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.290061950683594, + "learning_rate": 1e-06, + "loss": 0.3803, + "mean_token_accuracy": 0.8810548782348633, + "num_tokens": 760676553.0, + "step": 19933 + }, + { + "epoch": 2.5358096934232286, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.135963439941406, + "learning_rate": 1e-06, + "loss": 0.3577, + "mean_token_accuracy": 0.8872795104980469, + "num_tokens": 760708850.0, + "step": 19934 + }, + { + "epoch": 2.535936903701819, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.056455612182617, + "learning_rate": 1e-06, + "loss": 0.4202, + "mean_token_accuracy": 0.8695715069770813, + "num_tokens": 760746088.0, + "step": 19935 + }, + { + "epoch": 2.5360641139804097, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.45927619934082, + "learning_rate": 1e-06, + "loss": 0.3919, + "mean_token_accuracy": 0.8786042928695679, + "num_tokens": 760785716.0, + "step": 19936 + }, + { + "epoch": 2.536191324259, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.43834114074707, + "learning_rate": 1e-06, + "loss": 0.3844, + "mean_token_accuracy": 0.8821595311164856, + "num_tokens": 760821301.0, + "step": 19937 + }, + { + "epoch": 2.5363185345375907, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.177993774414062, + "learning_rate": 1e-06, + "loss": 0.4281, + "mean_token_accuracy": 0.8632313013076782, + "num_tokens": 760861200.0, + "step": 19938 + }, + { + "epoch": 2.5364457448161812, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.34897232055664, + "learning_rate": 1e-06, + "loss": 0.4084, + "mean_token_accuracy": 0.8715600967407227, + "num_tokens": 760896398.0, + "step": 19939 + }, + { + "epoch": 2.5365729550947718, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.407638549804688, + "learning_rate": 1e-06, + "loss": 0.4614, + "mean_token_accuracy": 0.8539347648620605, + "num_tokens": 760937300.0, + "step": 19940 + }, + { + "epoch": 2.5367001653733623, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.184280395507812, + "learning_rate": 1e-06, + "loss": 0.4195, + "mean_token_accuracy": 0.8672264218330383, + "num_tokens": 760973726.0, + "step": 19941 + }, + { + "epoch": 2.536827375651953, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.188322067260742, + "learning_rate": 1e-06, + "loss": 0.4137, + "mean_token_accuracy": 0.8710286617279053, + "num_tokens": 761009048.0, + "step": 19942 + }, + { + "epoch": 2.5369545859305433, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.469005584716797, + "learning_rate": 1e-06, + "loss": 0.4126, + "mean_token_accuracy": 0.8705811500549316, + "num_tokens": 761050836.0, + "step": 19943 + }, + { + "epoch": 2.537081796209134, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.495847702026367, + "learning_rate": 1e-06, + "loss": 0.4219, + "mean_token_accuracy": 0.8661497831344604, + "num_tokens": 761087517.0, + "step": 19944 + }, + { + "epoch": 2.537209006487724, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.202016830444336, + "learning_rate": 1e-06, + "loss": 0.361, + "mean_token_accuracy": 0.886364221572876, + "num_tokens": 761121401.0, + "step": 19945 + }, + { + "epoch": 2.537336216766315, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.257888793945312, + "learning_rate": 1e-06, + "loss": 0.4317, + "mean_token_accuracy": 0.8616580963134766, + "num_tokens": 761154973.0, + "step": 19946 + }, + { + "epoch": 2.537463427044905, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.320411682128906, + "learning_rate": 1e-06, + "loss": 0.3752, + "mean_token_accuracy": 0.8809491395950317, + "num_tokens": 761194211.0, + "step": 19947 + }, + { + "epoch": 2.537590637323496, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.08241081237793, + "learning_rate": 1e-06, + "loss": 0.4234, + "mean_token_accuracy": 0.8668537139892578, + "num_tokens": 761233486.0, + "step": 19948 + }, + { + "epoch": 2.537717847602086, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.164514541625977, + "learning_rate": 1e-06, + "loss": 0.3952, + "mean_token_accuracy": 0.8733439445495605, + "num_tokens": 761275846.0, + "step": 19949 + }, + { + "epoch": 2.5378450578806766, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.66583824157715, + "learning_rate": 1e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.8626599311828613, + "num_tokens": 761311210.0, + "step": 19950 + }, + { + "epoch": 2.537972268159267, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.28994369506836, + "learning_rate": 1e-06, + "loss": 0.4016, + "mean_token_accuracy": 0.8720870018005371, + "num_tokens": 761349961.0, + "step": 19951 + }, + { + "epoch": 2.5380994784378577, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.01779556274414, + "learning_rate": 1e-06, + "loss": 0.3858, + "mean_token_accuracy": 0.8808678388595581, + "num_tokens": 761386993.0, + "step": 19952 + }, + { + "epoch": 2.538226688716448, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.27151107788086, + "learning_rate": 1e-06, + "loss": 0.4328, + "mean_token_accuracy": 0.8620160818099976, + "num_tokens": 761426910.0, + "step": 19953 + }, + { + "epoch": 2.5383538989950387, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.3502197265625, + "learning_rate": 1e-06, + "loss": 0.3887, + "mean_token_accuracy": 0.8752768039703369, + "num_tokens": 761466633.0, + "step": 19954 + }, + { + "epoch": 2.5384811092736292, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.21800422668457, + "learning_rate": 1e-06, + "loss": 0.442, + "mean_token_accuracy": 0.8594149947166443, + "num_tokens": 761497670.0, + "step": 19955 + }, + { + "epoch": 2.5386083195522198, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.349403381347656, + "learning_rate": 1e-06, + "loss": 0.4138, + "mean_token_accuracy": 0.8698159456253052, + "num_tokens": 761534626.0, + "step": 19956 + }, + { + "epoch": 2.5387355298308103, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.25551414489746, + "learning_rate": 1e-06, + "loss": 0.3901, + "mean_token_accuracy": 0.8800750970840454, + "num_tokens": 761581125.0, + "step": 19957 + }, + { + "epoch": 2.538862740109401, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.088930130004883, + "learning_rate": 1e-06, + "loss": 0.4097, + "mean_token_accuracy": 0.873291015625, + "num_tokens": 761616533.0, + "step": 19958 + }, + { + "epoch": 2.5389899503879914, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.408132553100586, + "learning_rate": 1e-06, + "loss": 0.4442, + "mean_token_accuracy": 0.8622839450836182, + "num_tokens": 761650615.0, + "step": 19959 + }, + { + "epoch": 2.539117160666582, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.2780704498291, + "learning_rate": 1e-06, + "loss": 0.3712, + "mean_token_accuracy": 0.8842625021934509, + "num_tokens": 761687020.0, + "step": 19960 + }, + { + "epoch": 2.5392443709451724, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.22604751586914, + "learning_rate": 1e-06, + "loss": 0.3603, + "mean_token_accuracy": 0.8885008096694946, + "num_tokens": 761721653.0, + "step": 19961 + }, + { + "epoch": 2.539371581223763, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.381555557250977, + "learning_rate": 1e-06, + "loss": 0.3849, + "mean_token_accuracy": 0.880480945110321, + "num_tokens": 761758377.0, + "step": 19962 + }, + { + "epoch": 2.5394987915023535, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.237146377563477, + "learning_rate": 1e-06, + "loss": 0.417, + "mean_token_accuracy": 0.8721688985824585, + "num_tokens": 761795263.0, + "step": 19963 + }, + { + "epoch": 2.539626001780944, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.325706481933594, + "learning_rate": 1e-06, + "loss": 0.366, + "mean_token_accuracy": 0.8829323053359985, + "num_tokens": 761834758.0, + "step": 19964 + }, + { + "epoch": 2.5397532120595345, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.316991806030273, + "learning_rate": 1e-06, + "loss": 0.4249, + "mean_token_accuracy": 0.8661859631538391, + "num_tokens": 761880482.0, + "step": 19965 + }, + { + "epoch": 2.539880422338125, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.17278289794922, + "learning_rate": 1e-06, + "loss": 0.4038, + "mean_token_accuracy": 0.8718799948692322, + "num_tokens": 761922494.0, + "step": 19966 + }, + { + "epoch": 2.5400076326167156, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.282936096191406, + "learning_rate": 1e-06, + "loss": 0.3477, + "mean_token_accuracy": 0.8913494348526001, + "num_tokens": 761959462.0, + "step": 19967 + }, + { + "epoch": 2.5401348428953057, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.213233947753906, + "learning_rate": 1e-06, + "loss": 0.4307, + "mean_token_accuracy": 0.8645570874214172, + "num_tokens": 761990316.0, + "step": 19968 + }, + { + "epoch": 2.5402620531738966, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.30707550048828, + "learning_rate": 1e-06, + "loss": 0.421, + "mean_token_accuracy": 0.8667789101600647, + "num_tokens": 762023671.0, + "step": 19969 + }, + { + "epoch": 2.5403892634524867, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.323488235473633, + "learning_rate": 1e-06, + "loss": 0.3894, + "mean_token_accuracy": 0.8778085708618164, + "num_tokens": 762056719.0, + "step": 19970 + }, + { + "epoch": 2.5405164737310777, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.12516975402832, + "learning_rate": 1e-06, + "loss": 0.4045, + "mean_token_accuracy": 0.8742694854736328, + "num_tokens": 762098802.0, + "step": 19971 + }, + { + "epoch": 2.540643684009668, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.183462142944336, + "learning_rate": 1e-06, + "loss": 0.4039, + "mean_token_accuracy": 0.8755115270614624, + "num_tokens": 762134470.0, + "step": 19972 + }, + { + "epoch": 2.5407708942882588, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.362411499023438, + "learning_rate": 1e-06, + "loss": 0.3947, + "mean_token_accuracy": 0.8765658140182495, + "num_tokens": 762177163.0, + "step": 19973 + }, + { + "epoch": 2.540898104566849, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.456552505493164, + "learning_rate": 1e-06, + "loss": 0.3757, + "mean_token_accuracy": 0.8801710605621338, + "num_tokens": 762217323.0, + "step": 19974 + }, + { + "epoch": 2.5410253148454394, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.012413024902344, + "learning_rate": 1e-06, + "loss": 0.4137, + "mean_token_accuracy": 0.8687641620635986, + "num_tokens": 762252144.0, + "step": 19975 + }, + { + "epoch": 2.54115252512403, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.099956512451172, + "learning_rate": 1e-06, + "loss": 0.4254, + "mean_token_accuracy": 0.8671259880065918, + "num_tokens": 762282108.0, + "step": 19976 + }, + { + "epoch": 2.5412797354026204, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.44789695739746, + "learning_rate": 1e-06, + "loss": 0.3936, + "mean_token_accuracy": 0.8766676187515259, + "num_tokens": 762318154.0, + "step": 19977 + }, + { + "epoch": 2.541406945681211, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.06307029724121, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.8700705766677856, + "num_tokens": 762354704.0, + "step": 19978 + }, + { + "epoch": 2.5415341559598015, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.224519729614258, + "learning_rate": 1e-06, + "loss": 0.3895, + "mean_token_accuracy": 0.8753403425216675, + "num_tokens": 762389943.0, + "step": 19979 + }, + { + "epoch": 2.541661366238392, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.33173942565918, + "learning_rate": 1e-06, + "loss": 0.3905, + "mean_token_accuracy": 0.8758867979049683, + "num_tokens": 762421560.0, + "step": 19980 + }, + { + "epoch": 2.5417885765169825, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.169910430908203, + "learning_rate": 1e-06, + "loss": 0.3796, + "mean_token_accuracy": 0.8785171508789062, + "num_tokens": 762458509.0, + "step": 19981 + }, + { + "epoch": 2.541915786795573, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.15993881225586, + "learning_rate": 1e-06, + "loss": 0.4025, + "mean_token_accuracy": 0.872038722038269, + "num_tokens": 762502382.0, + "step": 19982 + }, + { + "epoch": 2.5420429970741636, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.981985092163086, + "learning_rate": 1e-06, + "loss": 0.4045, + "mean_token_accuracy": 0.8693424463272095, + "num_tokens": 762536611.0, + "step": 19983 + }, + { + "epoch": 2.542170207352754, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.48780632019043, + "learning_rate": 1e-06, + "loss": 0.399, + "mean_token_accuracy": 0.8731213808059692, + "num_tokens": 762577801.0, + "step": 19984 + }, + { + "epoch": 2.5422974176313446, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.274372100830078, + "learning_rate": 1e-06, + "loss": 0.4002, + "mean_token_accuracy": 0.874093770980835, + "num_tokens": 762617413.0, + "step": 19985 + }, + { + "epoch": 2.542424627909935, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.131624221801758, + "learning_rate": 1e-06, + "loss": 0.3783, + "mean_token_accuracy": 0.881649374961853, + "num_tokens": 762654811.0, + "step": 19986 + }, + { + "epoch": 2.5425518381885257, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.183650970458984, + "learning_rate": 1e-06, + "loss": 0.3794, + "mean_token_accuracy": 0.8778605461120605, + "num_tokens": 762692902.0, + "step": 19987 + }, + { + "epoch": 2.5426790484671162, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.251317977905273, + "learning_rate": 1e-06, + "loss": 0.4052, + "mean_token_accuracy": 0.8731474280357361, + "num_tokens": 762731984.0, + "step": 19988 + }, + { + "epoch": 2.5428062587457068, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.1207275390625, + "learning_rate": 1e-06, + "loss": 0.3701, + "mean_token_accuracy": 0.8821181058883667, + "num_tokens": 762767104.0, + "step": 19989 + }, + { + "epoch": 2.5429334690242973, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.090557098388672, + "learning_rate": 1e-06, + "loss": 0.4306, + "mean_token_accuracy": 0.8617411851882935, + "num_tokens": 762804982.0, + "step": 19990 + }, + { + "epoch": 2.543060679302888, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.218639373779297, + "learning_rate": 1e-06, + "loss": 0.4307, + "mean_token_accuracy": 0.8633204698562622, + "num_tokens": 762842956.0, + "step": 19991 + }, + { + "epoch": 2.5431878895814783, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.42057991027832, + "learning_rate": 1e-06, + "loss": 0.3857, + "mean_token_accuracy": 0.8791922926902771, + "num_tokens": 762875445.0, + "step": 19992 + }, + { + "epoch": 2.5433150998600684, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.43159294128418, + "learning_rate": 1e-06, + "loss": 0.3747, + "mean_token_accuracy": 0.8859969973564148, + "num_tokens": 762906061.0, + "step": 19993 + }, + { + "epoch": 2.5434423101386594, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.110925674438477, + "learning_rate": 1e-06, + "loss": 0.3815, + "mean_token_accuracy": 0.8807403445243835, + "num_tokens": 762945550.0, + "step": 19994 + }, + { + "epoch": 2.5435695204172495, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.145231246948242, + "learning_rate": 1e-06, + "loss": 0.397, + "mean_token_accuracy": 0.8735325932502747, + "num_tokens": 762982586.0, + "step": 19995 + }, + { + "epoch": 2.5436967306958405, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.247419357299805, + "learning_rate": 1e-06, + "loss": 0.4059, + "mean_token_accuracy": 0.8751449584960938, + "num_tokens": 763013527.0, + "step": 19996 + }, + { + "epoch": 2.5438239409744305, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.338932037353516, + "learning_rate": 1e-06, + "loss": 0.415, + "mean_token_accuracy": 0.8679744005203247, + "num_tokens": 763054363.0, + "step": 19997 + }, + { + "epoch": 2.5439511512530215, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.140796661376953, + "learning_rate": 1e-06, + "loss": 0.4223, + "mean_token_accuracy": 0.8686434626579285, + "num_tokens": 763096834.0, + "step": 19998 + }, + { + "epoch": 2.5440783615316116, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.310775756835938, + "learning_rate": 1e-06, + "loss": 0.4045, + "mean_token_accuracy": 0.8721774816513062, + "num_tokens": 763139611.0, + "step": 19999 + }, + { + "epoch": 2.544205571810202, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.31023597717285, + "learning_rate": 1e-06, + "loss": 0.3997, + "mean_token_accuracy": 0.8739837408065796, + "num_tokens": 763181085.0, + "step": 20000 + }, + { + "epoch": 2.5443327820887927, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.299562454223633, + "learning_rate": 1e-06, + "loss": 0.4119, + "mean_token_accuracy": 0.870874285697937, + "num_tokens": 763219811.0, + "step": 20001 + }, + { + "epoch": 2.544459992367383, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.28681755065918, + "learning_rate": 1e-06, + "loss": 0.4367, + "mean_token_accuracy": 0.8621373176574707, + "num_tokens": 763257745.0, + "step": 20002 + }, + { + "epoch": 2.5445872026459737, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.21757698059082, + "learning_rate": 1e-06, + "loss": 0.4447, + "mean_token_accuracy": 0.8581031560897827, + "num_tokens": 763299475.0, + "step": 20003 + }, + { + "epoch": 2.5447144129245642, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.194055557250977, + "learning_rate": 1e-06, + "loss": 0.4659, + "mean_token_accuracy": 0.8503418564796448, + "num_tokens": 763333960.0, + "step": 20004 + }, + { + "epoch": 2.5448416232031548, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.155029296875, + "learning_rate": 1e-06, + "loss": 0.4268, + "mean_token_accuracy": 0.8641930818557739, + "num_tokens": 763378646.0, + "step": 20005 + }, + { + "epoch": 2.5449688334817453, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.312509536743164, + "learning_rate": 1e-06, + "loss": 0.3877, + "mean_token_accuracy": 0.877186119556427, + "num_tokens": 763412441.0, + "step": 20006 + }, + { + "epoch": 2.545096043760336, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.169878005981445, + "learning_rate": 1e-06, + "loss": 0.3934, + "mean_token_accuracy": 0.8770173788070679, + "num_tokens": 763446713.0, + "step": 20007 + }, + { + "epoch": 2.5452232540389264, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.120677947998047, + "learning_rate": 1e-06, + "loss": 0.368, + "mean_token_accuracy": 0.8833746910095215, + "num_tokens": 763478686.0, + "step": 20008 + }, + { + "epoch": 2.545350464317517, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.239465713500977, + "learning_rate": 1e-06, + "loss": 0.3584, + "mean_token_accuracy": 0.8884838223457336, + "num_tokens": 763517258.0, + "step": 20009 + }, + { + "epoch": 2.5454776745961074, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.068706512451172, + "learning_rate": 1e-06, + "loss": 0.4141, + "mean_token_accuracy": 0.8677470684051514, + "num_tokens": 763553229.0, + "step": 20010 + }, + { + "epoch": 2.545604884874698, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.567453384399414, + "learning_rate": 1e-06, + "loss": 0.3832, + "mean_token_accuracy": 0.8805514574050903, + "num_tokens": 763594274.0, + "step": 20011 + }, + { + "epoch": 2.5457320951532885, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.375316619873047, + "learning_rate": 1e-06, + "loss": 0.4269, + "mean_token_accuracy": 0.8647648692131042, + "num_tokens": 763632162.0, + "step": 20012 + }, + { + "epoch": 2.545859305431879, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.255516052246094, + "learning_rate": 1e-06, + "loss": 0.4031, + "mean_token_accuracy": 0.874294102191925, + "num_tokens": 763666797.0, + "step": 20013 + }, + { + "epoch": 2.5459865157104695, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.371551513671875, + "learning_rate": 1e-06, + "loss": 0.4107, + "mean_token_accuracy": 0.8708527088165283, + "num_tokens": 763705475.0, + "step": 20014 + }, + { + "epoch": 2.54611372598906, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.35944938659668, + "learning_rate": 1e-06, + "loss": 0.3518, + "mean_token_accuracy": 0.8884880542755127, + "num_tokens": 763739434.0, + "step": 20015 + }, + { + "epoch": 2.5462409362676506, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.35430908203125, + "learning_rate": 1e-06, + "loss": 0.4378, + "mean_token_accuracy": 0.8605701923370361, + "num_tokens": 763773262.0, + "step": 20016 + }, + { + "epoch": 2.546368146546241, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.28605842590332, + "learning_rate": 1e-06, + "loss": 0.4404, + "mean_token_accuracy": 0.8621435165405273, + "num_tokens": 763812573.0, + "step": 20017 + }, + { + "epoch": 2.546495356824831, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.302692413330078, + "learning_rate": 1e-06, + "loss": 0.4005, + "mean_token_accuracy": 0.8758418560028076, + "num_tokens": 763848555.0, + "step": 20018 + }, + { + "epoch": 2.546622567103422, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.110668182373047, + "learning_rate": 1e-06, + "loss": 0.4139, + "mean_token_accuracy": 0.871296763420105, + "num_tokens": 763884608.0, + "step": 20019 + }, + { + "epoch": 2.5467497773820122, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.270992279052734, + "learning_rate": 1e-06, + "loss": 0.4061, + "mean_token_accuracy": 0.8688370585441589, + "num_tokens": 763922038.0, + "step": 20020 + }, + { + "epoch": 2.546876987660603, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.173879623413086, + "learning_rate": 1e-06, + "loss": 0.4119, + "mean_token_accuracy": 0.8702969551086426, + "num_tokens": 763958089.0, + "step": 20021 + }, + { + "epoch": 2.5470041979391933, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.11164665222168, + "learning_rate": 1e-06, + "loss": 0.4102, + "mean_token_accuracy": 0.87413090467453, + "num_tokens": 764000802.0, + "step": 20022 + }, + { + "epoch": 2.547131408217784, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.1507625579834, + "learning_rate": 1e-06, + "loss": 0.3876, + "mean_token_accuracy": 0.8771106004714966, + "num_tokens": 764036628.0, + "step": 20023 + }, + { + "epoch": 2.5472586184963744, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.4768123626709, + "learning_rate": 1e-06, + "loss": 0.382, + "mean_token_accuracy": 0.8756542801856995, + "num_tokens": 764073643.0, + "step": 20024 + }, + { + "epoch": 2.547385828774965, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.40188217163086, + "learning_rate": 1e-06, + "loss": 0.3852, + "mean_token_accuracy": 0.8786421418190002, + "num_tokens": 764110473.0, + "step": 20025 + }, + { + "epoch": 2.5475130390535554, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.22028350830078, + "learning_rate": 1e-06, + "loss": 0.4072, + "mean_token_accuracy": 0.8709495067596436, + "num_tokens": 764142941.0, + "step": 20026 + }, + { + "epoch": 2.547640249332146, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.41143798828125, + "learning_rate": 1e-06, + "loss": 0.3961, + "mean_token_accuracy": 0.8752439618110657, + "num_tokens": 764178752.0, + "step": 20027 + }, + { + "epoch": 2.5477674596107365, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.1844425201416, + "learning_rate": 1e-06, + "loss": 0.4147, + "mean_token_accuracy": 0.8695998191833496, + "num_tokens": 764217440.0, + "step": 20028 + }, + { + "epoch": 2.547894669889327, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.084081649780273, + "learning_rate": 1e-06, + "loss": 0.4269, + "mean_token_accuracy": 0.8637233972549438, + "num_tokens": 764257524.0, + "step": 20029 + }, + { + "epoch": 2.5480218801679175, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.157573699951172, + "learning_rate": 1e-06, + "loss": 0.4209, + "mean_token_accuracy": 0.8688282370567322, + "num_tokens": 764293645.0, + "step": 20030 + }, + { + "epoch": 2.548149090446508, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.358407974243164, + "learning_rate": 1e-06, + "loss": 0.3753, + "mean_token_accuracy": 0.8798813223838806, + "num_tokens": 764331660.0, + "step": 20031 + }, + { + "epoch": 2.5482763007250986, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.380495071411133, + "learning_rate": 1e-06, + "loss": 0.3469, + "mean_token_accuracy": 0.8901840448379517, + "num_tokens": 764376577.0, + "step": 20032 + }, + { + "epoch": 2.548403511003689, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.272212982177734, + "learning_rate": 1e-06, + "loss": 0.4168, + "mean_token_accuracy": 0.8705657720565796, + "num_tokens": 764415492.0, + "step": 20033 + }, + { + "epoch": 2.5485307212822796, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.345186233520508, + "learning_rate": 1e-06, + "loss": 0.4372, + "mean_token_accuracy": 0.8636895418167114, + "num_tokens": 764453948.0, + "step": 20034 + }, + { + "epoch": 2.54865793156087, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.13702392578125, + "learning_rate": 1e-06, + "loss": 0.3827, + "mean_token_accuracy": 0.8781601190567017, + "num_tokens": 764493906.0, + "step": 20035 + }, + { + "epoch": 2.5487851418394607, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.30771827697754, + "learning_rate": 1e-06, + "loss": 0.4162, + "mean_token_accuracy": 0.8672257661819458, + "num_tokens": 764532379.0, + "step": 20036 + }, + { + "epoch": 2.5489123521180512, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.306392669677734, + "learning_rate": 1e-06, + "loss": 0.4481, + "mean_token_accuracy": 0.8565565347671509, + "num_tokens": 764567726.0, + "step": 20037 + }, + { + "epoch": 2.5490395623966418, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.559946060180664, + "learning_rate": 1e-06, + "loss": 0.4077, + "mean_token_accuracy": 0.8734626770019531, + "num_tokens": 764606229.0, + "step": 20038 + }, + { + "epoch": 2.5491667726752323, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.21766471862793, + "learning_rate": 1e-06, + "loss": 0.3625, + "mean_token_accuracy": 0.882796049118042, + "num_tokens": 764641346.0, + "step": 20039 + }, + { + "epoch": 2.549293982953823, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.26558494567871, + "learning_rate": 1e-06, + "loss": 0.3962, + "mean_token_accuracy": 0.8738402128219604, + "num_tokens": 764687286.0, + "step": 20040 + }, + { + "epoch": 2.5494211932324133, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.34703826904297, + "learning_rate": 1e-06, + "loss": 0.3918, + "mean_token_accuracy": 0.8760012984275818, + "num_tokens": 764729235.0, + "step": 20041 + }, + { + "epoch": 2.549548403511004, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.32769775390625, + "learning_rate": 1e-06, + "loss": 0.4249, + "mean_token_accuracy": 0.865561842918396, + "num_tokens": 764771834.0, + "step": 20042 + }, + { + "epoch": 2.549675613789594, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.346288681030273, + "learning_rate": 1e-06, + "loss": 0.4044, + "mean_token_accuracy": 0.8726157546043396, + "num_tokens": 764808307.0, + "step": 20043 + }, + { + "epoch": 2.549802824068185, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.177717208862305, + "learning_rate": 1e-06, + "loss": 0.4252, + "mean_token_accuracy": 0.8664392232894897, + "num_tokens": 764848373.0, + "step": 20044 + }, + { + "epoch": 2.549930034346775, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.38138198852539, + "learning_rate": 1e-06, + "loss": 0.4243, + "mean_token_accuracy": 0.8686317801475525, + "num_tokens": 764886468.0, + "step": 20045 + }, + { + "epoch": 2.550057244625366, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.134504318237305, + "learning_rate": 1e-06, + "loss": 0.4012, + "mean_token_accuracy": 0.8745011687278748, + "num_tokens": 764928690.0, + "step": 20046 + }, + { + "epoch": 2.550184454903956, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.16459846496582, + "learning_rate": 1e-06, + "loss": 0.5117, + "mean_token_accuracy": 0.8392423391342163, + "num_tokens": 764969148.0, + "step": 20047 + }, + { + "epoch": 2.5503116651825466, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.40364646911621, + "learning_rate": 1e-06, + "loss": 0.4238, + "mean_token_accuracy": 0.8663721680641174, + "num_tokens": 765010754.0, + "step": 20048 + }, + { + "epoch": 2.550438875461137, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.103763580322266, + "learning_rate": 1e-06, + "loss": 0.3859, + "mean_token_accuracy": 0.8774169683456421, + "num_tokens": 765043562.0, + "step": 20049 + }, + { + "epoch": 2.5505660857397277, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.313915252685547, + "learning_rate": 1e-06, + "loss": 0.4629, + "mean_token_accuracy": 0.8544238805770874, + "num_tokens": 765085638.0, + "step": 20050 + }, + { + "epoch": 2.550693296018318, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.296533584594727, + "learning_rate": 1e-06, + "loss": 0.3825, + "mean_token_accuracy": 0.8799285888671875, + "num_tokens": 765115847.0, + "step": 20051 + }, + { + "epoch": 2.5508205062969087, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.958620071411133, + "learning_rate": 1e-06, + "loss": 0.3895, + "mean_token_accuracy": 0.8715812563896179, + "num_tokens": 765146839.0, + "step": 20052 + }, + { + "epoch": 2.5509477165754992, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.273670196533203, + "learning_rate": 1e-06, + "loss": 0.4013, + "mean_token_accuracy": 0.8744595646858215, + "num_tokens": 765185921.0, + "step": 20053 + }, + { + "epoch": 2.5510749268540898, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.187938690185547, + "learning_rate": 1e-06, + "loss": 0.4001, + "mean_token_accuracy": 0.873339056968689, + "num_tokens": 765224534.0, + "step": 20054 + }, + { + "epoch": 2.5512021371326803, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.19230842590332, + "learning_rate": 1e-06, + "loss": 0.3781, + "mean_token_accuracy": 0.881798267364502, + "num_tokens": 765267160.0, + "step": 20055 + }, + { + "epoch": 2.551329347411271, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.241565704345703, + "learning_rate": 1e-06, + "loss": 0.3998, + "mean_token_accuracy": 0.8743265867233276, + "num_tokens": 765304770.0, + "step": 20056 + }, + { + "epoch": 2.5514565576898613, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.10662269592285, + "learning_rate": 1e-06, + "loss": 0.4129, + "mean_token_accuracy": 0.869874119758606, + "num_tokens": 765341966.0, + "step": 20057 + }, + { + "epoch": 2.551583767968452, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.050498962402344, + "learning_rate": 1e-06, + "loss": 0.3602, + "mean_token_accuracy": 0.887237548828125, + "num_tokens": 765382653.0, + "step": 20058 + }, + { + "epoch": 2.5517109782470424, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.166589736938477, + "learning_rate": 1e-06, + "loss": 0.4082, + "mean_token_accuracy": 0.8723764419555664, + "num_tokens": 765414940.0, + "step": 20059 + }, + { + "epoch": 2.551838188525633, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.356380462646484, + "learning_rate": 1e-06, + "loss": 0.3936, + "mean_token_accuracy": 0.8765199184417725, + "num_tokens": 765445549.0, + "step": 20060 + }, + { + "epoch": 2.5519653988042235, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.099266052246094, + "learning_rate": 1e-06, + "loss": 0.3834, + "mean_token_accuracy": 0.8807346224784851, + "num_tokens": 765484474.0, + "step": 20061 + }, + { + "epoch": 2.552092609082814, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.446517944335938, + "learning_rate": 1e-06, + "loss": 0.3968, + "mean_token_accuracy": 0.8731431365013123, + "num_tokens": 765522651.0, + "step": 20062 + }, + { + "epoch": 2.5522198193614045, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.29921531677246, + "learning_rate": 1e-06, + "loss": 0.4206, + "mean_token_accuracy": 0.8677862882614136, + "num_tokens": 765565983.0, + "step": 20063 + }, + { + "epoch": 2.552347029639995, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.278989791870117, + "learning_rate": 1e-06, + "loss": 0.4074, + "mean_token_accuracy": 0.8738121390342712, + "num_tokens": 765604056.0, + "step": 20064 + }, + { + "epoch": 2.5524742399185856, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.34723472595215, + "learning_rate": 1e-06, + "loss": 0.3746, + "mean_token_accuracy": 0.8816146850585938, + "num_tokens": 765646062.0, + "step": 20065 + }, + { + "epoch": 2.5526014501971757, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.053001403808594, + "learning_rate": 1e-06, + "loss": 0.4082, + "mean_token_accuracy": 0.8715674877166748, + "num_tokens": 765687284.0, + "step": 20066 + }, + { + "epoch": 2.5527286604757666, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.35359001159668, + "learning_rate": 1e-06, + "loss": 0.4419, + "mean_token_accuracy": 0.8601338863372803, + "num_tokens": 765723743.0, + "step": 20067 + }, + { + "epoch": 2.5528558707543567, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.27432632446289, + "learning_rate": 1e-06, + "loss": 0.4098, + "mean_token_accuracy": 0.8698253631591797, + "num_tokens": 765768472.0, + "step": 20068 + }, + { + "epoch": 2.5529830810329477, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.487030029296875, + "learning_rate": 1e-06, + "loss": 0.3564, + "mean_token_accuracy": 0.8883945941925049, + "num_tokens": 765809040.0, + "step": 20069 + }, + { + "epoch": 2.5531102913115378, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.3372802734375, + "learning_rate": 1e-06, + "loss": 0.4088, + "mean_token_accuracy": 0.873825192451477, + "num_tokens": 765845968.0, + "step": 20070 + }, + { + "epoch": 2.5532375015901287, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.059816360473633, + "learning_rate": 1e-06, + "loss": 0.3838, + "mean_token_accuracy": 0.8782156109809875, + "num_tokens": 765881037.0, + "step": 20071 + }, + { + "epoch": 2.553364711868719, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.489097595214844, + "learning_rate": 1e-06, + "loss": 0.3906, + "mean_token_accuracy": 0.8777126669883728, + "num_tokens": 765918945.0, + "step": 20072 + }, + { + "epoch": 2.5534919221473094, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.174169540405273, + "learning_rate": 1e-06, + "loss": 0.3542, + "mean_token_accuracy": 0.8854132294654846, + "num_tokens": 765956128.0, + "step": 20073 + }, + { + "epoch": 2.5536191324259, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.038572311401367, + "learning_rate": 1e-06, + "loss": 0.3572, + "mean_token_accuracy": 0.8860061168670654, + "num_tokens": 765997109.0, + "step": 20074 + }, + { + "epoch": 2.5537463427044904, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.179569244384766, + "learning_rate": 1e-06, + "loss": 0.4067, + "mean_token_accuracy": 0.8710525631904602, + "num_tokens": 766032557.0, + "step": 20075 + }, + { + "epoch": 2.553873552983081, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.158985137939453, + "learning_rate": 1e-06, + "loss": 0.4258, + "mean_token_accuracy": 0.869070291519165, + "num_tokens": 766075285.0, + "step": 20076 + }, + { + "epoch": 2.5540007632616715, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.158050537109375, + "learning_rate": 1e-06, + "loss": 0.4112, + "mean_token_accuracy": 0.8710446357727051, + "num_tokens": 766110600.0, + "step": 20077 + }, + { + "epoch": 2.554127973540262, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.195772171020508, + "learning_rate": 1e-06, + "loss": 0.436, + "mean_token_accuracy": 0.8620107173919678, + "num_tokens": 766146877.0, + "step": 20078 + }, + { + "epoch": 2.5542551838188525, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.401493072509766, + "learning_rate": 1e-06, + "loss": 0.405, + "mean_token_accuracy": 0.8681299686431885, + "num_tokens": 766185164.0, + "step": 20079 + }, + { + "epoch": 2.554382394097443, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.141136169433594, + "learning_rate": 1e-06, + "loss": 0.3459, + "mean_token_accuracy": 0.8927291631698608, + "num_tokens": 766221442.0, + "step": 20080 + }, + { + "epoch": 2.5545096043760336, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.472633361816406, + "learning_rate": 1e-06, + "loss": 0.4812, + "mean_token_accuracy": 0.8496764302253723, + "num_tokens": 766255258.0, + "step": 20081 + }, + { + "epoch": 2.554636814654624, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.287845611572266, + "learning_rate": 1e-06, + "loss": 0.4083, + "mean_token_accuracy": 0.870307207107544, + "num_tokens": 766292392.0, + "step": 20082 + }, + { + "epoch": 2.5547640249332146, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.402433395385742, + "learning_rate": 1e-06, + "loss": 0.4329, + "mean_token_accuracy": 0.8636974096298218, + "num_tokens": 766335312.0, + "step": 20083 + }, + { + "epoch": 2.554891235211805, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.295642852783203, + "learning_rate": 1e-06, + "loss": 0.3879, + "mean_token_accuracy": 0.8749194145202637, + "num_tokens": 766372189.0, + "step": 20084 + }, + { + "epoch": 2.5550184454903957, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.171232223510742, + "learning_rate": 1e-06, + "loss": 0.4283, + "mean_token_accuracy": 0.8624497652053833, + "num_tokens": 766414021.0, + "step": 20085 + }, + { + "epoch": 2.5551456557689862, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.393085479736328, + "learning_rate": 1e-06, + "loss": 0.3599, + "mean_token_accuracy": 0.8862557411193848, + "num_tokens": 766452005.0, + "step": 20086 + }, + { + "epoch": 2.5552728660475768, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.11493492126465, + "learning_rate": 1e-06, + "loss": 0.4128, + "mean_token_accuracy": 0.8664460182189941, + "num_tokens": 766492150.0, + "step": 20087 + }, + { + "epoch": 2.5554000763261673, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.363370895385742, + "learning_rate": 1e-06, + "loss": 0.3696, + "mean_token_accuracy": 0.8832682371139526, + "num_tokens": 766533889.0, + "step": 20088 + }, + { + "epoch": 2.555527286604758, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.30866813659668, + "learning_rate": 1e-06, + "loss": 0.3917, + "mean_token_accuracy": 0.8722213506698608, + "num_tokens": 766565008.0, + "step": 20089 + }, + { + "epoch": 2.5556544968833483, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.20071029663086, + "learning_rate": 1e-06, + "loss": 0.401, + "mean_token_accuracy": 0.8738885521888733, + "num_tokens": 766605744.0, + "step": 20090 + }, + { + "epoch": 2.5557817071619384, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.323884963989258, + "learning_rate": 1e-06, + "loss": 0.3748, + "mean_token_accuracy": 0.8809924125671387, + "num_tokens": 766645782.0, + "step": 20091 + }, + { + "epoch": 2.5559089174405294, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.251522064208984, + "learning_rate": 1e-06, + "loss": 0.4413, + "mean_token_accuracy": 0.8636350035667419, + "num_tokens": 766683135.0, + "step": 20092 + }, + { + "epoch": 2.5560361277191195, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.18302345275879, + "learning_rate": 1e-06, + "loss": 0.4026, + "mean_token_accuracy": 0.8733867406845093, + "num_tokens": 766720691.0, + "step": 20093 + }, + { + "epoch": 2.5561633379977104, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.127700805664062, + "learning_rate": 1e-06, + "loss": 0.4148, + "mean_token_accuracy": 0.8682272434234619, + "num_tokens": 766755426.0, + "step": 20094 + }, + { + "epoch": 2.5562905482763005, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.37958526611328, + "learning_rate": 1e-06, + "loss": 0.3745, + "mean_token_accuracy": 0.878608763217926, + "num_tokens": 766788315.0, + "step": 20095 + }, + { + "epoch": 2.5564177585548915, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.081235885620117, + "learning_rate": 1e-06, + "loss": 0.3553, + "mean_token_accuracy": 0.8891122341156006, + "num_tokens": 766825664.0, + "step": 20096 + }, + { + "epoch": 2.5565449688334816, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.362369537353516, + "learning_rate": 1e-06, + "loss": 0.3806, + "mean_token_accuracy": 0.8854550719261169, + "num_tokens": 766868145.0, + "step": 20097 + }, + { + "epoch": 2.556672179112072, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.415822982788086, + "learning_rate": 1e-06, + "loss": 0.4468, + "mean_token_accuracy": 0.8619751334190369, + "num_tokens": 766907278.0, + "step": 20098 + }, + { + "epoch": 2.5567993893906626, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.275230407714844, + "learning_rate": 1e-06, + "loss": 0.3626, + "mean_token_accuracy": 0.8861452341079712, + "num_tokens": 766941662.0, + "step": 20099 + }, + { + "epoch": 2.556926599669253, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.407258987426758, + "learning_rate": 1e-06, + "loss": 0.4722, + "mean_token_accuracy": 0.8555896878242493, + "num_tokens": 766977725.0, + "step": 20100 + }, + { + "epoch": 2.5570538099478437, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.139074325561523, + "learning_rate": 1e-06, + "loss": 0.4071, + "mean_token_accuracy": 0.8691930770874023, + "num_tokens": 767011011.0, + "step": 20101 + }, + { + "epoch": 2.5571810202264342, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.253890991210938, + "learning_rate": 1e-06, + "loss": 0.4083, + "mean_token_accuracy": 0.8706862330436707, + "num_tokens": 767047078.0, + "step": 20102 + }, + { + "epoch": 2.5573082305050248, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.546344757080078, + "learning_rate": 1e-06, + "loss": 0.4201, + "mean_token_accuracy": 0.8690750598907471, + "num_tokens": 767089208.0, + "step": 20103 + }, + { + "epoch": 2.5574354407836153, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.33111000061035, + "learning_rate": 1e-06, + "loss": 0.4618, + "mean_token_accuracy": 0.8582940697669983, + "num_tokens": 767131225.0, + "step": 20104 + }, + { + "epoch": 2.557562651062206, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.395891189575195, + "learning_rate": 1e-06, + "loss": 0.4294, + "mean_token_accuracy": 0.8618372082710266, + "num_tokens": 767165222.0, + "step": 20105 + }, + { + "epoch": 2.5576898613407963, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.47063446044922, + "learning_rate": 1e-06, + "loss": 0.4022, + "mean_token_accuracy": 0.8735790252685547, + "num_tokens": 767200870.0, + "step": 20106 + }, + { + "epoch": 2.557817071619387, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.33329963684082, + "learning_rate": 1e-06, + "loss": 0.4114, + "mean_token_accuracy": 0.871788740158081, + "num_tokens": 767236881.0, + "step": 20107 + }, + { + "epoch": 2.5579442818979774, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.296653747558594, + "learning_rate": 1e-06, + "loss": 0.3879, + "mean_token_accuracy": 0.8787153363227844, + "num_tokens": 767269716.0, + "step": 20108 + }, + { + "epoch": 2.558071492176568, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.45347785949707, + "learning_rate": 1e-06, + "loss": 0.4348, + "mean_token_accuracy": 0.8638732433319092, + "num_tokens": 767306840.0, + "step": 20109 + }, + { + "epoch": 2.5581987024551585, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.562116622924805, + "learning_rate": 1e-06, + "loss": 0.3713, + "mean_token_accuracy": 0.8853834867477417, + "num_tokens": 767346966.0, + "step": 20110 + }, + { + "epoch": 2.558325912733749, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.952239990234375, + "learning_rate": 1e-06, + "loss": 0.4139, + "mean_token_accuracy": 0.8687549829483032, + "num_tokens": 767387084.0, + "step": 20111 + }, + { + "epoch": 2.5584531230123395, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.575572967529297, + "learning_rate": 1e-06, + "loss": 0.4601, + "mean_token_accuracy": 0.8543266654014587, + "num_tokens": 767417857.0, + "step": 20112 + }, + { + "epoch": 2.55858033329093, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.152984619140625, + "learning_rate": 1e-06, + "loss": 0.4, + "mean_token_accuracy": 0.8749629855155945, + "num_tokens": 767455771.0, + "step": 20113 + }, + { + "epoch": 2.5587075435695206, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.321443557739258, + "learning_rate": 1e-06, + "loss": 0.4355, + "mean_token_accuracy": 0.8598494529724121, + "num_tokens": 767493187.0, + "step": 20114 + }, + { + "epoch": 2.558834753848111, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.40968132019043, + "learning_rate": 1e-06, + "loss": 0.426, + "mean_token_accuracy": 0.8654248714447021, + "num_tokens": 767534526.0, + "step": 20115 + }, + { + "epoch": 2.558961964126701, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.576404571533203, + "learning_rate": 1e-06, + "loss": 0.4149, + "mean_token_accuracy": 0.8702645897865295, + "num_tokens": 767569297.0, + "step": 20116 + }, + { + "epoch": 2.559089174405292, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.14627456665039, + "learning_rate": 1e-06, + "loss": 0.4178, + "mean_token_accuracy": 0.8699769973754883, + "num_tokens": 767605860.0, + "step": 20117 + }, + { + "epoch": 2.5592163846838822, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.31569480895996, + "learning_rate": 1e-06, + "loss": 0.3966, + "mean_token_accuracy": 0.8746655583381653, + "num_tokens": 767645252.0, + "step": 20118 + }, + { + "epoch": 2.559343594962473, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.692731857299805, + "learning_rate": 1e-06, + "loss": 0.4054, + "mean_token_accuracy": 0.8733278512954712, + "num_tokens": 767681942.0, + "step": 20119 + }, + { + "epoch": 2.5594708052410633, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.1176815032959, + "learning_rate": 1e-06, + "loss": 0.424, + "mean_token_accuracy": 0.8721444010734558, + "num_tokens": 767720631.0, + "step": 20120 + }, + { + "epoch": 2.559598015519654, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.178014755249023, + "learning_rate": 1e-06, + "loss": 0.3912, + "mean_token_accuracy": 0.8761420249938965, + "num_tokens": 767757018.0, + "step": 20121 + }, + { + "epoch": 2.5597252257982444, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.591184616088867, + "learning_rate": 1e-06, + "loss": 0.4072, + "mean_token_accuracy": 0.8763880729675293, + "num_tokens": 767796046.0, + "step": 20122 + }, + { + "epoch": 2.559852436076835, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.288536071777344, + "learning_rate": 1e-06, + "loss": 0.3806, + "mean_token_accuracy": 0.8836183547973633, + "num_tokens": 767835678.0, + "step": 20123 + }, + { + "epoch": 2.5599796463554254, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.125591278076172, + "learning_rate": 1e-06, + "loss": 0.3617, + "mean_token_accuracy": 0.8873677253723145, + "num_tokens": 767870515.0, + "step": 20124 + }, + { + "epoch": 2.560106856634016, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.59222984313965, + "learning_rate": 1e-06, + "loss": 0.4538, + "mean_token_accuracy": 0.8577627539634705, + "num_tokens": 767911573.0, + "step": 20125 + }, + { + "epoch": 2.5602340669126065, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.11039161682129, + "learning_rate": 1e-06, + "loss": 0.4091, + "mean_token_accuracy": 0.8716366291046143, + "num_tokens": 767950161.0, + "step": 20126 + }, + { + "epoch": 2.560361277191197, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.45516014099121, + "learning_rate": 1e-06, + "loss": 0.4111, + "mean_token_accuracy": 0.8705657124519348, + "num_tokens": 767989431.0, + "step": 20127 + }, + { + "epoch": 2.5604884874697875, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.577777862548828, + "learning_rate": 1e-06, + "loss": 0.3704, + "mean_token_accuracy": 0.8831772208213806, + "num_tokens": 768023551.0, + "step": 20128 + }, + { + "epoch": 2.560615697748378, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.34227180480957, + "learning_rate": 1e-06, + "loss": 0.3781, + "mean_token_accuracy": 0.8815689086914062, + "num_tokens": 768055317.0, + "step": 20129 + }, + { + "epoch": 2.5607429080269686, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.431941986083984, + "learning_rate": 1e-06, + "loss": 0.4321, + "mean_token_accuracy": 0.8663061857223511, + "num_tokens": 768089575.0, + "step": 20130 + }, + { + "epoch": 2.560870118305559, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.283157348632812, + "learning_rate": 1e-06, + "loss": 0.4416, + "mean_token_accuracy": 0.8639073371887207, + "num_tokens": 768131795.0, + "step": 20131 + }, + { + "epoch": 2.5609973285841496, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.293010711669922, + "learning_rate": 1e-06, + "loss": 0.3857, + "mean_token_accuracy": 0.8762938380241394, + "num_tokens": 768165714.0, + "step": 20132 + }, + { + "epoch": 2.56112453886274, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.366199493408203, + "learning_rate": 1e-06, + "loss": 0.3813, + "mean_token_accuracy": 0.8805078268051147, + "num_tokens": 768203475.0, + "step": 20133 + }, + { + "epoch": 2.5612517491413307, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.25447654724121, + "learning_rate": 1e-06, + "loss": 0.3635, + "mean_token_accuracy": 0.8884305953979492, + "num_tokens": 768244841.0, + "step": 20134 + }, + { + "epoch": 2.561378959419921, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.080059051513672, + "learning_rate": 1e-06, + "loss": 0.4295, + "mean_token_accuracy": 0.867632269859314, + "num_tokens": 768286502.0, + "step": 20135 + }, + { + "epoch": 2.5615061696985117, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.195262908935547, + "learning_rate": 1e-06, + "loss": 0.3752, + "mean_token_accuracy": 0.8809285759925842, + "num_tokens": 768320628.0, + "step": 20136 + }, + { + "epoch": 2.5616333799771023, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.54314422607422, + "learning_rate": 1e-06, + "loss": 0.4318, + "mean_token_accuracy": 0.8661980628967285, + "num_tokens": 768353630.0, + "step": 20137 + }, + { + "epoch": 2.561760590255693, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.044326782226562, + "learning_rate": 1e-06, + "loss": 0.3947, + "mean_token_accuracy": 0.8731008172035217, + "num_tokens": 768388170.0, + "step": 20138 + }, + { + "epoch": 2.5618878005342833, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.160472869873047, + "learning_rate": 1e-06, + "loss": 0.3527, + "mean_token_accuracy": 0.8897198438644409, + "num_tokens": 768428238.0, + "step": 20139 + }, + { + "epoch": 2.562015010812874, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.346149444580078, + "learning_rate": 1e-06, + "loss": 0.3609, + "mean_token_accuracy": 0.8869636058807373, + "num_tokens": 768460075.0, + "step": 20140 + }, + { + "epoch": 2.562142221091464, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.045413970947266, + "learning_rate": 1e-06, + "loss": 0.3415, + "mean_token_accuracy": 0.8912020921707153, + "num_tokens": 768493982.0, + "step": 20141 + }, + { + "epoch": 2.562269431370055, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.54814338684082, + "learning_rate": 1e-06, + "loss": 0.3874, + "mean_token_accuracy": 0.8771075010299683, + "num_tokens": 768529121.0, + "step": 20142 + }, + { + "epoch": 2.562396641648645, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.12274169921875, + "learning_rate": 1e-06, + "loss": 0.402, + "mean_token_accuracy": 0.8737114071846008, + "num_tokens": 768560730.0, + "step": 20143 + }, + { + "epoch": 2.562523851927236, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.123065948486328, + "learning_rate": 1e-06, + "loss": 0.3652, + "mean_token_accuracy": 0.8835690021514893, + "num_tokens": 768603015.0, + "step": 20144 + }, + { + "epoch": 2.562651062205826, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.31134605407715, + "learning_rate": 1e-06, + "loss": 0.3957, + "mean_token_accuracy": 0.8730766773223877, + "num_tokens": 768639847.0, + "step": 20145 + }, + { + "epoch": 2.5627782724844166, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.163372039794922, + "learning_rate": 1e-06, + "loss": 0.4164, + "mean_token_accuracy": 0.8696773648262024, + "num_tokens": 768681614.0, + "step": 20146 + }, + { + "epoch": 2.562905482763007, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.493837356567383, + "learning_rate": 1e-06, + "loss": 0.3949, + "mean_token_accuracy": 0.8768105506896973, + "num_tokens": 768722052.0, + "step": 20147 + }, + { + "epoch": 2.5630326930415976, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.311168670654297, + "learning_rate": 1e-06, + "loss": 0.4154, + "mean_token_accuracy": 0.8669928908348083, + "num_tokens": 768754191.0, + "step": 20148 + }, + { + "epoch": 2.563159903320188, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.13463020324707, + "learning_rate": 1e-06, + "loss": 0.4401, + "mean_token_accuracy": 0.8630689978599548, + "num_tokens": 768792981.0, + "step": 20149 + }, + { + "epoch": 2.5632871135987787, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.43362045288086, + "learning_rate": 1e-06, + "loss": 0.4322, + "mean_token_accuracy": 0.8644160032272339, + "num_tokens": 768827040.0, + "step": 20150 + }, + { + "epoch": 2.5634143238773692, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.29883575439453, + "learning_rate": 1e-06, + "loss": 0.4028, + "mean_token_accuracy": 0.871672511100769, + "num_tokens": 768865669.0, + "step": 20151 + }, + { + "epoch": 2.5635415341559598, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.068613052368164, + "learning_rate": 1e-06, + "loss": 0.4101, + "mean_token_accuracy": 0.8716540336608887, + "num_tokens": 768903306.0, + "step": 20152 + }, + { + "epoch": 2.5636687444345503, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.625043869018555, + "learning_rate": 1e-06, + "loss": 0.3864, + "mean_token_accuracy": 0.8767589926719666, + "num_tokens": 768946322.0, + "step": 20153 + }, + { + "epoch": 2.563795954713141, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.356143951416016, + "learning_rate": 1e-06, + "loss": 0.3418, + "mean_token_accuracy": 0.8907352685928345, + "num_tokens": 768987350.0, + "step": 20154 + }, + { + "epoch": 2.5639231649917313, + "ewc_loss": 0.038330078125, + "ewc_loss_parallel": 3.838539123535156e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.21632957458496, + "learning_rate": 1e-06, + "loss": 0.3866, + "mean_token_accuracy": 0.8813024759292603, + "num_tokens": 769030762.0, + "step": 20155 + }, + { + "epoch": 2.564050375270322, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.335250854492188, + "learning_rate": 1e-06, + "loss": 0.3674, + "mean_token_accuracy": 0.8846399784088135, + "num_tokens": 769069827.0, + "step": 20156 + }, + { + "epoch": 2.5641775855489124, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.37736701965332, + "learning_rate": 1e-06, + "loss": 0.4183, + "mean_token_accuracy": 0.8737750053405762, + "num_tokens": 769107742.0, + "step": 20157 + }, + { + "epoch": 2.564304795827503, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.17576026916504, + "learning_rate": 1e-06, + "loss": 0.4245, + "mean_token_accuracy": 0.8719070553779602, + "num_tokens": 769153286.0, + "step": 20158 + }, + { + "epoch": 2.5644320061060935, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.274974822998047, + "learning_rate": 1e-06, + "loss": 0.4027, + "mean_token_accuracy": 0.874924898147583, + "num_tokens": 769192646.0, + "step": 20159 + }, + { + "epoch": 2.564559216384684, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.4171142578125, + "learning_rate": 1e-06, + "loss": 0.4698, + "mean_token_accuracy": 0.8516703248023987, + "num_tokens": 769230360.0, + "step": 20160 + }, + { + "epoch": 2.5646864266632745, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.397470474243164, + "learning_rate": 1e-06, + "loss": 0.3604, + "mean_token_accuracy": 0.8891751766204834, + "num_tokens": 769270754.0, + "step": 20161 + }, + { + "epoch": 2.564813636941865, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.276403427124023, + "learning_rate": 1e-06, + "loss": 0.4645, + "mean_token_accuracy": 0.8502165079116821, + "num_tokens": 769310109.0, + "step": 20162 + }, + { + "epoch": 2.5649408472204556, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.908750534057617, + "learning_rate": 1e-06, + "loss": 0.4041, + "mean_token_accuracy": 0.8720397353172302, + "num_tokens": 769352101.0, + "step": 20163 + }, + { + "epoch": 2.5650680574990457, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.498485565185547, + "learning_rate": 1e-06, + "loss": 0.3609, + "mean_token_accuracy": 0.887135922908783, + "num_tokens": 769386570.0, + "step": 20164 + }, + { + "epoch": 2.5651952677776366, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.479610443115234, + "learning_rate": 1e-06, + "loss": 0.3702, + "mean_token_accuracy": 0.8849396705627441, + "num_tokens": 769427167.0, + "step": 20165 + }, + { + "epoch": 2.5653224780562267, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.184614181518555, + "learning_rate": 1e-06, + "loss": 0.4301, + "mean_token_accuracy": 0.8630115985870361, + "num_tokens": 769465456.0, + "step": 20166 + }, + { + "epoch": 2.5654496883348177, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.07331085205078, + "learning_rate": 1e-06, + "loss": 0.3587, + "mean_token_accuracy": 0.8848220109939575, + "num_tokens": 769497135.0, + "step": 20167 + }, + { + "epoch": 2.5655768986134078, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.398874282836914, + "learning_rate": 1e-06, + "loss": 0.4155, + "mean_token_accuracy": 0.8673155903816223, + "num_tokens": 769533706.0, + "step": 20168 + }, + { + "epoch": 2.5657041088919987, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.303726196289062, + "learning_rate": 1e-06, + "loss": 0.4143, + "mean_token_accuracy": 0.8673476576805115, + "num_tokens": 769578698.0, + "step": 20169 + }, + { + "epoch": 2.565831319170589, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.076013565063477, + "learning_rate": 1e-06, + "loss": 0.386, + "mean_token_accuracy": 0.8851979970932007, + "num_tokens": 769619799.0, + "step": 20170 + }, + { + "epoch": 2.5659585294491793, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.28610610961914, + "learning_rate": 1e-06, + "loss": 0.3997, + "mean_token_accuracy": 0.8731045722961426, + "num_tokens": 769655973.0, + "step": 20171 + }, + { + "epoch": 2.56608573972777, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.205167770385742, + "learning_rate": 1e-06, + "loss": 0.4101, + "mean_token_accuracy": 0.8692935705184937, + "num_tokens": 769692333.0, + "step": 20172 + }, + { + "epoch": 2.5662129500063604, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.195371627807617, + "learning_rate": 1e-06, + "loss": 0.3964, + "mean_token_accuracy": 0.8785806894302368, + "num_tokens": 769731804.0, + "step": 20173 + }, + { + "epoch": 2.566340160284951, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.249595642089844, + "learning_rate": 1e-06, + "loss": 0.3831, + "mean_token_accuracy": 0.8794966340065002, + "num_tokens": 769767409.0, + "step": 20174 + }, + { + "epoch": 2.5664673705635415, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.089710235595703, + "learning_rate": 1e-06, + "loss": 0.4126, + "mean_token_accuracy": 0.8712978959083557, + "num_tokens": 769806468.0, + "step": 20175 + }, + { + "epoch": 2.566594580842132, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.173330307006836, + "learning_rate": 1e-06, + "loss": 0.3679, + "mean_token_accuracy": 0.8814368844032288, + "num_tokens": 769842025.0, + "step": 20176 + }, + { + "epoch": 2.5667217911207225, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.241811752319336, + "learning_rate": 1e-06, + "loss": 0.4102, + "mean_token_accuracy": 0.8692356944084167, + "num_tokens": 769884125.0, + "step": 20177 + }, + { + "epoch": 2.566849001399313, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.254047393798828, + "learning_rate": 1e-06, + "loss": 0.4377, + "mean_token_accuracy": 0.8618254661560059, + "num_tokens": 769920523.0, + "step": 20178 + }, + { + "epoch": 2.5669762116779036, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.133039474487305, + "learning_rate": 1e-06, + "loss": 0.4102, + "mean_token_accuracy": 0.8717340230941772, + "num_tokens": 769960141.0, + "step": 20179 + }, + { + "epoch": 2.567103421956494, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.20981216430664, + "learning_rate": 1e-06, + "loss": 0.4019, + "mean_token_accuracy": 0.8744757175445557, + "num_tokens": 769995281.0, + "step": 20180 + }, + { + "epoch": 2.5672306322350846, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.336219787597656, + "learning_rate": 1e-06, + "loss": 0.3886, + "mean_token_accuracy": 0.8762553930282593, + "num_tokens": 770028230.0, + "step": 20181 + }, + { + "epoch": 2.567357842513675, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.168384552001953, + "learning_rate": 1e-06, + "loss": 0.373, + "mean_token_accuracy": 0.8803659677505493, + "num_tokens": 770068996.0, + "step": 20182 + }, + { + "epoch": 2.5674850527922657, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.310558319091797, + "learning_rate": 1e-06, + "loss": 0.4339, + "mean_token_accuracy": 0.8625307083129883, + "num_tokens": 770107018.0, + "step": 20183 + }, + { + "epoch": 2.567612263070856, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.206024169921875, + "learning_rate": 1e-06, + "loss": 0.415, + "mean_token_accuracy": 0.872247576713562, + "num_tokens": 770140251.0, + "step": 20184 + }, + { + "epoch": 2.5677394733494467, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.07495880126953, + "learning_rate": 1e-06, + "loss": 0.4424, + "mean_token_accuracy": 0.8657439947128296, + "num_tokens": 770182241.0, + "step": 20185 + }, + { + "epoch": 2.5678666836280373, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.31001853942871, + "learning_rate": 1e-06, + "loss": 0.3577, + "mean_token_accuracy": 0.8868979215621948, + "num_tokens": 770223267.0, + "step": 20186 + }, + { + "epoch": 2.567993893906628, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.03565216064453, + "learning_rate": 1e-06, + "loss": 0.4371, + "mean_token_accuracy": 0.8632053732872009, + "num_tokens": 770261856.0, + "step": 20187 + }, + { + "epoch": 2.5681211041852183, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.32538604736328, + "learning_rate": 1e-06, + "loss": 0.432, + "mean_token_accuracy": 0.8627753257751465, + "num_tokens": 770299762.0, + "step": 20188 + }, + { + "epoch": 2.5682483144638084, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.253034591674805, + "learning_rate": 1e-06, + "loss": 0.4105, + "mean_token_accuracy": 0.8675814270973206, + "num_tokens": 770333690.0, + "step": 20189 + }, + { + "epoch": 2.5683755247423994, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.098493576049805, + "learning_rate": 1e-06, + "loss": 0.3769, + "mean_token_accuracy": 0.879387617111206, + "num_tokens": 770375755.0, + "step": 20190 + }, + { + "epoch": 2.5685027350209895, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.197492599487305, + "learning_rate": 1e-06, + "loss": 0.3772, + "mean_token_accuracy": 0.881363570690155, + "num_tokens": 770417805.0, + "step": 20191 + }, + { + "epoch": 2.5686299452995804, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.048250198364258, + "learning_rate": 1e-06, + "loss": 0.3962, + "mean_token_accuracy": 0.8719230890274048, + "num_tokens": 770458183.0, + "step": 20192 + }, + { + "epoch": 2.5687571555781705, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.203712463378906, + "learning_rate": 1e-06, + "loss": 0.3863, + "mean_token_accuracy": 0.8762712478637695, + "num_tokens": 770488973.0, + "step": 20193 + }, + { + "epoch": 2.5688843658567615, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.195598602294922, + "learning_rate": 1e-06, + "loss": 0.4183, + "mean_token_accuracy": 0.8678815364837646, + "num_tokens": 770529030.0, + "step": 20194 + }, + { + "epoch": 2.5690115761353516, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.172470092773438, + "learning_rate": 1e-06, + "loss": 0.4385, + "mean_token_accuracy": 0.8624655604362488, + "num_tokens": 770574697.0, + "step": 20195 + }, + { + "epoch": 2.569138786413942, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.251317977905273, + "learning_rate": 1e-06, + "loss": 0.3837, + "mean_token_accuracy": 0.8787825107574463, + "num_tokens": 770615417.0, + "step": 20196 + }, + { + "epoch": 2.5692659966925326, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.309816360473633, + "learning_rate": 1e-06, + "loss": 0.4392, + "mean_token_accuracy": 0.8598619699478149, + "num_tokens": 770653228.0, + "step": 20197 + }, + { + "epoch": 2.569393206971123, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.144733428955078, + "learning_rate": 1e-06, + "loss": 0.3637, + "mean_token_accuracy": 0.8876845240592957, + "num_tokens": 770693876.0, + "step": 20198 + }, + { + "epoch": 2.5695204172497137, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.08854103088379, + "learning_rate": 1e-06, + "loss": 0.4043, + "mean_token_accuracy": 0.8717828989028931, + "num_tokens": 770728754.0, + "step": 20199 + }, + { + "epoch": 2.5696476275283042, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.255504608154297, + "learning_rate": 1e-06, + "loss": 0.4107, + "mean_token_accuracy": 0.8714255094528198, + "num_tokens": 770767961.0, + "step": 20200 + }, + { + "epoch": 2.5697748378068948, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.22150230407715, + "learning_rate": 1e-06, + "loss": 0.494, + "mean_token_accuracy": 0.8467066287994385, + "num_tokens": 770808880.0, + "step": 20201 + }, + { + "epoch": 2.5699020480854853, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.02931022644043, + "learning_rate": 1e-06, + "loss": 0.4148, + "mean_token_accuracy": 0.867501974105835, + "num_tokens": 770846483.0, + "step": 20202 + }, + { + "epoch": 2.570029258364076, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.091304779052734, + "learning_rate": 1e-06, + "loss": 0.4795, + "mean_token_accuracy": 0.8534358739852905, + "num_tokens": 770884234.0, + "step": 20203 + }, + { + "epoch": 2.5701564686426663, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.330135345458984, + "learning_rate": 1e-06, + "loss": 0.4498, + "mean_token_accuracy": 0.8594350814819336, + "num_tokens": 770918785.0, + "step": 20204 + }, + { + "epoch": 2.570283678921257, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.06450843811035, + "learning_rate": 1e-06, + "loss": 0.3365, + "mean_token_accuracy": 0.8938570022583008, + "num_tokens": 770956889.0, + "step": 20205 + }, + { + "epoch": 2.5704108891998474, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.186321258544922, + "learning_rate": 1e-06, + "loss": 0.4468, + "mean_token_accuracy": 0.8595924377441406, + "num_tokens": 770994937.0, + "step": 20206 + }, + { + "epoch": 2.570538099478438, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.0484619140625, + "learning_rate": 1e-06, + "loss": 0.3766, + "mean_token_accuracy": 0.8811456561088562, + "num_tokens": 771031451.0, + "step": 20207 + }, + { + "epoch": 2.5706653097570284, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.15593910217285, + "learning_rate": 1e-06, + "loss": 0.3782, + "mean_token_accuracy": 0.8804850578308105, + "num_tokens": 771061575.0, + "step": 20208 + }, + { + "epoch": 2.570792520035619, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.454288482666016, + "learning_rate": 1e-06, + "loss": 0.4011, + "mean_token_accuracy": 0.872668445110321, + "num_tokens": 771103366.0, + "step": 20209 + }, + { + "epoch": 2.5709197303142095, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.12920570373535, + "learning_rate": 1e-06, + "loss": 0.3918, + "mean_token_accuracy": 0.8775995373725891, + "num_tokens": 771142178.0, + "step": 20210 + }, + { + "epoch": 2.5710469405928, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.206398010253906, + "learning_rate": 1e-06, + "loss": 0.4043, + "mean_token_accuracy": 0.8733664155006409, + "num_tokens": 771179296.0, + "step": 20211 + }, + { + "epoch": 2.5711741508713906, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.164901733398438, + "learning_rate": 1e-06, + "loss": 0.427, + "mean_token_accuracy": 0.8665462732315063, + "num_tokens": 771219084.0, + "step": 20212 + }, + { + "epoch": 2.571301361149981, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.300823211669922, + "learning_rate": 1e-06, + "loss": 0.3831, + "mean_token_accuracy": 0.8796770572662354, + "num_tokens": 771250609.0, + "step": 20213 + }, + { + "epoch": 2.571428571428571, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.279476165771484, + "learning_rate": 1e-06, + "loss": 0.4103, + "mean_token_accuracy": 0.8709874749183655, + "num_tokens": 771286191.0, + "step": 20214 + }, + { + "epoch": 2.571555781707162, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.256227493286133, + "learning_rate": 1e-06, + "loss": 0.4134, + "mean_token_accuracy": 0.872388482093811, + "num_tokens": 771320898.0, + "step": 20215 + }, + { + "epoch": 2.5716829919857522, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.202512741088867, + "learning_rate": 1e-06, + "loss": 0.4186, + "mean_token_accuracy": 0.8647624254226685, + "num_tokens": 771353977.0, + "step": 20216 + }, + { + "epoch": 2.571810202264343, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.166542053222656, + "learning_rate": 1e-06, + "loss": 0.3836, + "mean_token_accuracy": 0.8780534267425537, + "num_tokens": 771391960.0, + "step": 20217 + }, + { + "epoch": 2.5719374125429333, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.196449279785156, + "learning_rate": 1e-06, + "loss": 0.4337, + "mean_token_accuracy": 0.8651699423789978, + "num_tokens": 771427764.0, + "step": 20218 + }, + { + "epoch": 2.572064622821524, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.120328903198242, + "learning_rate": 1e-06, + "loss": 0.396, + "mean_token_accuracy": 0.873268723487854, + "num_tokens": 771469706.0, + "step": 20219 + }, + { + "epoch": 2.5721918331001143, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.33969497680664, + "learning_rate": 1e-06, + "loss": 0.3377, + "mean_token_accuracy": 0.8933420181274414, + "num_tokens": 771504296.0, + "step": 20220 + }, + { + "epoch": 2.572319043378705, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.005319595336914, + "learning_rate": 1e-06, + "loss": 0.4011, + "mean_token_accuracy": 0.8727575540542603, + "num_tokens": 771544496.0, + "step": 20221 + }, + { + "epoch": 2.5724462536572954, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.108047485351562, + "learning_rate": 1e-06, + "loss": 0.3782, + "mean_token_accuracy": 0.8814985752105713, + "num_tokens": 771584322.0, + "step": 20222 + }, + { + "epoch": 2.572573463935886, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.292139053344727, + "learning_rate": 1e-06, + "loss": 0.3794, + "mean_token_accuracy": 0.8819603323936462, + "num_tokens": 771621424.0, + "step": 20223 + }, + { + "epoch": 2.5727006742144765, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.384807586669922, + "learning_rate": 1e-06, + "loss": 0.3972, + "mean_token_accuracy": 0.8747503161430359, + "num_tokens": 771658173.0, + "step": 20224 + }, + { + "epoch": 2.572827884493067, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.299169540405273, + "learning_rate": 1e-06, + "loss": 0.4327, + "mean_token_accuracy": 0.8654928207397461, + "num_tokens": 771698182.0, + "step": 20225 + }, + { + "epoch": 2.5729550947716575, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.142335891723633, + "learning_rate": 1e-06, + "loss": 0.4375, + "mean_token_accuracy": 0.8582056760787964, + "num_tokens": 771731559.0, + "step": 20226 + }, + { + "epoch": 2.573082305050248, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.277538299560547, + "learning_rate": 1e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.8688167333602905, + "num_tokens": 771774362.0, + "step": 20227 + }, + { + "epoch": 2.5732095153288386, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.268075942993164, + "learning_rate": 1e-06, + "loss": 0.4215, + "mean_token_accuracy": 0.8638759851455688, + "num_tokens": 771810540.0, + "step": 20228 + }, + { + "epoch": 2.573336725607429, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.149555206298828, + "learning_rate": 1e-06, + "loss": 0.3984, + "mean_token_accuracy": 0.876936137676239, + "num_tokens": 771846221.0, + "step": 20229 + }, + { + "epoch": 2.5734639358860196, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.466562271118164, + "learning_rate": 1e-06, + "loss": 0.3784, + "mean_token_accuracy": 0.8827056288719177, + "num_tokens": 771882268.0, + "step": 20230 + }, + { + "epoch": 2.57359114616461, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.09641456604004, + "learning_rate": 1e-06, + "loss": 0.4089, + "mean_token_accuracy": 0.8726507425308228, + "num_tokens": 771913176.0, + "step": 20231 + }, + { + "epoch": 2.5737183564432007, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.360305786132812, + "learning_rate": 1e-06, + "loss": 0.4186, + "mean_token_accuracy": 0.8684316873550415, + "num_tokens": 771948163.0, + "step": 20232 + }, + { + "epoch": 2.573845566721791, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.12640953063965, + "learning_rate": 1e-06, + "loss": 0.3612, + "mean_token_accuracy": 0.8858908414840698, + "num_tokens": 771988296.0, + "step": 20233 + }, + { + "epoch": 2.5739727770003817, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.207149505615234, + "learning_rate": 1e-06, + "loss": 0.3728, + "mean_token_accuracy": 0.8807616233825684, + "num_tokens": 772024330.0, + "step": 20234 + }, + { + "epoch": 2.5740999872789723, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.137523651123047, + "learning_rate": 1e-06, + "loss": 0.4399, + "mean_token_accuracy": 0.8616346120834351, + "num_tokens": 772062291.0, + "step": 20235 + }, + { + "epoch": 2.574227197557563, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.367801666259766, + "learning_rate": 1e-06, + "loss": 0.3986, + "mean_token_accuracy": 0.8750894069671631, + "num_tokens": 772110239.0, + "step": 20236 + }, + { + "epoch": 2.5743544078361533, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.20509910583496, + "learning_rate": 1e-06, + "loss": 0.4304, + "mean_token_accuracy": 0.8688421249389648, + "num_tokens": 772153201.0, + "step": 20237 + }, + { + "epoch": 2.574481618114744, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.31262969970703, + "learning_rate": 1e-06, + "loss": 0.3942, + "mean_token_accuracy": 0.875318169593811, + "num_tokens": 772186816.0, + "step": 20238 + }, + { + "epoch": 2.574608828393334, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.364681243896484, + "learning_rate": 1e-06, + "loss": 0.3806, + "mean_token_accuracy": 0.8820862770080566, + "num_tokens": 772224238.0, + "step": 20239 + }, + { + "epoch": 2.574736038671925, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.296579360961914, + "learning_rate": 1e-06, + "loss": 0.3772, + "mean_token_accuracy": 0.8814687728881836, + "num_tokens": 772263505.0, + "step": 20240 + }, + { + "epoch": 2.574863248950515, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.316913604736328, + "learning_rate": 1e-06, + "loss": 0.3804, + "mean_token_accuracy": 0.8816899061203003, + "num_tokens": 772303205.0, + "step": 20241 + }, + { + "epoch": 2.574990459229106, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.23124885559082, + "learning_rate": 1e-06, + "loss": 0.3862, + "mean_token_accuracy": 0.8778425455093384, + "num_tokens": 772344495.0, + "step": 20242 + }, + { + "epoch": 2.575117669507696, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.22454261779785, + "learning_rate": 1e-06, + "loss": 0.4287, + "mean_token_accuracy": 0.8684340119361877, + "num_tokens": 772380296.0, + "step": 20243 + }, + { + "epoch": 2.5752448797862866, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.35222053527832, + "learning_rate": 1e-06, + "loss": 0.369, + "mean_token_accuracy": 0.8862662315368652, + "num_tokens": 772423933.0, + "step": 20244 + }, + { + "epoch": 2.575372090064877, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.267606735229492, + "learning_rate": 1e-06, + "loss": 0.4483, + "mean_token_accuracy": 0.8593187928199768, + "num_tokens": 772464514.0, + "step": 20245 + }, + { + "epoch": 2.5754993003434676, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.357683181762695, + "learning_rate": 1e-06, + "loss": 0.3369, + "mean_token_accuracy": 0.8922505974769592, + "num_tokens": 772498501.0, + "step": 20246 + }, + { + "epoch": 2.575626510622058, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.319164276123047, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.8694236278533936, + "num_tokens": 772538072.0, + "step": 20247 + }, + { + "epoch": 2.5757537209006487, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.260488510131836, + "learning_rate": 1e-06, + "loss": 0.4082, + "mean_token_accuracy": 0.8738419413566589, + "num_tokens": 772577434.0, + "step": 20248 + }, + { + "epoch": 2.575880931179239, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.087617874145508, + "learning_rate": 1e-06, + "loss": 0.3515, + "mean_token_accuracy": 0.8873888254165649, + "num_tokens": 772612917.0, + "step": 20249 + }, + { + "epoch": 2.5760081414578297, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.26700782775879, + "learning_rate": 1e-06, + "loss": 0.3698, + "mean_token_accuracy": 0.8853225708007812, + "num_tokens": 772652939.0, + "step": 20250 + }, + { + "epoch": 2.5761353517364203, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.180782318115234, + "learning_rate": 1e-06, + "loss": 0.3747, + "mean_token_accuracy": 0.8804789781570435, + "num_tokens": 772688018.0, + "step": 20251 + }, + { + "epoch": 2.576262562015011, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.400970458984375, + "learning_rate": 1e-06, + "loss": 0.4375, + "mean_token_accuracy": 0.8638970255851746, + "num_tokens": 772728071.0, + "step": 20252 + }, + { + "epoch": 2.5763897722936013, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.27796745300293, + "learning_rate": 1e-06, + "loss": 0.3916, + "mean_token_accuracy": 0.8760685920715332, + "num_tokens": 772762730.0, + "step": 20253 + }, + { + "epoch": 2.576516982572192, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.31114387512207, + "learning_rate": 1e-06, + "loss": 0.3844, + "mean_token_accuracy": 0.8753776550292969, + "num_tokens": 772799125.0, + "step": 20254 + }, + { + "epoch": 2.5766441928507824, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.390056610107422, + "learning_rate": 1e-06, + "loss": 0.3612, + "mean_token_accuracy": 0.8854106068611145, + "num_tokens": 772838657.0, + "step": 20255 + }, + { + "epoch": 2.576771403129373, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.470420837402344, + "learning_rate": 1e-06, + "loss": 0.3817, + "mean_token_accuracy": 0.8809285759925842, + "num_tokens": 772876470.0, + "step": 20256 + }, + { + "epoch": 2.5768986134079634, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.497825622558594, + "learning_rate": 1e-06, + "loss": 0.4082, + "mean_token_accuracy": 0.8674901127815247, + "num_tokens": 772910369.0, + "step": 20257 + }, + { + "epoch": 2.577025823686554, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.186691284179688, + "learning_rate": 1e-06, + "loss": 0.3779, + "mean_token_accuracy": 0.8793357610702515, + "num_tokens": 772948129.0, + "step": 20258 + }, + { + "epoch": 2.5771530339651445, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.55257225036621, + "learning_rate": 1e-06, + "loss": 0.3225, + "mean_token_accuracy": 0.8994299173355103, + "num_tokens": 772985359.0, + "step": 20259 + }, + { + "epoch": 2.577280244243735, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.392419815063477, + "learning_rate": 1e-06, + "loss": 0.3923, + "mean_token_accuracy": 0.8756640553474426, + "num_tokens": 773019402.0, + "step": 20260 + }, + { + "epoch": 2.5774074545223256, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.221338272094727, + "learning_rate": 1e-06, + "loss": 0.4059, + "mean_token_accuracy": 0.8732550144195557, + "num_tokens": 773059176.0, + "step": 20261 + }, + { + "epoch": 2.5775346648009156, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.34933853149414, + "learning_rate": 1e-06, + "loss": 0.3886, + "mean_token_accuracy": 0.8752856254577637, + "num_tokens": 773094274.0, + "step": 20262 + }, + { + "epoch": 2.5776618750795066, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.33456039428711, + "learning_rate": 1e-06, + "loss": 0.4076, + "mean_token_accuracy": 0.8726288676261902, + "num_tokens": 773135679.0, + "step": 20263 + }, + { + "epoch": 2.5777890853580967, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.246824264526367, + "learning_rate": 1e-06, + "loss": 0.4124, + "mean_token_accuracy": 0.8700323104858398, + "num_tokens": 773174270.0, + "step": 20264 + }, + { + "epoch": 2.5779162956366877, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.116743087768555, + "learning_rate": 1e-06, + "loss": 0.3943, + "mean_token_accuracy": 0.8750370740890503, + "num_tokens": 773213582.0, + "step": 20265 + }, + { + "epoch": 2.5780435059152778, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.313331604003906, + "learning_rate": 1e-06, + "loss": 0.4039, + "mean_token_accuracy": 0.869322657585144, + "num_tokens": 773244980.0, + "step": 20266 + }, + { + "epoch": 2.5781707161938687, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.306447982788086, + "learning_rate": 1e-06, + "loss": 0.3569, + "mean_token_accuracy": 0.8878909945487976, + "num_tokens": 773287395.0, + "step": 20267 + }, + { + "epoch": 2.578297926472459, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.269468307495117, + "learning_rate": 1e-06, + "loss": 0.4088, + "mean_token_accuracy": 0.8701157569885254, + "num_tokens": 773324593.0, + "step": 20268 + }, + { + "epoch": 2.5784251367510493, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.266817092895508, + "learning_rate": 1e-06, + "loss": 0.4124, + "mean_token_accuracy": 0.868735671043396, + "num_tokens": 773361802.0, + "step": 20269 + }, + { + "epoch": 2.57855234702964, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.39873504638672, + "learning_rate": 1e-06, + "loss": 0.3927, + "mean_token_accuracy": 0.8784641027450562, + "num_tokens": 773401348.0, + "step": 20270 + }, + { + "epoch": 2.5786795573082304, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.353492736816406, + "learning_rate": 1e-06, + "loss": 0.3561, + "mean_token_accuracy": 0.8904442191123962, + "num_tokens": 773439040.0, + "step": 20271 + }, + { + "epoch": 2.578806767586821, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.10338592529297, + "learning_rate": 1e-06, + "loss": 0.3519, + "mean_token_accuracy": 0.886400580406189, + "num_tokens": 773471382.0, + "step": 20272 + }, + { + "epoch": 2.5789339778654115, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.34016990661621, + "learning_rate": 1e-06, + "loss": 0.4322, + "mean_token_accuracy": 0.8636178374290466, + "num_tokens": 773508924.0, + "step": 20273 + }, + { + "epoch": 2.579061188144002, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.353681564331055, + "learning_rate": 1e-06, + "loss": 0.3496, + "mean_token_accuracy": 0.8880259990692139, + "num_tokens": 773550219.0, + "step": 20274 + }, + { + "epoch": 2.5791883984225925, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.272327423095703, + "learning_rate": 1e-06, + "loss": 0.4175, + "mean_token_accuracy": 0.8696883916854858, + "num_tokens": 773589363.0, + "step": 20275 + }, + { + "epoch": 2.579315608701183, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.363622665405273, + "learning_rate": 1e-06, + "loss": 0.4168, + "mean_token_accuracy": 0.8732172846794128, + "num_tokens": 773630289.0, + "step": 20276 + }, + { + "epoch": 2.5794428189797736, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.288022994995117, + "learning_rate": 1e-06, + "loss": 0.4011, + "mean_token_accuracy": 0.8758864402770996, + "num_tokens": 773668660.0, + "step": 20277 + }, + { + "epoch": 2.579570029258364, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.35382843017578, + "learning_rate": 1e-06, + "loss": 0.4057, + "mean_token_accuracy": 0.8704620599746704, + "num_tokens": 773704318.0, + "step": 20278 + }, + { + "epoch": 2.5796972395369546, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.33999252319336, + "learning_rate": 1e-06, + "loss": 0.4513, + "mean_token_accuracy": 0.8591822385787964, + "num_tokens": 773745821.0, + "step": 20279 + }, + { + "epoch": 2.579824449815545, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.533740997314453, + "learning_rate": 1e-06, + "loss": 0.3644, + "mean_token_accuracy": 0.8835987448692322, + "num_tokens": 773782880.0, + "step": 20280 + }, + { + "epoch": 2.5799516600941357, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.148712158203125, + "learning_rate": 1e-06, + "loss": 0.4438, + "mean_token_accuracy": 0.8606032133102417, + "num_tokens": 773824880.0, + "step": 20281 + }, + { + "epoch": 2.580078870372726, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.455371856689453, + "learning_rate": 1e-06, + "loss": 0.3847, + "mean_token_accuracy": 0.87706458568573, + "num_tokens": 773863210.0, + "step": 20282 + }, + { + "epoch": 2.5802060806513167, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.456016540527344, + "learning_rate": 1e-06, + "loss": 0.3764, + "mean_token_accuracy": 0.8816913962364197, + "num_tokens": 773903432.0, + "step": 20283 + }, + { + "epoch": 2.5803332909299073, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.17323875427246, + "learning_rate": 1e-06, + "loss": 0.375, + "mean_token_accuracy": 0.8819332718849182, + "num_tokens": 773942424.0, + "step": 20284 + }, + { + "epoch": 2.580460501208498, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.470834732055664, + "learning_rate": 1e-06, + "loss": 0.4007, + "mean_token_accuracy": 0.8732167482376099, + "num_tokens": 773976598.0, + "step": 20285 + }, + { + "epoch": 2.5805877114870883, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.38877296447754, + "learning_rate": 1e-06, + "loss": 0.4214, + "mean_token_accuracy": 0.8681035041809082, + "num_tokens": 774016696.0, + "step": 20286 + }, + { + "epoch": 2.5807149217656784, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.979524612426758, + "learning_rate": 1e-06, + "loss": 0.4153, + "mean_token_accuracy": 0.8678716421127319, + "num_tokens": 774059046.0, + "step": 20287 + }, + { + "epoch": 2.5808421320442694, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.530071258544922, + "learning_rate": 1e-06, + "loss": 0.4049, + "mean_token_accuracy": 0.869934618473053, + "num_tokens": 774095322.0, + "step": 20288 + }, + { + "epoch": 2.5809693423228595, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.454206466674805, + "learning_rate": 1e-06, + "loss": 0.4376, + "mean_token_accuracy": 0.8625482320785522, + "num_tokens": 774134506.0, + "step": 20289 + }, + { + "epoch": 2.5810965526014504, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.22705078125, + "learning_rate": 1e-06, + "loss": 0.3585, + "mean_token_accuracy": 0.8885515332221985, + "num_tokens": 774176997.0, + "step": 20290 + }, + { + "epoch": 2.5812237628800405, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.37190055847168, + "learning_rate": 1e-06, + "loss": 0.4172, + "mean_token_accuracy": 0.8695924878120422, + "num_tokens": 774213900.0, + "step": 20291 + }, + { + "epoch": 2.5813509731586315, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.34946060180664, + "learning_rate": 1e-06, + "loss": 0.4228, + "mean_token_accuracy": 0.8657849431037903, + "num_tokens": 774259687.0, + "step": 20292 + }, + { + "epoch": 2.5814781834372216, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.484128952026367, + "learning_rate": 1e-06, + "loss": 0.3964, + "mean_token_accuracy": 0.8784403800964355, + "num_tokens": 774297200.0, + "step": 20293 + }, + { + "epoch": 2.581605393715812, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.443132400512695, + "learning_rate": 1e-06, + "loss": 0.3591, + "mean_token_accuracy": 0.8861761093139648, + "num_tokens": 774336642.0, + "step": 20294 + }, + { + "epoch": 2.5817326039944026, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.1737117767334, + "learning_rate": 1e-06, + "loss": 0.4421, + "mean_token_accuracy": 0.8589447736740112, + "num_tokens": 774376762.0, + "step": 20295 + }, + { + "epoch": 2.581859814272993, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.41326332092285, + "learning_rate": 1e-06, + "loss": 0.3769, + "mean_token_accuracy": 0.8820617198944092, + "num_tokens": 774421360.0, + "step": 20296 + }, + { + "epoch": 2.5819870245515837, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.377727508544922, + "learning_rate": 1e-06, + "loss": 0.3745, + "mean_token_accuracy": 0.8815733194351196, + "num_tokens": 774460850.0, + "step": 20297 + }, + { + "epoch": 2.582114234830174, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.289440155029297, + "learning_rate": 1e-06, + "loss": 0.4046, + "mean_token_accuracy": 0.8710444569587708, + "num_tokens": 774497542.0, + "step": 20298 + }, + { + "epoch": 2.5822414451087647, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.4208984375, + "learning_rate": 1e-06, + "loss": 0.422, + "mean_token_accuracy": 0.8657824993133545, + "num_tokens": 774538164.0, + "step": 20299 + }, + { + "epoch": 2.5823686553873553, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.422040939331055, + "learning_rate": 1e-06, + "loss": 0.4192, + "mean_token_accuracy": 0.8685891628265381, + "num_tokens": 774576118.0, + "step": 20300 + }, + { + "epoch": 2.582495865665946, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.104230880737305, + "learning_rate": 1e-06, + "loss": 0.3399, + "mean_token_accuracy": 0.8945537805557251, + "num_tokens": 774617506.0, + "step": 20301 + }, + { + "epoch": 2.5826230759445363, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.3286075592041, + "learning_rate": 1e-06, + "loss": 0.4796, + "mean_token_accuracy": 0.8500086069107056, + "num_tokens": 774653720.0, + "step": 20302 + }, + { + "epoch": 2.582750286223127, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.11404800415039, + "learning_rate": 1e-06, + "loss": 0.3337, + "mean_token_accuracy": 0.8938319087028503, + "num_tokens": 774691056.0, + "step": 20303 + }, + { + "epoch": 2.5828774965017174, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.51082992553711, + "learning_rate": 1e-06, + "loss": 0.4186, + "mean_token_accuracy": 0.868331789970398, + "num_tokens": 774726514.0, + "step": 20304 + }, + { + "epoch": 2.583004706780308, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.105648040771484, + "learning_rate": 1e-06, + "loss": 0.4127, + "mean_token_accuracy": 0.8653193116188049, + "num_tokens": 774760831.0, + "step": 20305 + }, + { + "epoch": 2.5831319170588984, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.47732162475586, + "learning_rate": 1e-06, + "loss": 0.3788, + "mean_token_accuracy": 0.8832608461380005, + "num_tokens": 774799399.0, + "step": 20306 + }, + { + "epoch": 2.583259127337489, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.22686195373535, + "learning_rate": 1e-06, + "loss": 0.418, + "mean_token_accuracy": 0.8658955097198486, + "num_tokens": 774839910.0, + "step": 20307 + }, + { + "epoch": 2.5833863376160795, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.480016708374023, + "learning_rate": 1e-06, + "loss": 0.4023, + "mean_token_accuracy": 0.8745222687721252, + "num_tokens": 774875694.0, + "step": 20308 + }, + { + "epoch": 2.58351354789467, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.38951873779297, + "learning_rate": 1e-06, + "loss": 0.3914, + "mean_token_accuracy": 0.8755216598510742, + "num_tokens": 774908020.0, + "step": 20309 + }, + { + "epoch": 2.5836407581732606, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.26947784423828, + "learning_rate": 1e-06, + "loss": 0.4276, + "mean_token_accuracy": 0.8631213903427124, + "num_tokens": 774946716.0, + "step": 20310 + }, + { + "epoch": 2.583767968451851, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.347286224365234, + "learning_rate": 1e-06, + "loss": 0.413, + "mean_token_accuracy": 0.8684757947921753, + "num_tokens": 774981732.0, + "step": 20311 + }, + { + "epoch": 2.583895178730441, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.035144805908203, + "learning_rate": 1e-06, + "loss": 0.3772, + "mean_token_accuracy": 0.8794769048690796, + "num_tokens": 775015280.0, + "step": 20312 + }, + { + "epoch": 2.584022389009032, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.182373046875, + "learning_rate": 1e-06, + "loss": 0.3701, + "mean_token_accuracy": 0.8815127015113831, + "num_tokens": 775054518.0, + "step": 20313 + }, + { + "epoch": 2.5841495992876222, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.36610221862793, + "learning_rate": 1e-06, + "loss": 0.3784, + "mean_token_accuracy": 0.8798717856407166, + "num_tokens": 775091548.0, + "step": 20314 + }, + { + "epoch": 2.584276809566213, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.25569725036621, + "learning_rate": 1e-06, + "loss": 0.4365, + "mean_token_accuracy": 0.8619427680969238, + "num_tokens": 775130916.0, + "step": 20315 + }, + { + "epoch": 2.5844040198448033, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.389118194580078, + "learning_rate": 1e-06, + "loss": 0.3653, + "mean_token_accuracy": 0.8864176869392395, + "num_tokens": 775165875.0, + "step": 20316 + }, + { + "epoch": 2.584531230123394, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.30317497253418, + "learning_rate": 1e-06, + "loss": 0.4367, + "mean_token_accuracy": 0.8605626821517944, + "num_tokens": 775211463.0, + "step": 20317 + }, + { + "epoch": 2.5846584404019843, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.299352645874023, + "learning_rate": 1e-06, + "loss": 0.3802, + "mean_token_accuracy": 0.8824683427810669, + "num_tokens": 775248338.0, + "step": 20318 + }, + { + "epoch": 2.584785650680575, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.2056827545166, + "learning_rate": 1e-06, + "loss": 0.4345, + "mean_token_accuracy": 0.86232590675354, + "num_tokens": 775285707.0, + "step": 20319 + }, + { + "epoch": 2.5849128609591654, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.365083694458008, + "learning_rate": 1e-06, + "loss": 0.3686, + "mean_token_accuracy": 0.8823561668395996, + "num_tokens": 775322682.0, + "step": 20320 + }, + { + "epoch": 2.585040071237756, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.54168701171875, + "learning_rate": 1e-06, + "loss": 0.3679, + "mean_token_accuracy": 0.8846203684806824, + "num_tokens": 775358274.0, + "step": 20321 + }, + { + "epoch": 2.5851672815163464, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.182600021362305, + "learning_rate": 1e-06, + "loss": 0.4094, + "mean_token_accuracy": 0.8697023391723633, + "num_tokens": 775394472.0, + "step": 20322 + }, + { + "epoch": 2.585294491794937, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.24282455444336, + "learning_rate": 1e-06, + "loss": 0.3723, + "mean_token_accuracy": 0.8839937448501587, + "num_tokens": 775433647.0, + "step": 20323 + }, + { + "epoch": 2.5854217020735275, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.479087829589844, + "learning_rate": 1e-06, + "loss": 0.4412, + "mean_token_accuracy": 0.8643359541893005, + "num_tokens": 775469159.0, + "step": 20324 + }, + { + "epoch": 2.585548912352118, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.230449676513672, + "learning_rate": 1e-06, + "loss": 0.3744, + "mean_token_accuracy": 0.8826988935470581, + "num_tokens": 775507329.0, + "step": 20325 + }, + { + "epoch": 2.5856761226307086, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.50509262084961, + "learning_rate": 1e-06, + "loss": 0.408, + "mean_token_accuracy": 0.8722476959228516, + "num_tokens": 775544638.0, + "step": 20326 + }, + { + "epoch": 2.585803332909299, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.30034065246582, + "learning_rate": 1e-06, + "loss": 0.4002, + "mean_token_accuracy": 0.8764966130256653, + "num_tokens": 775584918.0, + "step": 20327 + }, + { + "epoch": 2.5859305431878896, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.313093185424805, + "learning_rate": 1e-06, + "loss": 0.4219, + "mean_token_accuracy": 0.869282603263855, + "num_tokens": 775624214.0, + "step": 20328 + }, + { + "epoch": 2.58605775346648, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.334978103637695, + "learning_rate": 1e-06, + "loss": 0.4401, + "mean_token_accuracy": 0.8621670007705688, + "num_tokens": 775668372.0, + "step": 20329 + }, + { + "epoch": 2.5861849637450707, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.314586639404297, + "learning_rate": 1e-06, + "loss": 0.4719, + "mean_token_accuracy": 0.8458903431892395, + "num_tokens": 775700578.0, + "step": 20330 + }, + { + "epoch": 2.586312174023661, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.176368713378906, + "learning_rate": 1e-06, + "loss": 0.3789, + "mean_token_accuracy": 0.8803049921989441, + "num_tokens": 775746980.0, + "step": 20331 + }, + { + "epoch": 2.5864393843022517, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.229738235473633, + "learning_rate": 1e-06, + "loss": 0.4127, + "mean_token_accuracy": 0.8683522343635559, + "num_tokens": 775780762.0, + "step": 20332 + }, + { + "epoch": 2.5865665945808423, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.448701858520508, + "learning_rate": 1e-06, + "loss": 0.387, + "mean_token_accuracy": 0.8794160485267639, + "num_tokens": 775815591.0, + "step": 20333 + }, + { + "epoch": 2.586693804859433, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.34555435180664, + "learning_rate": 1e-06, + "loss": 0.4348, + "mean_token_accuracy": 0.8653826713562012, + "num_tokens": 775856414.0, + "step": 20334 + }, + { + "epoch": 2.5868210151380233, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.23737335205078, + "learning_rate": 1e-06, + "loss": 0.3925, + "mean_token_accuracy": 0.8742634057998657, + "num_tokens": 775896939.0, + "step": 20335 + }, + { + "epoch": 2.586948225416614, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.36555290222168, + "learning_rate": 1e-06, + "loss": 0.4114, + "mean_token_accuracy": 0.8688557147979736, + "num_tokens": 775941778.0, + "step": 20336 + }, + { + "epoch": 2.587075435695204, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.248395919799805, + "learning_rate": 1e-06, + "loss": 0.3993, + "mean_token_accuracy": 0.8728963136672974, + "num_tokens": 775979346.0, + "step": 20337 + }, + { + "epoch": 2.587202645973795, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.407840728759766, + "learning_rate": 1e-06, + "loss": 0.4108, + "mean_token_accuracy": 0.8714760541915894, + "num_tokens": 776013793.0, + "step": 20338 + }, + { + "epoch": 2.587329856252385, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.331552505493164, + "learning_rate": 1e-06, + "loss": 0.3637, + "mean_token_accuracy": 0.8869999647140503, + "num_tokens": 776058513.0, + "step": 20339 + }, + { + "epoch": 2.587457066530976, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.23288917541504, + "learning_rate": 1e-06, + "loss": 0.4032, + "mean_token_accuracy": 0.873884916305542, + "num_tokens": 776102790.0, + "step": 20340 + }, + { + "epoch": 2.587584276809566, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.366878509521484, + "learning_rate": 1e-06, + "loss": 0.3495, + "mean_token_accuracy": 0.8888256549835205, + "num_tokens": 776133898.0, + "step": 20341 + }, + { + "epoch": 2.5877114870881566, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.27075958251953, + "learning_rate": 1e-06, + "loss": 0.3994, + "mean_token_accuracy": 0.872218132019043, + "num_tokens": 776167590.0, + "step": 20342 + }, + { + "epoch": 2.587838697366747, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.195112228393555, + "learning_rate": 1e-06, + "loss": 0.4339, + "mean_token_accuracy": 0.8675513863563538, + "num_tokens": 776207966.0, + "step": 20343 + }, + { + "epoch": 2.5879659076453376, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.356096267700195, + "learning_rate": 1e-06, + "loss": 0.3953, + "mean_token_accuracy": 0.8745485544204712, + "num_tokens": 776245601.0, + "step": 20344 + }, + { + "epoch": 2.588093117923928, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.212051391601562, + "learning_rate": 1e-06, + "loss": 0.3733, + "mean_token_accuracy": 0.881695032119751, + "num_tokens": 776285377.0, + "step": 20345 + }, + { + "epoch": 2.5882203282025187, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.361804962158203, + "learning_rate": 1e-06, + "loss": 0.4342, + "mean_token_accuracy": 0.8645175099372864, + "num_tokens": 776328116.0, + "step": 20346 + }, + { + "epoch": 2.588347538481109, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.367887496948242, + "learning_rate": 1e-06, + "loss": 0.4537, + "mean_token_accuracy": 0.8583931922912598, + "num_tokens": 776369008.0, + "step": 20347 + }, + { + "epoch": 2.5884747487596997, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.153339385986328, + "learning_rate": 1e-06, + "loss": 0.4337, + "mean_token_accuracy": 0.8643933534622192, + "num_tokens": 776410151.0, + "step": 20348 + }, + { + "epoch": 2.5886019590382903, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.286725997924805, + "learning_rate": 1e-06, + "loss": 0.3419, + "mean_token_accuracy": 0.8977901935577393, + "num_tokens": 776445944.0, + "step": 20349 + }, + { + "epoch": 2.588729169316881, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.101703643798828, + "learning_rate": 1e-06, + "loss": 0.389, + "mean_token_accuracy": 0.8790880441665649, + "num_tokens": 776485388.0, + "step": 20350 + }, + { + "epoch": 2.5888563795954713, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.503952026367188, + "learning_rate": 1e-06, + "loss": 0.4528, + "mean_token_accuracy": 0.8558559417724609, + "num_tokens": 776529135.0, + "step": 20351 + }, + { + "epoch": 2.588983589874062, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.291627883911133, + "learning_rate": 1e-06, + "loss": 0.4096, + "mean_token_accuracy": 0.8673518896102905, + "num_tokens": 776564310.0, + "step": 20352 + }, + { + "epoch": 2.5891108001526524, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.240970611572266, + "learning_rate": 1e-06, + "loss": 0.3589, + "mean_token_accuracy": 0.8859014511108398, + "num_tokens": 776609004.0, + "step": 20353 + }, + { + "epoch": 2.589238010431243, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.240406036376953, + "learning_rate": 1e-06, + "loss": 0.3844, + "mean_token_accuracy": 0.8801631927490234, + "num_tokens": 776645796.0, + "step": 20354 + }, + { + "epoch": 2.5893652207098334, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.275354385375977, + "learning_rate": 1e-06, + "loss": 0.4255, + "mean_token_accuracy": 0.8650599718093872, + "num_tokens": 776684248.0, + "step": 20355 + }, + { + "epoch": 2.589492430988424, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.27592658996582, + "learning_rate": 1e-06, + "loss": 0.4001, + "mean_token_accuracy": 0.876383900642395, + "num_tokens": 776722684.0, + "step": 20356 + }, + { + "epoch": 2.5896196412670145, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.289749145507812, + "learning_rate": 1e-06, + "loss": 0.3891, + "mean_token_accuracy": 0.8779816627502441, + "num_tokens": 776764635.0, + "step": 20357 + }, + { + "epoch": 2.589746851545605, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.55333709716797, + "learning_rate": 1e-06, + "loss": 0.4359, + "mean_token_accuracy": 0.8626450896263123, + "num_tokens": 776805683.0, + "step": 20358 + }, + { + "epoch": 2.5898740618241956, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.29962730407715, + "learning_rate": 1e-06, + "loss": 0.3983, + "mean_token_accuracy": 0.8760629296302795, + "num_tokens": 776841229.0, + "step": 20359 + }, + { + "epoch": 2.5900012721027856, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.409168243408203, + "learning_rate": 1e-06, + "loss": 0.3816, + "mean_token_accuracy": 0.8815293312072754, + "num_tokens": 776881319.0, + "step": 20360 + }, + { + "epoch": 2.5901284823813766, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.25436782836914, + "learning_rate": 1e-06, + "loss": 0.3814, + "mean_token_accuracy": 0.8786319494247437, + "num_tokens": 776919436.0, + "step": 20361 + }, + { + "epoch": 2.5902556926599667, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.167192459106445, + "learning_rate": 1e-06, + "loss": 0.4623, + "mean_token_accuracy": 0.856326162815094, + "num_tokens": 776958644.0, + "step": 20362 + }, + { + "epoch": 2.5903829029385577, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.42974090576172, + "learning_rate": 1e-06, + "loss": 0.3476, + "mean_token_accuracy": 0.8919267654418945, + "num_tokens": 776994509.0, + "step": 20363 + }, + { + "epoch": 2.5905101132171477, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.173025131225586, + "learning_rate": 1e-06, + "loss": 0.3898, + "mean_token_accuracy": 0.8800196647644043, + "num_tokens": 777037557.0, + "step": 20364 + }, + { + "epoch": 2.5906373234957387, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.345348358154297, + "learning_rate": 1e-06, + "loss": 0.4682, + "mean_token_accuracy": 0.8548447489738464, + "num_tokens": 777077237.0, + "step": 20365 + }, + { + "epoch": 2.590764533774329, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.302696228027344, + "learning_rate": 1e-06, + "loss": 0.3795, + "mean_token_accuracy": 0.880635142326355, + "num_tokens": 777120813.0, + "step": 20366 + }, + { + "epoch": 2.5908917440529193, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.23276710510254, + "learning_rate": 1e-06, + "loss": 0.4333, + "mean_token_accuracy": 0.8613572120666504, + "num_tokens": 777155809.0, + "step": 20367 + }, + { + "epoch": 2.59101895433151, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.424848556518555, + "learning_rate": 1e-06, + "loss": 0.4078, + "mean_token_accuracy": 0.8741816878318787, + "num_tokens": 777195868.0, + "step": 20368 + }, + { + "epoch": 2.5911461646101004, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.23685646057129, + "learning_rate": 1e-06, + "loss": 0.39, + "mean_token_accuracy": 0.8785133957862854, + "num_tokens": 777233406.0, + "step": 20369 + }, + { + "epoch": 2.591273374888691, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.119935989379883, + "learning_rate": 1e-06, + "loss": 0.3963, + "mean_token_accuracy": 0.8759251236915588, + "num_tokens": 777272817.0, + "step": 20370 + }, + { + "epoch": 2.5914005851672814, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.448684692382812, + "learning_rate": 1e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.8692879676818848, + "num_tokens": 777312398.0, + "step": 20371 + }, + { + "epoch": 2.591527795445872, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.198135375976562, + "learning_rate": 1e-06, + "loss": 0.3675, + "mean_token_accuracy": 0.8841373920440674, + "num_tokens": 777349375.0, + "step": 20372 + }, + { + "epoch": 2.5916550057244625, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.446062088012695, + "learning_rate": 1e-06, + "loss": 0.3706, + "mean_token_accuracy": 0.8842837810516357, + "num_tokens": 777394806.0, + "step": 20373 + }, + { + "epoch": 2.591782216003053, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 25.977445602416992, + "learning_rate": 1e-06, + "loss": 0.3835, + "mean_token_accuracy": 0.8789748549461365, + "num_tokens": 777434350.0, + "step": 20374 + }, + { + "epoch": 2.5919094262816436, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.177125930786133, + "learning_rate": 1e-06, + "loss": 0.4686, + "mean_token_accuracy": 0.8481159210205078, + "num_tokens": 777469690.0, + "step": 20375 + }, + { + "epoch": 2.592036636560234, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.51320457458496, + "learning_rate": 1e-06, + "loss": 0.4228, + "mean_token_accuracy": 0.8673324584960938, + "num_tokens": 777502095.0, + "step": 20376 + }, + { + "epoch": 2.5921638468388246, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.206457138061523, + "learning_rate": 1e-06, + "loss": 0.3675, + "mean_token_accuracy": 0.8855689764022827, + "num_tokens": 777540387.0, + "step": 20377 + }, + { + "epoch": 2.592291057117415, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.374113082885742, + "learning_rate": 1e-06, + "loss": 0.3719, + "mean_token_accuracy": 0.8794604539871216, + "num_tokens": 777581855.0, + "step": 20378 + }, + { + "epoch": 2.5924182673960057, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.218172073364258, + "learning_rate": 1e-06, + "loss": 0.4031, + "mean_token_accuracy": 0.874312162399292, + "num_tokens": 777619852.0, + "step": 20379 + }, + { + "epoch": 2.592545477674596, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.235158920288086, + "learning_rate": 1e-06, + "loss": 0.4207, + "mean_token_accuracy": 0.8671742677688599, + "num_tokens": 777662772.0, + "step": 20380 + }, + { + "epoch": 2.5926726879531867, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.17784881591797, + "learning_rate": 1e-06, + "loss": 0.4471, + "mean_token_accuracy": 0.8568430542945862, + "num_tokens": 777701171.0, + "step": 20381 + }, + { + "epoch": 2.5927998982317773, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.423465728759766, + "learning_rate": 1e-06, + "loss": 0.4471, + "mean_token_accuracy": 0.859697163105011, + "num_tokens": 777738245.0, + "step": 20382 + }, + { + "epoch": 2.592927108510368, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.4509334564209, + "learning_rate": 1e-06, + "loss": 0.3953, + "mean_token_accuracy": 0.8739460110664368, + "num_tokens": 777775732.0, + "step": 20383 + }, + { + "epoch": 2.5930543187889583, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.340225219726562, + "learning_rate": 1e-06, + "loss": 0.3995, + "mean_token_accuracy": 0.8748557567596436, + "num_tokens": 777811077.0, + "step": 20384 + }, + { + "epoch": 2.5931815290675484, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.2358341217041, + "learning_rate": 1e-06, + "loss": 0.3992, + "mean_token_accuracy": 0.8716727495193481, + "num_tokens": 777854107.0, + "step": 20385 + }, + { + "epoch": 2.5933087393461394, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.318809509277344, + "learning_rate": 1e-06, + "loss": 0.4051, + "mean_token_accuracy": 0.873273491859436, + "num_tokens": 777891950.0, + "step": 20386 + }, + { + "epoch": 2.5934359496247295, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.11001968383789, + "learning_rate": 1e-06, + "loss": 0.3445, + "mean_token_accuracy": 0.8918955326080322, + "num_tokens": 777932638.0, + "step": 20387 + }, + { + "epoch": 2.5935631599033204, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.466712951660156, + "learning_rate": 1e-06, + "loss": 0.404, + "mean_token_accuracy": 0.8718419075012207, + "num_tokens": 777977733.0, + "step": 20388 + }, + { + "epoch": 2.5936903701819105, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.461746215820312, + "learning_rate": 1e-06, + "loss": 0.3852, + "mean_token_accuracy": 0.8789123892784119, + "num_tokens": 778020451.0, + "step": 20389 + }, + { + "epoch": 2.5938175804605015, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.405332565307617, + "learning_rate": 1e-06, + "loss": 0.3847, + "mean_token_accuracy": 0.8800677061080933, + "num_tokens": 778059594.0, + "step": 20390 + }, + { + "epoch": 2.5939447907390916, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.08910369873047, + "learning_rate": 1e-06, + "loss": 0.386, + "mean_token_accuracy": 0.8785357475280762, + "num_tokens": 778099235.0, + "step": 20391 + }, + { + "epoch": 2.594072001017682, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.413156509399414, + "learning_rate": 1e-06, + "loss": 0.3689, + "mean_token_accuracy": 0.8855128288269043, + "num_tokens": 778135430.0, + "step": 20392 + }, + { + "epoch": 2.5941992112962726, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.387248992919922, + "learning_rate": 1e-06, + "loss": 0.4193, + "mean_token_accuracy": 0.867281973361969, + "num_tokens": 778175111.0, + "step": 20393 + }, + { + "epoch": 2.594326421574863, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.276866912841797, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.8650826811790466, + "num_tokens": 778211954.0, + "step": 20394 + }, + { + "epoch": 2.5944536318534537, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.454364776611328, + "learning_rate": 1e-06, + "loss": 0.4464, + "mean_token_accuracy": 0.8579301834106445, + "num_tokens": 778246580.0, + "step": 20395 + }, + { + "epoch": 2.594580842132044, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.236356735229492, + "learning_rate": 1e-06, + "loss": 0.3761, + "mean_token_accuracy": 0.8799290657043457, + "num_tokens": 778288659.0, + "step": 20396 + }, + { + "epoch": 2.5947080524106347, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.28849220275879, + "learning_rate": 1e-06, + "loss": 0.4018, + "mean_token_accuracy": 0.8741530179977417, + "num_tokens": 778325365.0, + "step": 20397 + }, + { + "epoch": 2.5948352626892253, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.2104549407959, + "learning_rate": 1e-06, + "loss": 0.377, + "mean_token_accuracy": 0.8801813125610352, + "num_tokens": 778362982.0, + "step": 20398 + }, + { + "epoch": 2.594962472967816, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.121885299682617, + "learning_rate": 1e-06, + "loss": 0.4372, + "mean_token_accuracy": 0.8600304126739502, + "num_tokens": 778394230.0, + "step": 20399 + }, + { + "epoch": 2.5950896832464063, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.295095443725586, + "learning_rate": 1e-06, + "loss": 0.4388, + "mean_token_accuracy": 0.8634878396987915, + "num_tokens": 778434345.0, + "step": 20400 + }, + { + "epoch": 2.595216893524997, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.38291358947754, + "learning_rate": 1e-06, + "loss": 0.4267, + "mean_token_accuracy": 0.8686968684196472, + "num_tokens": 778478303.0, + "step": 20401 + }, + { + "epoch": 2.5953441038035874, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.188562393188477, + "learning_rate": 1e-06, + "loss": 0.3769, + "mean_token_accuracy": 0.879889726638794, + "num_tokens": 778513915.0, + "step": 20402 + }, + { + "epoch": 2.595471314082178, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.29942512512207, + "learning_rate": 1e-06, + "loss": 0.3487, + "mean_token_accuracy": 0.8896727561950684, + "num_tokens": 778554294.0, + "step": 20403 + }, + { + "epoch": 2.5955985243607684, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.18347930908203, + "learning_rate": 1e-06, + "loss": 0.4112, + "mean_token_accuracy": 0.871370255947113, + "num_tokens": 778592836.0, + "step": 20404 + }, + { + "epoch": 2.595725734639359, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.43304443359375, + "learning_rate": 1e-06, + "loss": 0.4248, + "mean_token_accuracy": 0.865383505821228, + "num_tokens": 778629316.0, + "step": 20405 + }, + { + "epoch": 2.5958529449179495, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.382116317749023, + "learning_rate": 1e-06, + "loss": 0.4018, + "mean_token_accuracy": 0.8699682950973511, + "num_tokens": 778659065.0, + "step": 20406 + }, + { + "epoch": 2.59598015519654, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.29507064819336, + "learning_rate": 1e-06, + "loss": 0.351, + "mean_token_accuracy": 0.889255702495575, + "num_tokens": 778693949.0, + "step": 20407 + }, + { + "epoch": 2.5961073654751305, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.348655700683594, + "learning_rate": 1e-06, + "loss": 0.3911, + "mean_token_accuracy": 0.878960132598877, + "num_tokens": 778738045.0, + "step": 20408 + }, + { + "epoch": 2.596234575753721, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.313587188720703, + "learning_rate": 1e-06, + "loss": 0.5226, + "mean_token_accuracy": 0.8320963382720947, + "num_tokens": 778769578.0, + "step": 20409 + }, + { + "epoch": 2.596361786032311, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.366147994995117, + "learning_rate": 1e-06, + "loss": 0.4244, + "mean_token_accuracy": 0.8691517114639282, + "num_tokens": 778810052.0, + "step": 20410 + }, + { + "epoch": 2.596488996310902, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.292497634887695, + "learning_rate": 1e-06, + "loss": 0.4212, + "mean_token_accuracy": 0.8684093952178955, + "num_tokens": 778850868.0, + "step": 20411 + }, + { + "epoch": 2.596616206589492, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.29029655456543, + "learning_rate": 1e-06, + "loss": 0.3771, + "mean_token_accuracy": 0.8805310130119324, + "num_tokens": 778890129.0, + "step": 20412 + }, + { + "epoch": 2.596743416868083, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.565387725830078, + "learning_rate": 1e-06, + "loss": 0.3996, + "mean_token_accuracy": 0.877919614315033, + "num_tokens": 778923181.0, + "step": 20413 + }, + { + "epoch": 2.5968706271466733, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.260116577148438, + "learning_rate": 1e-06, + "loss": 0.4323, + "mean_token_accuracy": 0.8656803965568542, + "num_tokens": 778963328.0, + "step": 20414 + }, + { + "epoch": 2.596997837425264, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.27554702758789, + "learning_rate": 1e-06, + "loss": 0.3709, + "mean_token_accuracy": 0.8842531442642212, + "num_tokens": 779003932.0, + "step": 20415 + }, + { + "epoch": 2.5971250477038543, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.278650283813477, + "learning_rate": 1e-06, + "loss": 0.4317, + "mean_token_accuracy": 0.8635352849960327, + "num_tokens": 779043913.0, + "step": 20416 + }, + { + "epoch": 2.597252257982445, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.289608001708984, + "learning_rate": 1e-06, + "loss": 0.4084, + "mean_token_accuracy": 0.8708783388137817, + "num_tokens": 779077126.0, + "step": 20417 + }, + { + "epoch": 2.5973794682610354, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.186891555786133, + "learning_rate": 1e-06, + "loss": 0.3884, + "mean_token_accuracy": 0.8780505657196045, + "num_tokens": 779111544.0, + "step": 20418 + }, + { + "epoch": 2.597506678539626, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.186052322387695, + "learning_rate": 1e-06, + "loss": 0.4028, + "mean_token_accuracy": 0.8726385831832886, + "num_tokens": 779145493.0, + "step": 20419 + }, + { + "epoch": 2.5976338888182164, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.382349014282227, + "learning_rate": 1e-06, + "loss": 0.4214, + "mean_token_accuracy": 0.8658902645111084, + "num_tokens": 779189216.0, + "step": 20420 + }, + { + "epoch": 2.597761099096807, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.133081436157227, + "learning_rate": 1e-06, + "loss": 0.3826, + "mean_token_accuracy": 0.880731463432312, + "num_tokens": 779225914.0, + "step": 20421 + }, + { + "epoch": 2.5978883093753975, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.324724197387695, + "learning_rate": 1e-06, + "loss": 0.3784, + "mean_token_accuracy": 0.8784217834472656, + "num_tokens": 779264118.0, + "step": 20422 + }, + { + "epoch": 2.598015519653988, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.236494064331055, + "learning_rate": 1e-06, + "loss": 0.3962, + "mean_token_accuracy": 0.8774552345275879, + "num_tokens": 779301694.0, + "step": 20423 + }, + { + "epoch": 2.5981427299325786, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.212438583374023, + "learning_rate": 1e-06, + "loss": 0.4093, + "mean_token_accuracy": 0.8725404739379883, + "num_tokens": 779339012.0, + "step": 20424 + }, + { + "epoch": 2.598269940211169, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.532379150390625, + "learning_rate": 1e-06, + "loss": 0.3902, + "mean_token_accuracy": 0.8782724738121033, + "num_tokens": 779376358.0, + "step": 20425 + }, + { + "epoch": 2.5983971504897596, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.17392921447754, + "learning_rate": 1e-06, + "loss": 0.4077, + "mean_token_accuracy": 0.8733202219009399, + "num_tokens": 779416344.0, + "step": 20426 + }, + { + "epoch": 2.59852436076835, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.428680419921875, + "learning_rate": 1e-06, + "loss": 0.4392, + "mean_token_accuracy": 0.8620412349700928, + "num_tokens": 779452189.0, + "step": 20427 + }, + { + "epoch": 2.5986515710469407, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.237035751342773, + "learning_rate": 1e-06, + "loss": 0.4065, + "mean_token_accuracy": 0.8723539113998413, + "num_tokens": 779493513.0, + "step": 20428 + }, + { + "epoch": 2.598778781325531, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.24653434753418, + "learning_rate": 1e-06, + "loss": 0.3848, + "mean_token_accuracy": 0.8803211450576782, + "num_tokens": 779533061.0, + "step": 20429 + }, + { + "epoch": 2.5989059916041217, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.191425323486328, + "learning_rate": 1e-06, + "loss": 0.4099, + "mean_token_accuracy": 0.8666445016860962, + "num_tokens": 779570159.0, + "step": 20430 + }, + { + "epoch": 2.5990332018827123, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.363487243652344, + "learning_rate": 1e-06, + "loss": 0.3995, + "mean_token_accuracy": 0.8711965680122375, + "num_tokens": 779609066.0, + "step": 20431 + }, + { + "epoch": 2.599160412161303, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.305692672729492, + "learning_rate": 1e-06, + "loss": 0.3997, + "mean_token_accuracy": 0.8728991150856018, + "num_tokens": 779651087.0, + "step": 20432 + }, + { + "epoch": 2.5992876224398933, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.381418228149414, + "learning_rate": 1e-06, + "loss": 0.3654, + "mean_token_accuracy": 0.8871105313301086, + "num_tokens": 779689531.0, + "step": 20433 + }, + { + "epoch": 2.599414832718484, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.197872161865234, + "learning_rate": 1e-06, + "loss": 0.4076, + "mean_token_accuracy": 0.8682918548583984, + "num_tokens": 779724062.0, + "step": 20434 + }, + { + "epoch": 2.599542042997074, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.308759689331055, + "learning_rate": 1e-06, + "loss": 0.4439, + "mean_token_accuracy": 0.8608497977256775, + "num_tokens": 779766029.0, + "step": 20435 + }, + { + "epoch": 2.599669253275665, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.09129524230957, + "learning_rate": 1e-06, + "loss": 0.4494, + "mean_token_accuracy": 0.8584287166595459, + "num_tokens": 779811065.0, + "step": 20436 + }, + { + "epoch": 2.599796463554255, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.263126373291016, + "learning_rate": 1e-06, + "loss": 0.4452, + "mean_token_accuracy": 0.8601936101913452, + "num_tokens": 779852442.0, + "step": 20437 + }, + { + "epoch": 2.599923673832846, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.12347984313965, + "learning_rate": 1e-06, + "loss": 0.3726, + "mean_token_accuracy": 0.8776624202728271, + "num_tokens": 779885993.0, + "step": 20438 + }, + { + "epoch": 2.600050884111436, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.26730728149414, + "learning_rate": 1e-06, + "loss": 0.38, + "mean_token_accuracy": 0.8775056004524231, + "num_tokens": 779927791.0, + "step": 20439 + }, + { + "epoch": 2.6001780943900266, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.195878982543945, + "learning_rate": 1e-06, + "loss": 0.3846, + "mean_token_accuracy": 0.8764007687568665, + "num_tokens": 779961517.0, + "step": 20440 + }, + { + "epoch": 2.600305304668617, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.419870376586914, + "learning_rate": 1e-06, + "loss": 0.425, + "mean_token_accuracy": 0.8606493473052979, + "num_tokens": 779992902.0, + "step": 20441 + }, + { + "epoch": 2.6004325149472076, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.25752830505371, + "learning_rate": 1e-06, + "loss": 0.3997, + "mean_token_accuracy": 0.873913049697876, + "num_tokens": 780032849.0, + "step": 20442 + }, + { + "epoch": 2.600559725225798, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.13533592224121, + "learning_rate": 1e-06, + "loss": 0.3748, + "mean_token_accuracy": 0.8856183290481567, + "num_tokens": 780072563.0, + "step": 20443 + }, + { + "epoch": 2.6006869355043887, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.17255210876465, + "learning_rate": 1e-06, + "loss": 0.4022, + "mean_token_accuracy": 0.873721182346344, + "num_tokens": 780115299.0, + "step": 20444 + }, + { + "epoch": 2.600814145782979, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.532766342163086, + "learning_rate": 1e-06, + "loss": 0.4066, + "mean_token_accuracy": 0.87214195728302, + "num_tokens": 780155792.0, + "step": 20445 + }, + { + "epoch": 2.6009413560615697, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.19620704650879, + "learning_rate": 1e-06, + "loss": 0.4033, + "mean_token_accuracy": 0.8694360852241516, + "num_tokens": 780189040.0, + "step": 20446 + }, + { + "epoch": 2.6010685663401603, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.392581939697266, + "learning_rate": 1e-06, + "loss": 0.4056, + "mean_token_accuracy": 0.8722242116928101, + "num_tokens": 780229463.0, + "step": 20447 + }, + { + "epoch": 2.601195776618751, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.2081241607666, + "learning_rate": 1e-06, + "loss": 0.362, + "mean_token_accuracy": 0.8852726221084595, + "num_tokens": 780265961.0, + "step": 20448 + }, + { + "epoch": 2.6013229868973413, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.318923950195312, + "learning_rate": 1e-06, + "loss": 0.42, + "mean_token_accuracy": 0.8672826886177063, + "num_tokens": 780303220.0, + "step": 20449 + }, + { + "epoch": 2.601450197175932, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.338163375854492, + "learning_rate": 1e-06, + "loss": 0.4195, + "mean_token_accuracy": 0.8697624206542969, + "num_tokens": 780340456.0, + "step": 20450 + }, + { + "epoch": 2.6015774074545224, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.37517547607422, + "learning_rate": 1e-06, + "loss": 0.4247, + "mean_token_accuracy": 0.8718669414520264, + "num_tokens": 780381402.0, + "step": 20451 + }, + { + "epoch": 2.601704617733113, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.13365364074707, + "learning_rate": 1e-06, + "loss": 0.3881, + "mean_token_accuracy": 0.8777861595153809, + "num_tokens": 780422571.0, + "step": 20452 + }, + { + "epoch": 2.6018318280117034, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.3963680267334, + "learning_rate": 1e-06, + "loss": 0.4209, + "mean_token_accuracy": 0.8682231903076172, + "num_tokens": 780462447.0, + "step": 20453 + }, + { + "epoch": 2.601959038290294, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.27596664428711, + "learning_rate": 1e-06, + "loss": 0.4074, + "mean_token_accuracy": 0.872951865196228, + "num_tokens": 780499183.0, + "step": 20454 + }, + { + "epoch": 2.6020862485688845, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.227733612060547, + "learning_rate": 1e-06, + "loss": 0.4392, + "mean_token_accuracy": 0.8618462085723877, + "num_tokens": 780539216.0, + "step": 20455 + }, + { + "epoch": 2.602213458847475, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.39006233215332, + "learning_rate": 1e-06, + "loss": 0.3852, + "mean_token_accuracy": 0.879905641078949, + "num_tokens": 780578725.0, + "step": 20456 + }, + { + "epoch": 2.6023406691260655, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.688461303710938, + "learning_rate": 1e-06, + "loss": 0.4069, + "mean_token_accuracy": 0.8735747337341309, + "num_tokens": 780615175.0, + "step": 20457 + }, + { + "epoch": 2.6024678794046556, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.371688842773438, + "learning_rate": 1e-06, + "loss": 0.394, + "mean_token_accuracy": 0.8760358095169067, + "num_tokens": 780651186.0, + "step": 20458 + }, + { + "epoch": 2.6025950896832466, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.38896369934082, + "learning_rate": 1e-06, + "loss": 0.4155, + "mean_token_accuracy": 0.8658484816551208, + "num_tokens": 780685550.0, + "step": 20459 + }, + { + "epoch": 2.6027222999618367, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.268579483032227, + "learning_rate": 1e-06, + "loss": 0.3326, + "mean_token_accuracy": 0.8941988348960876, + "num_tokens": 780725477.0, + "step": 20460 + }, + { + "epoch": 2.6028495102404277, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.10800552368164, + "learning_rate": 1e-06, + "loss": 0.3956, + "mean_token_accuracy": 0.8750187158584595, + "num_tokens": 780762481.0, + "step": 20461 + }, + { + "epoch": 2.6029767205190177, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.415830612182617, + "learning_rate": 1e-06, + "loss": 0.465, + "mean_token_accuracy": 0.8528310060501099, + "num_tokens": 780800305.0, + "step": 20462 + }, + { + "epoch": 2.6031039307976087, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.23705291748047, + "learning_rate": 1e-06, + "loss": 0.4035, + "mean_token_accuracy": 0.87493896484375, + "num_tokens": 780836271.0, + "step": 20463 + }, + { + "epoch": 2.603231141076199, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.17578887939453, + "learning_rate": 1e-06, + "loss": 0.3594, + "mean_token_accuracy": 0.885108232498169, + "num_tokens": 780877363.0, + "step": 20464 + }, + { + "epoch": 2.6033583513547893, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.248502731323242, + "learning_rate": 1e-06, + "loss": 0.4421, + "mean_token_accuracy": 0.8618240356445312, + "num_tokens": 780922982.0, + "step": 20465 + }, + { + "epoch": 2.60348556163338, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.351701736450195, + "learning_rate": 1e-06, + "loss": 0.4164, + "mean_token_accuracy": 0.8715808987617493, + "num_tokens": 780957925.0, + "step": 20466 + }, + { + "epoch": 2.6036127719119704, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.404909133911133, + "learning_rate": 1e-06, + "loss": 0.4719, + "mean_token_accuracy": 0.8547027707099915, + "num_tokens": 780996922.0, + "step": 20467 + }, + { + "epoch": 2.603739982190561, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.526269912719727, + "learning_rate": 1e-06, + "loss": 0.4703, + "mean_token_accuracy": 0.8555459380149841, + "num_tokens": 781037209.0, + "step": 20468 + }, + { + "epoch": 2.6038671924691514, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.32008934020996, + "learning_rate": 1e-06, + "loss": 0.4365, + "mean_token_accuracy": 0.8658607006072998, + "num_tokens": 781078077.0, + "step": 20469 + }, + { + "epoch": 2.603994402747742, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.3096923828125, + "learning_rate": 1e-06, + "loss": 0.4064, + "mean_token_accuracy": 0.8721441030502319, + "num_tokens": 781122516.0, + "step": 20470 + }, + { + "epoch": 2.6041216130263325, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.385517120361328, + "learning_rate": 1e-06, + "loss": 0.4161, + "mean_token_accuracy": 0.8654592633247375, + "num_tokens": 781156290.0, + "step": 20471 + }, + { + "epoch": 2.604248823304923, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.144393920898438, + "learning_rate": 1e-06, + "loss": 0.3862, + "mean_token_accuracy": 0.8809051513671875, + "num_tokens": 781192769.0, + "step": 20472 + }, + { + "epoch": 2.6043760335835135, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.40850257873535, + "learning_rate": 1e-06, + "loss": 0.3933, + "mean_token_accuracy": 0.8767763376235962, + "num_tokens": 781227594.0, + "step": 20473 + }, + { + "epoch": 2.604503243862104, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.185670852661133, + "learning_rate": 1e-06, + "loss": 0.3581, + "mean_token_accuracy": 0.8831997513771057, + "num_tokens": 781259369.0, + "step": 20474 + }, + { + "epoch": 2.6046304541406946, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.24494743347168, + "learning_rate": 1e-06, + "loss": 0.4053, + "mean_token_accuracy": 0.8733484148979187, + "num_tokens": 781296788.0, + "step": 20475 + }, + { + "epoch": 2.604757664419285, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.11608123779297, + "learning_rate": 1e-06, + "loss": 0.3938, + "mean_token_accuracy": 0.874984860420227, + "num_tokens": 781333103.0, + "step": 20476 + }, + { + "epoch": 2.6048848746978757, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.334270477294922, + "learning_rate": 1e-06, + "loss": 0.3785, + "mean_token_accuracy": 0.8815028667449951, + "num_tokens": 781372691.0, + "step": 20477 + }, + { + "epoch": 2.605012084976466, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.268640518188477, + "learning_rate": 1e-06, + "loss": 0.4182, + "mean_token_accuracy": 0.8690661191940308, + "num_tokens": 781410480.0, + "step": 20478 + }, + { + "epoch": 2.6051392952550567, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.131685256958008, + "learning_rate": 1e-06, + "loss": 0.4301, + "mean_token_accuracy": 0.8648785352706909, + "num_tokens": 781454110.0, + "step": 20479 + }, + { + "epoch": 2.6052665055336472, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.269710540771484, + "learning_rate": 1e-06, + "loss": 0.4134, + "mean_token_accuracy": 0.8702434301376343, + "num_tokens": 781494034.0, + "step": 20480 + }, + { + "epoch": 2.6053937158122378, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.236827850341797, + "learning_rate": 1e-06, + "loss": 0.41, + "mean_token_accuracy": 0.8703584671020508, + "num_tokens": 781536225.0, + "step": 20481 + }, + { + "epoch": 2.6055209260908283, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.320369720458984, + "learning_rate": 1e-06, + "loss": 0.3866, + "mean_token_accuracy": 0.8792035579681396, + "num_tokens": 781573987.0, + "step": 20482 + }, + { + "epoch": 2.6056481363694184, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.187686920166016, + "learning_rate": 1e-06, + "loss": 0.3802, + "mean_token_accuracy": 0.8773106932640076, + "num_tokens": 781608987.0, + "step": 20483 + }, + { + "epoch": 2.6057753466480094, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.301149368286133, + "learning_rate": 1e-06, + "loss": 0.3785, + "mean_token_accuracy": 0.8797856569290161, + "num_tokens": 781646185.0, + "step": 20484 + }, + { + "epoch": 2.6059025569265994, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.214540481567383, + "learning_rate": 1e-06, + "loss": 0.4361, + "mean_token_accuracy": 0.8630998730659485, + "num_tokens": 781682999.0, + "step": 20485 + }, + { + "epoch": 2.6060297672051904, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.33424186706543, + "learning_rate": 1e-06, + "loss": 0.4076, + "mean_token_accuracy": 0.8706619739532471, + "num_tokens": 781726052.0, + "step": 20486 + }, + { + "epoch": 2.6061569774837805, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.402666091918945, + "learning_rate": 1e-06, + "loss": 0.3558, + "mean_token_accuracy": 0.885402262210846, + "num_tokens": 781760562.0, + "step": 20487 + }, + { + "epoch": 2.6062841877623715, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.219146728515625, + "learning_rate": 1e-06, + "loss": 0.4345, + "mean_token_accuracy": 0.8620297908782959, + "num_tokens": 781803765.0, + "step": 20488 + }, + { + "epoch": 2.6064113980409616, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.36825180053711, + "learning_rate": 1e-06, + "loss": 0.3975, + "mean_token_accuracy": 0.8743653297424316, + "num_tokens": 781845419.0, + "step": 20489 + }, + { + "epoch": 2.606538608319552, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.433502197265625, + "learning_rate": 1e-06, + "loss": 0.3848, + "mean_token_accuracy": 0.8819352388381958, + "num_tokens": 781883371.0, + "step": 20490 + }, + { + "epoch": 2.6066658185981426, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.18625259399414, + "learning_rate": 1e-06, + "loss": 0.408, + "mean_token_accuracy": 0.8720616102218628, + "num_tokens": 781919669.0, + "step": 20491 + }, + { + "epoch": 2.606793028876733, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.210941314697266, + "learning_rate": 1e-06, + "loss": 0.3915, + "mean_token_accuracy": 0.8788133859634399, + "num_tokens": 781953212.0, + "step": 20492 + }, + { + "epoch": 2.6069202391553237, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.41961669921875, + "learning_rate": 1e-06, + "loss": 0.3939, + "mean_token_accuracy": 0.8774382472038269, + "num_tokens": 781996824.0, + "step": 20493 + }, + { + "epoch": 2.607047449433914, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.231054306030273, + "learning_rate": 1e-06, + "loss": 0.3382, + "mean_token_accuracy": 0.8920958042144775, + "num_tokens": 782035350.0, + "step": 20494 + }, + { + "epoch": 2.6071746597125047, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.250274658203125, + "learning_rate": 1e-06, + "loss": 0.4103, + "mean_token_accuracy": 0.8679611682891846, + "num_tokens": 782073217.0, + "step": 20495 + }, + { + "epoch": 2.6073018699910953, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.281719207763672, + "learning_rate": 1e-06, + "loss": 0.4284, + "mean_token_accuracy": 0.8672937154769897, + "num_tokens": 782110777.0, + "step": 20496 + }, + { + "epoch": 2.607429080269686, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.260303497314453, + "learning_rate": 1e-06, + "loss": 0.418, + "mean_token_accuracy": 0.8677052855491638, + "num_tokens": 782153300.0, + "step": 20497 + }, + { + "epoch": 2.6075562905482763, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.144004821777344, + "learning_rate": 1e-06, + "loss": 0.3849, + "mean_token_accuracy": 0.8780561089515686, + "num_tokens": 782191945.0, + "step": 20498 + }, + { + "epoch": 2.607683500826867, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.341466903686523, + "learning_rate": 1e-06, + "loss": 0.3693, + "mean_token_accuracy": 0.8830640316009521, + "num_tokens": 782230472.0, + "step": 20499 + }, + { + "epoch": 2.6078107111054574, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.386125564575195, + "learning_rate": 1e-06, + "loss": 0.4278, + "mean_token_accuracy": 0.8630572557449341, + "num_tokens": 782263868.0, + "step": 20500 + }, + { + "epoch": 2.607937921384048, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.08755111694336, + "learning_rate": 1e-06, + "loss": 0.3921, + "mean_token_accuracy": 0.8758535385131836, + "num_tokens": 782299781.0, + "step": 20501 + }, + { + "epoch": 2.6080651316626384, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.326744079589844, + "learning_rate": 1e-06, + "loss": 0.4257, + "mean_token_accuracy": 0.8662570118904114, + "num_tokens": 782343376.0, + "step": 20502 + }, + { + "epoch": 2.608192341941229, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.206083297729492, + "learning_rate": 1e-06, + "loss": 0.3702, + "mean_token_accuracy": 0.8851528167724609, + "num_tokens": 782389894.0, + "step": 20503 + }, + { + "epoch": 2.6083195522198195, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.2021541595459, + "learning_rate": 1e-06, + "loss": 0.3885, + "mean_token_accuracy": 0.8780235052108765, + "num_tokens": 782430055.0, + "step": 20504 + }, + { + "epoch": 2.60844676249841, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.510520935058594, + "learning_rate": 1e-06, + "loss": 0.4083, + "mean_token_accuracy": 0.8752239942550659, + "num_tokens": 782467211.0, + "step": 20505 + }, + { + "epoch": 2.6085739727770005, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.353126525878906, + "learning_rate": 1e-06, + "loss": 0.4348, + "mean_token_accuracy": 0.8650423288345337, + "num_tokens": 782505913.0, + "step": 20506 + }, + { + "epoch": 2.608701183055591, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.17202377319336, + "learning_rate": 1e-06, + "loss": 0.4005, + "mean_token_accuracy": 0.8724837303161621, + "num_tokens": 782541858.0, + "step": 20507 + }, + { + "epoch": 2.608828393334181, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.28359031677246, + "learning_rate": 1e-06, + "loss": 0.401, + "mean_token_accuracy": 0.8733841180801392, + "num_tokens": 782575854.0, + "step": 20508 + }, + { + "epoch": 2.608955603612772, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.214242935180664, + "learning_rate": 1e-06, + "loss": 0.3999, + "mean_token_accuracy": 0.8750755190849304, + "num_tokens": 782614005.0, + "step": 20509 + }, + { + "epoch": 2.609082813891362, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.32999038696289, + "learning_rate": 1e-06, + "loss": 0.385, + "mean_token_accuracy": 0.8807200789451599, + "num_tokens": 782650147.0, + "step": 20510 + }, + { + "epoch": 2.609210024169953, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.231857299804688, + "learning_rate": 1e-06, + "loss": 0.4001, + "mean_token_accuracy": 0.873115599155426, + "num_tokens": 782683215.0, + "step": 20511 + }, + { + "epoch": 2.6093372344485433, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.451473236083984, + "learning_rate": 1e-06, + "loss": 0.3878, + "mean_token_accuracy": 0.878258466720581, + "num_tokens": 782721184.0, + "step": 20512 + }, + { + "epoch": 2.609464444727134, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.171707153320312, + "learning_rate": 1e-06, + "loss": 0.4165, + "mean_token_accuracy": 0.8685104250907898, + "num_tokens": 782759891.0, + "step": 20513 + }, + { + "epoch": 2.6095916550057243, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.275245666503906, + "learning_rate": 1e-06, + "loss": 0.4429, + "mean_token_accuracy": 0.8632962703704834, + "num_tokens": 782796398.0, + "step": 20514 + }, + { + "epoch": 2.609718865284315, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.338098526000977, + "learning_rate": 1e-06, + "loss": 0.3805, + "mean_token_accuracy": 0.8831390738487244, + "num_tokens": 782835968.0, + "step": 20515 + }, + { + "epoch": 2.6098460755629054, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.11838722229004, + "learning_rate": 1e-06, + "loss": 0.3633, + "mean_token_accuracy": 0.8842819929122925, + "num_tokens": 782871367.0, + "step": 20516 + }, + { + "epoch": 2.609973285841496, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.49338150024414, + "learning_rate": 1e-06, + "loss": 0.3466, + "mean_token_accuracy": 0.8942222595214844, + "num_tokens": 782905766.0, + "step": 20517 + }, + { + "epoch": 2.6101004961200864, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.23646354675293, + "learning_rate": 1e-06, + "loss": 0.389, + "mean_token_accuracy": 0.87884920835495, + "num_tokens": 782942256.0, + "step": 20518 + }, + { + "epoch": 2.610227706398677, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.41412925720215, + "learning_rate": 1e-06, + "loss": 0.375, + "mean_token_accuracy": 0.883160412311554, + "num_tokens": 782976156.0, + "step": 20519 + }, + { + "epoch": 2.6103549166772675, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.45989990234375, + "learning_rate": 1e-06, + "loss": 0.3783, + "mean_token_accuracy": 0.8825268149375916, + "num_tokens": 783011450.0, + "step": 20520 + }, + { + "epoch": 2.610482126955858, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.135469436645508, + "learning_rate": 1e-06, + "loss": 0.3772, + "mean_token_accuracy": 0.8791749477386475, + "num_tokens": 783050355.0, + "step": 20521 + }, + { + "epoch": 2.6106093372344485, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.234838485717773, + "learning_rate": 1e-06, + "loss": 0.3772, + "mean_token_accuracy": 0.883582592010498, + "num_tokens": 783091621.0, + "step": 20522 + }, + { + "epoch": 2.610736547513039, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.324932098388672, + "learning_rate": 1e-06, + "loss": 0.4443, + "mean_token_accuracy": 0.8625648021697998, + "num_tokens": 783130471.0, + "step": 20523 + }, + { + "epoch": 2.6108637577916296, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.44008445739746, + "learning_rate": 1e-06, + "loss": 0.396, + "mean_token_accuracy": 0.878355860710144, + "num_tokens": 783168340.0, + "step": 20524 + }, + { + "epoch": 2.61099096807022, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.37937355041504, + "learning_rate": 1e-06, + "loss": 0.4536, + "mean_token_accuracy": 0.8549070358276367, + "num_tokens": 783207279.0, + "step": 20525 + }, + { + "epoch": 2.6111181783488107, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.247360229492188, + "learning_rate": 1e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.8667658567428589, + "num_tokens": 783243010.0, + "step": 20526 + }, + { + "epoch": 2.611245388627401, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.429174423217773, + "learning_rate": 1e-06, + "loss": 0.4332, + "mean_token_accuracy": 0.8630315065383911, + "num_tokens": 783287548.0, + "step": 20527 + }, + { + "epoch": 2.6113725989059917, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.29388427734375, + "learning_rate": 1e-06, + "loss": 0.4105, + "mean_token_accuracy": 0.8705805540084839, + "num_tokens": 783331549.0, + "step": 20528 + }, + { + "epoch": 2.6114998091845822, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.3663387298584, + "learning_rate": 1e-06, + "loss": 0.3743, + "mean_token_accuracy": 0.8837965726852417, + "num_tokens": 783369445.0, + "step": 20529 + }, + { + "epoch": 2.6116270194631728, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.265010833740234, + "learning_rate": 1e-06, + "loss": 0.4037, + "mean_token_accuracy": 0.8695483207702637, + "num_tokens": 783413218.0, + "step": 20530 + }, + { + "epoch": 2.6117542297417633, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.548791885375977, + "learning_rate": 1e-06, + "loss": 0.4, + "mean_token_accuracy": 0.8732127547264099, + "num_tokens": 783449944.0, + "step": 20531 + }, + { + "epoch": 2.611881440020354, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.377025604248047, + "learning_rate": 1e-06, + "loss": 0.3949, + "mean_token_accuracy": 0.8736445307731628, + "num_tokens": 783486142.0, + "step": 20532 + }, + { + "epoch": 2.612008650298944, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.230587005615234, + "learning_rate": 1e-06, + "loss": 0.3806, + "mean_token_accuracy": 0.8799880743026733, + "num_tokens": 783521170.0, + "step": 20533 + }, + { + "epoch": 2.612135860577535, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.27088737487793, + "learning_rate": 1e-06, + "loss": 0.4009, + "mean_token_accuracy": 0.8717128038406372, + "num_tokens": 783563902.0, + "step": 20534 + }, + { + "epoch": 2.612263070856125, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.36559295654297, + "learning_rate": 1e-06, + "loss": 0.3802, + "mean_token_accuracy": 0.8776944875717163, + "num_tokens": 783597655.0, + "step": 20535 + }, + { + "epoch": 2.612390281134716, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.273042678833008, + "learning_rate": 1e-06, + "loss": 0.4303, + "mean_token_accuracy": 0.8631422519683838, + "num_tokens": 783635233.0, + "step": 20536 + }, + { + "epoch": 2.612517491413306, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.442245483398438, + "learning_rate": 1e-06, + "loss": 0.3921, + "mean_token_accuracy": 0.8749876022338867, + "num_tokens": 783679677.0, + "step": 20537 + }, + { + "epoch": 2.6126447016918966, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.259122848510742, + "learning_rate": 1e-06, + "loss": 0.3763, + "mean_token_accuracy": 0.8805246949195862, + "num_tokens": 783720608.0, + "step": 20538 + }, + { + "epoch": 2.612771911970487, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.659870147705078, + "learning_rate": 1e-06, + "loss": 0.3698, + "mean_token_accuracy": 0.883545458316803, + "num_tokens": 783753198.0, + "step": 20539 + }, + { + "epoch": 2.6128991222490776, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.177886962890625, + "learning_rate": 1e-06, + "loss": 0.3953, + "mean_token_accuracy": 0.8736368417739868, + "num_tokens": 783791068.0, + "step": 20540 + }, + { + "epoch": 2.613026332527668, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.296892166137695, + "learning_rate": 1e-06, + "loss": 0.4045, + "mean_token_accuracy": 0.8740347623825073, + "num_tokens": 783831319.0, + "step": 20541 + }, + { + "epoch": 2.6131535428062587, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.414945602416992, + "learning_rate": 1e-06, + "loss": 0.3797, + "mean_token_accuracy": 0.8771019577980042, + "num_tokens": 783867537.0, + "step": 20542 + }, + { + "epoch": 2.613280753084849, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.459535598754883, + "learning_rate": 1e-06, + "loss": 0.4088, + "mean_token_accuracy": 0.8685330152511597, + "num_tokens": 783907787.0, + "step": 20543 + }, + { + "epoch": 2.6134079633634397, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.09099769592285, + "learning_rate": 1e-06, + "loss": 0.4195, + "mean_token_accuracy": 0.8681918382644653, + "num_tokens": 783948558.0, + "step": 20544 + }, + { + "epoch": 2.6135351736420303, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.280302047729492, + "learning_rate": 1e-06, + "loss": 0.3486, + "mean_token_accuracy": 0.8914512991905212, + "num_tokens": 783986726.0, + "step": 20545 + }, + { + "epoch": 2.613662383920621, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.712514877319336, + "learning_rate": 1e-06, + "loss": 0.442, + "mean_token_accuracy": 0.8561712503433228, + "num_tokens": 784023449.0, + "step": 20546 + }, + { + "epoch": 2.6137895941992113, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.164716720581055, + "learning_rate": 1e-06, + "loss": 0.3489, + "mean_token_accuracy": 0.8894383907318115, + "num_tokens": 784059529.0, + "step": 20547 + }, + { + "epoch": 2.613916804477802, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.508333206176758, + "learning_rate": 1e-06, + "loss": 0.3986, + "mean_token_accuracy": 0.8746122121810913, + "num_tokens": 784103530.0, + "step": 20548 + }, + { + "epoch": 2.6140440147563924, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.173099517822266, + "learning_rate": 1e-06, + "loss": 0.4124, + "mean_token_accuracy": 0.8688709139823914, + "num_tokens": 784141721.0, + "step": 20549 + }, + { + "epoch": 2.614171225034983, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.59412384033203, + "learning_rate": 1e-06, + "loss": 0.4002, + "mean_token_accuracy": 0.8739392757415771, + "num_tokens": 784175868.0, + "step": 20550 + }, + { + "epoch": 2.6142984353135734, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.412153244018555, + "learning_rate": 1e-06, + "loss": 0.3644, + "mean_token_accuracy": 0.8856097459793091, + "num_tokens": 784210646.0, + "step": 20551 + }, + { + "epoch": 2.614425645592164, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.41446304321289, + "learning_rate": 1e-06, + "loss": 0.3875, + "mean_token_accuracy": 0.8792339563369751, + "num_tokens": 784255588.0, + "step": 20552 + }, + { + "epoch": 2.6145528558707545, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.37051010131836, + "learning_rate": 1e-06, + "loss": 0.398, + "mean_token_accuracy": 0.8769686222076416, + "num_tokens": 784292877.0, + "step": 20553 + }, + { + "epoch": 2.614680066149345, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.153226852416992, + "learning_rate": 1e-06, + "loss": 0.3828, + "mean_token_accuracy": 0.8783068656921387, + "num_tokens": 784336150.0, + "step": 20554 + }, + { + "epoch": 2.6148072764279355, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.427213668823242, + "learning_rate": 1e-06, + "loss": 0.3538, + "mean_token_accuracy": 0.8860173225402832, + "num_tokens": 784371547.0, + "step": 20555 + }, + { + "epoch": 2.6149344867065256, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.2598819732666, + "learning_rate": 1e-06, + "loss": 0.3975, + "mean_token_accuracy": 0.8762022256851196, + "num_tokens": 784411473.0, + "step": 20556 + }, + { + "epoch": 2.6150616969851166, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.417348861694336, + "learning_rate": 1e-06, + "loss": 0.4272, + "mean_token_accuracy": 0.8647482991218567, + "num_tokens": 784456713.0, + "step": 20557 + }, + { + "epoch": 2.6151889072637067, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.436431884765625, + "learning_rate": 1e-06, + "loss": 0.3547, + "mean_token_accuracy": 0.8884944319725037, + "num_tokens": 784493685.0, + "step": 20558 + }, + { + "epoch": 2.6153161175422976, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.342329025268555, + "learning_rate": 1e-06, + "loss": 0.3675, + "mean_token_accuracy": 0.8862656354904175, + "num_tokens": 784534641.0, + "step": 20559 + }, + { + "epoch": 2.6154433278208877, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.6099796295166, + "learning_rate": 1e-06, + "loss": 0.4743, + "mean_token_accuracy": 0.8512784838676453, + "num_tokens": 784576184.0, + "step": 20560 + }, + { + "epoch": 2.6155705380994787, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.339685440063477, + "learning_rate": 1e-06, + "loss": 0.3848, + "mean_token_accuracy": 0.8784471750259399, + "num_tokens": 784613851.0, + "step": 20561 + }, + { + "epoch": 2.615697748378069, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.250947952270508, + "learning_rate": 1e-06, + "loss": 0.3897, + "mean_token_accuracy": 0.8763263821601868, + "num_tokens": 784654133.0, + "step": 20562 + }, + { + "epoch": 2.6158249586566593, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.28253746032715, + "learning_rate": 1e-06, + "loss": 0.3885, + "mean_token_accuracy": 0.8759571313858032, + "num_tokens": 784690529.0, + "step": 20563 + }, + { + "epoch": 2.61595216893525, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.238460540771484, + "learning_rate": 1e-06, + "loss": 0.3967, + "mean_token_accuracy": 0.8738726377487183, + "num_tokens": 784731972.0, + "step": 20564 + }, + { + "epoch": 2.6160793792138404, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.490188598632812, + "learning_rate": 1e-06, + "loss": 0.4127, + "mean_token_accuracy": 0.8703434467315674, + "num_tokens": 784773275.0, + "step": 20565 + }, + { + "epoch": 2.616206589492431, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.453968048095703, + "learning_rate": 1e-06, + "loss": 0.4468, + "mean_token_accuracy": 0.8602069020271301, + "num_tokens": 784811468.0, + "step": 20566 + }, + { + "epoch": 2.6163337997710214, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.266016006469727, + "learning_rate": 1e-06, + "loss": 0.391, + "mean_token_accuracy": 0.8766106367111206, + "num_tokens": 784850611.0, + "step": 20567 + }, + { + "epoch": 2.616461010049612, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.57286262512207, + "learning_rate": 1e-06, + "loss": 0.3673, + "mean_token_accuracy": 0.8799157738685608, + "num_tokens": 784886502.0, + "step": 20568 + }, + { + "epoch": 2.6165882203282025, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.4539794921875, + "learning_rate": 1e-06, + "loss": 0.4295, + "mean_token_accuracy": 0.865700364112854, + "num_tokens": 784924489.0, + "step": 20569 + }, + { + "epoch": 2.616715430606793, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.25062370300293, + "learning_rate": 1e-06, + "loss": 0.3839, + "mean_token_accuracy": 0.8790741562843323, + "num_tokens": 784957224.0, + "step": 20570 + }, + { + "epoch": 2.6168426408853835, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.355865478515625, + "learning_rate": 1e-06, + "loss": 0.4761, + "mean_token_accuracy": 0.8502463698387146, + "num_tokens": 784997754.0, + "step": 20571 + }, + { + "epoch": 2.616969851163974, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.059782028198242, + "learning_rate": 1e-06, + "loss": 0.4427, + "mean_token_accuracy": 0.8597914576530457, + "num_tokens": 785033932.0, + "step": 20572 + }, + { + "epoch": 2.6170970614425646, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.368627548217773, + "learning_rate": 1e-06, + "loss": 0.403, + "mean_token_accuracy": 0.8712409734725952, + "num_tokens": 785073190.0, + "step": 20573 + }, + { + "epoch": 2.617224271721155, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.497318267822266, + "learning_rate": 1e-06, + "loss": 0.3461, + "mean_token_accuracy": 0.8903121948242188, + "num_tokens": 785108397.0, + "step": 20574 + }, + { + "epoch": 2.6173514819997457, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.262075424194336, + "learning_rate": 1e-06, + "loss": 0.3259, + "mean_token_accuracy": 0.9002161026000977, + "num_tokens": 785142671.0, + "step": 20575 + }, + { + "epoch": 2.617478692278336, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.803037643432617, + "learning_rate": 1e-06, + "loss": 0.3986, + "mean_token_accuracy": 0.8721799850463867, + "num_tokens": 785183467.0, + "step": 20576 + }, + { + "epoch": 2.6176059025569267, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.34566879272461, + "learning_rate": 1e-06, + "loss": 0.4096, + "mean_token_accuracy": 0.8643583059310913, + "num_tokens": 785222072.0, + "step": 20577 + }, + { + "epoch": 2.6177331128355172, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.365434646606445, + "learning_rate": 1e-06, + "loss": 0.4193, + "mean_token_accuracy": 0.863602876663208, + "num_tokens": 785260458.0, + "step": 20578 + }, + { + "epoch": 2.6178603231141078, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.445133209228516, + "learning_rate": 1e-06, + "loss": 0.4414, + "mean_token_accuracy": 0.8612597584724426, + "num_tokens": 785302434.0, + "step": 20579 + }, + { + "epoch": 2.6179875333926983, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.262222290039062, + "learning_rate": 1e-06, + "loss": 0.4442, + "mean_token_accuracy": 0.8597939014434814, + "num_tokens": 785338251.0, + "step": 20580 + }, + { + "epoch": 2.6181147436712884, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.39934539794922, + "learning_rate": 1e-06, + "loss": 0.4153, + "mean_token_accuracy": 0.8697814345359802, + "num_tokens": 785384789.0, + "step": 20581 + }, + { + "epoch": 2.6182419539498794, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.372577667236328, + "learning_rate": 1e-06, + "loss": 0.4098, + "mean_token_accuracy": 0.8740934729576111, + "num_tokens": 785424978.0, + "step": 20582 + }, + { + "epoch": 2.6183691642284694, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.56319808959961, + "learning_rate": 1e-06, + "loss": 0.4197, + "mean_token_accuracy": 0.866063117980957, + "num_tokens": 785464421.0, + "step": 20583 + }, + { + "epoch": 2.6184963745070604, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.49996566772461, + "learning_rate": 1e-06, + "loss": 0.3358, + "mean_token_accuracy": 0.8923105001449585, + "num_tokens": 785496166.0, + "step": 20584 + }, + { + "epoch": 2.6186235847856505, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.40688133239746, + "learning_rate": 1e-06, + "loss": 0.4446, + "mean_token_accuracy": 0.860015332698822, + "num_tokens": 785528504.0, + "step": 20585 + }, + { + "epoch": 2.6187507950642415, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.694883346557617, + "learning_rate": 1e-06, + "loss": 0.3849, + "mean_token_accuracy": 0.8767073154449463, + "num_tokens": 785570896.0, + "step": 20586 + }, + { + "epoch": 2.6188780053428315, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.43474006652832, + "learning_rate": 1e-06, + "loss": 0.4127, + "mean_token_accuracy": 0.8709930181503296, + "num_tokens": 785612464.0, + "step": 20587 + }, + { + "epoch": 2.619005215621422, + "ewc_loss": 0.03857421875, + "ewc_loss_parallel": 3.8623809814453125e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.28775978088379, + "learning_rate": 1e-06, + "loss": 0.4164, + "mean_token_accuracy": 0.8731300234794617, + "num_tokens": 785655500.0, + "step": 20588 + }, + { + "epoch": 2.6191324259000126, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.296571731567383, + "learning_rate": 1e-06, + "loss": 0.3892, + "mean_token_accuracy": 0.8765541315078735, + "num_tokens": 785691186.0, + "step": 20589 + }, + { + "epoch": 2.619259636178603, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.441471099853516, + "learning_rate": 1e-06, + "loss": 0.3637, + "mean_token_accuracy": 0.8864978551864624, + "num_tokens": 785728575.0, + "step": 20590 + }, + { + "epoch": 2.6193868464571937, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.345016479492188, + "learning_rate": 1e-06, + "loss": 0.4092, + "mean_token_accuracy": 0.8720176815986633, + "num_tokens": 785765782.0, + "step": 20591 + }, + { + "epoch": 2.619514056735784, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.12241554260254, + "learning_rate": 1e-06, + "loss": 0.3707, + "mean_token_accuracy": 0.878240168094635, + "num_tokens": 785800130.0, + "step": 20592 + }, + { + "epoch": 2.6196412670143747, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.281009674072266, + "learning_rate": 1e-06, + "loss": 0.3145, + "mean_token_accuracy": 0.9012513160705566, + "num_tokens": 785836647.0, + "step": 20593 + }, + { + "epoch": 2.6197684772929652, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.38827896118164, + "learning_rate": 1e-06, + "loss": 0.4208, + "mean_token_accuracy": 0.8683210611343384, + "num_tokens": 785873897.0, + "step": 20594 + }, + { + "epoch": 2.6198956875715558, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.18946075439453, + "learning_rate": 1e-06, + "loss": 0.4089, + "mean_token_accuracy": 0.8690869808197021, + "num_tokens": 785914296.0, + "step": 20595 + }, + { + "epoch": 2.6200228978501463, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.19913673400879, + "learning_rate": 1e-06, + "loss": 0.441, + "mean_token_accuracy": 0.8616483211517334, + "num_tokens": 785951426.0, + "step": 20596 + }, + { + "epoch": 2.620150108128737, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.42741584777832, + "learning_rate": 1e-06, + "loss": 0.392, + "mean_token_accuracy": 0.8767404556274414, + "num_tokens": 785989906.0, + "step": 20597 + }, + { + "epoch": 2.6202773184073274, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.457204818725586, + "learning_rate": 1e-06, + "loss": 0.413, + "mean_token_accuracy": 0.868711531162262, + "num_tokens": 786024763.0, + "step": 20598 + }, + { + "epoch": 2.620404528685918, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.262096405029297, + "learning_rate": 1e-06, + "loss": 0.416, + "mean_token_accuracy": 0.8705324530601501, + "num_tokens": 786063055.0, + "step": 20599 + }, + { + "epoch": 2.6205317389645084, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.484710693359375, + "learning_rate": 1e-06, + "loss": 0.4982, + "mean_token_accuracy": 0.8481432199478149, + "num_tokens": 786097675.0, + "step": 20600 + }, + { + "epoch": 2.620658949243099, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.302553176879883, + "learning_rate": 1e-06, + "loss": 0.3712, + "mean_token_accuracy": 0.88599693775177, + "num_tokens": 786133721.0, + "step": 20601 + }, + { + "epoch": 2.6207861595216895, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.426340103149414, + "learning_rate": 1e-06, + "loss": 0.4595, + "mean_token_accuracy": 0.8557104468345642, + "num_tokens": 786179615.0, + "step": 20602 + }, + { + "epoch": 2.62091336980028, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.525774002075195, + "learning_rate": 1e-06, + "loss": 0.359, + "mean_token_accuracy": 0.8881477117538452, + "num_tokens": 786214945.0, + "step": 20603 + }, + { + "epoch": 2.6210405800788705, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.12145233154297, + "learning_rate": 1e-06, + "loss": 0.4, + "mean_token_accuracy": 0.8758456707000732, + "num_tokens": 786250880.0, + "step": 20604 + }, + { + "epoch": 2.621167790357461, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.287172317504883, + "learning_rate": 1e-06, + "loss": 0.3971, + "mean_token_accuracy": 0.8737941980361938, + "num_tokens": 786286095.0, + "step": 20605 + }, + { + "epoch": 2.621295000636051, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.477771759033203, + "learning_rate": 1e-06, + "loss": 0.3858, + "mean_token_accuracy": 0.8785157203674316, + "num_tokens": 786319547.0, + "step": 20606 + }, + { + "epoch": 2.621422210914642, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.353046417236328, + "learning_rate": 1e-06, + "loss": 0.3984, + "mean_token_accuracy": 0.8740203380584717, + "num_tokens": 786358699.0, + "step": 20607 + }, + { + "epoch": 2.621549421193232, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.016847610473633, + "learning_rate": 1e-06, + "loss": 0.3846, + "mean_token_accuracy": 0.8778363466262817, + "num_tokens": 786399732.0, + "step": 20608 + }, + { + "epoch": 2.621676631471823, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.55733299255371, + "learning_rate": 1e-06, + "loss": 0.435, + "mean_token_accuracy": 0.8653866052627563, + "num_tokens": 786437375.0, + "step": 20609 + }, + { + "epoch": 2.6218038417504133, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.264873504638672, + "learning_rate": 1e-06, + "loss": 0.4379, + "mean_token_accuracy": 0.8634019494056702, + "num_tokens": 786475538.0, + "step": 20610 + }, + { + "epoch": 2.621931052029004, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.293941497802734, + "learning_rate": 1e-06, + "loss": 0.3976, + "mean_token_accuracy": 0.8758611679077148, + "num_tokens": 786516802.0, + "step": 20611 + }, + { + "epoch": 2.6220582623075943, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.192699432373047, + "learning_rate": 1e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.8732852935791016, + "num_tokens": 786558470.0, + "step": 20612 + }, + { + "epoch": 2.622185472586185, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.44289779663086, + "learning_rate": 1e-06, + "loss": 0.4112, + "mean_token_accuracy": 0.8689626455307007, + "num_tokens": 786601163.0, + "step": 20613 + }, + { + "epoch": 2.6223126828647754, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.83746910095215, + "learning_rate": 1e-06, + "loss": 0.3802, + "mean_token_accuracy": 0.8799633979797363, + "num_tokens": 786636741.0, + "step": 20614 + }, + { + "epoch": 2.622439893143366, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.362287521362305, + "learning_rate": 1e-06, + "loss": 0.433, + "mean_token_accuracy": 0.8622671365737915, + "num_tokens": 786672931.0, + "step": 20615 + }, + { + "epoch": 2.6225671034219564, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.26741600036621, + "learning_rate": 1e-06, + "loss": 0.3649, + "mean_token_accuracy": 0.8861737251281738, + "num_tokens": 786709310.0, + "step": 20616 + }, + { + "epoch": 2.622694313700547, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.469995498657227, + "learning_rate": 1e-06, + "loss": 0.3946, + "mean_token_accuracy": 0.8803926110267639, + "num_tokens": 786747113.0, + "step": 20617 + }, + { + "epoch": 2.6228215239791375, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.379976272583008, + "learning_rate": 1e-06, + "loss": 0.4083, + "mean_token_accuracy": 0.8718546628952026, + "num_tokens": 786782033.0, + "step": 20618 + }, + { + "epoch": 2.622948734257728, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.311172485351562, + "learning_rate": 1e-06, + "loss": 0.3977, + "mean_token_accuracy": 0.8751232624053955, + "num_tokens": 786827229.0, + "step": 20619 + }, + { + "epoch": 2.6230759445363185, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.330238342285156, + "learning_rate": 1e-06, + "loss": 0.3682, + "mean_token_accuracy": 0.881502091884613, + "num_tokens": 786861722.0, + "step": 20620 + }, + { + "epoch": 2.623203154814909, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.08483123779297, + "learning_rate": 1e-06, + "loss": 0.3854, + "mean_token_accuracy": 0.8769926428794861, + "num_tokens": 786896202.0, + "step": 20621 + }, + { + "epoch": 2.6233303650934996, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.3775577545166, + "learning_rate": 1e-06, + "loss": 0.4017, + "mean_token_accuracy": 0.876154899597168, + "num_tokens": 786932154.0, + "step": 20622 + }, + { + "epoch": 2.62345757537209, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.3449649810791, + "learning_rate": 1e-06, + "loss": 0.4301, + "mean_token_accuracy": 0.8649951815605164, + "num_tokens": 786972184.0, + "step": 20623 + }, + { + "epoch": 2.6235847856506807, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.322412490844727, + "learning_rate": 1e-06, + "loss": 0.4244, + "mean_token_accuracy": 0.8677254319190979, + "num_tokens": 787015514.0, + "step": 20624 + }, + { + "epoch": 2.623711995929271, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.385080337524414, + "learning_rate": 1e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.8747845888137817, + "num_tokens": 787054023.0, + "step": 20625 + }, + { + "epoch": 2.6238392062078617, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.54525375366211, + "learning_rate": 1e-06, + "loss": 0.4135, + "mean_token_accuracy": 0.8709915280342102, + "num_tokens": 787088858.0, + "step": 20626 + }, + { + "epoch": 2.6239664164864522, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.195314407348633, + "learning_rate": 1e-06, + "loss": 0.391, + "mean_token_accuracy": 0.8767334818840027, + "num_tokens": 787124805.0, + "step": 20627 + }, + { + "epoch": 2.6240936267650428, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.47986602783203, + "learning_rate": 1e-06, + "loss": 0.3666, + "mean_token_accuracy": 0.8857501745223999, + "num_tokens": 787164922.0, + "step": 20628 + }, + { + "epoch": 2.6242208370436333, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.314348220825195, + "learning_rate": 1e-06, + "loss": 0.3607, + "mean_token_accuracy": 0.8855592012405396, + "num_tokens": 787204936.0, + "step": 20629 + }, + { + "epoch": 2.624348047322224, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.406333923339844, + "learning_rate": 1e-06, + "loss": 0.3642, + "mean_token_accuracy": 0.8853122591972351, + "num_tokens": 787238292.0, + "step": 20630 + }, + { + "epoch": 2.624475257600814, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.458236694335938, + "learning_rate": 1e-06, + "loss": 0.3383, + "mean_token_accuracy": 0.8934004306793213, + "num_tokens": 787269054.0, + "step": 20631 + }, + { + "epoch": 2.624602467879405, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.270034790039062, + "learning_rate": 1e-06, + "loss": 0.4092, + "mean_token_accuracy": 0.8732178211212158, + "num_tokens": 787310022.0, + "step": 20632 + }, + { + "epoch": 2.624729678157995, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.31925392150879, + "learning_rate": 1e-06, + "loss": 0.4256, + "mean_token_accuracy": 0.8642035722732544, + "num_tokens": 787350917.0, + "step": 20633 + }, + { + "epoch": 2.624856888436586, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.486225128173828, + "learning_rate": 1e-06, + "loss": 0.3779, + "mean_token_accuracy": 0.8818618059158325, + "num_tokens": 787388187.0, + "step": 20634 + }, + { + "epoch": 2.624984098715176, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.15460777282715, + "learning_rate": 1e-06, + "loss": 0.3782, + "mean_token_accuracy": 0.8802893161773682, + "num_tokens": 787424998.0, + "step": 20635 + }, + { + "epoch": 2.6251113089937665, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.28468132019043, + "learning_rate": 1e-06, + "loss": 0.3623, + "mean_token_accuracy": 0.8854720592498779, + "num_tokens": 787465456.0, + "step": 20636 + }, + { + "epoch": 2.625238519272357, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.45226287841797, + "learning_rate": 1e-06, + "loss": 0.4068, + "mean_token_accuracy": 0.872807502746582, + "num_tokens": 787504797.0, + "step": 20637 + }, + { + "epoch": 2.6253657295509476, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.23839569091797, + "learning_rate": 1e-06, + "loss": 0.3705, + "mean_token_accuracy": 0.8839674592018127, + "num_tokens": 787542911.0, + "step": 20638 + }, + { + "epoch": 2.625492939829538, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.285411834716797, + "learning_rate": 1e-06, + "loss": 0.4227, + "mean_token_accuracy": 0.8658969402313232, + "num_tokens": 787576143.0, + "step": 20639 + }, + { + "epoch": 2.6256201501081287, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.353363037109375, + "learning_rate": 1e-06, + "loss": 0.3949, + "mean_token_accuracy": 0.8756406307220459, + "num_tokens": 787619912.0, + "step": 20640 + }, + { + "epoch": 2.625747360386719, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.40620231628418, + "learning_rate": 1e-06, + "loss": 0.4232, + "mean_token_accuracy": 0.8651010990142822, + "num_tokens": 787658181.0, + "step": 20641 + }, + { + "epoch": 2.6258745706653097, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.335174560546875, + "learning_rate": 1e-06, + "loss": 0.4293, + "mean_token_accuracy": 0.8645437955856323, + "num_tokens": 787694889.0, + "step": 20642 + }, + { + "epoch": 2.6260017809439002, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.29499626159668, + "learning_rate": 1e-06, + "loss": 0.3571, + "mean_token_accuracy": 0.8871140480041504, + "num_tokens": 787737633.0, + "step": 20643 + }, + { + "epoch": 2.6261289912224908, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.246337890625, + "learning_rate": 1e-06, + "loss": 0.444, + "mean_token_accuracy": 0.8617462515830994, + "num_tokens": 787779069.0, + "step": 20644 + }, + { + "epoch": 2.6262562015010813, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.359848022460938, + "learning_rate": 1e-06, + "loss": 0.4508, + "mean_token_accuracy": 0.8617207407951355, + "num_tokens": 787815544.0, + "step": 20645 + }, + { + "epoch": 2.626383411779672, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.338531494140625, + "learning_rate": 1e-06, + "loss": 0.4241, + "mean_token_accuracy": 0.8692092895507812, + "num_tokens": 787852684.0, + "step": 20646 + }, + { + "epoch": 2.6265106220582624, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.379913330078125, + "learning_rate": 1e-06, + "loss": 0.4077, + "mean_token_accuracy": 0.8698848485946655, + "num_tokens": 787893452.0, + "step": 20647 + }, + { + "epoch": 2.626637832336853, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.392093658447266, + "learning_rate": 1e-06, + "loss": 0.4052, + "mean_token_accuracy": 0.8738470077514648, + "num_tokens": 787929430.0, + "step": 20648 + }, + { + "epoch": 2.6267650426154434, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.280868530273438, + "learning_rate": 1e-06, + "loss": 0.4132, + "mean_token_accuracy": 0.8714116811752319, + "num_tokens": 787969768.0, + "step": 20649 + }, + { + "epoch": 2.626892252894034, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.40363883972168, + "learning_rate": 1e-06, + "loss": 0.405, + "mean_token_accuracy": 0.87419593334198, + "num_tokens": 788009605.0, + "step": 20650 + }, + { + "epoch": 2.6270194631726245, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.172649383544922, + "learning_rate": 1e-06, + "loss": 0.3599, + "mean_token_accuracy": 0.8860551714897156, + "num_tokens": 788044652.0, + "step": 20651 + }, + { + "epoch": 2.627146673451215, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.394182205200195, + "learning_rate": 1e-06, + "loss": 0.4812, + "mean_token_accuracy": 0.8489198684692383, + "num_tokens": 788085981.0, + "step": 20652 + }, + { + "epoch": 2.6272738837298055, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.425296783447266, + "learning_rate": 1e-06, + "loss": 0.4076, + "mean_token_accuracy": 0.8709431290626526, + "num_tokens": 788132196.0, + "step": 20653 + }, + { + "epoch": 2.6274010940083956, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.060440063476562, + "learning_rate": 1e-06, + "loss": 0.3542, + "mean_token_accuracy": 0.8867148160934448, + "num_tokens": 788160099.0, + "step": 20654 + }, + { + "epoch": 2.6275283042869866, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.440343856811523, + "learning_rate": 1e-06, + "loss": 0.3993, + "mean_token_accuracy": 0.8726646304130554, + "num_tokens": 788197264.0, + "step": 20655 + }, + { + "epoch": 2.6276555145655767, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.183963775634766, + "learning_rate": 1e-06, + "loss": 0.3619, + "mean_token_accuracy": 0.8853033781051636, + "num_tokens": 788232900.0, + "step": 20656 + }, + { + "epoch": 2.6277827248441676, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.239501953125, + "learning_rate": 1e-06, + "loss": 0.4209, + "mean_token_accuracy": 0.8690527677536011, + "num_tokens": 788273720.0, + "step": 20657 + }, + { + "epoch": 2.6279099351227577, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.46368408203125, + "learning_rate": 1e-06, + "loss": 0.3969, + "mean_token_accuracy": 0.8771224021911621, + "num_tokens": 788308425.0, + "step": 20658 + }, + { + "epoch": 2.6280371454013487, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.382246017456055, + "learning_rate": 1e-06, + "loss": 0.4159, + "mean_token_accuracy": 0.8667442798614502, + "num_tokens": 788343701.0, + "step": 20659 + }, + { + "epoch": 2.628164355679939, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.204185485839844, + "learning_rate": 1e-06, + "loss": 0.4663, + "mean_token_accuracy": 0.8586698770523071, + "num_tokens": 788372789.0, + "step": 20660 + }, + { + "epoch": 2.6282915659585293, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.520469665527344, + "learning_rate": 1e-06, + "loss": 0.3822, + "mean_token_accuracy": 0.8785013556480408, + "num_tokens": 788408422.0, + "step": 20661 + }, + { + "epoch": 2.62841877623712, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.401357650756836, + "learning_rate": 1e-06, + "loss": 0.3957, + "mean_token_accuracy": 0.8772079944610596, + "num_tokens": 788446113.0, + "step": 20662 + }, + { + "epoch": 2.6285459865157104, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.139694213867188, + "learning_rate": 1e-06, + "loss": 0.4156, + "mean_token_accuracy": 0.8678957223892212, + "num_tokens": 788485009.0, + "step": 20663 + }, + { + "epoch": 2.628673196794301, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.533140182495117, + "learning_rate": 1e-06, + "loss": 0.4013, + "mean_token_accuracy": 0.8770278692245483, + "num_tokens": 788523446.0, + "step": 20664 + }, + { + "epoch": 2.6288004070728914, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.105077743530273, + "learning_rate": 1e-06, + "loss": 0.3708, + "mean_token_accuracy": 0.882918655872345, + "num_tokens": 788557423.0, + "step": 20665 + }, + { + "epoch": 2.628927617351482, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.862560272216797, + "learning_rate": 1e-06, + "loss": 0.4149, + "mean_token_accuracy": 0.870581865310669, + "num_tokens": 788588177.0, + "step": 20666 + }, + { + "epoch": 2.6290548276300725, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.470117568969727, + "learning_rate": 1e-06, + "loss": 0.4058, + "mean_token_accuracy": 0.8747615814208984, + "num_tokens": 788624877.0, + "step": 20667 + }, + { + "epoch": 2.629182037908663, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.154584884643555, + "learning_rate": 1e-06, + "loss": 0.3686, + "mean_token_accuracy": 0.8838602900505066, + "num_tokens": 788662532.0, + "step": 20668 + }, + { + "epoch": 2.6293092481872535, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.60637664794922, + "learning_rate": 1e-06, + "loss": 0.3943, + "mean_token_accuracy": 0.8747596144676208, + "num_tokens": 788702734.0, + "step": 20669 + }, + { + "epoch": 2.629436458465844, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.377456665039062, + "learning_rate": 1e-06, + "loss": 0.4034, + "mean_token_accuracy": 0.8756029605865479, + "num_tokens": 788738041.0, + "step": 20670 + }, + { + "epoch": 2.6295636687444346, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.25025749206543, + "learning_rate": 1e-06, + "loss": 0.4016, + "mean_token_accuracy": 0.8746663331985474, + "num_tokens": 788772998.0, + "step": 20671 + }, + { + "epoch": 2.629690879023025, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.96839714050293, + "learning_rate": 1e-06, + "loss": 0.4459, + "mean_token_accuracy": 0.8583853244781494, + "num_tokens": 788814310.0, + "step": 20672 + }, + { + "epoch": 2.6298180893016156, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.2615966796875, + "learning_rate": 1e-06, + "loss": 0.3918, + "mean_token_accuracy": 0.8773642778396606, + "num_tokens": 788852811.0, + "step": 20673 + }, + { + "epoch": 2.629945299580206, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.316905975341797, + "learning_rate": 1e-06, + "loss": 0.3743, + "mean_token_accuracy": 0.8820633888244629, + "num_tokens": 788888756.0, + "step": 20674 + }, + { + "epoch": 2.6300725098587967, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.2772159576416, + "learning_rate": 1e-06, + "loss": 0.3845, + "mean_token_accuracy": 0.8783862590789795, + "num_tokens": 788926292.0, + "step": 20675 + }, + { + "epoch": 2.6301997201373872, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.575458526611328, + "learning_rate": 1e-06, + "loss": 0.364, + "mean_token_accuracy": 0.884366512298584, + "num_tokens": 788965625.0, + "step": 20676 + }, + { + "epoch": 2.6303269304159778, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.1882381439209, + "learning_rate": 1e-06, + "loss": 0.3908, + "mean_token_accuracy": 0.8782466650009155, + "num_tokens": 789007390.0, + "step": 20677 + }, + { + "epoch": 2.6304541406945683, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.483659744262695, + "learning_rate": 1e-06, + "loss": 0.3985, + "mean_token_accuracy": 0.8781866431236267, + "num_tokens": 789047940.0, + "step": 20678 + }, + { + "epoch": 2.6305813509731584, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.669174194335938, + "learning_rate": 1e-06, + "loss": 0.402, + "mean_token_accuracy": 0.8742017149925232, + "num_tokens": 789086321.0, + "step": 20679 + }, + { + "epoch": 2.6307085612517493, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.358694076538086, + "learning_rate": 1e-06, + "loss": 0.3601, + "mean_token_accuracy": 0.885238528251648, + "num_tokens": 789121757.0, + "step": 20680 + }, + { + "epoch": 2.6308357715303394, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.3118953704834, + "learning_rate": 1e-06, + "loss": 0.3575, + "mean_token_accuracy": 0.8885823488235474, + "num_tokens": 789160076.0, + "step": 20681 + }, + { + "epoch": 2.6309629818089304, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.412321090698242, + "learning_rate": 1e-06, + "loss": 0.4296, + "mean_token_accuracy": 0.8651973009109497, + "num_tokens": 789197080.0, + "step": 20682 + }, + { + "epoch": 2.6310901920875205, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.484058380126953, + "learning_rate": 1e-06, + "loss": 0.4189, + "mean_token_accuracy": 0.868990421295166, + "num_tokens": 789249876.0, + "step": 20683 + }, + { + "epoch": 2.6312174023661115, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.379077911376953, + "learning_rate": 1e-06, + "loss": 0.4027, + "mean_token_accuracy": 0.8727446794509888, + "num_tokens": 789290146.0, + "step": 20684 + }, + { + "epoch": 2.6313446126447015, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.424766540527344, + "learning_rate": 1e-06, + "loss": 0.3883, + "mean_token_accuracy": 0.8779892921447754, + "num_tokens": 789335610.0, + "step": 20685 + }, + { + "epoch": 2.631471822923292, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.43302345275879, + "learning_rate": 1e-06, + "loss": 0.4149, + "mean_token_accuracy": 0.8695827722549438, + "num_tokens": 789371956.0, + "step": 20686 + }, + { + "epoch": 2.6315990332018826, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.610694885253906, + "learning_rate": 1e-06, + "loss": 0.4187, + "mean_token_accuracy": 0.8695700168609619, + "num_tokens": 789415257.0, + "step": 20687 + }, + { + "epoch": 2.631726243480473, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.32891082763672, + "learning_rate": 1e-06, + "loss": 0.3999, + "mean_token_accuracy": 0.8734626770019531, + "num_tokens": 789452533.0, + "step": 20688 + }, + { + "epoch": 2.6318534537590637, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.267730712890625, + "learning_rate": 1e-06, + "loss": 0.4065, + "mean_token_accuracy": 0.8717646598815918, + "num_tokens": 789488780.0, + "step": 20689 + }, + { + "epoch": 2.631980664037654, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.38401222229004, + "learning_rate": 1e-06, + "loss": 0.4149, + "mean_token_accuracy": 0.8720275163650513, + "num_tokens": 789528169.0, + "step": 20690 + }, + { + "epoch": 2.6321078743162447, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.308530807495117, + "learning_rate": 1e-06, + "loss": 0.4184, + "mean_token_accuracy": 0.8665993213653564, + "num_tokens": 789568382.0, + "step": 20691 + }, + { + "epoch": 2.6322350845948352, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.503870010375977, + "learning_rate": 1e-06, + "loss": 0.4013, + "mean_token_accuracy": 0.8713335990905762, + "num_tokens": 789606895.0, + "step": 20692 + }, + { + "epoch": 2.6323622948734258, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.29079818725586, + "learning_rate": 1e-06, + "loss": 0.4493, + "mean_token_accuracy": 0.8591358661651611, + "num_tokens": 789649106.0, + "step": 20693 + }, + { + "epoch": 2.6324895051520163, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.352357864379883, + "learning_rate": 1e-06, + "loss": 0.3924, + "mean_token_accuracy": 0.8778029084205627, + "num_tokens": 789689449.0, + "step": 20694 + }, + { + "epoch": 2.632616715430607, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.390329360961914, + "learning_rate": 1e-06, + "loss": 0.3657, + "mean_token_accuracy": 0.8846041560173035, + "num_tokens": 789726179.0, + "step": 20695 + }, + { + "epoch": 2.6327439257091974, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.527833938598633, + "learning_rate": 1e-06, + "loss": 0.4457, + "mean_token_accuracy": 0.8569881916046143, + "num_tokens": 789764858.0, + "step": 20696 + }, + { + "epoch": 2.632871135987788, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.3219051361084, + "learning_rate": 1e-06, + "loss": 0.4257, + "mean_token_accuracy": 0.8618506789207458, + "num_tokens": 789799196.0, + "step": 20697 + }, + { + "epoch": 2.6329983462663784, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.53959846496582, + "learning_rate": 1e-06, + "loss": 0.403, + "mean_token_accuracy": 0.871927797794342, + "num_tokens": 789840498.0, + "step": 20698 + }, + { + "epoch": 2.633125556544969, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.216733932495117, + "learning_rate": 1e-06, + "loss": 0.3902, + "mean_token_accuracy": 0.8774491548538208, + "num_tokens": 789874269.0, + "step": 20699 + }, + { + "epoch": 2.6332527668235595, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.503625869750977, + "learning_rate": 1e-06, + "loss": 0.3974, + "mean_token_accuracy": 0.8749446868896484, + "num_tokens": 789914938.0, + "step": 20700 + }, + { + "epoch": 2.63337997710215, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.381885528564453, + "learning_rate": 1e-06, + "loss": 0.4135, + "mean_token_accuracy": 0.8739058971405029, + "num_tokens": 789959219.0, + "step": 20701 + }, + { + "epoch": 2.6335071873807405, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.302684783935547, + "learning_rate": 1e-06, + "loss": 0.4259, + "mean_token_accuracy": 0.8672722578048706, + "num_tokens": 789993707.0, + "step": 20702 + }, + { + "epoch": 2.633634397659331, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.3492374420166, + "learning_rate": 1e-06, + "loss": 0.4594, + "mean_token_accuracy": 0.8545253872871399, + "num_tokens": 790034285.0, + "step": 20703 + }, + { + "epoch": 2.633761607937921, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.384634017944336, + "learning_rate": 1e-06, + "loss": 0.385, + "mean_token_accuracy": 0.8772710561752319, + "num_tokens": 790069714.0, + "step": 20704 + }, + { + "epoch": 2.633888818216512, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.408479690551758, + "learning_rate": 1e-06, + "loss": 0.4046, + "mean_token_accuracy": 0.8719068169593811, + "num_tokens": 790110733.0, + "step": 20705 + }, + { + "epoch": 2.634016028495102, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.23398780822754, + "learning_rate": 1e-06, + "loss": 0.3827, + "mean_token_accuracy": 0.8792263269424438, + "num_tokens": 790144468.0, + "step": 20706 + }, + { + "epoch": 2.634143238773693, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.312538146972656, + "learning_rate": 1e-06, + "loss": 0.405, + "mean_token_accuracy": 0.8720242977142334, + "num_tokens": 790186881.0, + "step": 20707 + }, + { + "epoch": 2.6342704490522832, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.31817626953125, + "learning_rate": 1e-06, + "loss": 0.3786, + "mean_token_accuracy": 0.8812477588653564, + "num_tokens": 790225083.0, + "step": 20708 + }, + { + "epoch": 2.6343976593308738, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.613405227661133, + "learning_rate": 1e-06, + "loss": 0.4015, + "mean_token_accuracy": 0.8736051917076111, + "num_tokens": 790266822.0, + "step": 20709 + }, + { + "epoch": 2.6345248696094643, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.20244789123535, + "learning_rate": 1e-06, + "loss": 0.4034, + "mean_token_accuracy": 0.8704807162284851, + "num_tokens": 790305948.0, + "step": 20710 + }, + { + "epoch": 2.634652079888055, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.517839431762695, + "learning_rate": 1e-06, + "loss": 0.3899, + "mean_token_accuracy": 0.8751726150512695, + "num_tokens": 790341639.0, + "step": 20711 + }, + { + "epoch": 2.6347792901666454, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.435001373291016, + "learning_rate": 1e-06, + "loss": 0.3647, + "mean_token_accuracy": 0.8833867311477661, + "num_tokens": 790377442.0, + "step": 20712 + }, + { + "epoch": 2.634906500445236, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.433361053466797, + "learning_rate": 1e-06, + "loss": 0.3963, + "mean_token_accuracy": 0.8749103546142578, + "num_tokens": 790414911.0, + "step": 20713 + }, + { + "epoch": 2.6350337107238264, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.413707733154297, + "learning_rate": 1e-06, + "loss": 0.41, + "mean_token_accuracy": 0.8724551200866699, + "num_tokens": 790453968.0, + "step": 20714 + }, + { + "epoch": 2.635160921002417, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.397388458251953, + "learning_rate": 1e-06, + "loss": 0.4199, + "mean_token_accuracy": 0.8699472546577454, + "num_tokens": 790493216.0, + "step": 20715 + }, + { + "epoch": 2.6352881312810075, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.32317352294922, + "learning_rate": 1e-06, + "loss": 0.4108, + "mean_token_accuracy": 0.86977219581604, + "num_tokens": 790529380.0, + "step": 20716 + }, + { + "epoch": 2.635415341559598, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.625497817993164, + "learning_rate": 1e-06, + "loss": 0.4148, + "mean_token_accuracy": 0.8684407472610474, + "num_tokens": 790569568.0, + "step": 20717 + }, + { + "epoch": 2.6355425518381885, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.403350830078125, + "learning_rate": 1e-06, + "loss": 0.3713, + "mean_token_accuracy": 0.8806939721107483, + "num_tokens": 790602269.0, + "step": 20718 + }, + { + "epoch": 2.635669762116779, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.412687301635742, + "learning_rate": 1e-06, + "loss": 0.4389, + "mean_token_accuracy": 0.8616147041320801, + "num_tokens": 790639723.0, + "step": 20719 + }, + { + "epoch": 2.6357969723953696, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.461259841918945, + "learning_rate": 1e-06, + "loss": 0.4484, + "mean_token_accuracy": 0.8603330850601196, + "num_tokens": 790676103.0, + "step": 20720 + }, + { + "epoch": 2.63592418267396, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.496065139770508, + "learning_rate": 1e-06, + "loss": 0.4013, + "mean_token_accuracy": 0.8758587837219238, + "num_tokens": 790712118.0, + "step": 20721 + }, + { + "epoch": 2.6360513929525506, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.193754196166992, + "learning_rate": 1e-06, + "loss": 0.4054, + "mean_token_accuracy": 0.8733981847763062, + "num_tokens": 790747079.0, + "step": 20722 + }, + { + "epoch": 2.636178603231141, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.360309600830078, + "learning_rate": 1e-06, + "loss": 0.4228, + "mean_token_accuracy": 0.8639287948608398, + "num_tokens": 790781741.0, + "step": 20723 + }, + { + "epoch": 2.6363058135097317, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.443235397338867, + "learning_rate": 1e-06, + "loss": 0.4051, + "mean_token_accuracy": 0.8728852868080139, + "num_tokens": 790820243.0, + "step": 20724 + }, + { + "epoch": 2.6364330237883222, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.482120513916016, + "learning_rate": 1e-06, + "loss": 0.3566, + "mean_token_accuracy": 0.8885785341262817, + "num_tokens": 790851492.0, + "step": 20725 + }, + { + "epoch": 2.6365602340669128, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.45585060119629, + "learning_rate": 1e-06, + "loss": 0.386, + "mean_token_accuracy": 0.8793665170669556, + "num_tokens": 790882293.0, + "step": 20726 + }, + { + "epoch": 2.636687444345503, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.39035987854004, + "learning_rate": 1e-06, + "loss": 0.415, + "mean_token_accuracy": 0.869989275932312, + "num_tokens": 790922188.0, + "step": 20727 + }, + { + "epoch": 2.636814654624094, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.330289840698242, + "learning_rate": 1e-06, + "loss": 0.3659, + "mean_token_accuracy": 0.8828936219215393, + "num_tokens": 790960675.0, + "step": 20728 + }, + { + "epoch": 2.636941864902684, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.394126892089844, + "learning_rate": 1e-06, + "loss": 0.3747, + "mean_token_accuracy": 0.8822027444839478, + "num_tokens": 790999352.0, + "step": 20729 + }, + { + "epoch": 2.637069075181275, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.38372802734375, + "learning_rate": 1e-06, + "loss": 0.3996, + "mean_token_accuracy": 0.8749487400054932, + "num_tokens": 791035898.0, + "step": 20730 + }, + { + "epoch": 2.637196285459865, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.54233169555664, + "learning_rate": 1e-06, + "loss": 0.4729, + "mean_token_accuracy": 0.8493321537971497, + "num_tokens": 791079731.0, + "step": 20731 + }, + { + "epoch": 2.637323495738456, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.195709228515625, + "learning_rate": 1e-06, + "loss": 0.3709, + "mean_token_accuracy": 0.8862265348434448, + "num_tokens": 791118565.0, + "step": 20732 + }, + { + "epoch": 2.637450706017046, + "ewc_loss": 0.038818359375, + "ewc_loss_parallel": 3.886222839355469e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.40496063232422, + "learning_rate": 1e-06, + "loss": 0.4, + "mean_token_accuracy": 0.8748567700386047, + "num_tokens": 791154876.0, + "step": 20733 + }, + { + "epoch": 2.6375779162956365, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.407678604125977, + "learning_rate": 1e-06, + "loss": 0.3997, + "mean_token_accuracy": 0.8736768364906311, + "num_tokens": 791196003.0, + "step": 20734 + }, + { + "epoch": 2.637705126574227, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.32516860961914, + "learning_rate": 1e-06, + "loss": 0.411, + "mean_token_accuracy": 0.872223973274231, + "num_tokens": 791236462.0, + "step": 20735 + }, + { + "epoch": 2.6378323368528176, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.199750900268555, + "learning_rate": 1e-06, + "loss": 0.4351, + "mean_token_accuracy": 0.8608723282814026, + "num_tokens": 791271918.0, + "step": 20736 + }, + { + "epoch": 2.637959547131408, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.346153259277344, + "learning_rate": 1e-06, + "loss": 0.3607, + "mean_token_accuracy": 0.8854690194129944, + "num_tokens": 791309813.0, + "step": 20737 + }, + { + "epoch": 2.6380867574099987, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.275508880615234, + "learning_rate": 1e-06, + "loss": 0.3853, + "mean_token_accuracy": 0.8819642066955566, + "num_tokens": 791338854.0, + "step": 20738 + }, + { + "epoch": 2.638213967688589, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.310237884521484, + "learning_rate": 1e-06, + "loss": 0.3892, + "mean_token_accuracy": 0.8790592551231384, + "num_tokens": 791376580.0, + "step": 20739 + }, + { + "epoch": 2.6383411779671797, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.300230026245117, + "learning_rate": 1e-06, + "loss": 0.4123, + "mean_token_accuracy": 0.8697531223297119, + "num_tokens": 791415931.0, + "step": 20740 + }, + { + "epoch": 2.6384683882457702, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.36686134338379, + "learning_rate": 1e-06, + "loss": 0.4107, + "mean_token_accuracy": 0.8695006370544434, + "num_tokens": 791448201.0, + "step": 20741 + }, + { + "epoch": 2.6385955985243608, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.360857009887695, + "learning_rate": 1e-06, + "loss": 0.419, + "mean_token_accuracy": 0.8671883344650269, + "num_tokens": 791489183.0, + "step": 20742 + }, + { + "epoch": 2.6387228088029513, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.564205169677734, + "learning_rate": 1e-06, + "loss": 0.396, + "mean_token_accuracy": 0.8750655651092529, + "num_tokens": 791528414.0, + "step": 20743 + }, + { + "epoch": 2.638850019081542, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.382625579833984, + "learning_rate": 1e-06, + "loss": 0.3432, + "mean_token_accuracy": 0.8928152322769165, + "num_tokens": 791566379.0, + "step": 20744 + }, + { + "epoch": 2.6389772293601323, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.249723434448242, + "learning_rate": 1e-06, + "loss": 0.3937, + "mean_token_accuracy": 0.8771445155143738, + "num_tokens": 791599789.0, + "step": 20745 + }, + { + "epoch": 2.639104439638723, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.681020736694336, + "learning_rate": 1e-06, + "loss": 0.4426, + "mean_token_accuracy": 0.8603843450546265, + "num_tokens": 791642333.0, + "step": 20746 + }, + { + "epoch": 2.6392316499173134, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.293045043945312, + "learning_rate": 1e-06, + "loss": 0.4025, + "mean_token_accuracy": 0.8741306662559509, + "num_tokens": 791687190.0, + "step": 20747 + }, + { + "epoch": 2.639358860195904, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.271541595458984, + "learning_rate": 1e-06, + "loss": 0.4353, + "mean_token_accuracy": 0.8591185808181763, + "num_tokens": 791722806.0, + "step": 20748 + }, + { + "epoch": 2.6394860704744945, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.462623596191406, + "learning_rate": 1e-06, + "loss": 0.4469, + "mean_token_accuracy": 0.8596630096435547, + "num_tokens": 791757726.0, + "step": 20749 + }, + { + "epoch": 2.639613280753085, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.37286376953125, + "learning_rate": 1e-06, + "loss": 0.3843, + "mean_token_accuracy": 0.8816500902175903, + "num_tokens": 791795160.0, + "step": 20750 + }, + { + "epoch": 2.6397404910316755, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.448606491088867, + "learning_rate": 1e-06, + "loss": 0.3741, + "mean_token_accuracy": 0.8859243988990784, + "num_tokens": 791831682.0, + "step": 20751 + }, + { + "epoch": 2.6398677013102656, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.41195297241211, + "learning_rate": 1e-06, + "loss": 0.3783, + "mean_token_accuracy": 0.8845742344856262, + "num_tokens": 791863230.0, + "step": 20752 + }, + { + "epoch": 2.6399949115888566, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.168598175048828, + "learning_rate": 1e-06, + "loss": 0.3534, + "mean_token_accuracy": 0.8861812949180603, + "num_tokens": 791904323.0, + "step": 20753 + }, + { + "epoch": 2.6401221218674467, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.641254425048828, + "learning_rate": 1e-06, + "loss": 0.4208, + "mean_token_accuracy": 0.8667649030685425, + "num_tokens": 791951836.0, + "step": 20754 + }, + { + "epoch": 2.6402493321460376, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.26556968688965, + "learning_rate": 1e-06, + "loss": 0.3813, + "mean_token_accuracy": 0.8761454224586487, + "num_tokens": 791989782.0, + "step": 20755 + }, + { + "epoch": 2.6403765424246277, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.562381744384766, + "learning_rate": 1e-06, + "loss": 0.3896, + "mean_token_accuracy": 0.874947190284729, + "num_tokens": 792024038.0, + "step": 20756 + }, + { + "epoch": 2.6405037527032187, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.287723541259766, + "learning_rate": 1e-06, + "loss": 0.396, + "mean_token_accuracy": 0.8734976053237915, + "num_tokens": 792067893.0, + "step": 20757 + }, + { + "epoch": 2.6406309629818088, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.1511287689209, + "learning_rate": 1e-06, + "loss": 0.3639, + "mean_token_accuracy": 0.8829432725906372, + "num_tokens": 792108593.0, + "step": 20758 + }, + { + "epoch": 2.6407581732603993, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.554019927978516, + "learning_rate": 1e-06, + "loss": 0.359, + "mean_token_accuracy": 0.8863376379013062, + "num_tokens": 792141201.0, + "step": 20759 + }, + { + "epoch": 2.64088538353899, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.257610321044922, + "learning_rate": 1e-06, + "loss": 0.3447, + "mean_token_accuracy": 0.8930786848068237, + "num_tokens": 792182908.0, + "step": 20760 + }, + { + "epoch": 2.6410125938175804, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.489826202392578, + "learning_rate": 1e-06, + "loss": 0.3988, + "mean_token_accuracy": 0.8745976686477661, + "num_tokens": 792225223.0, + "step": 20761 + }, + { + "epoch": 2.641139804096171, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.356159210205078, + "learning_rate": 1e-06, + "loss": 0.4515, + "mean_token_accuracy": 0.8573760390281677, + "num_tokens": 792258131.0, + "step": 20762 + }, + { + "epoch": 2.6412670143747614, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.499637603759766, + "learning_rate": 1e-06, + "loss": 0.3726, + "mean_token_accuracy": 0.8800019025802612, + "num_tokens": 792291429.0, + "step": 20763 + }, + { + "epoch": 2.641394224653352, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.690370559692383, + "learning_rate": 1e-06, + "loss": 0.4113, + "mean_token_accuracy": 0.8700133562088013, + "num_tokens": 792327777.0, + "step": 20764 + }, + { + "epoch": 2.6415214349319425, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.438514709472656, + "learning_rate": 1e-06, + "loss": 0.3919, + "mean_token_accuracy": 0.8765620589256287, + "num_tokens": 792367918.0, + "step": 20765 + }, + { + "epoch": 2.641648645210533, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.286056518554688, + "learning_rate": 1e-06, + "loss": 0.3857, + "mean_token_accuracy": 0.8782088756561279, + "num_tokens": 792405550.0, + "step": 20766 + }, + { + "epoch": 2.6417758554891235, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.719701766967773, + "learning_rate": 1e-06, + "loss": 0.383, + "mean_token_accuracy": 0.8755598664283752, + "num_tokens": 792440948.0, + "step": 20767 + }, + { + "epoch": 2.641903065767714, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.118083953857422, + "learning_rate": 1e-06, + "loss": 0.3876, + "mean_token_accuracy": 0.880626916885376, + "num_tokens": 792488065.0, + "step": 20768 + }, + { + "epoch": 2.6420302760463046, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.368104934692383, + "learning_rate": 1e-06, + "loss": 0.3695, + "mean_token_accuracy": 0.8816171288490295, + "num_tokens": 792522845.0, + "step": 20769 + }, + { + "epoch": 2.642157486324895, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.350250244140625, + "learning_rate": 1e-06, + "loss": 0.3754, + "mean_token_accuracy": 0.8842841386795044, + "num_tokens": 792561118.0, + "step": 20770 + }, + { + "epoch": 2.6422846966034856, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.223876953125, + "learning_rate": 1e-06, + "loss": 0.4064, + "mean_token_accuracy": 0.8743101358413696, + "num_tokens": 792600125.0, + "step": 20771 + }, + { + "epoch": 2.642411906882076, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.30559539794922, + "learning_rate": 1e-06, + "loss": 0.3839, + "mean_token_accuracy": 0.8801074028015137, + "num_tokens": 792633856.0, + "step": 20772 + }, + { + "epoch": 2.6425391171606667, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.36467933654785, + "learning_rate": 1e-06, + "loss": 0.4164, + "mean_token_accuracy": 0.8712393045425415, + "num_tokens": 792672427.0, + "step": 20773 + }, + { + "epoch": 2.6426663274392572, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.323909759521484, + "learning_rate": 1e-06, + "loss": 0.4505, + "mean_token_accuracy": 0.8534674644470215, + "num_tokens": 792713856.0, + "step": 20774 + }, + { + "epoch": 2.6427935377178478, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.242019653320312, + "learning_rate": 1e-06, + "loss": 0.4356, + "mean_token_accuracy": 0.8674608469009399, + "num_tokens": 792757602.0, + "step": 20775 + }, + { + "epoch": 2.6429207479964383, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.547883987426758, + "learning_rate": 1e-06, + "loss": 0.4337, + "mean_token_accuracy": 0.865976870059967, + "num_tokens": 792797067.0, + "step": 20776 + }, + { + "epoch": 2.6430479582750284, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.48502540588379, + "learning_rate": 1e-06, + "loss": 0.3888, + "mean_token_accuracy": 0.8784345984458923, + "num_tokens": 792835748.0, + "step": 20777 + }, + { + "epoch": 2.6431751685536193, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.462574005126953, + "learning_rate": 1e-06, + "loss": 0.399, + "mean_token_accuracy": 0.8754288554191589, + "num_tokens": 792875039.0, + "step": 20778 + }, + { + "epoch": 2.6433023788322094, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.31193733215332, + "learning_rate": 1e-06, + "loss": 0.3791, + "mean_token_accuracy": 0.8796969652175903, + "num_tokens": 792916580.0, + "step": 20779 + }, + { + "epoch": 2.6434295891108004, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.429210662841797, + "learning_rate": 1e-06, + "loss": 0.3947, + "mean_token_accuracy": 0.8760606646537781, + "num_tokens": 792952507.0, + "step": 20780 + }, + { + "epoch": 2.6435567993893905, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.324390411376953, + "learning_rate": 1e-06, + "loss": 0.3882, + "mean_token_accuracy": 0.8780282735824585, + "num_tokens": 792994046.0, + "step": 20781 + }, + { + "epoch": 2.6436840096679814, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.47062110900879, + "learning_rate": 1e-06, + "loss": 0.3479, + "mean_token_accuracy": 0.8922463655471802, + "num_tokens": 793033855.0, + "step": 20782 + }, + { + "epoch": 2.6438112199465715, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.24479866027832, + "learning_rate": 1e-06, + "loss": 0.3784, + "mean_token_accuracy": 0.8781070113182068, + "num_tokens": 793072226.0, + "step": 20783 + }, + { + "epoch": 2.643938430225162, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.541501998901367, + "learning_rate": 1e-06, + "loss": 0.3866, + "mean_token_accuracy": 0.8781545162200928, + "num_tokens": 793110884.0, + "step": 20784 + }, + { + "epoch": 2.6440656405037526, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.337282180786133, + "learning_rate": 1e-06, + "loss": 0.4619, + "mean_token_accuracy": 0.8573221564292908, + "num_tokens": 793151885.0, + "step": 20785 + }, + { + "epoch": 2.644192850782343, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.40172004699707, + "learning_rate": 1e-06, + "loss": 0.4622, + "mean_token_accuracy": 0.8531518578529358, + "num_tokens": 793187253.0, + "step": 20786 + }, + { + "epoch": 2.6443200610609336, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.25324821472168, + "learning_rate": 1e-06, + "loss": 0.383, + "mean_token_accuracy": 0.8793042302131653, + "num_tokens": 793231249.0, + "step": 20787 + }, + { + "epoch": 2.644447271339524, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.198081970214844, + "learning_rate": 1e-06, + "loss": 0.3696, + "mean_token_accuracy": 0.8867356181144714, + "num_tokens": 793269918.0, + "step": 20788 + }, + { + "epoch": 2.6445744816181147, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.475069046020508, + "learning_rate": 1e-06, + "loss": 0.3758, + "mean_token_accuracy": 0.8847657442092896, + "num_tokens": 793306856.0, + "step": 20789 + }, + { + "epoch": 2.6447016918967052, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.402019500732422, + "learning_rate": 1e-06, + "loss": 0.4321, + "mean_token_accuracy": 0.8661755323410034, + "num_tokens": 793347866.0, + "step": 20790 + }, + { + "epoch": 2.6448289021752958, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.365358352661133, + "learning_rate": 1e-06, + "loss": 0.4037, + "mean_token_accuracy": 0.8729067444801331, + "num_tokens": 793391037.0, + "step": 20791 + }, + { + "epoch": 2.6449561124538863, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.50202178955078, + "learning_rate": 1e-06, + "loss": 0.4387, + "mean_token_accuracy": 0.8609994649887085, + "num_tokens": 793423560.0, + "step": 20792 + }, + { + "epoch": 2.645083322732477, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.263429641723633, + "learning_rate": 1e-06, + "loss": 0.3766, + "mean_token_accuracy": 0.8810102343559265, + "num_tokens": 793460215.0, + "step": 20793 + }, + { + "epoch": 2.6452105330110673, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.553621292114258, + "learning_rate": 1e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.8622009754180908, + "num_tokens": 793492671.0, + "step": 20794 + }, + { + "epoch": 2.645337743289658, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.428468704223633, + "learning_rate": 1e-06, + "loss": 0.3963, + "mean_token_accuracy": 0.8736300468444824, + "num_tokens": 793529526.0, + "step": 20795 + }, + { + "epoch": 2.6454649535682484, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.421070098876953, + "learning_rate": 1e-06, + "loss": 0.367, + "mean_token_accuracy": 0.8832825422286987, + "num_tokens": 793566748.0, + "step": 20796 + }, + { + "epoch": 2.645592163846839, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.288347244262695, + "learning_rate": 1e-06, + "loss": 0.3521, + "mean_token_accuracy": 0.888723611831665, + "num_tokens": 793600703.0, + "step": 20797 + }, + { + "epoch": 2.6457193741254295, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.338891983032227, + "learning_rate": 1e-06, + "loss": 0.4104, + "mean_token_accuracy": 0.8739729523658752, + "num_tokens": 793639226.0, + "step": 20798 + }, + { + "epoch": 2.64584658440402, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.40981674194336, + "learning_rate": 1e-06, + "loss": 0.3524, + "mean_token_accuracy": 0.887526273727417, + "num_tokens": 793676147.0, + "step": 20799 + }, + { + "epoch": 2.6459737946826105, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.15285301208496, + "learning_rate": 1e-06, + "loss": 0.372, + "mean_token_accuracy": 0.8839995861053467, + "num_tokens": 793715778.0, + "step": 20800 + }, + { + "epoch": 2.646101004961201, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.265953063964844, + "learning_rate": 1e-06, + "loss": 0.3615, + "mean_token_accuracy": 0.88998943567276, + "num_tokens": 793752096.0, + "step": 20801 + }, + { + "epoch": 2.646228215239791, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.425674438476562, + "learning_rate": 1e-06, + "loss": 0.4198, + "mean_token_accuracy": 0.870641827583313, + "num_tokens": 793789809.0, + "step": 20802 + }, + { + "epoch": 2.646355425518382, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.14975929260254, + "learning_rate": 1e-06, + "loss": 0.4036, + "mean_token_accuracy": 0.8712576627731323, + "num_tokens": 793826505.0, + "step": 20803 + }, + { + "epoch": 2.646482635796972, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.372148513793945, + "learning_rate": 1e-06, + "loss": 0.3966, + "mean_token_accuracy": 0.8747273683547974, + "num_tokens": 793868360.0, + "step": 20804 + }, + { + "epoch": 2.646609846075563, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.24565887451172, + "learning_rate": 1e-06, + "loss": 0.3988, + "mean_token_accuracy": 0.8775204420089722, + "num_tokens": 793906566.0, + "step": 20805 + }, + { + "epoch": 2.6467370563541532, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.442955017089844, + "learning_rate": 1e-06, + "loss": 0.4143, + "mean_token_accuracy": 0.872800350189209, + "num_tokens": 793947814.0, + "step": 20806 + }, + { + "epoch": 2.6468642666327438, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.494958877563477, + "learning_rate": 1e-06, + "loss": 0.3772, + "mean_token_accuracy": 0.87973952293396, + "num_tokens": 793987391.0, + "step": 20807 + }, + { + "epoch": 2.6469914769113343, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.33142852783203, + "learning_rate": 1e-06, + "loss": 0.3655, + "mean_token_accuracy": 0.885757565498352, + "num_tokens": 794024045.0, + "step": 20808 + }, + { + "epoch": 2.647118687189925, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.381505966186523, + "learning_rate": 1e-06, + "loss": 0.3794, + "mean_token_accuracy": 0.8788010478019714, + "num_tokens": 794063405.0, + "step": 20809 + }, + { + "epoch": 2.6472458974685154, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.396366119384766, + "learning_rate": 1e-06, + "loss": 0.4662, + "mean_token_accuracy": 0.8505341410636902, + "num_tokens": 794099729.0, + "step": 20810 + }, + { + "epoch": 2.647373107747106, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.476572036743164, + "learning_rate": 1e-06, + "loss": 0.4186, + "mean_token_accuracy": 0.8672095537185669, + "num_tokens": 794139669.0, + "step": 20811 + }, + { + "epoch": 2.6475003180256964, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.34451675415039, + "learning_rate": 1e-06, + "loss": 0.3965, + "mean_token_accuracy": 0.876073956489563, + "num_tokens": 794176906.0, + "step": 20812 + }, + { + "epoch": 2.647627528304287, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.51814079284668, + "learning_rate": 1e-06, + "loss": 0.3879, + "mean_token_accuracy": 0.8771475553512573, + "num_tokens": 794216594.0, + "step": 20813 + }, + { + "epoch": 2.6477547385828775, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.23838996887207, + "learning_rate": 1e-06, + "loss": 0.3864, + "mean_token_accuracy": 0.8772806525230408, + "num_tokens": 794248379.0, + "step": 20814 + }, + { + "epoch": 2.647881948861468, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.371688842773438, + "learning_rate": 1e-06, + "loss": 0.4602, + "mean_token_accuracy": 0.8518837690353394, + "num_tokens": 794293604.0, + "step": 20815 + }, + { + "epoch": 2.6480091591400585, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.45136833190918, + "learning_rate": 1e-06, + "loss": 0.4261, + "mean_token_accuracy": 0.8647617101669312, + "num_tokens": 794332084.0, + "step": 20816 + }, + { + "epoch": 2.648136369418649, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.492610931396484, + "learning_rate": 1e-06, + "loss": 0.3518, + "mean_token_accuracy": 0.8906224966049194, + "num_tokens": 794369158.0, + "step": 20817 + }, + { + "epoch": 2.6482635796972396, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.393795013427734, + "learning_rate": 1e-06, + "loss": 0.3907, + "mean_token_accuracy": 0.8786211013793945, + "num_tokens": 794410531.0, + "step": 20818 + }, + { + "epoch": 2.64839078997583, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.51505470275879, + "learning_rate": 1e-06, + "loss": 0.4039, + "mean_token_accuracy": 0.8744144439697266, + "num_tokens": 794457556.0, + "step": 20819 + }, + { + "epoch": 2.6485180002544206, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.40840721130371, + "learning_rate": 1e-06, + "loss": 0.436, + "mean_token_accuracy": 0.8637701869010925, + "num_tokens": 794492737.0, + "step": 20820 + }, + { + "epoch": 2.648645210533011, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.555070877075195, + "learning_rate": 1e-06, + "loss": 0.3712, + "mean_token_accuracy": 0.8847799301147461, + "num_tokens": 794532589.0, + "step": 20821 + }, + { + "epoch": 2.6487724208116017, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.2449893951416, + "learning_rate": 1e-06, + "loss": 0.4314, + "mean_token_accuracy": 0.863749086856842, + "num_tokens": 794566982.0, + "step": 20822 + }, + { + "epoch": 2.648899631090192, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.32827377319336, + "learning_rate": 1e-06, + "loss": 0.4477, + "mean_token_accuracy": 0.8575889468193054, + "num_tokens": 794604278.0, + "step": 20823 + }, + { + "epoch": 2.6490268413687827, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.327844619750977, + "learning_rate": 1e-06, + "loss": 0.3698, + "mean_token_accuracy": 0.8839429616928101, + "num_tokens": 794643785.0, + "step": 20824 + }, + { + "epoch": 2.649154051647373, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.605724334716797, + "learning_rate": 1e-06, + "loss": 0.3678, + "mean_token_accuracy": 0.883708655834198, + "num_tokens": 794678302.0, + "step": 20825 + }, + { + "epoch": 2.649281261925964, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.321857452392578, + "learning_rate": 1e-06, + "loss": 0.4249, + "mean_token_accuracy": 0.8640259504318237, + "num_tokens": 794715732.0, + "step": 20826 + }, + { + "epoch": 2.649408472204554, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.4104061126709, + "learning_rate": 1e-06, + "loss": 0.4029, + "mean_token_accuracy": 0.8789992332458496, + "num_tokens": 794754359.0, + "step": 20827 + }, + { + "epoch": 2.649535682483145, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.58470344543457, + "learning_rate": 1e-06, + "loss": 0.4038, + "mean_token_accuracy": 0.8715429306030273, + "num_tokens": 794795770.0, + "step": 20828 + }, + { + "epoch": 2.649662892761735, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.21797752380371, + "learning_rate": 1e-06, + "loss": 0.4283, + "mean_token_accuracy": 0.8680609464645386, + "num_tokens": 794835654.0, + "step": 20829 + }, + { + "epoch": 2.649790103040326, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.71022605895996, + "learning_rate": 1e-06, + "loss": 0.4026, + "mean_token_accuracy": 0.8769308924674988, + "num_tokens": 794872527.0, + "step": 20830 + }, + { + "epoch": 2.649917313318916, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.3786678314209, + "learning_rate": 1e-06, + "loss": 0.3796, + "mean_token_accuracy": 0.8785555958747864, + "num_tokens": 794918577.0, + "step": 20831 + }, + { + "epoch": 2.6500445235975065, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.30852699279785, + "learning_rate": 1e-06, + "loss": 0.3745, + "mean_token_accuracy": 0.8844690322875977, + "num_tokens": 794953566.0, + "step": 20832 + }, + { + "epoch": 2.650171733876097, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.538898468017578, + "learning_rate": 1e-06, + "loss": 0.3712, + "mean_token_accuracy": 0.8813589811325073, + "num_tokens": 794990547.0, + "step": 20833 + }, + { + "epoch": 2.6502989441546876, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.173992156982422, + "learning_rate": 1e-06, + "loss": 0.347, + "mean_token_accuracy": 0.8914746046066284, + "num_tokens": 795028419.0, + "step": 20834 + }, + { + "epoch": 2.650426154433278, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.318151473999023, + "learning_rate": 1e-06, + "loss": 0.3495, + "mean_token_accuracy": 0.8901463747024536, + "num_tokens": 795064303.0, + "step": 20835 + }, + { + "epoch": 2.6505533647118686, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.40440559387207, + "learning_rate": 1e-06, + "loss": 0.45, + "mean_token_accuracy": 0.8534504175186157, + "num_tokens": 795104280.0, + "step": 20836 + }, + { + "epoch": 2.650680574990459, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.463237762451172, + "learning_rate": 1e-06, + "loss": 0.4192, + "mean_token_accuracy": 0.8657344579696655, + "num_tokens": 795142547.0, + "step": 20837 + }, + { + "epoch": 2.6508077852690497, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.240144729614258, + "learning_rate": 1e-06, + "loss": 0.4163, + "mean_token_accuracy": 0.8727627396583557, + "num_tokens": 795186193.0, + "step": 20838 + }, + { + "epoch": 2.6509349955476402, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.45550537109375, + "learning_rate": 1e-06, + "loss": 0.3437, + "mean_token_accuracy": 0.8910020589828491, + "num_tokens": 795215453.0, + "step": 20839 + }, + { + "epoch": 2.6510622058262308, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.43861198425293, + "learning_rate": 1e-06, + "loss": 0.415, + "mean_token_accuracy": 0.8706321716308594, + "num_tokens": 795252820.0, + "step": 20840 + }, + { + "epoch": 2.6511894161048213, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.61367416381836, + "learning_rate": 1e-06, + "loss": 0.3955, + "mean_token_accuracy": 0.8727326393127441, + "num_tokens": 795290000.0, + "step": 20841 + }, + { + "epoch": 2.651316626383412, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.206727981567383, + "learning_rate": 1e-06, + "loss": 0.3432, + "mean_token_accuracy": 0.8925373554229736, + "num_tokens": 795330389.0, + "step": 20842 + }, + { + "epoch": 2.6514438366620023, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.46114158630371, + "learning_rate": 1e-06, + "loss": 0.4052, + "mean_token_accuracy": 0.8731582760810852, + "num_tokens": 795366693.0, + "step": 20843 + }, + { + "epoch": 2.651571046940593, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.549312591552734, + "learning_rate": 1e-06, + "loss": 0.3911, + "mean_token_accuracy": 0.876505970954895, + "num_tokens": 795405748.0, + "step": 20844 + }, + { + "epoch": 2.6516982572191834, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.436683654785156, + "learning_rate": 1e-06, + "loss": 0.3524, + "mean_token_accuracy": 0.8855006694793701, + "num_tokens": 795446179.0, + "step": 20845 + }, + { + "epoch": 2.651825467497774, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.299177169799805, + "learning_rate": 1e-06, + "loss": 0.4093, + "mean_token_accuracy": 0.870218813419342, + "num_tokens": 795488234.0, + "step": 20846 + }, + { + "epoch": 2.6519526777763645, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.350831985473633, + "learning_rate": 1e-06, + "loss": 0.4061, + "mean_token_accuracy": 0.8718193173408508, + "num_tokens": 795526508.0, + "step": 20847 + }, + { + "epoch": 2.652079888054955, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.162227630615234, + "learning_rate": 1e-06, + "loss": 0.3802, + "mean_token_accuracy": 0.882466733455658, + "num_tokens": 795566299.0, + "step": 20848 + }, + { + "epoch": 2.6522070983335455, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.449342727661133, + "learning_rate": 1e-06, + "loss": 0.414, + "mean_token_accuracy": 0.8702482581138611, + "num_tokens": 795603892.0, + "step": 20849 + }, + { + "epoch": 2.6523343086121356, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.438507080078125, + "learning_rate": 1e-06, + "loss": 0.3751, + "mean_token_accuracy": 0.8823153972625732, + "num_tokens": 795639622.0, + "step": 20850 + }, + { + "epoch": 2.6524615188907266, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.35504150390625, + "learning_rate": 1e-06, + "loss": 0.3319, + "mean_token_accuracy": 0.896439790725708, + "num_tokens": 795677299.0, + "step": 20851 + }, + { + "epoch": 2.6525887291693167, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.44280242919922, + "learning_rate": 1e-06, + "loss": 0.397, + "mean_token_accuracy": 0.8745474219322205, + "num_tokens": 795714304.0, + "step": 20852 + }, + { + "epoch": 2.6527159394479076, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.47088623046875, + "learning_rate": 1e-06, + "loss": 0.4107, + "mean_token_accuracy": 0.8706949353218079, + "num_tokens": 795752330.0, + "step": 20853 + }, + { + "epoch": 2.6528431497264977, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.366750717163086, + "learning_rate": 1e-06, + "loss": 0.4584, + "mean_token_accuracy": 0.8542128801345825, + "num_tokens": 795789503.0, + "step": 20854 + }, + { + "epoch": 2.6529703600050887, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.427282333374023, + "learning_rate": 1e-06, + "loss": 0.3933, + "mean_token_accuracy": 0.8741974234580994, + "num_tokens": 795828449.0, + "step": 20855 + }, + { + "epoch": 2.6530975702836788, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.329282760620117, + "learning_rate": 1e-06, + "loss": 0.399, + "mean_token_accuracy": 0.8753857016563416, + "num_tokens": 795868454.0, + "step": 20856 + }, + { + "epoch": 2.6532247805622693, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.348400115966797, + "learning_rate": 1e-06, + "loss": 0.4566, + "mean_token_accuracy": 0.8611552715301514, + "num_tokens": 795904629.0, + "step": 20857 + }, + { + "epoch": 2.65335199084086, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.22260856628418, + "learning_rate": 1e-06, + "loss": 0.438, + "mean_token_accuracy": 0.8610880374908447, + "num_tokens": 795943197.0, + "step": 20858 + }, + { + "epoch": 2.6534792011194503, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.36418914794922, + "learning_rate": 1e-06, + "loss": 0.4204, + "mean_token_accuracy": 0.8715897798538208, + "num_tokens": 795984672.0, + "step": 20859 + }, + { + "epoch": 2.653606411398041, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.180015563964844, + "learning_rate": 1e-06, + "loss": 0.4555, + "mean_token_accuracy": 0.8561357855796814, + "num_tokens": 796022599.0, + "step": 20860 + }, + { + "epoch": 2.6537336216766314, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.375795364379883, + "learning_rate": 1e-06, + "loss": 0.4016, + "mean_token_accuracy": 0.8724275827407837, + "num_tokens": 796057798.0, + "step": 20861 + }, + { + "epoch": 2.653860831955222, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.401866912841797, + "learning_rate": 1e-06, + "loss": 0.393, + "mean_token_accuracy": 0.873820424079895, + "num_tokens": 796094928.0, + "step": 20862 + }, + { + "epoch": 2.6539880422338125, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.45290756225586, + "learning_rate": 1e-06, + "loss": 0.3953, + "mean_token_accuracy": 0.8768457770347595, + "num_tokens": 796132614.0, + "step": 20863 + }, + { + "epoch": 2.654115252512403, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.39203643798828, + "learning_rate": 1e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.8729965090751648, + "num_tokens": 796171653.0, + "step": 20864 + }, + { + "epoch": 2.6542424627909935, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.44285774230957, + "learning_rate": 1e-06, + "loss": 0.4197, + "mean_token_accuracy": 0.8691704273223877, + "num_tokens": 796215218.0, + "step": 20865 + }, + { + "epoch": 2.654369673069584, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.5169677734375, + "learning_rate": 1e-06, + "loss": 0.3623, + "mean_token_accuracy": 0.8858755826950073, + "num_tokens": 796249165.0, + "step": 20866 + }, + { + "epoch": 2.6544968833481746, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.260889053344727, + "learning_rate": 1e-06, + "loss": 0.4151, + "mean_token_accuracy": 0.8719228506088257, + "num_tokens": 796283878.0, + "step": 20867 + }, + { + "epoch": 2.654624093626765, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.426776885986328, + "learning_rate": 1e-06, + "loss": 0.4423, + "mean_token_accuracy": 0.86199951171875, + "num_tokens": 796330760.0, + "step": 20868 + }, + { + "epoch": 2.6547513039053556, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.3505859375, + "learning_rate": 1e-06, + "loss": 0.4489, + "mean_token_accuracy": 0.8640826940536499, + "num_tokens": 796371547.0, + "step": 20869 + }, + { + "epoch": 2.654878514183946, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.43864631652832, + "learning_rate": 1e-06, + "loss": 0.4422, + "mean_token_accuracy": 0.858911395072937, + "num_tokens": 796409893.0, + "step": 20870 + }, + { + "epoch": 2.6550057244625367, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.579416275024414, + "learning_rate": 1e-06, + "loss": 0.4358, + "mean_token_accuracy": 0.8589552640914917, + "num_tokens": 796442493.0, + "step": 20871 + }, + { + "epoch": 2.655132934741127, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.239910125732422, + "learning_rate": 1e-06, + "loss": 0.4055, + "mean_token_accuracy": 0.8765132427215576, + "num_tokens": 796480853.0, + "step": 20872 + }, + { + "epoch": 2.6552601450197177, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.493988037109375, + "learning_rate": 1e-06, + "loss": 0.3798, + "mean_token_accuracy": 0.8801004886627197, + "num_tokens": 796520057.0, + "step": 20873 + }, + { + "epoch": 2.6553873552983083, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.591901779174805, + "learning_rate": 1e-06, + "loss": 0.3806, + "mean_token_accuracy": 0.8784480690956116, + "num_tokens": 796549779.0, + "step": 20874 + }, + { + "epoch": 2.6555145655768984, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.53188705444336, + "learning_rate": 1e-06, + "loss": 0.3812, + "mean_token_accuracy": 0.8800977468490601, + "num_tokens": 796590923.0, + "step": 20875 + }, + { + "epoch": 2.6556417758554893, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.437808990478516, + "learning_rate": 1e-06, + "loss": 0.4061, + "mean_token_accuracy": 0.8693783283233643, + "num_tokens": 796629455.0, + "step": 20876 + }, + { + "epoch": 2.6557689861340794, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.536338806152344, + "learning_rate": 1e-06, + "loss": 0.4032, + "mean_token_accuracy": 0.8711512088775635, + "num_tokens": 796669950.0, + "step": 20877 + }, + { + "epoch": 2.6558961964126704, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.47275161743164, + "learning_rate": 1e-06, + "loss": 0.4364, + "mean_token_accuracy": 0.8640698194503784, + "num_tokens": 796712263.0, + "step": 20878 + }, + { + "epoch": 2.6560234066912605, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.25556182861328, + "learning_rate": 1e-06, + "loss": 0.4027, + "mean_token_accuracy": 0.873144805431366, + "num_tokens": 796753025.0, + "step": 20879 + }, + { + "epoch": 2.6561506169698514, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.488788604736328, + "learning_rate": 1e-06, + "loss": 0.3548, + "mean_token_accuracy": 0.887262761592865, + "num_tokens": 796788810.0, + "step": 20880 + }, + { + "epoch": 2.6562778272484415, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.420860290527344, + "learning_rate": 1e-06, + "loss": 0.4059, + "mean_token_accuracy": 0.876990020275116, + "num_tokens": 796826275.0, + "step": 20881 + }, + { + "epoch": 2.656405037527032, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.333232879638672, + "learning_rate": 1e-06, + "loss": 0.4153, + "mean_token_accuracy": 0.869278073310852, + "num_tokens": 796859587.0, + "step": 20882 + }, + { + "epoch": 2.6565322478056226, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.390811920166016, + "learning_rate": 1e-06, + "loss": 0.3874, + "mean_token_accuracy": 0.8799328804016113, + "num_tokens": 796899440.0, + "step": 20883 + }, + { + "epoch": 2.656659458084213, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.236730575561523, + "learning_rate": 1e-06, + "loss": 0.4253, + "mean_token_accuracy": 0.8638007640838623, + "num_tokens": 796938703.0, + "step": 20884 + }, + { + "epoch": 2.6567866683628036, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.482437133789062, + "learning_rate": 1e-06, + "loss": 0.4435, + "mean_token_accuracy": 0.860881507396698, + "num_tokens": 796982287.0, + "step": 20885 + }, + { + "epoch": 2.656913878641394, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.28728485107422, + "learning_rate": 1e-06, + "loss": 0.4045, + "mean_token_accuracy": 0.8762367367744446, + "num_tokens": 797017640.0, + "step": 20886 + }, + { + "epoch": 2.6570410889199847, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.304786682128906, + "learning_rate": 1e-06, + "loss": 0.4444, + "mean_token_accuracy": 0.8593918085098267, + "num_tokens": 797054050.0, + "step": 20887 + }, + { + "epoch": 2.6571682991985752, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.366025924682617, + "learning_rate": 1e-06, + "loss": 0.3838, + "mean_token_accuracy": 0.8777083158493042, + "num_tokens": 797092429.0, + "step": 20888 + }, + { + "epoch": 2.6572955094771658, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.52936553955078, + "learning_rate": 1e-06, + "loss": 0.4136, + "mean_token_accuracy": 0.8710578680038452, + "num_tokens": 797125646.0, + "step": 20889 + }, + { + "epoch": 2.6574227197557563, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.30999183654785, + "learning_rate": 1e-06, + "loss": 0.389, + "mean_token_accuracy": 0.8768960237503052, + "num_tokens": 797164560.0, + "step": 20890 + }, + { + "epoch": 2.657549930034347, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.403064727783203, + "learning_rate": 1e-06, + "loss": 0.4019, + "mean_token_accuracy": 0.8729121685028076, + "num_tokens": 797194168.0, + "step": 20891 + }, + { + "epoch": 2.6576771403129373, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.34820556640625, + "learning_rate": 1e-06, + "loss": 0.3451, + "mean_token_accuracy": 0.8927764892578125, + "num_tokens": 797227045.0, + "step": 20892 + }, + { + "epoch": 2.657804350591528, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.330007553100586, + "learning_rate": 1e-06, + "loss": 0.359, + "mean_token_accuracy": 0.8880527019500732, + "num_tokens": 797267607.0, + "step": 20893 + }, + { + "epoch": 2.6579315608701184, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.301475524902344, + "learning_rate": 1e-06, + "loss": 0.4349, + "mean_token_accuracy": 0.8628846406936646, + "num_tokens": 797309320.0, + "step": 20894 + }, + { + "epoch": 2.658058771148709, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.560989379882812, + "learning_rate": 1e-06, + "loss": 0.4141, + "mean_token_accuracy": 0.8713064193725586, + "num_tokens": 797348258.0, + "step": 20895 + }, + { + "epoch": 2.6581859814272994, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.160205841064453, + "learning_rate": 1e-06, + "loss": 0.4038, + "mean_token_accuracy": 0.874788224697113, + "num_tokens": 797391563.0, + "step": 20896 + }, + { + "epoch": 2.65831319170589, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.551668167114258, + "learning_rate": 1e-06, + "loss": 0.3885, + "mean_token_accuracy": 0.8826912641525269, + "num_tokens": 797428314.0, + "step": 20897 + }, + { + "epoch": 2.6584404019844805, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.401813507080078, + "learning_rate": 1e-06, + "loss": 0.4602, + "mean_token_accuracy": 0.8541517853736877, + "num_tokens": 797464480.0, + "step": 20898 + }, + { + "epoch": 2.658567612263071, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.408784866333008, + "learning_rate": 1e-06, + "loss": 0.3897, + "mean_token_accuracy": 0.8750215172767639, + "num_tokens": 797504135.0, + "step": 20899 + }, + { + "epoch": 2.658694822541661, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.479007720947266, + "learning_rate": 1e-06, + "loss": 0.3991, + "mean_token_accuracy": 0.876304030418396, + "num_tokens": 797544222.0, + "step": 20900 + }, + { + "epoch": 2.658822032820252, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.37184715270996, + "learning_rate": 1e-06, + "loss": 0.3799, + "mean_token_accuracy": 0.8802956342697144, + "num_tokens": 797583695.0, + "step": 20901 + }, + { + "epoch": 2.658949243098842, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.4103946685791, + "learning_rate": 1e-06, + "loss": 0.439, + "mean_token_accuracy": 0.8604345917701721, + "num_tokens": 797625660.0, + "step": 20902 + }, + { + "epoch": 2.659076453377433, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.469085693359375, + "learning_rate": 1e-06, + "loss": 0.3954, + "mean_token_accuracy": 0.8723340034484863, + "num_tokens": 797665807.0, + "step": 20903 + }, + { + "epoch": 2.6592036636560232, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.326595306396484, + "learning_rate": 1e-06, + "loss": 0.4525, + "mean_token_accuracy": 0.8584619164466858, + "num_tokens": 797707935.0, + "step": 20904 + }, + { + "epoch": 2.6593308739346138, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.398630142211914, + "learning_rate": 1e-06, + "loss": 0.4272, + "mean_token_accuracy": 0.8678613901138306, + "num_tokens": 797745502.0, + "step": 20905 + }, + { + "epoch": 2.6594580842132043, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.40022087097168, + "learning_rate": 1e-06, + "loss": 0.4026, + "mean_token_accuracy": 0.8715213537216187, + "num_tokens": 797779412.0, + "step": 20906 + }, + { + "epoch": 2.659585294491795, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.33965492248535, + "learning_rate": 1e-06, + "loss": 0.3957, + "mean_token_accuracy": 0.8760733604431152, + "num_tokens": 797816677.0, + "step": 20907 + }, + { + "epoch": 2.6597125047703853, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.508220672607422, + "learning_rate": 1e-06, + "loss": 0.4487, + "mean_token_accuracy": 0.8566346764564514, + "num_tokens": 797854260.0, + "step": 20908 + }, + { + "epoch": 2.659839715048976, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.3837890625, + "learning_rate": 1e-06, + "loss": 0.4473, + "mean_token_accuracy": 0.8614974617958069, + "num_tokens": 797892349.0, + "step": 20909 + }, + { + "epoch": 2.6599669253275664, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.3145809173584, + "learning_rate": 1e-06, + "loss": 0.385, + "mean_token_accuracy": 0.8777700662612915, + "num_tokens": 797928028.0, + "step": 20910 + }, + { + "epoch": 2.660094135606157, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.453262329101562, + "learning_rate": 1e-06, + "loss": 0.4457, + "mean_token_accuracy": 0.8594147562980652, + "num_tokens": 797963885.0, + "step": 20911 + }, + { + "epoch": 2.6602213458847475, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.18648910522461, + "learning_rate": 1e-06, + "loss": 0.4365, + "mean_token_accuracy": 0.8612434267997742, + "num_tokens": 798009842.0, + "step": 20912 + }, + { + "epoch": 2.660348556163338, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.284204483032227, + "learning_rate": 1e-06, + "loss": 0.3555, + "mean_token_accuracy": 0.886853039264679, + "num_tokens": 798043207.0, + "step": 20913 + }, + { + "epoch": 2.6604757664419285, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.32987403869629, + "learning_rate": 1e-06, + "loss": 0.3687, + "mean_token_accuracy": 0.8891571760177612, + "num_tokens": 798080887.0, + "step": 20914 + }, + { + "epoch": 2.660602976720519, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.23463249206543, + "learning_rate": 1e-06, + "loss": 0.3759, + "mean_token_accuracy": 0.8837193846702576, + "num_tokens": 798126393.0, + "step": 20915 + }, + { + "epoch": 2.6607301869991096, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.35721206665039, + "learning_rate": 1e-06, + "loss": 0.3571, + "mean_token_accuracy": 0.8890482783317566, + "num_tokens": 798166099.0, + "step": 20916 + }, + { + "epoch": 2.6608573972777, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.283935546875, + "learning_rate": 1e-06, + "loss": 0.4121, + "mean_token_accuracy": 0.8674615621566772, + "num_tokens": 798202285.0, + "step": 20917 + }, + { + "epoch": 2.6609846075562906, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.384782791137695, + "learning_rate": 1e-06, + "loss": 0.3799, + "mean_token_accuracy": 0.8812159299850464, + "num_tokens": 798243463.0, + "step": 20918 + }, + { + "epoch": 2.661111817834881, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.320697784423828, + "learning_rate": 1e-06, + "loss": 0.3852, + "mean_token_accuracy": 0.8757842183113098, + "num_tokens": 798275507.0, + "step": 20919 + }, + { + "epoch": 2.6612390281134717, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.40561294555664, + "learning_rate": 1e-06, + "loss": 0.3426, + "mean_token_accuracy": 0.8920496702194214, + "num_tokens": 798310536.0, + "step": 20920 + }, + { + "epoch": 2.661366238392062, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.592391967773438, + "learning_rate": 1e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.8689603209495544, + "num_tokens": 798347874.0, + "step": 20921 + }, + { + "epoch": 2.6614934486706527, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.48578453063965, + "learning_rate": 1e-06, + "loss": 0.328, + "mean_token_accuracy": 0.8976112604141235, + "num_tokens": 798381865.0, + "step": 20922 + }, + { + "epoch": 2.661620658949243, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.27960777282715, + "learning_rate": 1e-06, + "loss": 0.4013, + "mean_token_accuracy": 0.872675895690918, + "num_tokens": 798423378.0, + "step": 20923 + }, + { + "epoch": 2.661747869227834, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.3778076171875, + "learning_rate": 1e-06, + "loss": 0.3887, + "mean_token_accuracy": 0.8811856508255005, + "num_tokens": 798460278.0, + "step": 20924 + }, + { + "epoch": 2.661875079506424, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.41298484802246, + "learning_rate": 1e-06, + "loss": 0.3984, + "mean_token_accuracy": 0.8745306730270386, + "num_tokens": 798496622.0, + "step": 20925 + }, + { + "epoch": 2.662002289785015, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.647449493408203, + "learning_rate": 1e-06, + "loss": 0.4103, + "mean_token_accuracy": 0.8700709342956543, + "num_tokens": 798533061.0, + "step": 20926 + }, + { + "epoch": 2.662129500063605, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.276212692260742, + "learning_rate": 1e-06, + "loss": 0.4169, + "mean_token_accuracy": 0.8669095039367676, + "num_tokens": 798573689.0, + "step": 20927 + }, + { + "epoch": 2.662256710342196, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.443038940429688, + "learning_rate": 1e-06, + "loss": 0.3697, + "mean_token_accuracy": 0.8828259706497192, + "num_tokens": 798614058.0, + "step": 20928 + }, + { + "epoch": 2.662383920620786, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.325117111206055, + "learning_rate": 1e-06, + "loss": 0.4421, + "mean_token_accuracy": 0.8635732531547546, + "num_tokens": 798657769.0, + "step": 20929 + }, + { + "epoch": 2.6625111308993765, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.277986526489258, + "learning_rate": 1e-06, + "loss": 0.397, + "mean_token_accuracy": 0.8772941827774048, + "num_tokens": 798701084.0, + "step": 20930 + }, + { + "epoch": 2.662638341177967, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.35810089111328, + "learning_rate": 1e-06, + "loss": 0.3719, + "mean_token_accuracy": 0.8789516687393188, + "num_tokens": 798738029.0, + "step": 20931 + }, + { + "epoch": 2.6627655514565576, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.344392776489258, + "learning_rate": 1e-06, + "loss": 0.411, + "mean_token_accuracy": 0.8699468374252319, + "num_tokens": 798773537.0, + "step": 20932 + }, + { + "epoch": 2.662892761735148, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.527124404907227, + "learning_rate": 1e-06, + "loss": 0.4051, + "mean_token_accuracy": 0.8719038963317871, + "num_tokens": 798810397.0, + "step": 20933 + }, + { + "epoch": 2.6630199720137386, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.18767547607422, + "learning_rate": 1e-06, + "loss": 0.4145, + "mean_token_accuracy": 0.8695014715194702, + "num_tokens": 798850796.0, + "step": 20934 + }, + { + "epoch": 2.663147182292329, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.320552825927734, + "learning_rate": 1e-06, + "loss": 0.4068, + "mean_token_accuracy": 0.8730996251106262, + "num_tokens": 798892379.0, + "step": 20935 + }, + { + "epoch": 2.6632743925709197, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.37409210205078, + "learning_rate": 1e-06, + "loss": 0.4329, + "mean_token_accuracy": 0.862401008605957, + "num_tokens": 798929444.0, + "step": 20936 + }, + { + "epoch": 2.66340160284951, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.259687423706055, + "learning_rate": 1e-06, + "loss": 0.3948, + "mean_token_accuracy": 0.8802058696746826, + "num_tokens": 798964086.0, + "step": 20937 + }, + { + "epoch": 2.6635288131281007, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.38404655456543, + "learning_rate": 1e-06, + "loss": 0.3761, + "mean_token_accuracy": 0.8821496367454529, + "num_tokens": 799007408.0, + "step": 20938 + }, + { + "epoch": 2.6636560234066913, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.42070770263672, + "learning_rate": 1e-06, + "loss": 0.3836, + "mean_token_accuracy": 0.8782808780670166, + "num_tokens": 799045930.0, + "step": 20939 + }, + { + "epoch": 2.663783233685282, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.524150848388672, + "learning_rate": 1e-06, + "loss": 0.3734, + "mean_token_accuracy": 0.8839719891548157, + "num_tokens": 799083568.0, + "step": 20940 + }, + { + "epoch": 2.6639104439638723, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.27193832397461, + "learning_rate": 1e-06, + "loss": 0.4102, + "mean_token_accuracy": 0.8710841536521912, + "num_tokens": 799123681.0, + "step": 20941 + }, + { + "epoch": 2.664037654242463, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.432945251464844, + "learning_rate": 1e-06, + "loss": 0.418, + "mean_token_accuracy": 0.8701225519180298, + "num_tokens": 799164623.0, + "step": 20942 + }, + { + "epoch": 2.6641648645210534, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.397174835205078, + "learning_rate": 1e-06, + "loss": 0.3944, + "mean_token_accuracy": 0.873189389705658, + "num_tokens": 799200408.0, + "step": 20943 + }, + { + "epoch": 2.664292074799644, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.363676071166992, + "learning_rate": 1e-06, + "loss": 0.4471, + "mean_token_accuracy": 0.8628000617027283, + "num_tokens": 799245106.0, + "step": 20944 + }, + { + "epoch": 2.6644192850782344, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.41969871520996, + "learning_rate": 1e-06, + "loss": 0.3799, + "mean_token_accuracy": 0.8810484409332275, + "num_tokens": 799290429.0, + "step": 20945 + }, + { + "epoch": 2.664546495356825, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.390043258666992, + "learning_rate": 1e-06, + "loss": 0.3993, + "mean_token_accuracy": 0.8714233636856079, + "num_tokens": 799322951.0, + "step": 20946 + }, + { + "epoch": 2.6646737056354155, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.430814743041992, + "learning_rate": 1e-06, + "loss": 0.409, + "mean_token_accuracy": 0.8699316382408142, + "num_tokens": 799365498.0, + "step": 20947 + }, + { + "epoch": 2.6648009159140056, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.352127075195312, + "learning_rate": 1e-06, + "loss": 0.4226, + "mean_token_accuracy": 0.8683106899261475, + "num_tokens": 799405894.0, + "step": 20948 + }, + { + "epoch": 2.6649281261925966, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.44470977783203, + "learning_rate": 1e-06, + "loss": 0.395, + "mean_token_accuracy": 0.8730998635292053, + "num_tokens": 799439750.0, + "step": 20949 + }, + { + "epoch": 2.6650553364711866, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.603118896484375, + "learning_rate": 1e-06, + "loss": 0.4293, + "mean_token_accuracy": 0.8663861155509949, + "num_tokens": 799475170.0, + "step": 20950 + }, + { + "epoch": 2.6651825467497776, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.344680786132812, + "learning_rate": 1e-06, + "loss": 0.4474, + "mean_token_accuracy": 0.8606107234954834, + "num_tokens": 799514231.0, + "step": 20951 + }, + { + "epoch": 2.6653097570283677, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.521350860595703, + "learning_rate": 1e-06, + "loss": 0.4042, + "mean_token_accuracy": 0.8748538494110107, + "num_tokens": 799549512.0, + "step": 20952 + }, + { + "epoch": 2.6654369673069587, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.42422103881836, + "learning_rate": 1e-06, + "loss": 0.4153, + "mean_token_accuracy": 0.8692280054092407, + "num_tokens": 799592144.0, + "step": 20953 + }, + { + "epoch": 2.6655641775855488, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.4040584564209, + "learning_rate": 1e-06, + "loss": 0.4439, + "mean_token_accuracy": 0.8588091731071472, + "num_tokens": 799634779.0, + "step": 20954 + }, + { + "epoch": 2.6656913878641393, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.380300521850586, + "learning_rate": 1e-06, + "loss": 0.366, + "mean_token_accuracy": 0.8849942684173584, + "num_tokens": 799677548.0, + "step": 20955 + }, + { + "epoch": 2.66581859814273, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.238037109375, + "learning_rate": 1e-06, + "loss": 0.4623, + "mean_token_accuracy": 0.856718897819519, + "num_tokens": 799715679.0, + "step": 20956 + }, + { + "epoch": 2.6659458084213203, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.27420425415039, + "learning_rate": 1e-06, + "loss": 0.4368, + "mean_token_accuracy": 0.8654272556304932, + "num_tokens": 799755701.0, + "step": 20957 + }, + { + "epoch": 2.666073018699911, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.3712100982666, + "learning_rate": 1e-06, + "loss": 0.4262, + "mean_token_accuracy": 0.8686676025390625, + "num_tokens": 799795151.0, + "step": 20958 + }, + { + "epoch": 2.6662002289785014, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.429481506347656, + "learning_rate": 1e-06, + "loss": 0.3826, + "mean_token_accuracy": 0.8797292709350586, + "num_tokens": 799832249.0, + "step": 20959 + }, + { + "epoch": 2.666327439257092, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.373685836791992, + "learning_rate": 1e-06, + "loss": 0.4493, + "mean_token_accuracy": 0.8537749648094177, + "num_tokens": 799867762.0, + "step": 20960 + }, + { + "epoch": 2.6664546495356825, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.36750030517578, + "learning_rate": 1e-06, + "loss": 0.4163, + "mean_token_accuracy": 0.8700224161148071, + "num_tokens": 799909012.0, + "step": 20961 + }, + { + "epoch": 2.666581859814273, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.421159744262695, + "learning_rate": 1e-06, + "loss": 0.4092, + "mean_token_accuracy": 0.8698331713676453, + "num_tokens": 799950316.0, + "step": 20962 + }, + { + "epoch": 2.6667090700928635, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.580387115478516, + "learning_rate": 1e-06, + "loss": 0.3617, + "mean_token_accuracy": 0.8880568146705627, + "num_tokens": 799990037.0, + "step": 20963 + }, + { + "epoch": 2.666836280371454, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.22532844543457, + "learning_rate": 1e-06, + "loss": 0.3782, + "mean_token_accuracy": 0.8827791213989258, + "num_tokens": 800023935.0, + "step": 20964 + }, + { + "epoch": 2.6669634906500446, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.369590759277344, + "learning_rate": 1e-06, + "loss": 0.3954, + "mean_token_accuracy": 0.879304051399231, + "num_tokens": 800066357.0, + "step": 20965 + }, + { + "epoch": 2.667090700928635, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.42555809020996, + "learning_rate": 1e-06, + "loss": 0.4401, + "mean_token_accuracy": 0.8598313927650452, + "num_tokens": 800100667.0, + "step": 20966 + }, + { + "epoch": 2.6672179112072256, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.19849395751953, + "learning_rate": 1e-06, + "loss": 0.3863, + "mean_token_accuracy": 0.87730473279953, + "num_tokens": 800135101.0, + "step": 20967 + }, + { + "epoch": 2.667345121485816, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.219270706176758, + "learning_rate": 1e-06, + "loss": 0.3595, + "mean_token_accuracy": 0.8881130218505859, + "num_tokens": 800176308.0, + "step": 20968 + }, + { + "epoch": 2.6674723317644067, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.537893295288086, + "learning_rate": 1e-06, + "loss": 0.3724, + "mean_token_accuracy": 0.8843034505844116, + "num_tokens": 800205585.0, + "step": 20969 + }, + { + "epoch": 2.667599542042997, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.402196884155273, + "learning_rate": 1e-06, + "loss": 0.4051, + "mean_token_accuracy": 0.8733012080192566, + "num_tokens": 800241400.0, + "step": 20970 + }, + { + "epoch": 2.6677267523215877, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.297208786010742, + "learning_rate": 1e-06, + "loss": 0.374, + "mean_token_accuracy": 0.8841944336891174, + "num_tokens": 800279060.0, + "step": 20971 + }, + { + "epoch": 2.6678539626001783, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.462217330932617, + "learning_rate": 1e-06, + "loss": 0.3719, + "mean_token_accuracy": 0.8826804757118225, + "num_tokens": 800318606.0, + "step": 20972 + }, + { + "epoch": 2.6679811728787683, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.405771255493164, + "learning_rate": 1e-06, + "loss": 0.4058, + "mean_token_accuracy": 0.8727414011955261, + "num_tokens": 800356795.0, + "step": 20973 + }, + { + "epoch": 2.6681083831573593, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.262319564819336, + "learning_rate": 1e-06, + "loss": 0.3797, + "mean_token_accuracy": 0.8810379505157471, + "num_tokens": 800393786.0, + "step": 20974 + }, + { + "epoch": 2.6682355934359494, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.680191040039062, + "learning_rate": 1e-06, + "loss": 0.4025, + "mean_token_accuracy": 0.8719650506973267, + "num_tokens": 800440153.0, + "step": 20975 + }, + { + "epoch": 2.6683628037145404, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.246492385864258, + "learning_rate": 1e-06, + "loss": 0.4629, + "mean_token_accuracy": 0.8557315468788147, + "num_tokens": 800476343.0, + "step": 20976 + }, + { + "epoch": 2.6684900139931305, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.425451278686523, + "learning_rate": 1e-06, + "loss": 0.3632, + "mean_token_accuracy": 0.8856624960899353, + "num_tokens": 800514227.0, + "step": 20977 + }, + { + "epoch": 2.668617224271721, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.47579002380371, + "learning_rate": 1e-06, + "loss": 0.4083, + "mean_token_accuracy": 0.8727193474769592, + "num_tokens": 800552357.0, + "step": 20978 + }, + { + "epoch": 2.6687444345503115, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.50737190246582, + "learning_rate": 1e-06, + "loss": 0.372, + "mean_token_accuracy": 0.8816776275634766, + "num_tokens": 800587624.0, + "step": 20979 + }, + { + "epoch": 2.668871644828902, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.22928237915039, + "learning_rate": 1e-06, + "loss": 0.4146, + "mean_token_accuracy": 0.8677922487258911, + "num_tokens": 800631183.0, + "step": 20980 + }, + { + "epoch": 2.6689988551074926, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.492095947265625, + "learning_rate": 1e-06, + "loss": 0.3812, + "mean_token_accuracy": 0.877765417098999, + "num_tokens": 800670334.0, + "step": 20981 + }, + { + "epoch": 2.669126065386083, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.596574783325195, + "learning_rate": 1e-06, + "loss": 0.4191, + "mean_token_accuracy": 0.8695521354675293, + "num_tokens": 800703500.0, + "step": 20982 + }, + { + "epoch": 2.6692532756646736, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.241722106933594, + "learning_rate": 1e-06, + "loss": 0.4254, + "mean_token_accuracy": 0.8674482107162476, + "num_tokens": 800742904.0, + "step": 20983 + }, + { + "epoch": 2.669380485943264, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.457866668701172, + "learning_rate": 1e-06, + "loss": 0.4047, + "mean_token_accuracy": 0.8704720139503479, + "num_tokens": 800777556.0, + "step": 20984 + }, + { + "epoch": 2.6695076962218547, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.38705062866211, + "learning_rate": 1e-06, + "loss": 0.4199, + "mean_token_accuracy": 0.8648937940597534, + "num_tokens": 800817532.0, + "step": 20985 + }, + { + "epoch": 2.669634906500445, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.408681869506836, + "learning_rate": 1e-06, + "loss": 0.3688, + "mean_token_accuracy": 0.8832323551177979, + "num_tokens": 800856567.0, + "step": 20986 + }, + { + "epoch": 2.6697621167790357, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.43642807006836, + "learning_rate": 1e-06, + "loss": 0.466, + "mean_token_accuracy": 0.8511886596679688, + "num_tokens": 800901234.0, + "step": 20987 + }, + { + "epoch": 2.6698893270576263, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.336509704589844, + "learning_rate": 1e-06, + "loss": 0.4122, + "mean_token_accuracy": 0.8709913492202759, + "num_tokens": 800942647.0, + "step": 20988 + }, + { + "epoch": 2.670016537336217, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.45612907409668, + "learning_rate": 1e-06, + "loss": 0.3838, + "mean_token_accuracy": 0.880679726600647, + "num_tokens": 800980194.0, + "step": 20989 + }, + { + "epoch": 2.6701437476148073, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.356298446655273, + "learning_rate": 1e-06, + "loss": 0.3913, + "mean_token_accuracy": 0.877826988697052, + "num_tokens": 801016978.0, + "step": 20990 + }, + { + "epoch": 2.670270957893398, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.301916122436523, + "learning_rate": 1e-06, + "loss": 0.4332, + "mean_token_accuracy": 0.8598180413246155, + "num_tokens": 801051921.0, + "step": 20991 + }, + { + "epoch": 2.6703981681719884, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.634105682373047, + "learning_rate": 1e-06, + "loss": 0.4679, + "mean_token_accuracy": 0.8478693962097168, + "num_tokens": 801087189.0, + "step": 20992 + }, + { + "epoch": 2.670525378450579, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.36722755432129, + "learning_rate": 1e-06, + "loss": 0.3837, + "mean_token_accuracy": 0.8787481188774109, + "num_tokens": 801121574.0, + "step": 20993 + }, + { + "epoch": 2.6706525887291694, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.434288024902344, + "learning_rate": 1e-06, + "loss": 0.3896, + "mean_token_accuracy": 0.879141092300415, + "num_tokens": 801160497.0, + "step": 20994 + }, + { + "epoch": 2.67077979900776, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.331838607788086, + "learning_rate": 1e-06, + "loss": 0.3871, + "mean_token_accuracy": 0.8745819926261902, + "num_tokens": 801198556.0, + "step": 20995 + }, + { + "epoch": 2.6709070092863505, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.478160858154297, + "learning_rate": 1e-06, + "loss": 0.3211, + "mean_token_accuracy": 0.8976798057556152, + "num_tokens": 801230616.0, + "step": 20996 + }, + { + "epoch": 2.671034219564941, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.34979820251465, + "learning_rate": 1e-06, + "loss": 0.4003, + "mean_token_accuracy": 0.8715664148330688, + "num_tokens": 801266582.0, + "step": 20997 + }, + { + "epoch": 2.671161429843531, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.358110427856445, + "learning_rate": 1e-06, + "loss": 0.3732, + "mean_token_accuracy": 0.8850198984146118, + "num_tokens": 801305127.0, + "step": 20998 + }, + { + "epoch": 2.671288640122122, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.14394187927246, + "learning_rate": 1e-06, + "loss": 0.3859, + "mean_token_accuracy": 0.8778842687606812, + "num_tokens": 801348284.0, + "step": 20999 + }, + { + "epoch": 2.671415850400712, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.240358352661133, + "learning_rate": 1e-06, + "loss": 0.3387, + "mean_token_accuracy": 0.8928880095481873, + "num_tokens": 801385298.0, + "step": 21000 + }, + { + "epoch": 2.671543060679303, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.300016403198242, + "learning_rate": 1e-06, + "loss": 0.4286, + "mean_token_accuracy": 0.8655772805213928, + "num_tokens": 801417020.0, + "step": 21001 + }, + { + "epoch": 2.6716702709578932, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.440160751342773, + "learning_rate": 1e-06, + "loss": 0.4194, + "mean_token_accuracy": 0.8659018278121948, + "num_tokens": 801455328.0, + "step": 21002 + }, + { + "epoch": 2.6717974812364838, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.514081954956055, + "learning_rate": 1e-06, + "loss": 0.3691, + "mean_token_accuracy": 0.8847548961639404, + "num_tokens": 801494342.0, + "step": 21003 + }, + { + "epoch": 2.6719246915150743, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.389116287231445, + "learning_rate": 1e-06, + "loss": 0.4012, + "mean_token_accuracy": 0.8760849237442017, + "num_tokens": 801528182.0, + "step": 21004 + }, + { + "epoch": 2.672051901793665, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.4852294921875, + "learning_rate": 1e-06, + "loss": 0.3999, + "mean_token_accuracy": 0.8742993474006653, + "num_tokens": 801569424.0, + "step": 21005 + }, + { + "epoch": 2.6721791120722553, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.497947692871094, + "learning_rate": 1e-06, + "loss": 0.4012, + "mean_token_accuracy": 0.870995283126831, + "num_tokens": 801608809.0, + "step": 21006 + }, + { + "epoch": 2.672306322350846, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.441516876220703, + "learning_rate": 1e-06, + "loss": 0.4234, + "mean_token_accuracy": 0.8655890226364136, + "num_tokens": 801651910.0, + "step": 21007 + }, + { + "epoch": 2.6724335326294364, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.487300872802734, + "learning_rate": 1e-06, + "loss": 0.4097, + "mean_token_accuracy": 0.8723864555358887, + "num_tokens": 801692961.0, + "step": 21008 + }, + { + "epoch": 2.672560742908027, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.422840118408203, + "learning_rate": 1e-06, + "loss": 0.4391, + "mean_token_accuracy": 0.8616987466812134, + "num_tokens": 801731278.0, + "step": 21009 + }, + { + "epoch": 2.6726879531866174, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.404884338378906, + "learning_rate": 1e-06, + "loss": 0.4192, + "mean_token_accuracy": 0.869876503944397, + "num_tokens": 801769562.0, + "step": 21010 + }, + { + "epoch": 2.672815163465208, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.51893424987793, + "learning_rate": 1e-06, + "loss": 0.4088, + "mean_token_accuracy": 0.8691914677619934, + "num_tokens": 801811046.0, + "step": 21011 + }, + { + "epoch": 2.6729423737437985, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.328123092651367, + "learning_rate": 1e-06, + "loss": 0.4676, + "mean_token_accuracy": 0.8503706455230713, + "num_tokens": 801847833.0, + "step": 21012 + }, + { + "epoch": 2.673069584022389, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.437719345092773, + "learning_rate": 1e-06, + "loss": 0.4249, + "mean_token_accuracy": 0.8684805035591125, + "num_tokens": 801887819.0, + "step": 21013 + }, + { + "epoch": 2.6731967943009796, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.592693328857422, + "learning_rate": 1e-06, + "loss": 0.3664, + "mean_token_accuracy": 0.8847498893737793, + "num_tokens": 801922535.0, + "step": 21014 + }, + { + "epoch": 2.67332400457957, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.288724899291992, + "learning_rate": 1e-06, + "loss": 0.3876, + "mean_token_accuracy": 0.8787740468978882, + "num_tokens": 801958025.0, + "step": 21015 + }, + { + "epoch": 2.6734512148581606, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.513246536254883, + "learning_rate": 1e-06, + "loss": 0.3857, + "mean_token_accuracy": 0.8781684637069702, + "num_tokens": 801991291.0, + "step": 21016 + }, + { + "epoch": 2.673578425136751, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.30843162536621, + "learning_rate": 1e-06, + "loss": 0.3967, + "mean_token_accuracy": 0.8747413158416748, + "num_tokens": 802024952.0, + "step": 21017 + }, + { + "epoch": 2.6737056354153417, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.486961364746094, + "learning_rate": 1e-06, + "loss": 0.402, + "mean_token_accuracy": 0.8729418516159058, + "num_tokens": 802065741.0, + "step": 21018 + }, + { + "epoch": 2.673832845693932, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.253650665283203, + "learning_rate": 1e-06, + "loss": 0.3961, + "mean_token_accuracy": 0.8730806708335876, + "num_tokens": 802107153.0, + "step": 21019 + }, + { + "epoch": 2.6739600559725227, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.259748458862305, + "learning_rate": 1e-06, + "loss": 0.3812, + "mean_token_accuracy": 0.8787446022033691, + "num_tokens": 802148177.0, + "step": 21020 + }, + { + "epoch": 2.674087266251113, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.134952545166016, + "learning_rate": 1e-06, + "loss": 0.3912, + "mean_token_accuracy": 0.8756089210510254, + "num_tokens": 802186144.0, + "step": 21021 + }, + { + "epoch": 2.674214476529704, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.34128189086914, + "learning_rate": 1e-06, + "loss": 0.3598, + "mean_token_accuracy": 0.8864632844924927, + "num_tokens": 802224054.0, + "step": 21022 + }, + { + "epoch": 2.674341686808294, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.526866912841797, + "learning_rate": 1e-06, + "loss": 0.4576, + "mean_token_accuracy": 0.8571362495422363, + "num_tokens": 802264483.0, + "step": 21023 + }, + { + "epoch": 2.674468897086885, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.351116180419922, + "learning_rate": 1e-06, + "loss": 0.3779, + "mean_token_accuracy": 0.8813077211380005, + "num_tokens": 802302933.0, + "step": 21024 + }, + { + "epoch": 2.674596107365475, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.33167839050293, + "learning_rate": 1e-06, + "loss": 0.4279, + "mean_token_accuracy": 0.8663594722747803, + "num_tokens": 802347354.0, + "step": 21025 + }, + { + "epoch": 2.674723317644066, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.5633602142334, + "learning_rate": 1e-06, + "loss": 0.371, + "mean_token_accuracy": 0.8829532861709595, + "num_tokens": 802392159.0, + "step": 21026 + }, + { + "epoch": 2.674850527922656, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.213966369628906, + "learning_rate": 1e-06, + "loss": 0.3919, + "mean_token_accuracy": 0.8773660659790039, + "num_tokens": 802432629.0, + "step": 21027 + }, + { + "epoch": 2.6749777382012465, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.172868728637695, + "learning_rate": 1e-06, + "loss": 0.3762, + "mean_token_accuracy": 0.882253885269165, + "num_tokens": 802471776.0, + "step": 21028 + }, + { + "epoch": 2.675104948479837, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.508296966552734, + "learning_rate": 1e-06, + "loss": 0.4238, + "mean_token_accuracy": 0.8675740957260132, + "num_tokens": 802507314.0, + "step": 21029 + }, + { + "epoch": 2.6752321587584276, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.240558624267578, + "learning_rate": 1e-06, + "loss": 0.3706, + "mean_token_accuracy": 0.8813974261283875, + "num_tokens": 802550587.0, + "step": 21030 + }, + { + "epoch": 2.675359369037018, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.305688858032227, + "learning_rate": 1e-06, + "loss": 0.4085, + "mean_token_accuracy": 0.8747565746307373, + "num_tokens": 802591544.0, + "step": 21031 + }, + { + "epoch": 2.6754865793156086, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.512109756469727, + "learning_rate": 1e-06, + "loss": 0.3699, + "mean_token_accuracy": 0.8814180493354797, + "num_tokens": 802627336.0, + "step": 21032 + }, + { + "epoch": 2.675613789594199, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.29218292236328, + "learning_rate": 1e-06, + "loss": 0.4127, + "mean_token_accuracy": 0.8724108934402466, + "num_tokens": 802668358.0, + "step": 21033 + }, + { + "epoch": 2.6757409998727897, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.46500587463379, + "learning_rate": 1e-06, + "loss": 0.3836, + "mean_token_accuracy": 0.8816896080970764, + "num_tokens": 802708023.0, + "step": 21034 + }, + { + "epoch": 2.67586821015138, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.1708984375, + "learning_rate": 1e-06, + "loss": 0.3492, + "mean_token_accuracy": 0.8940731287002563, + "num_tokens": 802745856.0, + "step": 21035 + }, + { + "epoch": 2.6759954204299707, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.4040470123291, + "learning_rate": 1e-06, + "loss": 0.372, + "mean_token_accuracy": 0.8860368728637695, + "num_tokens": 802779032.0, + "step": 21036 + }, + { + "epoch": 2.6761226307085613, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.216943740844727, + "learning_rate": 1e-06, + "loss": 0.3999, + "mean_token_accuracy": 0.8745354413986206, + "num_tokens": 802815736.0, + "step": 21037 + }, + { + "epoch": 2.676249840987152, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.279577255249023, + "learning_rate": 1e-06, + "loss": 0.4252, + "mean_token_accuracy": 0.8702894449234009, + "num_tokens": 802854133.0, + "step": 21038 + }, + { + "epoch": 2.6763770512657423, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.452165603637695, + "learning_rate": 1e-06, + "loss": 0.3845, + "mean_token_accuracy": 0.8812308311462402, + "num_tokens": 802892969.0, + "step": 21039 + }, + { + "epoch": 2.676504261544333, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.205808639526367, + "learning_rate": 1e-06, + "loss": 0.3573, + "mean_token_accuracy": 0.8884016275405884, + "num_tokens": 802936998.0, + "step": 21040 + }, + { + "epoch": 2.6766314718229234, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.32378387451172, + "learning_rate": 1e-06, + "loss": 0.3759, + "mean_token_accuracy": 0.882620632648468, + "num_tokens": 802979040.0, + "step": 21041 + }, + { + "epoch": 2.676758682101514, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.375, + "learning_rate": 1e-06, + "loss": 0.3678, + "mean_token_accuracy": 0.8845857381820679, + "num_tokens": 803016516.0, + "step": 21042 + }, + { + "epoch": 2.6768858923801044, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.334346771240234, + "learning_rate": 1e-06, + "loss": 0.3991, + "mean_token_accuracy": 0.8722854256629944, + "num_tokens": 803055566.0, + "step": 21043 + }, + { + "epoch": 2.677013102658695, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.46743392944336, + "learning_rate": 1e-06, + "loss": 0.4016, + "mean_token_accuracy": 0.8759385347366333, + "num_tokens": 803093683.0, + "step": 21044 + }, + { + "epoch": 2.6771403129372855, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.248891830444336, + "learning_rate": 1e-06, + "loss": 0.3829, + "mean_token_accuracy": 0.8838395476341248, + "num_tokens": 803133691.0, + "step": 21045 + }, + { + "epoch": 2.6772675232158756, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.413379669189453, + "learning_rate": 1e-06, + "loss": 0.3617, + "mean_token_accuracy": 0.8829447031021118, + "num_tokens": 803169870.0, + "step": 21046 + }, + { + "epoch": 2.6773947334944665, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.775493621826172, + "learning_rate": 1e-06, + "loss": 0.4249, + "mean_token_accuracy": 0.8674944043159485, + "num_tokens": 803209038.0, + "step": 21047 + }, + { + "epoch": 2.6775219437730566, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.40111541748047, + "learning_rate": 1e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.8604990243911743, + "num_tokens": 803239725.0, + "step": 21048 + }, + { + "epoch": 2.6776491540516476, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.5295352935791, + "learning_rate": 1e-06, + "loss": 0.3923, + "mean_token_accuracy": 0.8719794154167175, + "num_tokens": 803273580.0, + "step": 21049 + }, + { + "epoch": 2.6777763643302377, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.442697525024414, + "learning_rate": 1e-06, + "loss": 0.379, + "mean_token_accuracy": 0.879995584487915, + "num_tokens": 803313583.0, + "step": 21050 + }, + { + "epoch": 2.6779035746088287, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.26563835144043, + "learning_rate": 1e-06, + "loss": 0.3724, + "mean_token_accuracy": 0.8825028538703918, + "num_tokens": 803347517.0, + "step": 21051 + }, + { + "epoch": 2.6780307848874187, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.604766845703125, + "learning_rate": 1e-06, + "loss": 0.4375, + "mean_token_accuracy": 0.8607842922210693, + "num_tokens": 803388183.0, + "step": 21052 + }, + { + "epoch": 2.6781579951660093, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.521074295043945, + "learning_rate": 1e-06, + "loss": 0.3907, + "mean_token_accuracy": 0.874305784702301, + "num_tokens": 803428377.0, + "step": 21053 + }, + { + "epoch": 2.6782852054446, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.393823623657227, + "learning_rate": 1e-06, + "loss": 0.3709, + "mean_token_accuracy": 0.8830512762069702, + "num_tokens": 803460930.0, + "step": 21054 + }, + { + "epoch": 2.6784124157231903, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.456161499023438, + "learning_rate": 1e-06, + "loss": 0.3754, + "mean_token_accuracy": 0.8824794888496399, + "num_tokens": 803497971.0, + "step": 21055 + }, + { + "epoch": 2.678539626001781, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.633554458618164, + "learning_rate": 1e-06, + "loss": 0.3526, + "mean_token_accuracy": 0.8904176950454712, + "num_tokens": 803535514.0, + "step": 21056 + }, + { + "epoch": 2.6786668362803714, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.566713333129883, + "learning_rate": 1e-06, + "loss": 0.4092, + "mean_token_accuracy": 0.8705557584762573, + "num_tokens": 803580767.0, + "step": 21057 + }, + { + "epoch": 2.678794046558962, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.40599822998047, + "learning_rate": 1e-06, + "loss": 0.3987, + "mean_token_accuracy": 0.878416121006012, + "num_tokens": 803619168.0, + "step": 21058 + }, + { + "epoch": 2.6789212568375524, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.419025421142578, + "learning_rate": 1e-06, + "loss": 0.4359, + "mean_token_accuracy": 0.8615660071372986, + "num_tokens": 803654785.0, + "step": 21059 + }, + { + "epoch": 2.679048467116143, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.63167953491211, + "learning_rate": 1e-06, + "loss": 0.3676, + "mean_token_accuracy": 0.8826742172241211, + "num_tokens": 803692505.0, + "step": 21060 + }, + { + "epoch": 2.6791756773947335, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.318134307861328, + "learning_rate": 1e-06, + "loss": 0.3691, + "mean_token_accuracy": 0.8820943832397461, + "num_tokens": 803732592.0, + "step": 21061 + }, + { + "epoch": 2.679302887673324, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.5140380859375, + "learning_rate": 1e-06, + "loss": 0.42, + "mean_token_accuracy": 0.8630547523498535, + "num_tokens": 803767601.0, + "step": 21062 + }, + { + "epoch": 2.6794300979519146, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.482759475708008, + "learning_rate": 1e-06, + "loss": 0.413, + "mean_token_accuracy": 0.867888331413269, + "num_tokens": 803804329.0, + "step": 21063 + }, + { + "epoch": 2.679557308230505, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.533275604248047, + "learning_rate": 1e-06, + "loss": 0.411, + "mean_token_accuracy": 0.8699753284454346, + "num_tokens": 803846182.0, + "step": 21064 + }, + { + "epoch": 2.6796845185090956, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.535188674926758, + "learning_rate": 1e-06, + "loss": 0.4248, + "mean_token_accuracy": 0.8704251646995544, + "num_tokens": 803883561.0, + "step": 21065 + }, + { + "epoch": 2.679811728787686, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.358102798461914, + "learning_rate": 1e-06, + "loss": 0.3432, + "mean_token_accuracy": 0.8930176496505737, + "num_tokens": 803916860.0, + "step": 21066 + }, + { + "epoch": 2.6799389390662767, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.777305603027344, + "learning_rate": 1e-06, + "loss": 0.3921, + "mean_token_accuracy": 0.8747792840003967, + "num_tokens": 803951949.0, + "step": 21067 + }, + { + "epoch": 2.680066149344867, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.328819274902344, + "learning_rate": 1e-06, + "loss": 0.4077, + "mean_token_accuracy": 0.8703292608261108, + "num_tokens": 803992094.0, + "step": 21068 + }, + { + "epoch": 2.6801933596234577, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.38367462158203, + "learning_rate": 1e-06, + "loss": 0.3973, + "mean_token_accuracy": 0.874711811542511, + "num_tokens": 804034058.0, + "step": 21069 + }, + { + "epoch": 2.6803205699020483, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.391258239746094, + "learning_rate": 1e-06, + "loss": 0.4253, + "mean_token_accuracy": 0.8666568994522095, + "num_tokens": 804072722.0, + "step": 21070 + }, + { + "epoch": 2.6804477801806383, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.799654006958008, + "learning_rate": 1e-06, + "loss": 0.3785, + "mean_token_accuracy": 0.8808050155639648, + "num_tokens": 804114253.0, + "step": 21071 + }, + { + "epoch": 2.6805749904592293, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.483436584472656, + "learning_rate": 1e-06, + "loss": 0.39, + "mean_token_accuracy": 0.8769140839576721, + "num_tokens": 804154877.0, + "step": 21072 + }, + { + "epoch": 2.6807022007378194, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.391237258911133, + "learning_rate": 1e-06, + "loss": 0.4109, + "mean_token_accuracy": 0.8692266941070557, + "num_tokens": 804191060.0, + "step": 21073 + }, + { + "epoch": 2.6808294110164104, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.585294723510742, + "learning_rate": 1e-06, + "loss": 0.4142, + "mean_token_accuracy": 0.8672747611999512, + "num_tokens": 804231574.0, + "step": 21074 + }, + { + "epoch": 2.6809566212950005, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.57794189453125, + "learning_rate": 1e-06, + "loss": 0.4612, + "mean_token_accuracy": 0.8527708053588867, + "num_tokens": 804264027.0, + "step": 21075 + }, + { + "epoch": 2.681083831573591, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.29998207092285, + "learning_rate": 1e-06, + "loss": 0.392, + "mean_token_accuracy": 0.8761482834815979, + "num_tokens": 804302906.0, + "step": 21076 + }, + { + "epoch": 2.6812110418521815, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.404809951782227, + "learning_rate": 1e-06, + "loss": 0.4086, + "mean_token_accuracy": 0.8705452084541321, + "num_tokens": 804339107.0, + "step": 21077 + }, + { + "epoch": 2.681338252130772, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.503742218017578, + "learning_rate": 1e-06, + "loss": 0.3879, + "mean_token_accuracy": 0.8800061345100403, + "num_tokens": 804374292.0, + "step": 21078 + }, + { + "epoch": 2.6814654624093626, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.412818908691406, + "learning_rate": 1e-06, + "loss": 0.4003, + "mean_token_accuracy": 0.8764892816543579, + "num_tokens": 804420287.0, + "step": 21079 + }, + { + "epoch": 2.681592672687953, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.290712356567383, + "learning_rate": 1e-06, + "loss": 0.3744, + "mean_token_accuracy": 0.8803021311759949, + "num_tokens": 804453977.0, + "step": 21080 + }, + { + "epoch": 2.6817198829665436, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.45341682434082, + "learning_rate": 1e-06, + "loss": 0.4611, + "mean_token_accuracy": 0.8591508269309998, + "num_tokens": 804494958.0, + "step": 21081 + }, + { + "epoch": 2.681847093245134, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.3914794921875, + "learning_rate": 1e-06, + "loss": 0.4521, + "mean_token_accuracy": 0.8548210859298706, + "num_tokens": 804532209.0, + "step": 21082 + }, + { + "epoch": 2.6819743035237247, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.29386329650879, + "learning_rate": 1e-06, + "loss": 0.3948, + "mean_token_accuracy": 0.8764225244522095, + "num_tokens": 804572937.0, + "step": 21083 + }, + { + "epoch": 2.682101513802315, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.430679321289062, + "learning_rate": 1e-06, + "loss": 0.3975, + "mean_token_accuracy": 0.8743124604225159, + "num_tokens": 804610263.0, + "step": 21084 + }, + { + "epoch": 2.6822287240809057, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.505298614501953, + "learning_rate": 1e-06, + "loss": 0.3868, + "mean_token_accuracy": 0.8783548474311829, + "num_tokens": 804645516.0, + "step": 21085 + }, + { + "epoch": 2.6823559343594963, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.510366439819336, + "learning_rate": 1e-06, + "loss": 0.3755, + "mean_token_accuracy": 0.8854780197143555, + "num_tokens": 804684540.0, + "step": 21086 + }, + { + "epoch": 2.682483144638087, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.285106658935547, + "learning_rate": 1e-06, + "loss": 0.4026, + "mean_token_accuracy": 0.8756449222564697, + "num_tokens": 804723030.0, + "step": 21087 + }, + { + "epoch": 2.6826103549166773, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.487464904785156, + "learning_rate": 1e-06, + "loss": 0.3668, + "mean_token_accuracy": 0.8830740451812744, + "num_tokens": 804759202.0, + "step": 21088 + }, + { + "epoch": 2.682737565195268, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.604997634887695, + "learning_rate": 1e-06, + "loss": 0.4087, + "mean_token_accuracy": 0.8711828589439392, + "num_tokens": 804803199.0, + "step": 21089 + }, + { + "epoch": 2.6828647754738584, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.473718643188477, + "learning_rate": 1e-06, + "loss": 0.4072, + "mean_token_accuracy": 0.8724112510681152, + "num_tokens": 804835675.0, + "step": 21090 + }, + { + "epoch": 2.682991985752449, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.668529510498047, + "learning_rate": 1e-06, + "loss": 0.4367, + "mean_token_accuracy": 0.8609887957572937, + "num_tokens": 804876583.0, + "step": 21091 + }, + { + "epoch": 2.6831191960310394, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.599252700805664, + "learning_rate": 1e-06, + "loss": 0.4184, + "mean_token_accuracy": 0.8655751943588257, + "num_tokens": 804906110.0, + "step": 21092 + }, + { + "epoch": 2.68324640630963, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.39403533935547, + "learning_rate": 1e-06, + "loss": 0.424, + "mean_token_accuracy": 0.8647259473800659, + "num_tokens": 804942196.0, + "step": 21093 + }, + { + "epoch": 2.6833736165882205, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.697093963623047, + "learning_rate": 1e-06, + "loss": 0.4021, + "mean_token_accuracy": 0.8738393783569336, + "num_tokens": 804978760.0, + "step": 21094 + }, + { + "epoch": 2.683500826866811, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.782732009887695, + "learning_rate": 1e-06, + "loss": 0.4189, + "mean_token_accuracy": 0.8660844564437866, + "num_tokens": 805014212.0, + "step": 21095 + }, + { + "epoch": 2.683628037145401, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.490644454956055, + "learning_rate": 1e-06, + "loss": 0.329, + "mean_token_accuracy": 0.8967258334159851, + "num_tokens": 805050528.0, + "step": 21096 + }, + { + "epoch": 2.683755247423992, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.792247772216797, + "learning_rate": 1e-06, + "loss": 0.404, + "mean_token_accuracy": 0.8745806217193604, + "num_tokens": 805087355.0, + "step": 21097 + }, + { + "epoch": 2.683882457702582, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.838600158691406, + "learning_rate": 1e-06, + "loss": 0.437, + "mean_token_accuracy": 0.8603262901306152, + "num_tokens": 805117837.0, + "step": 21098 + }, + { + "epoch": 2.684009667981173, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.452411651611328, + "learning_rate": 1e-06, + "loss": 0.4057, + "mean_token_accuracy": 0.8741258978843689, + "num_tokens": 805157734.0, + "step": 21099 + }, + { + "epoch": 2.684136878259763, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.64383888244629, + "learning_rate": 1e-06, + "loss": 0.3818, + "mean_token_accuracy": 0.8804072737693787, + "num_tokens": 805190657.0, + "step": 21100 + }, + { + "epoch": 2.6842640885383537, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.570348739624023, + "learning_rate": 1e-06, + "loss": 0.4304, + "mean_token_accuracy": 0.867451548576355, + "num_tokens": 805229654.0, + "step": 21101 + }, + { + "epoch": 2.6843912988169443, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.413707733154297, + "learning_rate": 1e-06, + "loss": 0.4001, + "mean_token_accuracy": 0.8742626309394836, + "num_tokens": 805269199.0, + "step": 21102 + }, + { + "epoch": 2.684518509095535, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.653308868408203, + "learning_rate": 1e-06, + "loss": 0.4015, + "mean_token_accuracy": 0.8691918849945068, + "num_tokens": 805308918.0, + "step": 21103 + }, + { + "epoch": 2.6846457193741253, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.647253036499023, + "learning_rate": 1e-06, + "loss": 0.4153, + "mean_token_accuracy": 0.8703786134719849, + "num_tokens": 805345014.0, + "step": 21104 + }, + { + "epoch": 2.684772929652716, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.286808013916016, + "learning_rate": 1e-06, + "loss": 0.3635, + "mean_token_accuracy": 0.887192964553833, + "num_tokens": 805383674.0, + "step": 21105 + }, + { + "epoch": 2.6849001399313064, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.41863441467285, + "learning_rate": 1e-06, + "loss": 0.3878, + "mean_token_accuracy": 0.8785915374755859, + "num_tokens": 805419765.0, + "step": 21106 + }, + { + "epoch": 2.685027350209897, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.50010871887207, + "learning_rate": 1e-06, + "loss": 0.3921, + "mean_token_accuracy": 0.8751215934753418, + "num_tokens": 805461346.0, + "step": 21107 + }, + { + "epoch": 2.6851545604884874, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.391036987304688, + "learning_rate": 1e-06, + "loss": 0.3812, + "mean_token_accuracy": 0.8818973302841187, + "num_tokens": 805494519.0, + "step": 21108 + }, + { + "epoch": 2.685281770767078, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.55473518371582, + "learning_rate": 1e-06, + "loss": 0.4374, + "mean_token_accuracy": 0.8642945289611816, + "num_tokens": 805530542.0, + "step": 21109 + }, + { + "epoch": 2.6854089810456685, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.66221809387207, + "learning_rate": 1e-06, + "loss": 0.4065, + "mean_token_accuracy": 0.8736608028411865, + "num_tokens": 805564531.0, + "step": 21110 + }, + { + "epoch": 2.685536191324259, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.61962127685547, + "learning_rate": 1e-06, + "loss": 0.4111, + "mean_token_accuracy": 0.8713645935058594, + "num_tokens": 805606011.0, + "step": 21111 + }, + { + "epoch": 2.6856634016028496, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.327932357788086, + "learning_rate": 1e-06, + "loss": 0.4257, + "mean_token_accuracy": 0.8681321144104004, + "num_tokens": 805643078.0, + "step": 21112 + }, + { + "epoch": 2.68579061188144, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.59111213684082, + "learning_rate": 1e-06, + "loss": 0.3829, + "mean_token_accuracy": 0.8765119910240173, + "num_tokens": 805678554.0, + "step": 21113 + }, + { + "epoch": 2.6859178221600306, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.459945678710938, + "learning_rate": 1e-06, + "loss": 0.3655, + "mean_token_accuracy": 0.8834925293922424, + "num_tokens": 805713736.0, + "step": 21114 + }, + { + "epoch": 2.686045032438621, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.320449829101562, + "learning_rate": 1e-06, + "loss": 0.4106, + "mean_token_accuracy": 0.8683677911758423, + "num_tokens": 805751798.0, + "step": 21115 + }, + { + "epoch": 2.6861722427172117, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.615346908569336, + "learning_rate": 1e-06, + "loss": 0.3821, + "mean_token_accuracy": 0.8806442022323608, + "num_tokens": 805789205.0, + "step": 21116 + }, + { + "epoch": 2.686299452995802, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.681636810302734, + "learning_rate": 1e-06, + "loss": 0.392, + "mean_token_accuracy": 0.8771648406982422, + "num_tokens": 805824910.0, + "step": 21117 + }, + { + "epoch": 2.6864266632743927, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.44280433654785, + "learning_rate": 1e-06, + "loss": 0.4007, + "mean_token_accuracy": 0.8720337748527527, + "num_tokens": 805861335.0, + "step": 21118 + }, + { + "epoch": 2.686553873552983, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.35044288635254, + "learning_rate": 1e-06, + "loss": 0.4076, + "mean_token_accuracy": 0.8706034421920776, + "num_tokens": 805901488.0, + "step": 21119 + }, + { + "epoch": 2.686681083831574, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.507164001464844, + "learning_rate": 1e-06, + "loss": 0.5082, + "mean_token_accuracy": 0.8373361825942993, + "num_tokens": 805941336.0, + "step": 21120 + }, + { + "epoch": 2.686808294110164, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.38239860534668, + "learning_rate": 1e-06, + "loss": 0.3761, + "mean_token_accuracy": 0.8823529481887817, + "num_tokens": 805974222.0, + "step": 21121 + }, + { + "epoch": 2.686935504388755, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.592388153076172, + "learning_rate": 1e-06, + "loss": 0.4163, + "mean_token_accuracy": 0.8696000576019287, + "num_tokens": 806014942.0, + "step": 21122 + }, + { + "epoch": 2.687062714667345, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.68561363220215, + "learning_rate": 1e-06, + "loss": 0.3618, + "mean_token_accuracy": 0.8838962316513062, + "num_tokens": 806048894.0, + "step": 21123 + }, + { + "epoch": 2.687189924945936, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.556840896606445, + "learning_rate": 1e-06, + "loss": 0.4089, + "mean_token_accuracy": 0.8722545504570007, + "num_tokens": 806089714.0, + "step": 21124 + }, + { + "epoch": 2.687317135224526, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.679874420166016, + "learning_rate": 1e-06, + "loss": 0.4309, + "mean_token_accuracy": 0.8618746995925903, + "num_tokens": 806128974.0, + "step": 21125 + }, + { + "epoch": 2.6874443455031165, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.316919326782227, + "learning_rate": 1e-06, + "loss": 0.4004, + "mean_token_accuracy": 0.872644305229187, + "num_tokens": 806169857.0, + "step": 21126 + }, + { + "epoch": 2.687571555781707, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.449155807495117, + "learning_rate": 1e-06, + "loss": 0.3974, + "mean_token_accuracy": 0.8746626377105713, + "num_tokens": 806202826.0, + "step": 21127 + }, + { + "epoch": 2.6876987660602976, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.458295822143555, + "learning_rate": 1e-06, + "loss": 0.4047, + "mean_token_accuracy": 0.8728691339492798, + "num_tokens": 806242850.0, + "step": 21128 + }, + { + "epoch": 2.687825976338888, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.790788650512695, + "learning_rate": 1e-06, + "loss": 0.3881, + "mean_token_accuracy": 0.8748387098312378, + "num_tokens": 806279024.0, + "step": 21129 + }, + { + "epoch": 2.6879531866174786, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.571285247802734, + "learning_rate": 1e-06, + "loss": 0.4388, + "mean_token_accuracy": 0.859193742275238, + "num_tokens": 806313669.0, + "step": 21130 + }, + { + "epoch": 2.688080396896069, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.211149215698242, + "learning_rate": 1e-06, + "loss": 0.3853, + "mean_token_accuracy": 0.8764562010765076, + "num_tokens": 806353128.0, + "step": 21131 + }, + { + "epoch": 2.6882076071746597, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.673974990844727, + "learning_rate": 1e-06, + "loss": 0.3941, + "mean_token_accuracy": 0.8753620386123657, + "num_tokens": 806385044.0, + "step": 21132 + }, + { + "epoch": 2.68833481745325, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.76620101928711, + "learning_rate": 1e-06, + "loss": 0.44, + "mean_token_accuracy": 0.8661879897117615, + "num_tokens": 806422157.0, + "step": 21133 + }, + { + "epoch": 2.6884620277318407, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.290855407714844, + "learning_rate": 1e-06, + "loss": 0.4036, + "mean_token_accuracy": 0.8727589845657349, + "num_tokens": 806463180.0, + "step": 21134 + }, + { + "epoch": 2.6885892380104313, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.51986312866211, + "learning_rate": 1e-06, + "loss": 0.4106, + "mean_token_accuracy": 0.8695243000984192, + "num_tokens": 806502031.0, + "step": 21135 + }, + { + "epoch": 2.688716448289022, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.856826782226562, + "learning_rate": 1e-06, + "loss": 0.3994, + "mean_token_accuracy": 0.8758355379104614, + "num_tokens": 806539585.0, + "step": 21136 + }, + { + "epoch": 2.6888436585676123, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.681602478027344, + "learning_rate": 1e-06, + "loss": 0.3406, + "mean_token_accuracy": 0.8936695456504822, + "num_tokens": 806573147.0, + "step": 21137 + }, + { + "epoch": 2.688970868846203, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.338542938232422, + "learning_rate": 1e-06, + "loss": 0.3631, + "mean_token_accuracy": 0.8862684369087219, + "num_tokens": 806609785.0, + "step": 21138 + }, + { + "epoch": 2.6890980791247934, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.486370086669922, + "learning_rate": 1e-06, + "loss": 0.3713, + "mean_token_accuracy": 0.8813595771789551, + "num_tokens": 806647709.0, + "step": 21139 + }, + { + "epoch": 2.689225289403384, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.572961807250977, + "learning_rate": 1e-06, + "loss": 0.387, + "mean_token_accuracy": 0.8779745101928711, + "num_tokens": 806684448.0, + "step": 21140 + }, + { + "epoch": 2.6893524996819744, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.437332153320312, + "learning_rate": 1e-06, + "loss": 0.3843, + "mean_token_accuracy": 0.8775895833969116, + "num_tokens": 806725326.0, + "step": 21141 + }, + { + "epoch": 2.689479709960565, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.270301818847656, + "learning_rate": 1e-06, + "loss": 0.4016, + "mean_token_accuracy": 0.8743489980697632, + "num_tokens": 806759797.0, + "step": 21142 + }, + { + "epoch": 2.6896069202391555, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.49172592163086, + "learning_rate": 1e-06, + "loss": 0.3743, + "mean_token_accuracy": 0.8814201354980469, + "num_tokens": 806799986.0, + "step": 21143 + }, + { + "epoch": 2.6897341305177456, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.45242691040039, + "learning_rate": 1e-06, + "loss": 0.4123, + "mean_token_accuracy": 0.8685746788978577, + "num_tokens": 806843055.0, + "step": 21144 + }, + { + "epoch": 2.6898613407963365, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.84455680847168, + "learning_rate": 1e-06, + "loss": 0.3624, + "mean_token_accuracy": 0.8878861665725708, + "num_tokens": 806875897.0, + "step": 21145 + }, + { + "epoch": 2.6899885510749266, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.570781707763672, + "learning_rate": 1e-06, + "loss": 0.3823, + "mean_token_accuracy": 0.882125973701477, + "num_tokens": 806915583.0, + "step": 21146 + }, + { + "epoch": 2.6901157613535176, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.420928955078125, + "learning_rate": 1e-06, + "loss": 0.3913, + "mean_token_accuracy": 0.877025842666626, + "num_tokens": 806953851.0, + "step": 21147 + }, + { + "epoch": 2.6902429716321077, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.63872718811035, + "learning_rate": 1e-06, + "loss": 0.416, + "mean_token_accuracy": 0.8708158135414124, + "num_tokens": 806988519.0, + "step": 21148 + }, + { + "epoch": 2.6903701819106987, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.421890258789062, + "learning_rate": 1e-06, + "loss": 0.3946, + "mean_token_accuracy": 0.8775264620780945, + "num_tokens": 807029612.0, + "step": 21149 + }, + { + "epoch": 2.6904973921892887, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.316070556640625, + "learning_rate": 1e-06, + "loss": 0.4459, + "mean_token_accuracy": 0.8588513135910034, + "num_tokens": 807064009.0, + "step": 21150 + }, + { + "epoch": 2.6906246024678793, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.529945373535156, + "learning_rate": 1e-06, + "loss": 0.4106, + "mean_token_accuracy": 0.8712457418441772, + "num_tokens": 807104694.0, + "step": 21151 + }, + { + "epoch": 2.69075181274647, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.56122398376465, + "learning_rate": 1e-06, + "loss": 0.4721, + "mean_token_accuracy": 0.8526062965393066, + "num_tokens": 807142500.0, + "step": 21152 + }, + { + "epoch": 2.6908790230250603, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.89990997314453, + "learning_rate": 1e-06, + "loss": 0.407, + "mean_token_accuracy": 0.8712694644927979, + "num_tokens": 807172123.0, + "step": 21153 + }, + { + "epoch": 2.691006233303651, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.384069442749023, + "learning_rate": 1e-06, + "loss": 0.4078, + "mean_token_accuracy": 0.8754562735557556, + "num_tokens": 807206422.0, + "step": 21154 + }, + { + "epoch": 2.6911334435822414, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.576356887817383, + "learning_rate": 1e-06, + "loss": 0.4344, + "mean_token_accuracy": 0.8600193858146667, + "num_tokens": 807240867.0, + "step": 21155 + }, + { + "epoch": 2.691260653860832, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.552852630615234, + "learning_rate": 1e-06, + "loss": 0.3647, + "mean_token_accuracy": 0.8838487267494202, + "num_tokens": 807275838.0, + "step": 21156 + }, + { + "epoch": 2.6913878641394224, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.29513931274414, + "learning_rate": 1e-06, + "loss": 0.367, + "mean_token_accuracy": 0.882778525352478, + "num_tokens": 807315164.0, + "step": 21157 + }, + { + "epoch": 2.691515074418013, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.5562801361084, + "learning_rate": 1e-06, + "loss": 0.431, + "mean_token_accuracy": 0.8652947545051575, + "num_tokens": 807358016.0, + "step": 21158 + }, + { + "epoch": 2.6916422846966035, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.350065231323242, + "learning_rate": 1e-06, + "loss": 0.405, + "mean_token_accuracy": 0.8724764585494995, + "num_tokens": 807402751.0, + "step": 21159 + }, + { + "epoch": 2.691769494975194, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.746551513671875, + "learning_rate": 1e-06, + "loss": 0.3906, + "mean_token_accuracy": 0.8782283067703247, + "num_tokens": 807442761.0, + "step": 21160 + }, + { + "epoch": 2.6918967052537845, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.709348678588867, + "learning_rate": 1e-06, + "loss": 0.3725, + "mean_token_accuracy": 0.8814109563827515, + "num_tokens": 807478550.0, + "step": 21161 + }, + { + "epoch": 2.692023915532375, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.266788482666016, + "learning_rate": 1e-06, + "loss": 0.3775, + "mean_token_accuracy": 0.8826397061347961, + "num_tokens": 807522756.0, + "step": 21162 + }, + { + "epoch": 2.6921511258109656, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.63309097290039, + "learning_rate": 1e-06, + "loss": 0.3843, + "mean_token_accuracy": 0.8793162107467651, + "num_tokens": 807561701.0, + "step": 21163 + }, + { + "epoch": 2.692278336089556, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.678234100341797, + "learning_rate": 1e-06, + "loss": 0.4099, + "mean_token_accuracy": 0.8726919889450073, + "num_tokens": 807604058.0, + "step": 21164 + }, + { + "epoch": 2.6924055463681467, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.299335479736328, + "learning_rate": 1e-06, + "loss": 0.3774, + "mean_token_accuracy": 0.8791725635528564, + "num_tokens": 807637344.0, + "step": 21165 + }, + { + "epoch": 2.692532756646737, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.69684600830078, + "learning_rate": 1e-06, + "loss": 0.4238, + "mean_token_accuracy": 0.867246687412262, + "num_tokens": 807676867.0, + "step": 21166 + }, + { + "epoch": 2.6926599669253277, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.474973678588867, + "learning_rate": 1e-06, + "loss": 0.3885, + "mean_token_accuracy": 0.8767412900924683, + "num_tokens": 807707183.0, + "step": 21167 + }, + { + "epoch": 2.6927871772039182, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.80411148071289, + "learning_rate": 1e-06, + "loss": 0.4229, + "mean_token_accuracy": 0.8648066520690918, + "num_tokens": 807737617.0, + "step": 21168 + }, + { + "epoch": 2.6929143874825083, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.8808650970459, + "learning_rate": 1e-06, + "loss": 0.422, + "mean_token_accuracy": 0.8676326274871826, + "num_tokens": 807774141.0, + "step": 21169 + }, + { + "epoch": 2.6930415977610993, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.362380981445312, + "learning_rate": 1e-06, + "loss": 0.3848, + "mean_token_accuracy": 0.8789860010147095, + "num_tokens": 807813375.0, + "step": 21170 + }, + { + "epoch": 2.6931688080396894, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.453569412231445, + "learning_rate": 1e-06, + "loss": 0.4404, + "mean_token_accuracy": 0.8611714839935303, + "num_tokens": 807852317.0, + "step": 21171 + }, + { + "epoch": 2.6932960183182804, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.363418579101562, + "learning_rate": 1e-06, + "loss": 0.4024, + "mean_token_accuracy": 0.8756090402603149, + "num_tokens": 807895266.0, + "step": 21172 + }, + { + "epoch": 2.6934232285968704, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.423688888549805, + "learning_rate": 1e-06, + "loss": 0.3852, + "mean_token_accuracy": 0.8761234283447266, + "num_tokens": 807938600.0, + "step": 21173 + }, + { + "epoch": 2.693550438875461, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.50203514099121, + "learning_rate": 1e-06, + "loss": 0.3384, + "mean_token_accuracy": 0.8939565420150757, + "num_tokens": 807974329.0, + "step": 21174 + }, + { + "epoch": 2.6936776491540515, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.187458038330078, + "learning_rate": 1e-06, + "loss": 0.3649, + "mean_token_accuracy": 0.885493278503418, + "num_tokens": 808015666.0, + "step": 21175 + }, + { + "epoch": 2.693804859432642, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.69086456298828, + "learning_rate": 1e-06, + "loss": 0.3708, + "mean_token_accuracy": 0.8845981359481812, + "num_tokens": 808050176.0, + "step": 21176 + }, + { + "epoch": 2.6939320697112326, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.741436004638672, + "learning_rate": 1e-06, + "loss": 0.4137, + "mean_token_accuracy": 0.8666284084320068, + "num_tokens": 808091910.0, + "step": 21177 + }, + { + "epoch": 2.694059279989823, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.417102813720703, + "learning_rate": 1e-06, + "loss": 0.4136, + "mean_token_accuracy": 0.8693355917930603, + "num_tokens": 808128646.0, + "step": 21178 + }, + { + "epoch": 2.6941864902684136, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.576427459716797, + "learning_rate": 1e-06, + "loss": 0.3832, + "mean_token_accuracy": 0.8782986402511597, + "num_tokens": 808166544.0, + "step": 21179 + }, + { + "epoch": 2.694313700547004, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.637325286865234, + "learning_rate": 1e-06, + "loss": 0.4341, + "mean_token_accuracy": 0.8640483617782593, + "num_tokens": 808200051.0, + "step": 21180 + }, + { + "epoch": 2.6944409108255947, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.37485122680664, + "learning_rate": 1e-06, + "loss": 0.395, + "mean_token_accuracy": 0.8763153553009033, + "num_tokens": 808239251.0, + "step": 21181 + }, + { + "epoch": 2.694568121104185, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.366670608520508, + "learning_rate": 1e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.868810772895813, + "num_tokens": 808280639.0, + "step": 21182 + }, + { + "epoch": 2.6946953313827757, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.314285278320312, + "learning_rate": 1e-06, + "loss": 0.3789, + "mean_token_accuracy": 0.8812519311904907, + "num_tokens": 808316141.0, + "step": 21183 + }, + { + "epoch": 2.6948225416613663, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.89272117614746, + "learning_rate": 1e-06, + "loss": 0.4138, + "mean_token_accuracy": 0.8730796575546265, + "num_tokens": 808359172.0, + "step": 21184 + }, + { + "epoch": 2.694949751939957, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.406713485717773, + "learning_rate": 1e-06, + "loss": 0.4938, + "mean_token_accuracy": 0.8472917675971985, + "num_tokens": 808397823.0, + "step": 21185 + }, + { + "epoch": 2.6950769622185473, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.508995056152344, + "learning_rate": 1e-06, + "loss": 0.3736, + "mean_token_accuracy": 0.883554220199585, + "num_tokens": 808435355.0, + "step": 21186 + }, + { + "epoch": 2.695204172497138, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.51970863342285, + "learning_rate": 1e-06, + "loss": 0.3759, + "mean_token_accuracy": 0.8809464573860168, + "num_tokens": 808470792.0, + "step": 21187 + }, + { + "epoch": 2.6953313827757284, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.56343650817871, + "learning_rate": 1e-06, + "loss": 0.3463, + "mean_token_accuracy": 0.8916476368904114, + "num_tokens": 808504591.0, + "step": 21188 + }, + { + "epoch": 2.695458593054319, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.60560417175293, + "learning_rate": 1e-06, + "loss": 0.3559, + "mean_token_accuracy": 0.8852730989456177, + "num_tokens": 808537167.0, + "step": 21189 + }, + { + "epoch": 2.6955858033329094, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.273927688598633, + "learning_rate": 1e-06, + "loss": 0.3547, + "mean_token_accuracy": 0.8885304927825928, + "num_tokens": 808572055.0, + "step": 21190 + }, + { + "epoch": 2.6957130136115, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.732179641723633, + "learning_rate": 1e-06, + "loss": 0.3521, + "mean_token_accuracy": 0.8859761953353882, + "num_tokens": 808605239.0, + "step": 21191 + }, + { + "epoch": 2.6958402238900905, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.85624885559082, + "learning_rate": 1e-06, + "loss": 0.4667, + "mean_token_accuracy": 0.8522931337356567, + "num_tokens": 808643227.0, + "step": 21192 + }, + { + "epoch": 2.695967434168681, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.398462295532227, + "learning_rate": 1e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.8685818910598755, + "num_tokens": 808683855.0, + "step": 21193 + }, + { + "epoch": 2.696094644447271, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.426225662231445, + "learning_rate": 1e-06, + "loss": 0.3833, + "mean_token_accuracy": 0.8810651302337646, + "num_tokens": 808723467.0, + "step": 21194 + }, + { + "epoch": 2.696221854725862, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.497085571289062, + "learning_rate": 1e-06, + "loss": 0.4412, + "mean_token_accuracy": 0.8633719682693481, + "num_tokens": 808759805.0, + "step": 21195 + }, + { + "epoch": 2.696349065004452, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.607894897460938, + "learning_rate": 1e-06, + "loss": 0.454, + "mean_token_accuracy": 0.8578304052352905, + "num_tokens": 808802024.0, + "step": 21196 + }, + { + "epoch": 2.696476275283043, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.44737434387207, + "learning_rate": 1e-06, + "loss": 0.4331, + "mean_token_accuracy": 0.8671085238456726, + "num_tokens": 808837086.0, + "step": 21197 + }, + { + "epoch": 2.696603485561633, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.535865783691406, + "learning_rate": 1e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.8734807968139648, + "num_tokens": 808875261.0, + "step": 21198 + }, + { + "epoch": 2.6967306958402237, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.55865478515625, + "learning_rate": 1e-06, + "loss": 0.3729, + "mean_token_accuracy": 0.8808850049972534, + "num_tokens": 808913044.0, + "step": 21199 + }, + { + "epoch": 2.6968579061188143, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.886253356933594, + "learning_rate": 1e-06, + "loss": 0.3788, + "mean_token_accuracy": 0.8822171092033386, + "num_tokens": 808952374.0, + "step": 21200 + }, + { + "epoch": 2.696985116397405, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.360645294189453, + "learning_rate": 1e-06, + "loss": 0.3975, + "mean_token_accuracy": 0.875764012336731, + "num_tokens": 808994397.0, + "step": 21201 + }, + { + "epoch": 2.6971123266759953, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.55219078063965, + "learning_rate": 1e-06, + "loss": 0.387, + "mean_token_accuracy": 0.8774232864379883, + "num_tokens": 809036401.0, + "step": 21202 + }, + { + "epoch": 2.697239536954586, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.4563045501709, + "learning_rate": 1e-06, + "loss": 0.386, + "mean_token_accuracy": 0.879228949546814, + "num_tokens": 809073073.0, + "step": 21203 + }, + { + "epoch": 2.6973667472331764, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.47657585144043, + "learning_rate": 1e-06, + "loss": 0.3942, + "mean_token_accuracy": 0.8740106821060181, + "num_tokens": 809117348.0, + "step": 21204 + }, + { + "epoch": 2.697493957511767, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.517282485961914, + "learning_rate": 1e-06, + "loss": 0.3655, + "mean_token_accuracy": 0.8869123458862305, + "num_tokens": 809151357.0, + "step": 21205 + }, + { + "epoch": 2.6976211677903574, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.336210250854492, + "learning_rate": 1e-06, + "loss": 0.4188, + "mean_token_accuracy": 0.8685417771339417, + "num_tokens": 809189000.0, + "step": 21206 + }, + { + "epoch": 2.697748378068948, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.669931411743164, + "learning_rate": 1e-06, + "loss": 0.387, + "mean_token_accuracy": 0.8777462840080261, + "num_tokens": 809227380.0, + "step": 21207 + }, + { + "epoch": 2.6978755883475385, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.382646560668945, + "learning_rate": 1e-06, + "loss": 0.396, + "mean_token_accuracy": 0.8744227886199951, + "num_tokens": 809265131.0, + "step": 21208 + }, + { + "epoch": 2.698002798626129, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.502050399780273, + "learning_rate": 1e-06, + "loss": 0.3982, + "mean_token_accuracy": 0.8733115196228027, + "num_tokens": 809297190.0, + "step": 21209 + }, + { + "epoch": 2.6981300089047195, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.55376625061035, + "learning_rate": 1e-06, + "loss": 0.3876, + "mean_token_accuracy": 0.8839435577392578, + "num_tokens": 809336774.0, + "step": 21210 + }, + { + "epoch": 2.69825721918331, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.302125930786133, + "learning_rate": 1e-06, + "loss": 0.4474, + "mean_token_accuracy": 0.858176589012146, + "num_tokens": 809376612.0, + "step": 21211 + }, + { + "epoch": 2.6983844294619006, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.53563117980957, + "learning_rate": 1e-06, + "loss": 0.3243, + "mean_token_accuracy": 0.8970736265182495, + "num_tokens": 809416058.0, + "step": 21212 + }, + { + "epoch": 2.698511639740491, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.261249542236328, + "learning_rate": 1e-06, + "loss": 0.363, + "mean_token_accuracy": 0.8849856853485107, + "num_tokens": 809449580.0, + "step": 21213 + }, + { + "epoch": 2.6986388500190817, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.367469787597656, + "learning_rate": 1e-06, + "loss": 0.3936, + "mean_token_accuracy": 0.8771127462387085, + "num_tokens": 809484983.0, + "step": 21214 + }, + { + "epoch": 2.698766060297672, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.951204299926758, + "learning_rate": 1e-06, + "loss": 0.388, + "mean_token_accuracy": 0.8774374723434448, + "num_tokens": 809521086.0, + "step": 21215 + }, + { + "epoch": 2.6988932705762627, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.259166717529297, + "learning_rate": 1e-06, + "loss": 0.4106, + "mean_token_accuracy": 0.8720879554748535, + "num_tokens": 809562475.0, + "step": 21216 + }, + { + "epoch": 2.699020480854853, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.548137664794922, + "learning_rate": 1e-06, + "loss": 0.4401, + "mean_token_accuracy": 0.8612055778503418, + "num_tokens": 809596686.0, + "step": 21217 + }, + { + "epoch": 2.6991476911334438, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.429241180419922, + "learning_rate": 1e-06, + "loss": 0.4049, + "mean_token_accuracy": 0.8754497170448303, + "num_tokens": 809636614.0, + "step": 21218 + }, + { + "epoch": 2.699274901412034, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.558822631835938, + "learning_rate": 1e-06, + "loss": 0.4374, + "mean_token_accuracy": 0.8630488514900208, + "num_tokens": 809674542.0, + "step": 21219 + }, + { + "epoch": 2.699402111690625, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.4053955078125, + "learning_rate": 1e-06, + "loss": 0.3558, + "mean_token_accuracy": 0.8904280066490173, + "num_tokens": 809716726.0, + "step": 21220 + }, + { + "epoch": 2.699529321969215, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.282529830932617, + "learning_rate": 1e-06, + "loss": 0.3705, + "mean_token_accuracy": 0.8839342594146729, + "num_tokens": 809756790.0, + "step": 21221 + }, + { + "epoch": 2.699656532247806, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.751136779785156, + "learning_rate": 1e-06, + "loss": 0.4091, + "mean_token_accuracy": 0.8712946772575378, + "num_tokens": 809794616.0, + "step": 21222 + }, + { + "epoch": 2.699783742526396, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.795740127563477, + "learning_rate": 1e-06, + "loss": 0.3841, + "mean_token_accuracy": 0.8800705671310425, + "num_tokens": 809836662.0, + "step": 21223 + }, + { + "epoch": 2.6999109528049865, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.326025009155273, + "learning_rate": 1e-06, + "loss": 0.4129, + "mean_token_accuracy": 0.8726713061332703, + "num_tokens": 809873812.0, + "step": 21224 + }, + { + "epoch": 2.700038163083577, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.756145477294922, + "learning_rate": 1e-06, + "loss": 0.3943, + "mean_token_accuracy": 0.8743430376052856, + "num_tokens": 809913139.0, + "step": 21225 + }, + { + "epoch": 2.7001653733621676, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.368621826171875, + "learning_rate": 1e-06, + "loss": 0.3954, + "mean_token_accuracy": 0.8757367730140686, + "num_tokens": 809947741.0, + "step": 21226 + }, + { + "epoch": 2.700292583640758, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.346786499023438, + "learning_rate": 1e-06, + "loss": 0.423, + "mean_token_accuracy": 0.8669154047966003, + "num_tokens": 809987120.0, + "step": 21227 + }, + { + "epoch": 2.7004197939193486, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.53129768371582, + "learning_rate": 1e-06, + "loss": 0.4217, + "mean_token_accuracy": 0.8688021898269653, + "num_tokens": 810026972.0, + "step": 21228 + }, + { + "epoch": 2.700547004197939, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.030241012573242, + "learning_rate": 1e-06, + "loss": 0.4002, + "mean_token_accuracy": 0.8727738261222839, + "num_tokens": 810062767.0, + "step": 21229 + }, + { + "epoch": 2.7006742144765297, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.325878143310547, + "learning_rate": 1e-06, + "loss": 0.3646, + "mean_token_accuracy": 0.8828461766242981, + "num_tokens": 810096930.0, + "step": 21230 + }, + { + "epoch": 2.70080142475512, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.68769073486328, + "learning_rate": 1e-06, + "loss": 0.4416, + "mean_token_accuracy": 0.8613561987876892, + "num_tokens": 810134600.0, + "step": 21231 + }, + { + "epoch": 2.7009286350337107, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.40524673461914, + "learning_rate": 1e-06, + "loss": 0.3921, + "mean_token_accuracy": 0.8756897449493408, + "num_tokens": 810169535.0, + "step": 21232 + }, + { + "epoch": 2.7010558453123013, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.540014266967773, + "learning_rate": 1e-06, + "loss": 0.4315, + "mean_token_accuracy": 0.8641536831855774, + "num_tokens": 810205288.0, + "step": 21233 + }, + { + "epoch": 2.701183055590892, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.294214248657227, + "learning_rate": 1e-06, + "loss": 0.4128, + "mean_token_accuracy": 0.8712961673736572, + "num_tokens": 810242868.0, + "step": 21234 + }, + { + "epoch": 2.7013102658694823, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.371700286865234, + "learning_rate": 1e-06, + "loss": 0.4374, + "mean_token_accuracy": 0.8629099726676941, + "num_tokens": 810286539.0, + "step": 21235 + }, + { + "epoch": 2.701437476148073, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.23948097229004, + "learning_rate": 1e-06, + "loss": 0.3944, + "mean_token_accuracy": 0.8776236772537231, + "num_tokens": 810331598.0, + "step": 21236 + }, + { + "epoch": 2.7015646864266634, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.6267147064209, + "learning_rate": 1e-06, + "loss": 0.3785, + "mean_token_accuracy": 0.8828370571136475, + "num_tokens": 810369501.0, + "step": 21237 + }, + { + "epoch": 2.701691896705254, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.180707931518555, + "learning_rate": 1e-06, + "loss": 0.4152, + "mean_token_accuracy": 0.8718436360359192, + "num_tokens": 810410727.0, + "step": 21238 + }, + { + "epoch": 2.7018191069838444, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.62348747253418, + "learning_rate": 1e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.8652993440628052, + "num_tokens": 810450315.0, + "step": 21239 + }, + { + "epoch": 2.701946317262435, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.401742935180664, + "learning_rate": 1e-06, + "loss": 0.4266, + "mean_token_accuracy": 0.8660837411880493, + "num_tokens": 810493972.0, + "step": 21240 + }, + { + "epoch": 2.7020735275410255, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.6148624420166, + "learning_rate": 1e-06, + "loss": 0.3632, + "mean_token_accuracy": 0.8852767944335938, + "num_tokens": 810526038.0, + "step": 21241 + }, + { + "epoch": 2.7022007378196156, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.652381896972656, + "learning_rate": 1e-06, + "loss": 0.3809, + "mean_token_accuracy": 0.8781876564025879, + "num_tokens": 810560772.0, + "step": 21242 + }, + { + "epoch": 2.7023279480982065, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.606151580810547, + "learning_rate": 1e-06, + "loss": 0.3656, + "mean_token_accuracy": 0.8818449974060059, + "num_tokens": 810598322.0, + "step": 21243 + }, + { + "epoch": 2.7024551583767966, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.392873764038086, + "learning_rate": 1e-06, + "loss": 0.3443, + "mean_token_accuracy": 0.8938268423080444, + "num_tokens": 810638652.0, + "step": 21244 + }, + { + "epoch": 2.7025823686553876, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.547523498535156, + "learning_rate": 1e-06, + "loss": 0.398, + "mean_token_accuracy": 0.8730697631835938, + "num_tokens": 810680318.0, + "step": 21245 + }, + { + "epoch": 2.7027095789339777, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.69707679748535, + "learning_rate": 1e-06, + "loss": 0.3851, + "mean_token_accuracy": 0.8790152668952942, + "num_tokens": 810715001.0, + "step": 21246 + }, + { + "epoch": 2.7028367892125686, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.448257446289062, + "learning_rate": 1e-06, + "loss": 0.4391, + "mean_token_accuracy": 0.8664484024047852, + "num_tokens": 810748897.0, + "step": 21247 + }, + { + "epoch": 2.7029639994911587, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.554241180419922, + "learning_rate": 1e-06, + "loss": 0.4295, + "mean_token_accuracy": 0.8679828643798828, + "num_tokens": 810788954.0, + "step": 21248 + }, + { + "epoch": 2.7030912097697493, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.454105377197266, + "learning_rate": 1e-06, + "loss": 0.3832, + "mean_token_accuracy": 0.876849353313446, + "num_tokens": 810828897.0, + "step": 21249 + }, + { + "epoch": 2.70321842004834, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.654512405395508, + "learning_rate": 1e-06, + "loss": 0.345, + "mean_token_accuracy": 0.8911653161048889, + "num_tokens": 810866698.0, + "step": 21250 + }, + { + "epoch": 2.7033456303269303, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.714052200317383, + "learning_rate": 1e-06, + "loss": 0.407, + "mean_token_accuracy": 0.8673381805419922, + "num_tokens": 810903140.0, + "step": 21251 + }, + { + "epoch": 2.703472840605521, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.48893165588379, + "learning_rate": 1e-06, + "loss": 0.3884, + "mean_token_accuracy": 0.8783871531486511, + "num_tokens": 810940664.0, + "step": 21252 + }, + { + "epoch": 2.7036000508841114, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.411006927490234, + "learning_rate": 1e-06, + "loss": 0.4052, + "mean_token_accuracy": 0.8722786903381348, + "num_tokens": 810981818.0, + "step": 21253 + }, + { + "epoch": 2.703727261162702, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.480024337768555, + "learning_rate": 1e-06, + "loss": 0.4, + "mean_token_accuracy": 0.8739463686943054, + "num_tokens": 811019897.0, + "step": 21254 + }, + { + "epoch": 2.7038544714412924, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.77958106994629, + "learning_rate": 1e-06, + "loss": 0.3966, + "mean_token_accuracy": 0.8783072233200073, + "num_tokens": 811056310.0, + "step": 21255 + }, + { + "epoch": 2.703981681719883, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.65410804748535, + "learning_rate": 1e-06, + "loss": 0.4288, + "mean_token_accuracy": 0.8636943697929382, + "num_tokens": 811091505.0, + "step": 21256 + }, + { + "epoch": 2.7041088919984735, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.517860412597656, + "learning_rate": 1e-06, + "loss": 0.3611, + "mean_token_accuracy": 0.8853439092636108, + "num_tokens": 811134861.0, + "step": 21257 + }, + { + "epoch": 2.704236102277064, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.446006774902344, + "learning_rate": 1e-06, + "loss": 0.4207, + "mean_token_accuracy": 0.8661237359046936, + "num_tokens": 811170001.0, + "step": 21258 + }, + { + "epoch": 2.7043633125556545, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.388193130493164, + "learning_rate": 1e-06, + "loss": 0.3908, + "mean_token_accuracy": 0.879101037979126, + "num_tokens": 811204203.0, + "step": 21259 + }, + { + "epoch": 2.704490522834245, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.699626922607422, + "learning_rate": 1e-06, + "loss": 0.4013, + "mean_token_accuracy": 0.8783215880393982, + "num_tokens": 811244598.0, + "step": 21260 + }, + { + "epoch": 2.7046177331128356, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.420772552490234, + "learning_rate": 1e-06, + "loss": 0.3683, + "mean_token_accuracy": 0.884706437587738, + "num_tokens": 811283352.0, + "step": 21261 + }, + { + "epoch": 2.704744943391426, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.471540451049805, + "learning_rate": 1e-06, + "loss": 0.4328, + "mean_token_accuracy": 0.8631754517555237, + "num_tokens": 811317605.0, + "step": 21262 + }, + { + "epoch": 2.7048721536700167, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.5008544921875, + "learning_rate": 1e-06, + "loss": 0.3519, + "mean_token_accuracy": 0.8903520107269287, + "num_tokens": 811352117.0, + "step": 21263 + }, + { + "epoch": 2.704999363948607, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.503971099853516, + "learning_rate": 1e-06, + "loss": 0.4172, + "mean_token_accuracy": 0.8661119341850281, + "num_tokens": 811386118.0, + "step": 21264 + }, + { + "epoch": 2.7051265742271977, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.539939880371094, + "learning_rate": 1e-06, + "loss": 0.3719, + "mean_token_accuracy": 0.8849923610687256, + "num_tokens": 811424479.0, + "step": 21265 + }, + { + "epoch": 2.7052537845057882, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.522754669189453, + "learning_rate": 1e-06, + "loss": 0.4353, + "mean_token_accuracy": 0.8607511520385742, + "num_tokens": 811464683.0, + "step": 21266 + }, + { + "epoch": 2.7053809947843783, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.498638153076172, + "learning_rate": 1e-06, + "loss": 0.3563, + "mean_token_accuracy": 0.8898836374282837, + "num_tokens": 811504801.0, + "step": 21267 + }, + { + "epoch": 2.7055082050629693, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.56485939025879, + "learning_rate": 1e-06, + "loss": 0.4236, + "mean_token_accuracy": 0.8660194277763367, + "num_tokens": 811546278.0, + "step": 21268 + }, + { + "epoch": 2.7056354153415594, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.592370986938477, + "learning_rate": 1e-06, + "loss": 0.4337, + "mean_token_accuracy": 0.8611222505569458, + "num_tokens": 811587019.0, + "step": 21269 + }, + { + "epoch": 2.7057626256201504, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.445465087890625, + "learning_rate": 1e-06, + "loss": 0.4668, + "mean_token_accuracy": 0.8501189947128296, + "num_tokens": 811619843.0, + "step": 21270 + }, + { + "epoch": 2.7058898358987404, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.512250900268555, + "learning_rate": 1e-06, + "loss": 0.3771, + "mean_token_accuracy": 0.8804379105567932, + "num_tokens": 811655797.0, + "step": 21271 + }, + { + "epoch": 2.706017046177331, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.731578826904297, + "learning_rate": 1e-06, + "loss": 0.4206, + "mean_token_accuracy": 0.8658596277236938, + "num_tokens": 811691626.0, + "step": 21272 + }, + { + "epoch": 2.7061442564559215, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.497507095336914, + "learning_rate": 1e-06, + "loss": 0.4091, + "mean_token_accuracy": 0.8760185241699219, + "num_tokens": 811726046.0, + "step": 21273 + }, + { + "epoch": 2.706271466734512, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.407331466674805, + "learning_rate": 1e-06, + "loss": 0.4334, + "mean_token_accuracy": 0.8644262552261353, + "num_tokens": 811765686.0, + "step": 21274 + }, + { + "epoch": 2.7063986770131025, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.432893753051758, + "learning_rate": 1e-06, + "loss": 0.3741, + "mean_token_accuracy": 0.8814681768417358, + "num_tokens": 811801143.0, + "step": 21275 + }, + { + "epoch": 2.706525887291693, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.59642219543457, + "learning_rate": 1e-06, + "loss": 0.3707, + "mean_token_accuracy": 0.8812021017074585, + "num_tokens": 811834807.0, + "step": 21276 + }, + { + "epoch": 2.7066530975702836, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.579387664794922, + "learning_rate": 1e-06, + "loss": 0.3451, + "mean_token_accuracy": 0.893062174320221, + "num_tokens": 811865676.0, + "step": 21277 + }, + { + "epoch": 2.706780307848874, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.470205307006836, + "learning_rate": 1e-06, + "loss": 0.4046, + "mean_token_accuracy": 0.8732912540435791, + "num_tokens": 811901259.0, + "step": 21278 + }, + { + "epoch": 2.7069075181274647, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.45428466796875, + "learning_rate": 1e-06, + "loss": 0.4511, + "mean_token_accuracy": 0.8606775403022766, + "num_tokens": 811937768.0, + "step": 21279 + }, + { + "epoch": 2.707034728406055, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.523418426513672, + "learning_rate": 1e-06, + "loss": 0.4005, + "mean_token_accuracy": 0.8738824129104614, + "num_tokens": 811967805.0, + "step": 21280 + }, + { + "epoch": 2.7071619386846457, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.723495483398438, + "learning_rate": 1e-06, + "loss": 0.4104, + "mean_token_accuracy": 0.8700039386749268, + "num_tokens": 812007468.0, + "step": 21281 + }, + { + "epoch": 2.7072891489632362, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.89118766784668, + "learning_rate": 1e-06, + "loss": 0.3905, + "mean_token_accuracy": 0.8760032653808594, + "num_tokens": 812048127.0, + "step": 21282 + }, + { + "epoch": 2.7074163592418268, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.349245071411133, + "learning_rate": 1e-06, + "loss": 0.3883, + "mean_token_accuracy": 0.874905526638031, + "num_tokens": 812083904.0, + "step": 21283 + }, + { + "epoch": 2.7075435695204173, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.55240821838379, + "learning_rate": 1e-06, + "loss": 0.3799, + "mean_token_accuracy": 0.8797401189804077, + "num_tokens": 812119643.0, + "step": 21284 + }, + { + "epoch": 2.707670779799008, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.491058349609375, + "learning_rate": 1e-06, + "loss": 0.4222, + "mean_token_accuracy": 0.8730466365814209, + "num_tokens": 812155338.0, + "step": 21285 + }, + { + "epoch": 2.7077979900775984, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.504892349243164, + "learning_rate": 1e-06, + "loss": 0.3974, + "mean_token_accuracy": 0.8745533227920532, + "num_tokens": 812193075.0, + "step": 21286 + }, + { + "epoch": 2.707925200356189, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.650053024291992, + "learning_rate": 1e-06, + "loss": 0.3646, + "mean_token_accuracy": 0.8834195137023926, + "num_tokens": 812228163.0, + "step": 21287 + }, + { + "epoch": 2.7080524106347794, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.355106353759766, + "learning_rate": 1e-06, + "loss": 0.4133, + "mean_token_accuracy": 0.8686354160308838, + "num_tokens": 812268565.0, + "step": 21288 + }, + { + "epoch": 2.70817962091337, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.54597282409668, + "learning_rate": 1e-06, + "loss": 0.3914, + "mean_token_accuracy": 0.8800997734069824, + "num_tokens": 812307292.0, + "step": 21289 + }, + { + "epoch": 2.7083068311919605, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.54157066345215, + "learning_rate": 1e-06, + "loss": 0.3693, + "mean_token_accuracy": 0.8844748735427856, + "num_tokens": 812346180.0, + "step": 21290 + }, + { + "epoch": 2.708434041470551, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.382905960083008, + "learning_rate": 1e-06, + "loss": 0.4122, + "mean_token_accuracy": 0.8717656135559082, + "num_tokens": 812376033.0, + "step": 21291 + }, + { + "epoch": 2.708561251749141, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.45345115661621, + "learning_rate": 1e-06, + "loss": 0.3482, + "mean_token_accuracy": 0.8906840085983276, + "num_tokens": 812407925.0, + "step": 21292 + }, + { + "epoch": 2.708688462027732, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.608646392822266, + "learning_rate": 1e-06, + "loss": 0.3885, + "mean_token_accuracy": 0.880437970161438, + "num_tokens": 812452354.0, + "step": 21293 + }, + { + "epoch": 2.708815672306322, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.52435302734375, + "learning_rate": 1e-06, + "loss": 0.3951, + "mean_token_accuracy": 0.8744449615478516, + "num_tokens": 812490015.0, + "step": 21294 + }, + { + "epoch": 2.708942882584913, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.41049575805664, + "learning_rate": 1e-06, + "loss": 0.408, + "mean_token_accuracy": 0.8722046613693237, + "num_tokens": 812529919.0, + "step": 21295 + }, + { + "epoch": 2.709070092863503, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.67331886291504, + "learning_rate": 1e-06, + "loss": 0.3653, + "mean_token_accuracy": 0.8873858451843262, + "num_tokens": 812568541.0, + "step": 21296 + }, + { + "epoch": 2.7091973031420937, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.564191818237305, + "learning_rate": 1e-06, + "loss": 0.4188, + "mean_token_accuracy": 0.8655970692634583, + "num_tokens": 812612098.0, + "step": 21297 + }, + { + "epoch": 2.7093245134206843, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.57490348815918, + "learning_rate": 1e-06, + "loss": 0.4152, + "mean_token_accuracy": 0.8705764412879944, + "num_tokens": 812648843.0, + "step": 21298 + }, + { + "epoch": 2.709451723699275, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.665756225585938, + "learning_rate": 1e-06, + "loss": 0.405, + "mean_token_accuracy": 0.8742190599441528, + "num_tokens": 812692080.0, + "step": 21299 + }, + { + "epoch": 2.7095789339778653, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.418527603149414, + "learning_rate": 1e-06, + "loss": 0.3676, + "mean_token_accuracy": 0.888842761516571, + "num_tokens": 812729441.0, + "step": 21300 + }, + { + "epoch": 2.709706144256456, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.606712341308594, + "learning_rate": 1e-06, + "loss": 0.4248, + "mean_token_accuracy": 0.8685386180877686, + "num_tokens": 812768908.0, + "step": 21301 + }, + { + "epoch": 2.7098333545350464, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.476430892944336, + "learning_rate": 1e-06, + "loss": 0.3943, + "mean_token_accuracy": 0.8749359846115112, + "num_tokens": 812808384.0, + "step": 21302 + }, + { + "epoch": 2.709960564813637, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.503021240234375, + "learning_rate": 1e-06, + "loss": 0.4235, + "mean_token_accuracy": 0.8669350147247314, + "num_tokens": 812847658.0, + "step": 21303 + }, + { + "epoch": 2.7100877750922274, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.570764541625977, + "learning_rate": 1e-06, + "loss": 0.3891, + "mean_token_accuracy": 0.8753249645233154, + "num_tokens": 812889394.0, + "step": 21304 + }, + { + "epoch": 2.710214985370818, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.634737014770508, + "learning_rate": 1e-06, + "loss": 0.4331, + "mean_token_accuracy": 0.8616582751274109, + "num_tokens": 812924202.0, + "step": 21305 + }, + { + "epoch": 2.7103421956494085, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.53873062133789, + "learning_rate": 1e-06, + "loss": 0.3365, + "mean_token_accuracy": 0.8942397832870483, + "num_tokens": 812954391.0, + "step": 21306 + }, + { + "epoch": 2.710469405927999, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.416227340698242, + "learning_rate": 1e-06, + "loss": 0.4196, + "mean_token_accuracy": 0.8665276765823364, + "num_tokens": 812989498.0, + "step": 21307 + }, + { + "epoch": 2.7105966162065895, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.331308364868164, + "learning_rate": 1e-06, + "loss": 0.3759, + "mean_token_accuracy": 0.8816699385643005, + "num_tokens": 813030968.0, + "step": 21308 + }, + { + "epoch": 2.71072382648518, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.578838348388672, + "learning_rate": 1e-06, + "loss": 0.331, + "mean_token_accuracy": 0.8956185579299927, + "num_tokens": 813065843.0, + "step": 21309 + }, + { + "epoch": 2.7108510367637706, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.289365768432617, + "learning_rate": 1e-06, + "loss": 0.3795, + "mean_token_accuracy": 0.8801571130752563, + "num_tokens": 813107410.0, + "step": 21310 + }, + { + "epoch": 2.710978247042361, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.851844787597656, + "learning_rate": 1e-06, + "loss": 0.3831, + "mean_token_accuracy": 0.8807171583175659, + "num_tokens": 813155959.0, + "step": 21311 + }, + { + "epoch": 2.7111054573209517, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.529037475585938, + "learning_rate": 1e-06, + "loss": 0.3896, + "mean_token_accuracy": 0.8816288709640503, + "num_tokens": 813194133.0, + "step": 21312 + }, + { + "epoch": 2.711232667599542, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.607969284057617, + "learning_rate": 1e-06, + "loss": 0.4394, + "mean_token_accuracy": 0.8628475666046143, + "num_tokens": 813231015.0, + "step": 21313 + }, + { + "epoch": 2.7113598778781327, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.60047721862793, + "learning_rate": 1e-06, + "loss": 0.344, + "mean_token_accuracy": 0.8901084065437317, + "num_tokens": 813271497.0, + "step": 21314 + }, + { + "epoch": 2.711487088156723, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.447765350341797, + "learning_rate": 1e-06, + "loss": 0.3932, + "mean_token_accuracy": 0.8768953680992126, + "num_tokens": 813305758.0, + "step": 21315 + }, + { + "epoch": 2.7116142984353138, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.419902801513672, + "learning_rate": 1e-06, + "loss": 0.3977, + "mean_token_accuracy": 0.8733289241790771, + "num_tokens": 813345958.0, + "step": 21316 + }, + { + "epoch": 2.711741508713904, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.771677017211914, + "learning_rate": 1e-06, + "loss": 0.3754, + "mean_token_accuracy": 0.8785790205001831, + "num_tokens": 813381630.0, + "step": 21317 + }, + { + "epoch": 2.711868718992495, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.590103149414062, + "learning_rate": 1e-06, + "loss": 0.4136, + "mean_token_accuracy": 0.8749303817749023, + "num_tokens": 813421320.0, + "step": 21318 + }, + { + "epoch": 2.711995929271085, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.247583389282227, + "learning_rate": 1e-06, + "loss": 0.4129, + "mean_token_accuracy": 0.8689603209495544, + "num_tokens": 813458323.0, + "step": 21319 + }, + { + "epoch": 2.712123139549676, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.69350242614746, + "learning_rate": 1e-06, + "loss": 0.4585, + "mean_token_accuracy": 0.8566917777061462, + "num_tokens": 813501436.0, + "step": 21320 + }, + { + "epoch": 2.712250349828266, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.670297622680664, + "learning_rate": 1e-06, + "loss": 0.4029, + "mean_token_accuracy": 0.8753098845481873, + "num_tokens": 813545942.0, + "step": 21321 + }, + { + "epoch": 2.7123775601068565, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.395591735839844, + "learning_rate": 1e-06, + "loss": 0.414, + "mean_token_accuracy": 0.869624137878418, + "num_tokens": 813582202.0, + "step": 21322 + }, + { + "epoch": 2.712504770385447, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.517532348632812, + "learning_rate": 1e-06, + "loss": 0.4433, + "mean_token_accuracy": 0.8600096702575684, + "num_tokens": 813621711.0, + "step": 21323 + }, + { + "epoch": 2.7126319806640375, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.596628189086914, + "learning_rate": 1e-06, + "loss": 0.4045, + "mean_token_accuracy": 0.8695777654647827, + "num_tokens": 813665404.0, + "step": 21324 + }, + { + "epoch": 2.712759190942628, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.31710433959961, + "learning_rate": 1e-06, + "loss": 0.3719, + "mean_token_accuracy": 0.881244421005249, + "num_tokens": 813697470.0, + "step": 21325 + }, + { + "epoch": 2.7128864012212186, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.546791076660156, + "learning_rate": 1e-06, + "loss": 0.3758, + "mean_token_accuracy": 0.8809517025947571, + "num_tokens": 813733308.0, + "step": 21326 + }, + { + "epoch": 2.713013611499809, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.739093780517578, + "learning_rate": 1e-06, + "loss": 0.443, + "mean_token_accuracy": 0.8606188893318176, + "num_tokens": 813770881.0, + "step": 21327 + }, + { + "epoch": 2.7131408217783997, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.332857131958008, + "learning_rate": 1e-06, + "loss": 0.3876, + "mean_token_accuracy": 0.8803148865699768, + "num_tokens": 813806495.0, + "step": 21328 + }, + { + "epoch": 2.71326803205699, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.448352813720703, + "learning_rate": 1e-06, + "loss": 0.3848, + "mean_token_accuracy": 0.8803156018257141, + "num_tokens": 813842408.0, + "step": 21329 + }, + { + "epoch": 2.7133952423355807, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.462730407714844, + "learning_rate": 1e-06, + "loss": 0.4028, + "mean_token_accuracy": 0.8712237477302551, + "num_tokens": 813883041.0, + "step": 21330 + }, + { + "epoch": 2.7135224526141712, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.51167869567871, + "learning_rate": 1e-06, + "loss": 0.4075, + "mean_token_accuracy": 0.8705867528915405, + "num_tokens": 813922901.0, + "step": 21331 + }, + { + "epoch": 2.7136496628927618, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.375934600830078, + "learning_rate": 1e-06, + "loss": 0.3972, + "mean_token_accuracy": 0.8766713738441467, + "num_tokens": 813961948.0, + "step": 21332 + }, + { + "epoch": 2.7137768731713523, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.63804817199707, + "learning_rate": 1e-06, + "loss": 0.3559, + "mean_token_accuracy": 0.8880733251571655, + "num_tokens": 813999194.0, + "step": 21333 + }, + { + "epoch": 2.713904083449943, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.509016036987305, + "learning_rate": 1e-06, + "loss": 0.4402, + "mean_token_accuracy": 0.8639516830444336, + "num_tokens": 814038986.0, + "step": 21334 + }, + { + "epoch": 2.7140312937285334, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.663822174072266, + "learning_rate": 1e-06, + "loss": 0.3581, + "mean_token_accuracy": 0.888367772102356, + "num_tokens": 814074351.0, + "step": 21335 + }, + { + "epoch": 2.714158504007124, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.509742736816406, + "learning_rate": 1e-06, + "loss": 0.4007, + "mean_token_accuracy": 0.8743813037872314, + "num_tokens": 814109255.0, + "step": 21336 + }, + { + "epoch": 2.7142857142857144, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.433927536010742, + "learning_rate": 1e-06, + "loss": 0.4116, + "mean_token_accuracy": 0.8716394901275635, + "num_tokens": 814146601.0, + "step": 21337 + }, + { + "epoch": 2.714412924564305, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.40819549560547, + "learning_rate": 1e-06, + "loss": 0.3789, + "mean_token_accuracy": 0.8789294362068176, + "num_tokens": 814181299.0, + "step": 21338 + }, + { + "epoch": 2.7145401348428955, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.343791961669922, + "learning_rate": 1e-06, + "loss": 0.3721, + "mean_token_accuracy": 0.8826192617416382, + "num_tokens": 814218178.0, + "step": 21339 + }, + { + "epoch": 2.7146673451214856, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.545305252075195, + "learning_rate": 1e-06, + "loss": 0.383, + "mean_token_accuracy": 0.876015305519104, + "num_tokens": 814254161.0, + "step": 21340 + }, + { + "epoch": 2.7147945554000765, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.38071060180664, + "learning_rate": 1e-06, + "loss": 0.3991, + "mean_token_accuracy": 0.8746683597564697, + "num_tokens": 814290164.0, + "step": 21341 + }, + { + "epoch": 2.7149217656786666, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.592599868774414, + "learning_rate": 1e-06, + "loss": 0.4625, + "mean_token_accuracy": 0.8537719249725342, + "num_tokens": 814334459.0, + "step": 21342 + }, + { + "epoch": 2.7150489759572576, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.403783798217773, + "learning_rate": 1e-06, + "loss": 0.4212, + "mean_token_accuracy": 0.8660030961036682, + "num_tokens": 814374291.0, + "step": 21343 + }, + { + "epoch": 2.7151761862358477, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.499670028686523, + "learning_rate": 1e-06, + "loss": 0.4038, + "mean_token_accuracy": 0.8694099187850952, + "num_tokens": 814413196.0, + "step": 21344 + }, + { + "epoch": 2.7153033965144386, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.550718307495117, + "learning_rate": 1e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.873445987701416, + "num_tokens": 814455312.0, + "step": 21345 + }, + { + "epoch": 2.7154306067930287, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.5377254486084, + "learning_rate": 1e-06, + "loss": 0.4164, + "mean_token_accuracy": 0.8707530498504639, + "num_tokens": 814496586.0, + "step": 21346 + }, + { + "epoch": 2.7155578170716193, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.384042739868164, + "learning_rate": 1e-06, + "loss": 0.3729, + "mean_token_accuracy": 0.8806657791137695, + "num_tokens": 814531566.0, + "step": 21347 + }, + { + "epoch": 2.71568502735021, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.677753448486328, + "learning_rate": 1e-06, + "loss": 0.3895, + "mean_token_accuracy": 0.8805369138717651, + "num_tokens": 814566168.0, + "step": 21348 + }, + { + "epoch": 2.7158122376288003, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.466812133789062, + "learning_rate": 1e-06, + "loss": 0.3936, + "mean_token_accuracy": 0.8741602897644043, + "num_tokens": 814604927.0, + "step": 21349 + }, + { + "epoch": 2.715939447907391, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.65058135986328, + "learning_rate": 1e-06, + "loss": 0.3547, + "mean_token_accuracy": 0.888969361782074, + "num_tokens": 814641401.0, + "step": 21350 + }, + { + "epoch": 2.7160666581859814, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.576257705688477, + "learning_rate": 1e-06, + "loss": 0.3882, + "mean_token_accuracy": 0.8800865411758423, + "num_tokens": 814674252.0, + "step": 21351 + }, + { + "epoch": 2.716193868464572, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.408008575439453, + "learning_rate": 1e-06, + "loss": 0.4041, + "mean_token_accuracy": 0.8730616569519043, + "num_tokens": 814704965.0, + "step": 21352 + }, + { + "epoch": 2.7163210787431624, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.513662338256836, + "learning_rate": 1e-06, + "loss": 0.4817, + "mean_token_accuracy": 0.846428394317627, + "num_tokens": 814743740.0, + "step": 21353 + }, + { + "epoch": 2.716448289021753, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.510887145996094, + "learning_rate": 1e-06, + "loss": 0.3842, + "mean_token_accuracy": 0.8815929293632507, + "num_tokens": 814784598.0, + "step": 21354 + }, + { + "epoch": 2.7165754993003435, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.375295639038086, + "learning_rate": 1e-06, + "loss": 0.4131, + "mean_token_accuracy": 0.8687953948974609, + "num_tokens": 814821437.0, + "step": 21355 + }, + { + "epoch": 2.716702709578934, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.478967666625977, + "learning_rate": 1e-06, + "loss": 0.4077, + "mean_token_accuracy": 0.8750203251838684, + "num_tokens": 814861638.0, + "step": 21356 + }, + { + "epoch": 2.7168299198575245, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.379154205322266, + "learning_rate": 1e-06, + "loss": 0.3937, + "mean_token_accuracy": 0.8760472536087036, + "num_tokens": 814893799.0, + "step": 21357 + }, + { + "epoch": 2.716957130136115, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.57379150390625, + "learning_rate": 1e-06, + "loss": 0.3598, + "mean_token_accuracy": 0.88245689868927, + "num_tokens": 814930726.0, + "step": 21358 + }, + { + "epoch": 2.7170843404147056, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.377304077148438, + "learning_rate": 1e-06, + "loss": 0.3998, + "mean_token_accuracy": 0.8733901977539062, + "num_tokens": 814970041.0, + "step": 21359 + }, + { + "epoch": 2.717211550693296, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.395957946777344, + "learning_rate": 1e-06, + "loss": 0.3896, + "mean_token_accuracy": 0.8792318105697632, + "num_tokens": 815008587.0, + "step": 21360 + }, + { + "epoch": 2.7173387609718866, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.30018424987793, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8620368242263794, + "num_tokens": 815047659.0, + "step": 21361 + }, + { + "epoch": 2.717465971250477, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.788373947143555, + "learning_rate": 1e-06, + "loss": 0.3906, + "mean_token_accuracy": 0.8745411038398743, + "num_tokens": 815082631.0, + "step": 21362 + }, + { + "epoch": 2.7175931815290677, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.559825897216797, + "learning_rate": 1e-06, + "loss": 0.3406, + "mean_token_accuracy": 0.8927897810935974, + "num_tokens": 815117568.0, + "step": 21363 + }, + { + "epoch": 2.7177203918076582, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.466899871826172, + "learning_rate": 1e-06, + "loss": 0.3894, + "mean_token_accuracy": 0.8750975131988525, + "num_tokens": 815155019.0, + "step": 21364 + }, + { + "epoch": 2.7178476020862483, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.443300247192383, + "learning_rate": 1e-06, + "loss": 0.3952, + "mean_token_accuracy": 0.8749707937240601, + "num_tokens": 815195386.0, + "step": 21365 + }, + { + "epoch": 2.7179748123648393, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.347492218017578, + "learning_rate": 1e-06, + "loss": 0.3894, + "mean_token_accuracy": 0.8789602518081665, + "num_tokens": 815228304.0, + "step": 21366 + }, + { + "epoch": 2.7181020226434294, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.59148406982422, + "learning_rate": 1e-06, + "loss": 0.3339, + "mean_token_accuracy": 0.8935642242431641, + "num_tokens": 815260111.0, + "step": 21367 + }, + { + "epoch": 2.7182292329220203, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.486324310302734, + "learning_rate": 1e-06, + "loss": 0.3933, + "mean_token_accuracy": 0.8741138577461243, + "num_tokens": 815297272.0, + "step": 21368 + }, + { + "epoch": 2.7183564432006104, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.464460372924805, + "learning_rate": 1e-06, + "loss": 0.4023, + "mean_token_accuracy": 0.8733224868774414, + "num_tokens": 815336016.0, + "step": 21369 + }, + { + "epoch": 2.718483653479201, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.60506820678711, + "learning_rate": 1e-06, + "loss": 0.4753, + "mean_token_accuracy": 0.8481152057647705, + "num_tokens": 815373940.0, + "step": 21370 + }, + { + "epoch": 2.7186108637577915, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.4222412109375, + "learning_rate": 1e-06, + "loss": 0.4275, + "mean_token_accuracy": 0.864460825920105, + "num_tokens": 815410836.0, + "step": 21371 + }, + { + "epoch": 2.718738074036382, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.339906692504883, + "learning_rate": 1e-06, + "loss": 0.38, + "mean_token_accuracy": 0.8805252909660339, + "num_tokens": 815449577.0, + "step": 21372 + }, + { + "epoch": 2.7188652843149725, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.62995719909668, + "learning_rate": 1e-06, + "loss": 0.3945, + "mean_token_accuracy": 0.8778505325317383, + "num_tokens": 815485869.0, + "step": 21373 + }, + { + "epoch": 2.718992494593563, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.462688446044922, + "learning_rate": 1e-06, + "loss": 0.4563, + "mean_token_accuracy": 0.8566384315490723, + "num_tokens": 815523614.0, + "step": 21374 + }, + { + "epoch": 2.7191197048721536, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.567062377929688, + "learning_rate": 1e-06, + "loss": 0.4562, + "mean_token_accuracy": 0.8541262745857239, + "num_tokens": 815561835.0, + "step": 21375 + }, + { + "epoch": 2.719246915150744, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.30299949645996, + "learning_rate": 1e-06, + "loss": 0.4449, + "mean_token_accuracy": 0.8593510389328003, + "num_tokens": 815605596.0, + "step": 21376 + }, + { + "epoch": 2.7193741254293347, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.39134979248047, + "learning_rate": 1e-06, + "loss": 0.4137, + "mean_token_accuracy": 0.8688135147094727, + "num_tokens": 815638720.0, + "step": 21377 + }, + { + "epoch": 2.719501335707925, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.55182647705078, + "learning_rate": 1e-06, + "loss": 0.3524, + "mean_token_accuracy": 0.8901400566101074, + "num_tokens": 815673996.0, + "step": 21378 + }, + { + "epoch": 2.7196285459865157, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.496658325195312, + "learning_rate": 1e-06, + "loss": 0.4398, + "mean_token_accuracy": 0.8623975515365601, + "num_tokens": 815716571.0, + "step": 21379 + }, + { + "epoch": 2.7197557562651062, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.77928352355957, + "learning_rate": 1e-06, + "loss": 0.3725, + "mean_token_accuracy": 0.8812693953514099, + "num_tokens": 815748973.0, + "step": 21380 + }, + { + "epoch": 2.7198829665436968, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.574220657348633, + "learning_rate": 1e-06, + "loss": 0.3946, + "mean_token_accuracy": 0.8779551982879639, + "num_tokens": 815794754.0, + "step": 21381 + }, + { + "epoch": 2.7200101768222873, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.30521011352539, + "learning_rate": 1e-06, + "loss": 0.3833, + "mean_token_accuracy": 0.8775577545166016, + "num_tokens": 815831051.0, + "step": 21382 + }, + { + "epoch": 2.720137387100878, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.33588218688965, + "learning_rate": 1e-06, + "loss": 0.3867, + "mean_token_accuracy": 0.8776804208755493, + "num_tokens": 815874291.0, + "step": 21383 + }, + { + "epoch": 2.7202645973794684, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.584056854248047, + "learning_rate": 1e-06, + "loss": 0.4118, + "mean_token_accuracy": 0.8712875247001648, + "num_tokens": 815912484.0, + "step": 21384 + }, + { + "epoch": 2.720391807658059, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.272628784179688, + "learning_rate": 1e-06, + "loss": 0.4538, + "mean_token_accuracy": 0.8586021065711975, + "num_tokens": 815948626.0, + "step": 21385 + }, + { + "epoch": 2.7205190179366494, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.612220764160156, + "learning_rate": 1e-06, + "loss": 0.3608, + "mean_token_accuracy": 0.8834344148635864, + "num_tokens": 815983154.0, + "step": 21386 + }, + { + "epoch": 2.72064622821524, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.527538299560547, + "learning_rate": 1e-06, + "loss": 0.3852, + "mean_token_accuracy": 0.8777447938919067, + "num_tokens": 816017524.0, + "step": 21387 + }, + { + "epoch": 2.7207734384938305, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.654821395874023, + "learning_rate": 1e-06, + "loss": 0.4052, + "mean_token_accuracy": 0.8697161674499512, + "num_tokens": 816055204.0, + "step": 21388 + }, + { + "epoch": 2.720900648772421, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.435291290283203, + "learning_rate": 1e-06, + "loss": 0.3773, + "mean_token_accuracy": 0.8821595311164856, + "num_tokens": 816086671.0, + "step": 21389 + }, + { + "epoch": 2.721027859051011, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.230552673339844, + "learning_rate": 1e-06, + "loss": 0.474, + "mean_token_accuracy": 0.8552083373069763, + "num_tokens": 816114173.0, + "step": 21390 + }, + { + "epoch": 2.721155069329602, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.681995391845703, + "learning_rate": 1e-06, + "loss": 0.3627, + "mean_token_accuracy": 0.8863328695297241, + "num_tokens": 816156434.0, + "step": 21391 + }, + { + "epoch": 2.721282279608192, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.584102630615234, + "learning_rate": 1e-06, + "loss": 0.4151, + "mean_token_accuracy": 0.8701586723327637, + "num_tokens": 816191037.0, + "step": 21392 + }, + { + "epoch": 2.721409489886783, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.54300880432129, + "learning_rate": 1e-06, + "loss": 0.358, + "mean_token_accuracy": 0.8877807855606079, + "num_tokens": 816227031.0, + "step": 21393 + }, + { + "epoch": 2.721536700165373, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.529922485351562, + "learning_rate": 1e-06, + "loss": 0.3841, + "mean_token_accuracy": 0.8783692717552185, + "num_tokens": 816264308.0, + "step": 21394 + }, + { + "epoch": 2.7216639104439637, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.316394805908203, + "learning_rate": 1e-06, + "loss": 0.4005, + "mean_token_accuracy": 0.8723543286323547, + "num_tokens": 816300955.0, + "step": 21395 + }, + { + "epoch": 2.7217911207225542, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.67906379699707, + "learning_rate": 1e-06, + "loss": 0.3977, + "mean_token_accuracy": 0.8752681612968445, + "num_tokens": 816332881.0, + "step": 21396 + }, + { + "epoch": 2.7219183310011448, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.545732498168945, + "learning_rate": 1e-06, + "loss": 0.3427, + "mean_token_accuracy": 0.8929508924484253, + "num_tokens": 816367446.0, + "step": 21397 + }, + { + "epoch": 2.7220455412797353, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.570838928222656, + "learning_rate": 1e-06, + "loss": 0.4455, + "mean_token_accuracy": 0.8587321043014526, + "num_tokens": 816407286.0, + "step": 21398 + }, + { + "epoch": 2.722172751558326, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.424564361572266, + "learning_rate": 1e-06, + "loss": 0.3608, + "mean_token_accuracy": 0.8836274147033691, + "num_tokens": 816445277.0, + "step": 21399 + }, + { + "epoch": 2.7222999618369164, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.514310836791992, + "learning_rate": 1e-06, + "loss": 0.3846, + "mean_token_accuracy": 0.8797167539596558, + "num_tokens": 816481638.0, + "step": 21400 + }, + { + "epoch": 2.722427172115507, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.513916015625, + "learning_rate": 1e-06, + "loss": 0.3906, + "mean_token_accuracy": 0.8762334585189819, + "num_tokens": 816521386.0, + "step": 21401 + }, + { + "epoch": 2.7225543823940974, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.466949462890625, + "learning_rate": 1e-06, + "loss": 0.3627, + "mean_token_accuracy": 0.8858909606933594, + "num_tokens": 816554760.0, + "step": 21402 + }, + { + "epoch": 2.722681592672688, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.325422286987305, + "learning_rate": 1e-06, + "loss": 0.3681, + "mean_token_accuracy": 0.8835179805755615, + "num_tokens": 816595195.0, + "step": 21403 + }, + { + "epoch": 2.7228088029512785, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.53754234313965, + "learning_rate": 1e-06, + "loss": 0.3917, + "mean_token_accuracy": 0.8771054744720459, + "num_tokens": 816633046.0, + "step": 21404 + }, + { + "epoch": 2.722936013229869, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.62793731689453, + "learning_rate": 1e-06, + "loss": 0.3593, + "mean_token_accuracy": 0.8882713317871094, + "num_tokens": 816675283.0, + "step": 21405 + }, + { + "epoch": 2.7230632235084595, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.41470718383789, + "learning_rate": 1e-06, + "loss": 0.4223, + "mean_token_accuracy": 0.8657995462417603, + "num_tokens": 816716499.0, + "step": 21406 + }, + { + "epoch": 2.72319043378705, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.89236068725586, + "learning_rate": 1e-06, + "loss": 0.4107, + "mean_token_accuracy": 0.8682704567909241, + "num_tokens": 816761825.0, + "step": 21407 + }, + { + "epoch": 2.7233176440656406, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.219913482666016, + "learning_rate": 1e-06, + "loss": 0.3741, + "mean_token_accuracy": 0.8809236288070679, + "num_tokens": 816792080.0, + "step": 21408 + }, + { + "epoch": 2.723444854344231, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.488563537597656, + "learning_rate": 1e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.8649452328681946, + "num_tokens": 816832121.0, + "step": 21409 + }, + { + "epoch": 2.7235720646228216, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.696430206298828, + "learning_rate": 1e-06, + "loss": 0.425, + "mean_token_accuracy": 0.8684110641479492, + "num_tokens": 816874032.0, + "step": 21410 + }, + { + "epoch": 2.723699274901412, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.355134963989258, + "learning_rate": 1e-06, + "loss": 0.4536, + "mean_token_accuracy": 0.8566893935203552, + "num_tokens": 816920471.0, + "step": 21411 + }, + { + "epoch": 2.7238264851800027, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.692190170288086, + "learning_rate": 1e-06, + "loss": 0.3977, + "mean_token_accuracy": 0.8759404420852661, + "num_tokens": 816958042.0, + "step": 21412 + }, + { + "epoch": 2.723953695458593, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.39433479309082, + "learning_rate": 1e-06, + "loss": 0.4135, + "mean_token_accuracy": 0.8665015697479248, + "num_tokens": 816992158.0, + "step": 21413 + }, + { + "epoch": 2.7240809057371838, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.481700897216797, + "learning_rate": 1e-06, + "loss": 0.4256, + "mean_token_accuracy": 0.8642958402633667, + "num_tokens": 817032552.0, + "step": 21414 + }, + { + "epoch": 2.724208116015774, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.437381744384766, + "learning_rate": 1e-06, + "loss": 0.4296, + "mean_token_accuracy": 0.8718456029891968, + "num_tokens": 817072240.0, + "step": 21415 + }, + { + "epoch": 2.724335326294365, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.66271209716797, + "learning_rate": 1e-06, + "loss": 0.4229, + "mean_token_accuracy": 0.8656167984008789, + "num_tokens": 817104741.0, + "step": 21416 + }, + { + "epoch": 2.724462536572955, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.504568099975586, + "learning_rate": 1e-06, + "loss": 0.4415, + "mean_token_accuracy": 0.8623262643814087, + "num_tokens": 817143876.0, + "step": 21417 + }, + { + "epoch": 2.724589746851546, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.527851104736328, + "learning_rate": 1e-06, + "loss": 0.3251, + "mean_token_accuracy": 0.8972636461257935, + "num_tokens": 817183647.0, + "step": 21418 + }, + { + "epoch": 2.724716957130136, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.44728660583496, + "learning_rate": 1e-06, + "loss": 0.4187, + "mean_token_accuracy": 0.8705865144729614, + "num_tokens": 817214520.0, + "step": 21419 + }, + { + "epoch": 2.7248441674087265, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.379419326782227, + "learning_rate": 1e-06, + "loss": 0.427, + "mean_token_accuracy": 0.8650187253952026, + "num_tokens": 817256936.0, + "step": 21420 + }, + { + "epoch": 2.724971377687317, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.48002815246582, + "learning_rate": 1e-06, + "loss": 0.3965, + "mean_token_accuracy": 0.8757443428039551, + "num_tokens": 817292840.0, + "step": 21421 + }, + { + "epoch": 2.7250985879659075, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.633203506469727, + "learning_rate": 1e-06, + "loss": 0.4294, + "mean_token_accuracy": 0.8705241680145264, + "num_tokens": 817332872.0, + "step": 21422 + }, + { + "epoch": 2.725225798244498, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.49187469482422, + "learning_rate": 1e-06, + "loss": 0.3916, + "mean_token_accuracy": 0.8825416564941406, + "num_tokens": 817367668.0, + "step": 21423 + }, + { + "epoch": 2.7253530085230886, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.555160522460938, + "learning_rate": 1e-06, + "loss": 0.3435, + "mean_token_accuracy": 0.8928167819976807, + "num_tokens": 817407726.0, + "step": 21424 + }, + { + "epoch": 2.725480218801679, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.523422241210938, + "learning_rate": 1e-06, + "loss": 0.3634, + "mean_token_accuracy": 0.8850369453430176, + "num_tokens": 817441003.0, + "step": 21425 + }, + { + "epoch": 2.7256074290802697, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.50133514404297, + "learning_rate": 1e-06, + "loss": 0.3767, + "mean_token_accuracy": 0.880035400390625, + "num_tokens": 817482587.0, + "step": 21426 + }, + { + "epoch": 2.72573463935886, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.652389526367188, + "learning_rate": 1e-06, + "loss": 0.3878, + "mean_token_accuracy": 0.8786052465438843, + "num_tokens": 817519727.0, + "step": 21427 + }, + { + "epoch": 2.7258618496374507, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.498088836669922, + "learning_rate": 1e-06, + "loss": 0.4047, + "mean_token_accuracy": 0.8715168237686157, + "num_tokens": 817553809.0, + "step": 21428 + }, + { + "epoch": 2.7259890599160412, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.640573501586914, + "learning_rate": 1e-06, + "loss": 0.4513, + "mean_token_accuracy": 0.8593972325325012, + "num_tokens": 817588893.0, + "step": 21429 + }, + { + "epoch": 2.7261162701946318, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.474912643432617, + "learning_rate": 1e-06, + "loss": 0.3731, + "mean_token_accuracy": 0.8804453611373901, + "num_tokens": 817627789.0, + "step": 21430 + }, + { + "epoch": 2.7262434804732223, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.61307144165039, + "learning_rate": 1e-06, + "loss": 0.4205, + "mean_token_accuracy": 0.8653196096420288, + "num_tokens": 817670048.0, + "step": 21431 + }, + { + "epoch": 2.726370690751813, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.458864212036133, + "learning_rate": 1e-06, + "loss": 0.384, + "mean_token_accuracy": 0.8808510303497314, + "num_tokens": 817707108.0, + "step": 21432 + }, + { + "epoch": 2.7264979010304033, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.518726348876953, + "learning_rate": 1e-06, + "loss": 0.4042, + "mean_token_accuracy": 0.8722085952758789, + "num_tokens": 817748126.0, + "step": 21433 + }, + { + "epoch": 2.726625111308994, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.846393585205078, + "learning_rate": 1e-06, + "loss": 0.3876, + "mean_token_accuracy": 0.8781294822692871, + "num_tokens": 817781578.0, + "step": 21434 + }, + { + "epoch": 2.7267523215875844, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.431476593017578, + "learning_rate": 1e-06, + "loss": 0.384, + "mean_token_accuracy": 0.8810539245605469, + "num_tokens": 817822143.0, + "step": 21435 + }, + { + "epoch": 2.726879531866175, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.545637130737305, + "learning_rate": 1e-06, + "loss": 0.3685, + "mean_token_accuracy": 0.8852992653846741, + "num_tokens": 817858456.0, + "step": 21436 + }, + { + "epoch": 2.7270067421447655, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.539979934692383, + "learning_rate": 1e-06, + "loss": 0.3664, + "mean_token_accuracy": 0.8825050592422485, + "num_tokens": 817892863.0, + "step": 21437 + }, + { + "epoch": 2.7271339524233555, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.488544464111328, + "learning_rate": 1e-06, + "loss": 0.3907, + "mean_token_accuracy": 0.8789775371551514, + "num_tokens": 817931120.0, + "step": 21438 + }, + { + "epoch": 2.7272611627019465, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.819255828857422, + "learning_rate": 1e-06, + "loss": 0.3806, + "mean_token_accuracy": 0.878678560256958, + "num_tokens": 817963076.0, + "step": 21439 + }, + { + "epoch": 2.7273883729805366, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.3011417388916, + "learning_rate": 1e-06, + "loss": 0.3974, + "mean_token_accuracy": 0.8762233257293701, + "num_tokens": 818003844.0, + "step": 21440 + }, + { + "epoch": 2.7275155832591276, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.73268699645996, + "learning_rate": 1e-06, + "loss": 0.3896, + "mean_token_accuracy": 0.8775919675827026, + "num_tokens": 818048533.0, + "step": 21441 + }, + { + "epoch": 2.7276427935377177, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.60826301574707, + "learning_rate": 1e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.8716241121292114, + "num_tokens": 818083290.0, + "step": 21442 + }, + { + "epoch": 2.7277700038163086, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.394990921020508, + "learning_rate": 1e-06, + "loss": 0.367, + "mean_token_accuracy": 0.8825153112411499, + "num_tokens": 818120037.0, + "step": 21443 + }, + { + "epoch": 2.7278972140948987, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.55521011352539, + "learning_rate": 1e-06, + "loss": 0.3998, + "mean_token_accuracy": 0.8757305145263672, + "num_tokens": 818164122.0, + "step": 21444 + }, + { + "epoch": 2.7280244243734892, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.484830856323242, + "learning_rate": 1e-06, + "loss": 0.3995, + "mean_token_accuracy": 0.8769086599349976, + "num_tokens": 818198065.0, + "step": 21445 + }, + { + "epoch": 2.7281516346520798, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.701215744018555, + "learning_rate": 1e-06, + "loss": 0.3952, + "mean_token_accuracy": 0.8776752352714539, + "num_tokens": 818236843.0, + "step": 21446 + }, + { + "epoch": 2.7282788449306703, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.486759185791016, + "learning_rate": 1e-06, + "loss": 0.3938, + "mean_token_accuracy": 0.8763514757156372, + "num_tokens": 818271300.0, + "step": 21447 + }, + { + "epoch": 2.728406055209261, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.460203170776367, + "learning_rate": 1e-06, + "loss": 0.4421, + "mean_token_accuracy": 0.8608601093292236, + "num_tokens": 818311799.0, + "step": 21448 + }, + { + "epoch": 2.7285332654878514, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.4554443359375, + "learning_rate": 1e-06, + "loss": 0.3772, + "mean_token_accuracy": 0.8805493116378784, + "num_tokens": 818346662.0, + "step": 21449 + }, + { + "epoch": 2.728660475766442, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.402395248413086, + "learning_rate": 1e-06, + "loss": 0.3852, + "mean_token_accuracy": 0.878890335559845, + "num_tokens": 818384465.0, + "step": 21450 + }, + { + "epoch": 2.7287876860450324, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.406858444213867, + "learning_rate": 1e-06, + "loss": 0.4006, + "mean_token_accuracy": 0.872438907623291, + "num_tokens": 818425625.0, + "step": 21451 + }, + { + "epoch": 2.728914896323623, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.613292694091797, + "learning_rate": 1e-06, + "loss": 0.3792, + "mean_token_accuracy": 0.8800374865531921, + "num_tokens": 818458544.0, + "step": 21452 + }, + { + "epoch": 2.7290421066022135, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.504581451416016, + "learning_rate": 1e-06, + "loss": 0.4143, + "mean_token_accuracy": 0.8709763288497925, + "num_tokens": 818503507.0, + "step": 21453 + }, + { + "epoch": 2.729169316880804, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.59669303894043, + "learning_rate": 1e-06, + "loss": 0.3804, + "mean_token_accuracy": 0.8809504508972168, + "num_tokens": 818545411.0, + "step": 21454 + }, + { + "epoch": 2.7292965271593945, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.66459083557129, + "learning_rate": 1e-06, + "loss": 0.4038, + "mean_token_accuracy": 0.8702830672264099, + "num_tokens": 818582100.0, + "step": 21455 + }, + { + "epoch": 2.729423737437985, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.50159454345703, + "learning_rate": 1e-06, + "loss": 0.3934, + "mean_token_accuracy": 0.8726636171340942, + "num_tokens": 818615549.0, + "step": 21456 + }, + { + "epoch": 2.7295509477165756, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.57907485961914, + "learning_rate": 1e-06, + "loss": 0.379, + "mean_token_accuracy": 0.8795549869537354, + "num_tokens": 818652476.0, + "step": 21457 + }, + { + "epoch": 2.729678157995166, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.632797241210938, + "learning_rate": 1e-06, + "loss": 0.3793, + "mean_token_accuracy": 0.8778601288795471, + "num_tokens": 818692435.0, + "step": 21458 + }, + { + "epoch": 2.7298053682737566, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.70979118347168, + "learning_rate": 1e-06, + "loss": 0.3996, + "mean_token_accuracy": 0.8745559453964233, + "num_tokens": 818733206.0, + "step": 21459 + }, + { + "epoch": 2.729932578552347, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.633127212524414, + "learning_rate": 1e-06, + "loss": 0.3937, + "mean_token_accuracy": 0.8787789344787598, + "num_tokens": 818771717.0, + "step": 21460 + }, + { + "epoch": 2.7300597888309377, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.5767822265625, + "learning_rate": 1e-06, + "loss": 0.3992, + "mean_token_accuracy": 0.8752240538597107, + "num_tokens": 818806999.0, + "step": 21461 + }, + { + "epoch": 2.7301869991095282, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.517520904541016, + "learning_rate": 1e-06, + "loss": 0.3741, + "mean_token_accuracy": 0.8804179430007935, + "num_tokens": 818846226.0, + "step": 21462 + }, + { + "epoch": 2.7303142093881183, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.496728897094727, + "learning_rate": 1e-06, + "loss": 0.3705, + "mean_token_accuracy": 0.8816173076629639, + "num_tokens": 818885667.0, + "step": 21463 + }, + { + "epoch": 2.7304414196667093, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.467681884765625, + "learning_rate": 1e-06, + "loss": 0.3748, + "mean_token_accuracy": 0.8813253045082092, + "num_tokens": 818924009.0, + "step": 21464 + }, + { + "epoch": 2.7305686299452994, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.86308479309082, + "learning_rate": 1e-06, + "loss": 0.3704, + "mean_token_accuracy": 0.8843531012535095, + "num_tokens": 818962614.0, + "step": 21465 + }, + { + "epoch": 2.7306958402238903, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.564407348632812, + "learning_rate": 1e-06, + "loss": 0.3919, + "mean_token_accuracy": 0.8771335482597351, + "num_tokens": 819004486.0, + "step": 21466 + }, + { + "epoch": 2.7308230505024804, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.513595581054688, + "learning_rate": 1e-06, + "loss": 0.4115, + "mean_token_accuracy": 0.8688673973083496, + "num_tokens": 819048364.0, + "step": 21467 + }, + { + "epoch": 2.730950260781071, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.641714096069336, + "learning_rate": 1e-06, + "loss": 0.404, + "mean_token_accuracy": 0.871533215045929, + "num_tokens": 819086250.0, + "step": 21468 + }, + { + "epoch": 2.7310774710596615, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.6618709564209, + "learning_rate": 1e-06, + "loss": 0.3645, + "mean_token_accuracy": 0.8855672478675842, + "num_tokens": 819126705.0, + "step": 21469 + }, + { + "epoch": 2.731204681338252, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.511451721191406, + "learning_rate": 1e-06, + "loss": 0.3961, + "mean_token_accuracy": 0.8772315979003906, + "num_tokens": 819165967.0, + "step": 21470 + }, + { + "epoch": 2.7313318916168425, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.705184936523438, + "learning_rate": 1e-06, + "loss": 0.4131, + "mean_token_accuracy": 0.872308611869812, + "num_tokens": 819208066.0, + "step": 21471 + }, + { + "epoch": 2.731459101895433, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.443004608154297, + "learning_rate": 1e-06, + "loss": 0.4047, + "mean_token_accuracy": 0.8720629215240479, + "num_tokens": 819246269.0, + "step": 21472 + }, + { + "epoch": 2.7315863121740236, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.6978759765625, + "learning_rate": 1e-06, + "loss": 0.3941, + "mean_token_accuracy": 0.8758618831634521, + "num_tokens": 819286027.0, + "step": 21473 + }, + { + "epoch": 2.731713522452614, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.51756477355957, + "learning_rate": 1e-06, + "loss": 0.3736, + "mean_token_accuracy": 0.8844234943389893, + "num_tokens": 819319994.0, + "step": 21474 + }, + { + "epoch": 2.7318407327312046, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.614173889160156, + "learning_rate": 1e-06, + "loss": 0.3946, + "mean_token_accuracy": 0.8716949224472046, + "num_tokens": 819360113.0, + "step": 21475 + }, + { + "epoch": 2.731967943009795, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.430551528930664, + "learning_rate": 1e-06, + "loss": 0.3851, + "mean_token_accuracy": 0.8836969137191772, + "num_tokens": 819394575.0, + "step": 21476 + }, + { + "epoch": 2.7320951532883857, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.528926849365234, + "learning_rate": 1e-06, + "loss": 0.4189, + "mean_token_accuracy": 0.8706561326980591, + "num_tokens": 819434545.0, + "step": 21477 + }, + { + "epoch": 2.7322223635669762, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.644184112548828, + "learning_rate": 1e-06, + "loss": 0.3663, + "mean_token_accuracy": 0.8853623867034912, + "num_tokens": 819470699.0, + "step": 21478 + }, + { + "epoch": 2.7323495738455668, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.45494842529297, + "learning_rate": 1e-06, + "loss": 0.4201, + "mean_token_accuracy": 0.8694577217102051, + "num_tokens": 819514447.0, + "step": 21479 + }, + { + "epoch": 2.7324767841241573, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.727460861206055, + "learning_rate": 1e-06, + "loss": 0.4211, + "mean_token_accuracy": 0.869882345199585, + "num_tokens": 819553434.0, + "step": 21480 + }, + { + "epoch": 2.732603994402748, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.567861557006836, + "learning_rate": 1e-06, + "loss": 0.4151, + "mean_token_accuracy": 0.8690362572669983, + "num_tokens": 819591594.0, + "step": 21481 + }, + { + "epoch": 2.7327312046813383, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.728954315185547, + "learning_rate": 1e-06, + "loss": 0.4108, + "mean_token_accuracy": 0.871955156326294, + "num_tokens": 819634530.0, + "step": 21482 + }, + { + "epoch": 2.732858414959929, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.598960876464844, + "learning_rate": 1e-06, + "loss": 0.4235, + "mean_token_accuracy": 0.8667691946029663, + "num_tokens": 819669358.0, + "step": 21483 + }, + { + "epoch": 2.7329856252385194, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.652881622314453, + "learning_rate": 1e-06, + "loss": 0.3985, + "mean_token_accuracy": 0.8747482299804688, + "num_tokens": 819711336.0, + "step": 21484 + }, + { + "epoch": 2.73311283551711, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.54261589050293, + "learning_rate": 1e-06, + "loss": 0.4207, + "mean_token_accuracy": 0.8664644956588745, + "num_tokens": 819746973.0, + "step": 21485 + }, + { + "epoch": 2.7332400457957005, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.728370666503906, + "learning_rate": 1e-06, + "loss": 0.3654, + "mean_token_accuracy": 0.8849227428436279, + "num_tokens": 819780305.0, + "step": 21486 + }, + { + "epoch": 2.733367256074291, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.71094512939453, + "learning_rate": 1e-06, + "loss": 0.3681, + "mean_token_accuracy": 0.8836307525634766, + "num_tokens": 819817326.0, + "step": 21487 + }, + { + "epoch": 2.733494466352881, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.41728401184082, + "learning_rate": 1e-06, + "loss": 0.4117, + "mean_token_accuracy": 0.8695586919784546, + "num_tokens": 819857597.0, + "step": 21488 + }, + { + "epoch": 2.733621676631472, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.827749252319336, + "learning_rate": 1e-06, + "loss": 0.3878, + "mean_token_accuracy": 0.8786835670471191, + "num_tokens": 819893471.0, + "step": 21489 + }, + { + "epoch": 2.733748886910062, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.485004425048828, + "learning_rate": 1e-06, + "loss": 0.4553, + "mean_token_accuracy": 0.8586269617080688, + "num_tokens": 819934175.0, + "step": 21490 + }, + { + "epoch": 2.733876097188653, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.50968360900879, + "learning_rate": 1e-06, + "loss": 0.3642, + "mean_token_accuracy": 0.8832381963729858, + "num_tokens": 819971451.0, + "step": 21491 + }, + { + "epoch": 2.734003307467243, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.84670066833496, + "learning_rate": 1e-06, + "loss": 0.4124, + "mean_token_accuracy": 0.870360255241394, + "num_tokens": 820011566.0, + "step": 21492 + }, + { + "epoch": 2.7341305177458337, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.5002498626709, + "learning_rate": 1e-06, + "loss": 0.4737, + "mean_token_accuracy": 0.8511548042297363, + "num_tokens": 820047944.0, + "step": 21493 + }, + { + "epoch": 2.7342577280244242, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.871992111206055, + "learning_rate": 1e-06, + "loss": 0.4328, + "mean_token_accuracy": 0.863589346408844, + "num_tokens": 820083416.0, + "step": 21494 + }, + { + "epoch": 2.7343849383030148, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.604442596435547, + "learning_rate": 1e-06, + "loss": 0.4314, + "mean_token_accuracy": 0.8660463094711304, + "num_tokens": 820122476.0, + "step": 21495 + }, + { + "epoch": 2.7345121485816053, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.462512969970703, + "learning_rate": 1e-06, + "loss": 0.38, + "mean_token_accuracy": 0.8814635276794434, + "num_tokens": 820159866.0, + "step": 21496 + }, + { + "epoch": 2.734639358860196, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.65180778503418, + "learning_rate": 1e-06, + "loss": 0.3908, + "mean_token_accuracy": 0.876322865486145, + "num_tokens": 820202710.0, + "step": 21497 + }, + { + "epoch": 2.7347665691387864, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.592262268066406, + "learning_rate": 1e-06, + "loss": 0.4035, + "mean_token_accuracy": 0.8730301260948181, + "num_tokens": 820239156.0, + "step": 21498 + }, + { + "epoch": 2.734893779417377, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.620508193969727, + "learning_rate": 1e-06, + "loss": 0.3341, + "mean_token_accuracy": 0.8927662372589111, + "num_tokens": 820274010.0, + "step": 21499 + }, + { + "epoch": 2.7350209896959674, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.920780181884766, + "learning_rate": 1e-06, + "loss": 0.4135, + "mean_token_accuracy": 0.873421311378479, + "num_tokens": 820319171.0, + "step": 21500 + }, + { + "epoch": 2.735148199974558, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.539518356323242, + "learning_rate": 1e-06, + "loss": 0.4458, + "mean_token_accuracy": 0.8602427840232849, + "num_tokens": 820355038.0, + "step": 21501 + }, + { + "epoch": 2.7352754102531485, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.753339767456055, + "learning_rate": 1e-06, + "loss": 0.4012, + "mean_token_accuracy": 0.8715027570724487, + "num_tokens": 820390973.0, + "step": 21502 + }, + { + "epoch": 2.735402620531739, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.746822357177734, + "learning_rate": 1e-06, + "loss": 0.4117, + "mean_token_accuracy": 0.8700010776519775, + "num_tokens": 820424543.0, + "step": 21503 + }, + { + "epoch": 2.7355298308103295, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.593629837036133, + "learning_rate": 1e-06, + "loss": 0.3985, + "mean_token_accuracy": 0.8731280565261841, + "num_tokens": 820462702.0, + "step": 21504 + }, + { + "epoch": 2.73565704108892, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.76145362854004, + "learning_rate": 1e-06, + "loss": 0.371, + "mean_token_accuracy": 0.8846858143806458, + "num_tokens": 820498840.0, + "step": 21505 + }, + { + "epoch": 2.7357842513675106, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.052001953125, + "learning_rate": 1e-06, + "loss": 0.4204, + "mean_token_accuracy": 0.8678014874458313, + "num_tokens": 820534083.0, + "step": 21506 + }, + { + "epoch": 2.735911461646101, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.455245971679688, + "learning_rate": 1e-06, + "loss": 0.3924, + "mean_token_accuracy": 0.8783553838729858, + "num_tokens": 820572479.0, + "step": 21507 + }, + { + "epoch": 2.7360386719246916, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.425413131713867, + "learning_rate": 1e-06, + "loss": 0.432, + "mean_token_accuracy": 0.8638243079185486, + "num_tokens": 820610283.0, + "step": 21508 + }, + { + "epoch": 2.736165882203282, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.591690063476562, + "learning_rate": 1e-06, + "loss": 0.3521, + "mean_token_accuracy": 0.8904163241386414, + "num_tokens": 820647580.0, + "step": 21509 + }, + { + "epoch": 2.7362930924818727, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.458763122558594, + "learning_rate": 1e-06, + "loss": 0.3953, + "mean_token_accuracy": 0.8805680274963379, + "num_tokens": 820686774.0, + "step": 21510 + }, + { + "epoch": 2.7364203027604628, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.513996124267578, + "learning_rate": 1e-06, + "loss": 0.3711, + "mean_token_accuracy": 0.8840053677558899, + "num_tokens": 820724635.0, + "step": 21511 + }, + { + "epoch": 2.7365475130390537, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.906293869018555, + "learning_rate": 1e-06, + "loss": 0.3798, + "mean_token_accuracy": 0.8815611600875854, + "num_tokens": 820761467.0, + "step": 21512 + }, + { + "epoch": 2.736674723317644, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.523897171020508, + "learning_rate": 1e-06, + "loss": 0.436, + "mean_token_accuracy": 0.8611087799072266, + "num_tokens": 820801001.0, + "step": 21513 + }, + { + "epoch": 2.736801933596235, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.663440704345703, + "learning_rate": 1e-06, + "loss": 0.4109, + "mean_token_accuracy": 0.8702752590179443, + "num_tokens": 820841028.0, + "step": 21514 + }, + { + "epoch": 2.736929143874825, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.62409782409668, + "learning_rate": 1e-06, + "loss": 0.3673, + "mean_token_accuracy": 0.8831920623779297, + "num_tokens": 820881613.0, + "step": 21515 + }, + { + "epoch": 2.737056354153416, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.584548950195312, + "learning_rate": 1e-06, + "loss": 0.3794, + "mean_token_accuracy": 0.8819550275802612, + "num_tokens": 820918205.0, + "step": 21516 + }, + { + "epoch": 2.737183564432006, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.50487518310547, + "learning_rate": 1e-06, + "loss": 0.4148, + "mean_token_accuracy": 0.8682914972305298, + "num_tokens": 820950751.0, + "step": 21517 + }, + { + "epoch": 2.7373107747105965, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.7742977142334, + "learning_rate": 1e-06, + "loss": 0.3914, + "mean_token_accuracy": 0.8777190446853638, + "num_tokens": 820987385.0, + "step": 21518 + }, + { + "epoch": 2.737437984989187, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.39031219482422, + "learning_rate": 1e-06, + "loss": 0.4232, + "mean_token_accuracy": 0.8675196766853333, + "num_tokens": 821026273.0, + "step": 21519 + }, + { + "epoch": 2.7375651952677775, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.362716674804688, + "learning_rate": 1e-06, + "loss": 0.4177, + "mean_token_accuracy": 0.866843581199646, + "num_tokens": 821067978.0, + "step": 21520 + }, + { + "epoch": 2.737692405546368, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.630258560180664, + "learning_rate": 1e-06, + "loss": 0.3847, + "mean_token_accuracy": 0.8804656863212585, + "num_tokens": 821111925.0, + "step": 21521 + }, + { + "epoch": 2.7378196158249586, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.586490631103516, + "learning_rate": 1e-06, + "loss": 0.3914, + "mean_token_accuracy": 0.8785482048988342, + "num_tokens": 821151019.0, + "step": 21522 + }, + { + "epoch": 2.737946826103549, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.428739547729492, + "learning_rate": 1e-06, + "loss": 0.3683, + "mean_token_accuracy": 0.8812552690505981, + "num_tokens": 821185520.0, + "step": 21523 + }, + { + "epoch": 2.7380740363821396, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.543386459350586, + "learning_rate": 1e-06, + "loss": 0.4005, + "mean_token_accuracy": 0.8764743804931641, + "num_tokens": 821224273.0, + "step": 21524 + }, + { + "epoch": 2.73820124666073, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.752710342407227, + "learning_rate": 1e-06, + "loss": 0.3899, + "mean_token_accuracy": 0.8810815811157227, + "num_tokens": 821263541.0, + "step": 21525 + }, + { + "epoch": 2.7383284569393207, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.360322952270508, + "learning_rate": 1e-06, + "loss": 0.4345, + "mean_token_accuracy": 0.8632892370223999, + "num_tokens": 821304053.0, + "step": 21526 + }, + { + "epoch": 2.7384556672179112, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.613874435424805, + "learning_rate": 1e-06, + "loss": 0.4008, + "mean_token_accuracy": 0.8745795488357544, + "num_tokens": 821339710.0, + "step": 21527 + }, + { + "epoch": 2.7385828774965018, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.636962890625, + "learning_rate": 1e-06, + "loss": 0.3871, + "mean_token_accuracy": 0.8758689165115356, + "num_tokens": 821378377.0, + "step": 21528 + }, + { + "epoch": 2.7387100877750923, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.503517150878906, + "learning_rate": 1e-06, + "loss": 0.371, + "mean_token_accuracy": 0.8837911486625671, + "num_tokens": 821417434.0, + "step": 21529 + }, + { + "epoch": 2.738837298053683, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.484928131103516, + "learning_rate": 1e-06, + "loss": 0.4099, + "mean_token_accuracy": 0.8707056641578674, + "num_tokens": 821464323.0, + "step": 21530 + }, + { + "epoch": 2.7389645083322733, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.712547302246094, + "learning_rate": 1e-06, + "loss": 0.4022, + "mean_token_accuracy": 0.8724370002746582, + "num_tokens": 821509891.0, + "step": 21531 + }, + { + "epoch": 2.739091718610864, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.791868209838867, + "learning_rate": 1e-06, + "loss": 0.44, + "mean_token_accuracy": 0.8644205927848816, + "num_tokens": 821557309.0, + "step": 21532 + }, + { + "epoch": 2.7392189288894544, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.507125854492188, + "learning_rate": 1e-06, + "loss": 0.4224, + "mean_token_accuracy": 0.8705557584762573, + "num_tokens": 821597751.0, + "step": 21533 + }, + { + "epoch": 2.739346139168045, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.847679138183594, + "learning_rate": 1e-06, + "loss": 0.361, + "mean_token_accuracy": 0.886595606803894, + "num_tokens": 821637961.0, + "step": 21534 + }, + { + "epoch": 2.7394733494466355, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.573915481567383, + "learning_rate": 1e-06, + "loss": 0.4482, + "mean_token_accuracy": 0.858510434627533, + "num_tokens": 821684728.0, + "step": 21535 + }, + { + "epoch": 2.7396005597252255, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.68605613708496, + "learning_rate": 1e-06, + "loss": 0.3777, + "mean_token_accuracy": 0.8805668354034424, + "num_tokens": 821715359.0, + "step": 21536 + }, + { + "epoch": 2.7397277700038165, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.496986389160156, + "learning_rate": 1e-06, + "loss": 0.4249, + "mean_token_accuracy": 0.8675254583358765, + "num_tokens": 821755540.0, + "step": 21537 + }, + { + "epoch": 2.7398549802824066, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.5799560546875, + "learning_rate": 1e-06, + "loss": 0.3938, + "mean_token_accuracy": 0.8761383295059204, + "num_tokens": 821788506.0, + "step": 21538 + }, + { + "epoch": 2.7399821905609976, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.723434448242188, + "learning_rate": 1e-06, + "loss": 0.3937, + "mean_token_accuracy": 0.8737787008285522, + "num_tokens": 821826555.0, + "step": 21539 + }, + { + "epoch": 2.7401094008395877, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.64438247680664, + "learning_rate": 1e-06, + "loss": 0.4263, + "mean_token_accuracy": 0.8652013540267944, + "num_tokens": 821866160.0, + "step": 21540 + }, + { + "epoch": 2.7402366111181786, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.549072265625, + "learning_rate": 1e-06, + "loss": 0.3822, + "mean_token_accuracy": 0.8795866370201111, + "num_tokens": 821905595.0, + "step": 21541 + }, + { + "epoch": 2.7403638213967687, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.615467071533203, + "learning_rate": 1e-06, + "loss": 0.4294, + "mean_token_accuracy": 0.8648086786270142, + "num_tokens": 821944769.0, + "step": 21542 + }, + { + "epoch": 2.7404910316753592, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.5388126373291, + "learning_rate": 1e-06, + "loss": 0.3781, + "mean_token_accuracy": 0.8822280168533325, + "num_tokens": 821988889.0, + "step": 21543 + }, + { + "epoch": 2.7406182419539498, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.488489151000977, + "learning_rate": 1e-06, + "loss": 0.385, + "mean_token_accuracy": 0.8778092861175537, + "num_tokens": 822025343.0, + "step": 21544 + }, + { + "epoch": 2.7407454522325403, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.73470115661621, + "learning_rate": 1e-06, + "loss": 0.4401, + "mean_token_accuracy": 0.8633866906166077, + "num_tokens": 822067651.0, + "step": 21545 + }, + { + "epoch": 2.740872662511131, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.71350860595703, + "learning_rate": 1e-06, + "loss": 0.438, + "mean_token_accuracy": 0.8646913766860962, + "num_tokens": 822102885.0, + "step": 21546 + }, + { + "epoch": 2.7409998727897213, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.535276412963867, + "learning_rate": 1e-06, + "loss": 0.4273, + "mean_token_accuracy": 0.8666355609893799, + "num_tokens": 822142515.0, + "step": 21547 + }, + { + "epoch": 2.741127083068312, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.565013885498047, + "learning_rate": 1e-06, + "loss": 0.4131, + "mean_token_accuracy": 0.8708092570304871, + "num_tokens": 822180745.0, + "step": 21548 + }, + { + "epoch": 2.7412542933469024, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.734350204467773, + "learning_rate": 1e-06, + "loss": 0.372, + "mean_token_accuracy": 0.8815945982933044, + "num_tokens": 822215117.0, + "step": 21549 + }, + { + "epoch": 2.741381503625493, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.630659103393555, + "learning_rate": 1e-06, + "loss": 0.434, + "mean_token_accuracy": 0.8648097515106201, + "num_tokens": 822258883.0, + "step": 21550 + }, + { + "epoch": 2.7415087139040835, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.653867721557617, + "learning_rate": 1e-06, + "loss": 0.4029, + "mean_token_accuracy": 0.8719243407249451, + "num_tokens": 822295254.0, + "step": 21551 + }, + { + "epoch": 2.741635924182674, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.73002052307129, + "learning_rate": 1e-06, + "loss": 0.392, + "mean_token_accuracy": 0.8772730827331543, + "num_tokens": 822335769.0, + "step": 21552 + }, + { + "epoch": 2.7417631344612645, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.461517333984375, + "learning_rate": 1e-06, + "loss": 0.3753, + "mean_token_accuracy": 0.8841103315353394, + "num_tokens": 822376873.0, + "step": 21553 + }, + { + "epoch": 2.741890344739855, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.40067481994629, + "learning_rate": 1e-06, + "loss": 0.4049, + "mean_token_accuracy": 0.871705949306488, + "num_tokens": 822419868.0, + "step": 21554 + }, + { + "epoch": 2.7420175550184456, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.853225708007812, + "learning_rate": 1e-06, + "loss": 0.4017, + "mean_token_accuracy": 0.8733117580413818, + "num_tokens": 822456757.0, + "step": 21555 + }, + { + "epoch": 2.742144765297036, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.28365135192871, + "learning_rate": 1e-06, + "loss": 0.3509, + "mean_token_accuracy": 0.8881802558898926, + "num_tokens": 822492772.0, + "step": 21556 + }, + { + "epoch": 2.7422719755756266, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.747905731201172, + "learning_rate": 1e-06, + "loss": 0.4496, + "mean_token_accuracy": 0.858283281326294, + "num_tokens": 822535400.0, + "step": 21557 + }, + { + "epoch": 2.742399185854217, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.554012298583984, + "learning_rate": 1e-06, + "loss": 0.3772, + "mean_token_accuracy": 0.879565954208374, + "num_tokens": 822568313.0, + "step": 21558 + }, + { + "epoch": 2.7425263961328077, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.485366821289062, + "learning_rate": 1e-06, + "loss": 0.3766, + "mean_token_accuracy": 0.8812570571899414, + "num_tokens": 822608487.0, + "step": 21559 + }, + { + "epoch": 2.742653606411398, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.85396385192871, + "learning_rate": 1e-06, + "loss": 0.4022, + "mean_token_accuracy": 0.8755934834480286, + "num_tokens": 822647938.0, + "step": 21560 + }, + { + "epoch": 2.7427808166899883, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.659151077270508, + "learning_rate": 1e-06, + "loss": 0.4, + "mean_token_accuracy": 0.8700024485588074, + "num_tokens": 822685974.0, + "step": 21561 + }, + { + "epoch": 2.7429080269685793, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.631526947021484, + "learning_rate": 1e-06, + "loss": 0.3897, + "mean_token_accuracy": 0.8775449991226196, + "num_tokens": 822719166.0, + "step": 21562 + }, + { + "epoch": 2.7430352372471694, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.64224624633789, + "learning_rate": 1e-06, + "loss": 0.4231, + "mean_token_accuracy": 0.8637328147888184, + "num_tokens": 822758364.0, + "step": 21563 + }, + { + "epoch": 2.7431624475257603, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.568920135498047, + "learning_rate": 1e-06, + "loss": 0.3752, + "mean_token_accuracy": 0.8829792141914368, + "num_tokens": 822798002.0, + "step": 21564 + }, + { + "epoch": 2.7432896578043504, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.752561569213867, + "learning_rate": 1e-06, + "loss": 0.3955, + "mean_token_accuracy": 0.8755737543106079, + "num_tokens": 822837958.0, + "step": 21565 + }, + { + "epoch": 2.743416868082941, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.63291358947754, + "learning_rate": 1e-06, + "loss": 0.3796, + "mean_token_accuracy": 0.8753092288970947, + "num_tokens": 822873076.0, + "step": 21566 + }, + { + "epoch": 2.7435440783615315, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.832860946655273, + "learning_rate": 1e-06, + "loss": 0.3986, + "mean_token_accuracy": 0.8726013898849487, + "num_tokens": 822904693.0, + "step": 21567 + }, + { + "epoch": 2.743671288640122, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.609949111938477, + "learning_rate": 1e-06, + "loss": 0.4081, + "mean_token_accuracy": 0.8697389364242554, + "num_tokens": 822941192.0, + "step": 21568 + }, + { + "epoch": 2.7437984989187125, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.808473587036133, + "learning_rate": 1e-06, + "loss": 0.4341, + "mean_token_accuracy": 0.8617741465568542, + "num_tokens": 822982166.0, + "step": 21569 + }, + { + "epoch": 2.743925709197303, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.665803909301758, + "learning_rate": 1e-06, + "loss": 0.4168, + "mean_token_accuracy": 0.8676751852035522, + "num_tokens": 823021630.0, + "step": 21570 + }, + { + "epoch": 2.7440529194758936, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.45070457458496, + "learning_rate": 1e-06, + "loss": 0.4123, + "mean_token_accuracy": 0.8738590478897095, + "num_tokens": 823062133.0, + "step": 21571 + }, + { + "epoch": 2.744180129754484, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.685054779052734, + "learning_rate": 1e-06, + "loss": 0.409, + "mean_token_accuracy": 0.8708615303039551, + "num_tokens": 823105163.0, + "step": 21572 + }, + { + "epoch": 2.7443073400330746, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.902324676513672, + "learning_rate": 1e-06, + "loss": 0.4629, + "mean_token_accuracy": 0.8548005223274231, + "num_tokens": 823146878.0, + "step": 21573 + }, + { + "epoch": 2.744434550311665, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.4654483795166, + "learning_rate": 1e-06, + "loss": 0.3458, + "mean_token_accuracy": 0.8910306692123413, + "num_tokens": 823182216.0, + "step": 21574 + }, + { + "epoch": 2.7445617605902557, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.790971755981445, + "learning_rate": 1e-06, + "loss": 0.3813, + "mean_token_accuracy": 0.881706178188324, + "num_tokens": 823227838.0, + "step": 21575 + }, + { + "epoch": 2.7446889708688462, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.6600341796875, + "learning_rate": 1e-06, + "loss": 0.4254, + "mean_token_accuracy": 0.8658642768859863, + "num_tokens": 823268609.0, + "step": 21576 + }, + { + "epoch": 2.7448161811474368, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.774921417236328, + "learning_rate": 1e-06, + "loss": 0.375, + "mean_token_accuracy": 0.8825479745864868, + "num_tokens": 823309426.0, + "step": 21577 + }, + { + "epoch": 2.7449433914260273, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.3369197845459, + "learning_rate": 1e-06, + "loss": 0.415, + "mean_token_accuracy": 0.8701483607292175, + "num_tokens": 823346049.0, + "step": 21578 + }, + { + "epoch": 2.745070601704618, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.197858810424805, + "learning_rate": 1e-06, + "loss": 0.4405, + "mean_token_accuracy": 0.867673397064209, + "num_tokens": 823378915.0, + "step": 21579 + }, + { + "epoch": 2.7451978119832083, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.93812370300293, + "learning_rate": 1e-06, + "loss": 0.4281, + "mean_token_accuracy": 0.8638939261436462, + "num_tokens": 823422985.0, + "step": 21580 + }, + { + "epoch": 2.745325022261799, + "ewc_loss": 0.0390625, + "ewc_loss_parallel": 3.910064697265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.322509765625, + "learning_rate": 1e-06, + "loss": 0.4431, + "mean_token_accuracy": 0.8622408509254456, + "num_tokens": 823460703.0, + "step": 21581 + }, + { + "epoch": 2.7454522325403894, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.66343879699707, + "learning_rate": 1e-06, + "loss": 0.4014, + "mean_token_accuracy": 0.8771181106567383, + "num_tokens": 823499651.0, + "step": 21582 + }, + { + "epoch": 2.74557944281898, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.487260818481445, + "learning_rate": 1e-06, + "loss": 0.4052, + "mean_token_accuracy": 0.8727422952651978, + "num_tokens": 823537044.0, + "step": 21583 + }, + { + "epoch": 2.7457066530975704, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.652965545654297, + "learning_rate": 1e-06, + "loss": 0.3525, + "mean_token_accuracy": 0.887595534324646, + "num_tokens": 823571832.0, + "step": 21584 + }, + { + "epoch": 2.745833863376161, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.383525848388672, + "learning_rate": 1e-06, + "loss": 0.3776, + "mean_token_accuracy": 0.8805886507034302, + "num_tokens": 823609852.0, + "step": 21585 + }, + { + "epoch": 2.745961073654751, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.50872802734375, + "learning_rate": 1e-06, + "loss": 0.3914, + "mean_token_accuracy": 0.8780547380447388, + "num_tokens": 823644419.0, + "step": 21586 + }, + { + "epoch": 2.746088283933342, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.59158706665039, + "learning_rate": 1e-06, + "loss": 0.3858, + "mean_token_accuracy": 0.8799113035202026, + "num_tokens": 823682451.0, + "step": 21587 + }, + { + "epoch": 2.746215494211932, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.640575408935547, + "learning_rate": 1e-06, + "loss": 0.375, + "mean_token_accuracy": 0.8847654461860657, + "num_tokens": 823723051.0, + "step": 21588 + }, + { + "epoch": 2.746342704490523, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.5615234375, + "learning_rate": 1e-06, + "loss": 0.4099, + "mean_token_accuracy": 0.8711079955101013, + "num_tokens": 823761353.0, + "step": 21589 + }, + { + "epoch": 2.746469914769113, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.52263641357422, + "learning_rate": 1e-06, + "loss": 0.4604, + "mean_token_accuracy": 0.8602295517921448, + "num_tokens": 823800281.0, + "step": 21590 + }, + { + "epoch": 2.7465971250477037, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.704912185668945, + "learning_rate": 1e-06, + "loss": 0.3561, + "mean_token_accuracy": 0.8882864117622375, + "num_tokens": 823843255.0, + "step": 21591 + }, + { + "epoch": 2.7467243353262942, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.576383590698242, + "learning_rate": 1e-06, + "loss": 0.4057, + "mean_token_accuracy": 0.8710404634475708, + "num_tokens": 823880736.0, + "step": 21592 + }, + { + "epoch": 2.7468515456048848, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.72113609313965, + "learning_rate": 1e-06, + "loss": 0.4197, + "mean_token_accuracy": 0.872309684753418, + "num_tokens": 823923232.0, + "step": 21593 + }, + { + "epoch": 2.7469787558834753, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.525192260742188, + "learning_rate": 1e-06, + "loss": 0.3883, + "mean_token_accuracy": 0.8778092265129089, + "num_tokens": 823957488.0, + "step": 21594 + }, + { + "epoch": 2.747105966162066, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.519655227661133, + "learning_rate": 1e-06, + "loss": 0.4091, + "mean_token_accuracy": 0.869970440864563, + "num_tokens": 823999568.0, + "step": 21595 + }, + { + "epoch": 2.7472331764406563, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.580392837524414, + "learning_rate": 1e-06, + "loss": 0.3659, + "mean_token_accuracy": 0.8858877420425415, + "num_tokens": 824036947.0, + "step": 21596 + }, + { + "epoch": 2.747360386719247, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.747446060180664, + "learning_rate": 1e-06, + "loss": 0.3594, + "mean_token_accuracy": 0.8870302438735962, + "num_tokens": 824067966.0, + "step": 21597 + }, + { + "epoch": 2.7474875969978374, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.61273765563965, + "learning_rate": 1e-06, + "loss": 0.4098, + "mean_token_accuracy": 0.8731918931007385, + "num_tokens": 824112216.0, + "step": 21598 + }, + { + "epoch": 2.747614807276428, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.704835891723633, + "learning_rate": 1e-06, + "loss": 0.3568, + "mean_token_accuracy": 0.8844448328018188, + "num_tokens": 824148749.0, + "step": 21599 + }, + { + "epoch": 2.7477420175550185, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.600597381591797, + "learning_rate": 1e-06, + "loss": 0.4114, + "mean_token_accuracy": 0.8731777667999268, + "num_tokens": 824186685.0, + "step": 21600 + }, + { + "epoch": 2.747869227833609, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.51774024963379, + "learning_rate": 1e-06, + "loss": 0.4164, + "mean_token_accuracy": 0.869443416595459, + "num_tokens": 824223716.0, + "step": 21601 + }, + { + "epoch": 2.7479964381121995, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.94931983947754, + "learning_rate": 1e-06, + "loss": 0.4068, + "mean_token_accuracy": 0.8729904294013977, + "num_tokens": 824264627.0, + "step": 21602 + }, + { + "epoch": 2.74812364839079, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.67003059387207, + "learning_rate": 1e-06, + "loss": 0.3966, + "mean_token_accuracy": 0.874367356300354, + "num_tokens": 824301901.0, + "step": 21603 + }, + { + "epoch": 2.7482508586693806, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.695106506347656, + "learning_rate": 1e-06, + "loss": 0.3874, + "mean_token_accuracy": 0.8780554533004761, + "num_tokens": 824344621.0, + "step": 21604 + }, + { + "epoch": 2.748378068947971, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.69411849975586, + "learning_rate": 1e-06, + "loss": 0.3763, + "mean_token_accuracy": 0.8795673251152039, + "num_tokens": 824385850.0, + "step": 21605 + }, + { + "epoch": 2.7485052792265616, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.50672721862793, + "learning_rate": 1e-06, + "loss": 0.3146, + "mean_token_accuracy": 0.9013242721557617, + "num_tokens": 824425545.0, + "step": 21606 + }, + { + "epoch": 2.748632489505152, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.86805534362793, + "learning_rate": 1e-06, + "loss": 0.3488, + "mean_token_accuracy": 0.8915320634841919, + "num_tokens": 824468611.0, + "step": 21607 + }, + { + "epoch": 2.7487596997837427, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.630136489868164, + "learning_rate": 1e-06, + "loss": 0.3866, + "mean_token_accuracy": 0.8792539834976196, + "num_tokens": 824512161.0, + "step": 21608 + }, + { + "epoch": 2.7488869100623328, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.646957397460938, + "learning_rate": 1e-06, + "loss": 0.3616, + "mean_token_accuracy": 0.8866550922393799, + "num_tokens": 824555502.0, + "step": 21609 + }, + { + "epoch": 2.7490141203409237, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.63214111328125, + "learning_rate": 1e-06, + "loss": 0.429, + "mean_token_accuracy": 0.8679320812225342, + "num_tokens": 824596277.0, + "step": 21610 + }, + { + "epoch": 2.749141330619514, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.60131072998047, + "learning_rate": 1e-06, + "loss": 0.4367, + "mean_token_accuracy": 0.8629946112632751, + "num_tokens": 824633704.0, + "step": 21611 + }, + { + "epoch": 2.749268540898105, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.645458221435547, + "learning_rate": 1e-06, + "loss": 0.3856, + "mean_token_accuracy": 0.8796514868736267, + "num_tokens": 824673584.0, + "step": 21612 + }, + { + "epoch": 2.749395751176695, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.642316818237305, + "learning_rate": 1e-06, + "loss": 0.4261, + "mean_token_accuracy": 0.8692550659179688, + "num_tokens": 824712508.0, + "step": 21613 + }, + { + "epoch": 2.749522961455286, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.642614364624023, + "learning_rate": 1e-06, + "loss": 0.3643, + "mean_token_accuracy": 0.8851306438446045, + "num_tokens": 824754353.0, + "step": 21614 + }, + { + "epoch": 2.749650171733876, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.824853897094727, + "learning_rate": 1e-06, + "loss": 0.447, + "mean_token_accuracy": 0.8561021089553833, + "num_tokens": 824796333.0, + "step": 21615 + }, + { + "epoch": 2.7497773820124665, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.62339210510254, + "learning_rate": 1e-06, + "loss": 0.4091, + "mean_token_accuracy": 0.868561863899231, + "num_tokens": 824832048.0, + "step": 21616 + }, + { + "epoch": 2.749904592291057, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.488765716552734, + "learning_rate": 1e-06, + "loss": 0.4215, + "mean_token_accuracy": 0.869280993938446, + "num_tokens": 824871889.0, + "step": 21617 + }, + { + "epoch": 2.7500318025696475, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.600061416625977, + "learning_rate": 1e-06, + "loss": 0.3719, + "mean_token_accuracy": 0.8852707147598267, + "num_tokens": 824912547.0, + "step": 21618 + }, + { + "epoch": 2.750159012848238, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.66257095336914, + "learning_rate": 1e-06, + "loss": 0.4635, + "mean_token_accuracy": 0.8549112677574158, + "num_tokens": 824955626.0, + "step": 21619 + }, + { + "epoch": 2.7502862231268286, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.517902374267578, + "learning_rate": 1e-06, + "loss": 0.3684, + "mean_token_accuracy": 0.882738471031189, + "num_tokens": 824991813.0, + "step": 21620 + }, + { + "epoch": 2.750413433405419, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.686189651489258, + "learning_rate": 1e-06, + "loss": 0.3653, + "mean_token_accuracy": 0.8831667304039001, + "num_tokens": 825024356.0, + "step": 21621 + }, + { + "epoch": 2.7505406436840096, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.879959106445312, + "learning_rate": 1e-06, + "loss": 0.4049, + "mean_token_accuracy": 0.871084451675415, + "num_tokens": 825060316.0, + "step": 21622 + }, + { + "epoch": 2.7506678539626, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.712411880493164, + "learning_rate": 1e-06, + "loss": 0.4195, + "mean_token_accuracy": 0.867856502532959, + "num_tokens": 825096044.0, + "step": 21623 + }, + { + "epoch": 2.7507950642411907, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.583759307861328, + "learning_rate": 1e-06, + "loss": 0.3964, + "mean_token_accuracy": 0.8737481832504272, + "num_tokens": 825123999.0, + "step": 21624 + }, + { + "epoch": 2.750922274519781, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.405935287475586, + "learning_rate": 1e-06, + "loss": 0.4269, + "mean_token_accuracy": 0.8643980026245117, + "num_tokens": 825170155.0, + "step": 21625 + }, + { + "epoch": 2.7510494847983717, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.576576232910156, + "learning_rate": 1e-06, + "loss": 0.402, + "mean_token_accuracy": 0.8737390041351318, + "num_tokens": 825213388.0, + "step": 21626 + }, + { + "epoch": 2.7511766950769623, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.391691207885742, + "learning_rate": 1e-06, + "loss": 0.3898, + "mean_token_accuracy": 0.8817950487136841, + "num_tokens": 825247171.0, + "step": 21627 + }, + { + "epoch": 2.751303905355553, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.745201110839844, + "learning_rate": 1e-06, + "loss": 0.3979, + "mean_token_accuracy": 0.8758147954940796, + "num_tokens": 825283271.0, + "step": 21628 + }, + { + "epoch": 2.7514311156341433, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.67981719970703, + "learning_rate": 1e-06, + "loss": 0.3982, + "mean_token_accuracy": 0.8733724355697632, + "num_tokens": 825318976.0, + "step": 21629 + }, + { + "epoch": 2.751558325912734, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.458850860595703, + "learning_rate": 1e-06, + "loss": 0.403, + "mean_token_accuracy": 0.8738450407981873, + "num_tokens": 825358019.0, + "step": 21630 + }, + { + "epoch": 2.7516855361913244, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.673463821411133, + "learning_rate": 1e-06, + "loss": 0.3568, + "mean_token_accuracy": 0.8911553025245667, + "num_tokens": 825394025.0, + "step": 21631 + }, + { + "epoch": 2.751812746469915, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.622051239013672, + "learning_rate": 1e-06, + "loss": 0.3758, + "mean_token_accuracy": 0.8852246999740601, + "num_tokens": 825433021.0, + "step": 21632 + }, + { + "epoch": 2.7519399567485054, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.396129608154297, + "learning_rate": 1e-06, + "loss": 0.3934, + "mean_token_accuracy": 0.8758843541145325, + "num_tokens": 825477026.0, + "step": 21633 + }, + { + "epoch": 2.7520671670270955, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.729955673217773, + "learning_rate": 1e-06, + "loss": 0.3835, + "mean_token_accuracy": 0.8793041706085205, + "num_tokens": 825511054.0, + "step": 21634 + }, + { + "epoch": 2.7521943773056865, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.644906997680664, + "learning_rate": 1e-06, + "loss": 0.4386, + "mean_token_accuracy": 0.8601462841033936, + "num_tokens": 825549654.0, + "step": 21635 + }, + { + "epoch": 2.7523215875842766, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.461132049560547, + "learning_rate": 1e-06, + "loss": 0.4024, + "mean_token_accuracy": 0.8764533996582031, + "num_tokens": 825587269.0, + "step": 21636 + }, + { + "epoch": 2.7524487978628676, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.67415428161621, + "learning_rate": 1e-06, + "loss": 0.3272, + "mean_token_accuracy": 0.8969784379005432, + "num_tokens": 825624864.0, + "step": 21637 + }, + { + "epoch": 2.7525760081414576, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.77812957763672, + "learning_rate": 1e-06, + "loss": 0.4316, + "mean_token_accuracy": 0.8668524026870728, + "num_tokens": 825667025.0, + "step": 21638 + }, + { + "epoch": 2.7527032184200486, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.46045684814453, + "learning_rate": 1e-06, + "loss": 0.4437, + "mean_token_accuracy": 0.8601648211479187, + "num_tokens": 825709505.0, + "step": 21639 + }, + { + "epoch": 2.7528304286986387, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.555742263793945, + "learning_rate": 1e-06, + "loss": 0.4002, + "mean_token_accuracy": 0.8776129484176636, + "num_tokens": 825747376.0, + "step": 21640 + }, + { + "epoch": 2.7529576389772292, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.610103607177734, + "learning_rate": 1e-06, + "loss": 0.3816, + "mean_token_accuracy": 0.88046795129776, + "num_tokens": 825784817.0, + "step": 21641 + }, + { + "epoch": 2.7530848492558198, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.310653686523438, + "learning_rate": 1e-06, + "loss": 0.3963, + "mean_token_accuracy": 0.8726580142974854, + "num_tokens": 825815631.0, + "step": 21642 + }, + { + "epoch": 2.7532120595344103, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.622255325317383, + "learning_rate": 1e-06, + "loss": 0.4052, + "mean_token_accuracy": 0.8741524815559387, + "num_tokens": 825854202.0, + "step": 21643 + }, + { + "epoch": 2.753339269813001, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.872112274169922, + "learning_rate": 1e-06, + "loss": 0.3766, + "mean_token_accuracy": 0.882882297039032, + "num_tokens": 825890636.0, + "step": 21644 + }, + { + "epoch": 2.7534664800915913, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.627408981323242, + "learning_rate": 1e-06, + "loss": 0.4577, + "mean_token_accuracy": 0.85288405418396, + "num_tokens": 825931054.0, + "step": 21645 + }, + { + "epoch": 2.753593690370182, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.589397430419922, + "learning_rate": 1e-06, + "loss": 0.4102, + "mean_token_accuracy": 0.8704972267150879, + "num_tokens": 825972519.0, + "step": 21646 + }, + { + "epoch": 2.7537209006487724, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.552234649658203, + "learning_rate": 1e-06, + "loss": 0.3886, + "mean_token_accuracy": 0.8762221336364746, + "num_tokens": 826010010.0, + "step": 21647 + }, + { + "epoch": 2.753848110927363, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.687284469604492, + "learning_rate": 1e-06, + "loss": 0.3983, + "mean_token_accuracy": 0.8736959099769592, + "num_tokens": 826048348.0, + "step": 21648 + }, + { + "epoch": 2.7539753212059535, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.405344009399414, + "learning_rate": 1e-06, + "loss": 0.4086, + "mean_token_accuracy": 0.8715836405754089, + "num_tokens": 826087807.0, + "step": 21649 + }, + { + "epoch": 2.754102531484544, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.739748001098633, + "learning_rate": 1e-06, + "loss": 0.4021, + "mean_token_accuracy": 0.8708934187889099, + "num_tokens": 826124657.0, + "step": 21650 + }, + { + "epoch": 2.7542297417631345, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.84848976135254, + "learning_rate": 1e-06, + "loss": 0.4015, + "mean_token_accuracy": 0.8764598369598389, + "num_tokens": 826164190.0, + "step": 21651 + }, + { + "epoch": 2.754356952041725, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.88035774230957, + "learning_rate": 1e-06, + "loss": 0.3942, + "mean_token_accuracy": 0.8756168484687805, + "num_tokens": 826202876.0, + "step": 21652 + }, + { + "epoch": 2.7544841623203156, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.40281867980957, + "learning_rate": 1e-06, + "loss": 0.4079, + "mean_token_accuracy": 0.8732803463935852, + "num_tokens": 826242955.0, + "step": 21653 + }, + { + "epoch": 2.754611372598906, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.75503158569336, + "learning_rate": 1e-06, + "loss": 0.3669, + "mean_token_accuracy": 0.8832402229309082, + "num_tokens": 826279925.0, + "step": 21654 + }, + { + "epoch": 2.7547385828774966, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.67006492614746, + "learning_rate": 1e-06, + "loss": 0.3665, + "mean_token_accuracy": 0.8847193717956543, + "num_tokens": 826314076.0, + "step": 21655 + }, + { + "epoch": 2.754865793156087, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.620643615722656, + "learning_rate": 1e-06, + "loss": 0.3991, + "mean_token_accuracy": 0.8713811039924622, + "num_tokens": 826354138.0, + "step": 21656 + }, + { + "epoch": 2.7549930034346777, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.4213809967041, + "learning_rate": 1e-06, + "loss": 0.4173, + "mean_token_accuracy": 0.8703019618988037, + "num_tokens": 826395393.0, + "step": 21657 + }, + { + "epoch": 2.755120213713268, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.827228546142578, + "learning_rate": 1e-06, + "loss": 0.4398, + "mean_token_accuracy": 0.8648022413253784, + "num_tokens": 826437539.0, + "step": 21658 + }, + { + "epoch": 2.7552474239918583, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.955747604370117, + "learning_rate": 1e-06, + "loss": 0.4193, + "mean_token_accuracy": 0.8688095211982727, + "num_tokens": 826475857.0, + "step": 21659 + }, + { + "epoch": 2.7553746342704493, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.458229064941406, + "learning_rate": 1e-06, + "loss": 0.3753, + "mean_token_accuracy": 0.8839353322982788, + "num_tokens": 826513956.0, + "step": 21660 + }, + { + "epoch": 2.7555018445490393, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.607070922851562, + "learning_rate": 1e-06, + "loss": 0.399, + "mean_token_accuracy": 0.8727399110794067, + "num_tokens": 826552747.0, + "step": 21661 + }, + { + "epoch": 2.7556290548276303, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.7278995513916, + "learning_rate": 1e-06, + "loss": 0.3696, + "mean_token_accuracy": 0.8814361691474915, + "num_tokens": 826590830.0, + "step": 21662 + }, + { + "epoch": 2.7557562651062204, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.351661682128906, + "learning_rate": 1e-06, + "loss": 0.3994, + "mean_token_accuracy": 0.8779236078262329, + "num_tokens": 826633414.0, + "step": 21663 + }, + { + "epoch": 2.755883475384811, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.654218673706055, + "learning_rate": 1e-06, + "loss": 0.3729, + "mean_token_accuracy": 0.8832504153251648, + "num_tokens": 826668344.0, + "step": 21664 + }, + { + "epoch": 2.7560106856634015, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.293109893798828, + "learning_rate": 1e-06, + "loss": 0.4547, + "mean_token_accuracy": 0.8572714328765869, + "num_tokens": 826710587.0, + "step": 21665 + }, + { + "epoch": 2.756137895941992, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.974315643310547, + "learning_rate": 1e-06, + "loss": 0.422, + "mean_token_accuracy": 0.8654077053070068, + "num_tokens": 826748854.0, + "step": 21666 + }, + { + "epoch": 2.7562651062205825, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.685840606689453, + "learning_rate": 1e-06, + "loss": 0.4478, + "mean_token_accuracy": 0.8614633083343506, + "num_tokens": 826789591.0, + "step": 21667 + }, + { + "epoch": 2.756392316499173, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.480459213256836, + "learning_rate": 1e-06, + "loss": 0.4111, + "mean_token_accuracy": 0.8705899119377136, + "num_tokens": 826831890.0, + "step": 21668 + }, + { + "epoch": 2.7565195267777636, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.61348533630371, + "learning_rate": 1e-06, + "loss": 0.4554, + "mean_token_accuracy": 0.8562970757484436, + "num_tokens": 826870265.0, + "step": 21669 + }, + { + "epoch": 2.756646737056354, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.597322463989258, + "learning_rate": 1e-06, + "loss": 0.3929, + "mean_token_accuracy": 0.8741207122802734, + "num_tokens": 826910993.0, + "step": 21670 + }, + { + "epoch": 2.7567739473349446, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.62537956237793, + "learning_rate": 1e-06, + "loss": 0.3999, + "mean_token_accuracy": 0.8728903532028198, + "num_tokens": 826947492.0, + "step": 21671 + }, + { + "epoch": 2.756901157613535, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.594276428222656, + "learning_rate": 1e-06, + "loss": 0.3899, + "mean_token_accuracy": 0.8778948187828064, + "num_tokens": 826987215.0, + "step": 21672 + }, + { + "epoch": 2.7570283678921257, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.3071346282959, + "learning_rate": 1e-06, + "loss": 0.4102, + "mean_token_accuracy": 0.8713282346725464, + "num_tokens": 827024092.0, + "step": 21673 + }, + { + "epoch": 2.757155578170716, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.953840255737305, + "learning_rate": 1e-06, + "loss": 0.4629, + "mean_token_accuracy": 0.8573553562164307, + "num_tokens": 827060541.0, + "step": 21674 + }, + { + "epoch": 2.7572827884493067, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.606203079223633, + "learning_rate": 1e-06, + "loss": 0.4059, + "mean_token_accuracy": 0.8733984231948853, + "num_tokens": 827101205.0, + "step": 21675 + }, + { + "epoch": 2.7574099987278973, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.4263973236084, + "learning_rate": 1e-06, + "loss": 0.4203, + "mean_token_accuracy": 0.8686226010322571, + "num_tokens": 827132441.0, + "step": 21676 + }, + { + "epoch": 2.757537209006488, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.561521530151367, + "learning_rate": 1e-06, + "loss": 0.4042, + "mean_token_accuracy": 0.874287486076355, + "num_tokens": 827169530.0, + "step": 21677 + }, + { + "epoch": 2.7576644192850783, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.66021156311035, + "learning_rate": 1e-06, + "loss": 0.3746, + "mean_token_accuracy": 0.8809127807617188, + "num_tokens": 827207352.0, + "step": 21678 + }, + { + "epoch": 2.757791629563669, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.677526473999023, + "learning_rate": 1e-06, + "loss": 0.4106, + "mean_token_accuracy": 0.8691680431365967, + "num_tokens": 827242790.0, + "step": 21679 + }, + { + "epoch": 2.7579188398422594, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.45017433166504, + "learning_rate": 1e-06, + "loss": 0.4059, + "mean_token_accuracy": 0.8755524754524231, + "num_tokens": 827281676.0, + "step": 21680 + }, + { + "epoch": 2.75804605012085, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.609628677368164, + "learning_rate": 1e-06, + "loss": 0.3514, + "mean_token_accuracy": 0.8874844312667847, + "num_tokens": 827313676.0, + "step": 21681 + }, + { + "epoch": 2.7581732603994404, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.807950973510742, + "learning_rate": 1e-06, + "loss": 0.4027, + "mean_token_accuracy": 0.8741598129272461, + "num_tokens": 827355017.0, + "step": 21682 + }, + { + "epoch": 2.758300470678031, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.60905647277832, + "learning_rate": 1e-06, + "loss": 0.3506, + "mean_token_accuracy": 0.8908801674842834, + "num_tokens": 827391773.0, + "step": 21683 + }, + { + "epoch": 2.758427680956621, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.54503631591797, + "learning_rate": 1e-06, + "loss": 0.3597, + "mean_token_accuracy": 0.8886138796806335, + "num_tokens": 827433034.0, + "step": 21684 + }, + { + "epoch": 2.758554891235212, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.410131454467773, + "learning_rate": 1e-06, + "loss": 0.3667, + "mean_token_accuracy": 0.8848563432693481, + "num_tokens": 827475729.0, + "step": 21685 + }, + { + "epoch": 2.758682101513802, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.581750869750977, + "learning_rate": 1e-06, + "loss": 0.3849, + "mean_token_accuracy": 0.8791264891624451, + "num_tokens": 827514127.0, + "step": 21686 + }, + { + "epoch": 2.758809311792393, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.61642074584961, + "learning_rate": 1e-06, + "loss": 0.3688, + "mean_token_accuracy": 0.8847554326057434, + "num_tokens": 827546709.0, + "step": 21687 + }, + { + "epoch": 2.758936522070983, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.602392196655273, + "learning_rate": 1e-06, + "loss": 0.3348, + "mean_token_accuracy": 0.8944706916809082, + "num_tokens": 827581450.0, + "step": 21688 + }, + { + "epoch": 2.7590637323495737, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.451560974121094, + "learning_rate": 1e-06, + "loss": 0.4069, + "mean_token_accuracy": 0.8699098229408264, + "num_tokens": 827623161.0, + "step": 21689 + }, + { + "epoch": 2.7591909426281642, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.554981231689453, + "learning_rate": 1e-06, + "loss": 0.4031, + "mean_token_accuracy": 0.8695933818817139, + "num_tokens": 827659460.0, + "step": 21690 + }, + { + "epoch": 2.7593181529067548, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.535722732543945, + "learning_rate": 1e-06, + "loss": 0.4229, + "mean_token_accuracy": 0.869076132774353, + "num_tokens": 827701110.0, + "step": 21691 + }, + { + "epoch": 2.7594453631853453, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.659988403320312, + "learning_rate": 1e-06, + "loss": 0.4118, + "mean_token_accuracy": 0.8708615303039551, + "num_tokens": 827738992.0, + "step": 21692 + }, + { + "epoch": 2.759572573463936, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.68682861328125, + "learning_rate": 1e-06, + "loss": 0.4419, + "mean_token_accuracy": 0.8603862524032593, + "num_tokens": 827775853.0, + "step": 21693 + }, + { + "epoch": 2.7596997837425263, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.39665412902832, + "learning_rate": 1e-06, + "loss": 0.3966, + "mean_token_accuracy": 0.8753066658973694, + "num_tokens": 827819438.0, + "step": 21694 + }, + { + "epoch": 2.759826994021117, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.692676544189453, + "learning_rate": 1e-06, + "loss": 0.4114, + "mean_token_accuracy": 0.8686057329177856, + "num_tokens": 827858833.0, + "step": 21695 + }, + { + "epoch": 2.7599542042997074, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.542465209960938, + "learning_rate": 1e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.867729902267456, + "num_tokens": 827895069.0, + "step": 21696 + }, + { + "epoch": 2.760081414578298, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.621952056884766, + "learning_rate": 1e-06, + "loss": 0.3591, + "mean_token_accuracy": 0.8875781297683716, + "num_tokens": 827932970.0, + "step": 21697 + }, + { + "epoch": 2.7602086248568884, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.462167739868164, + "learning_rate": 1e-06, + "loss": 0.3641, + "mean_token_accuracy": 0.8855761289596558, + "num_tokens": 827981859.0, + "step": 21698 + }, + { + "epoch": 2.760335835135479, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.64695930480957, + "learning_rate": 1e-06, + "loss": 0.417, + "mean_token_accuracy": 0.8686708211898804, + "num_tokens": 828022018.0, + "step": 21699 + }, + { + "epoch": 2.7604630454140695, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.499378204345703, + "learning_rate": 1e-06, + "loss": 0.4238, + "mean_token_accuracy": 0.8703341484069824, + "num_tokens": 828061546.0, + "step": 21700 + }, + { + "epoch": 2.76059025569266, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.637651443481445, + "learning_rate": 1e-06, + "loss": 0.4437, + "mean_token_accuracy": 0.8612689971923828, + "num_tokens": 828104464.0, + "step": 21701 + }, + { + "epoch": 2.7607174659712506, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.70504379272461, + "learning_rate": 1e-06, + "loss": 0.4425, + "mean_token_accuracy": 0.863510251045227, + "num_tokens": 828142527.0, + "step": 21702 + }, + { + "epoch": 2.760844676249841, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.508054733276367, + "learning_rate": 1e-06, + "loss": 0.3954, + "mean_token_accuracy": 0.8743332624435425, + "num_tokens": 828189670.0, + "step": 21703 + }, + { + "epoch": 2.7609718865284316, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.653627395629883, + "learning_rate": 1e-06, + "loss": 0.38, + "mean_token_accuracy": 0.8811625242233276, + "num_tokens": 828231511.0, + "step": 21704 + }, + { + "epoch": 2.761099096807022, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.644529342651367, + "learning_rate": 1e-06, + "loss": 0.4032, + "mean_token_accuracy": 0.8741775751113892, + "num_tokens": 828274387.0, + "step": 21705 + }, + { + "epoch": 2.7612263070856127, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.53221321105957, + "learning_rate": 1e-06, + "loss": 0.4177, + "mean_token_accuracy": 0.8678871393203735, + "num_tokens": 828312263.0, + "step": 21706 + }, + { + "epoch": 2.7613535173642028, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.584177017211914, + "learning_rate": 1e-06, + "loss": 0.3869, + "mean_token_accuracy": 0.8753175735473633, + "num_tokens": 828345550.0, + "step": 21707 + }, + { + "epoch": 2.7614807276427937, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.762941360473633, + "learning_rate": 1e-06, + "loss": 0.436, + "mean_token_accuracy": 0.8647247552871704, + "num_tokens": 828378633.0, + "step": 21708 + }, + { + "epoch": 2.761607937921384, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.667892456054688, + "learning_rate": 1e-06, + "loss": 0.4214, + "mean_token_accuracy": 0.8680769205093384, + "num_tokens": 828414292.0, + "step": 21709 + }, + { + "epoch": 2.761735148199975, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.528696060180664, + "learning_rate": 1e-06, + "loss": 0.3991, + "mean_token_accuracy": 0.8737595081329346, + "num_tokens": 828452682.0, + "step": 21710 + }, + { + "epoch": 2.761862358478565, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.615755081176758, + "learning_rate": 1e-06, + "loss": 0.3562, + "mean_token_accuracy": 0.8892010450363159, + "num_tokens": 828494162.0, + "step": 21711 + }, + { + "epoch": 2.761989568757156, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.51718521118164, + "learning_rate": 1e-06, + "loss": 0.3871, + "mean_token_accuracy": 0.8786126375198364, + "num_tokens": 828525997.0, + "step": 21712 + }, + { + "epoch": 2.762116779035746, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.53484344482422, + "learning_rate": 1e-06, + "loss": 0.405, + "mean_token_accuracy": 0.8729150295257568, + "num_tokens": 828562488.0, + "step": 21713 + }, + { + "epoch": 2.7622439893143365, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.679241180419922, + "learning_rate": 1e-06, + "loss": 0.344, + "mean_token_accuracy": 0.8917785882949829, + "num_tokens": 828591818.0, + "step": 21714 + }, + { + "epoch": 2.762371199592927, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.620176315307617, + "learning_rate": 1e-06, + "loss": 0.4393, + "mean_token_accuracy": 0.8624535799026489, + "num_tokens": 828628872.0, + "step": 21715 + }, + { + "epoch": 2.7624984098715175, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.56114959716797, + "learning_rate": 1e-06, + "loss": 0.3774, + "mean_token_accuracy": 0.8829114437103271, + "num_tokens": 828670939.0, + "step": 21716 + }, + { + "epoch": 2.762625620150108, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.732751846313477, + "learning_rate": 1e-06, + "loss": 0.4529, + "mean_token_accuracy": 0.8566632270812988, + "num_tokens": 828708446.0, + "step": 21717 + }, + { + "epoch": 2.7627528304286986, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.51344108581543, + "learning_rate": 1e-06, + "loss": 0.3879, + "mean_token_accuracy": 0.8777934312820435, + "num_tokens": 828749873.0, + "step": 21718 + }, + { + "epoch": 2.762880040707289, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.66444206237793, + "learning_rate": 1e-06, + "loss": 0.4211, + "mean_token_accuracy": 0.8670668005943298, + "num_tokens": 828790066.0, + "step": 21719 + }, + { + "epoch": 2.7630072509858796, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.699752807617188, + "learning_rate": 1e-06, + "loss": 0.4218, + "mean_token_accuracy": 0.8704500198364258, + "num_tokens": 828821911.0, + "step": 21720 + }, + { + "epoch": 2.76313446126447, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.685495376586914, + "learning_rate": 1e-06, + "loss": 0.4067, + "mean_token_accuracy": 0.8690167665481567, + "num_tokens": 828856572.0, + "step": 21721 + }, + { + "epoch": 2.7632616715430607, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.58772850036621, + "learning_rate": 1e-06, + "loss": 0.3583, + "mean_token_accuracy": 0.885929524898529, + "num_tokens": 828898865.0, + "step": 21722 + }, + { + "epoch": 2.763388881821651, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.559247970581055, + "learning_rate": 1e-06, + "loss": 0.3706, + "mean_token_accuracy": 0.8818292617797852, + "num_tokens": 828933768.0, + "step": 21723 + }, + { + "epoch": 2.7635160921002417, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.479162216186523, + "learning_rate": 1e-06, + "loss": 0.3546, + "mean_token_accuracy": 0.8893930912017822, + "num_tokens": 828968869.0, + "step": 21724 + }, + { + "epoch": 2.7636433023788323, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.732107162475586, + "learning_rate": 1e-06, + "loss": 0.4605, + "mean_token_accuracy": 0.8577329516410828, + "num_tokens": 829005621.0, + "step": 21725 + }, + { + "epoch": 2.763770512657423, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.450153350830078, + "learning_rate": 1e-06, + "loss": 0.4291, + "mean_token_accuracy": 0.8643791675567627, + "num_tokens": 829045046.0, + "step": 21726 + }, + { + "epoch": 2.7638977229360133, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.62874412536621, + "learning_rate": 1e-06, + "loss": 0.3726, + "mean_token_accuracy": 0.8873899579048157, + "num_tokens": 829084588.0, + "step": 21727 + }, + { + "epoch": 2.764024933214604, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.6895751953125, + "learning_rate": 1e-06, + "loss": 0.4591, + "mean_token_accuracy": 0.8580989241600037, + "num_tokens": 829121010.0, + "step": 21728 + }, + { + "epoch": 2.7641521434931944, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.611984252929688, + "learning_rate": 1e-06, + "loss": 0.3678, + "mean_token_accuracy": 0.882726788520813, + "num_tokens": 829159618.0, + "step": 21729 + }, + { + "epoch": 2.764279353771785, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.61595344543457, + "learning_rate": 1e-06, + "loss": 0.443, + "mean_token_accuracy": 0.861290693283081, + "num_tokens": 829204826.0, + "step": 21730 + }, + { + "epoch": 2.7644065640503754, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.62394142150879, + "learning_rate": 1e-06, + "loss": 0.3551, + "mean_token_accuracy": 0.884666919708252, + "num_tokens": 829240741.0, + "step": 21731 + }, + { + "epoch": 2.7645337743289655, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.416458129882812, + "learning_rate": 1e-06, + "loss": 0.3486, + "mean_token_accuracy": 0.8931601047515869, + "num_tokens": 829278065.0, + "step": 21732 + }, + { + "epoch": 2.7646609846075565, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.59859275817871, + "learning_rate": 1e-06, + "loss": 0.4109, + "mean_token_accuracy": 0.8705543875694275, + "num_tokens": 829319438.0, + "step": 21733 + }, + { + "epoch": 2.7647881948861466, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.52924919128418, + "learning_rate": 1e-06, + "loss": 0.371, + "mean_token_accuracy": 0.8842639923095703, + "num_tokens": 829357299.0, + "step": 21734 + }, + { + "epoch": 2.7649154051647375, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.452173233032227, + "learning_rate": 1e-06, + "loss": 0.4025, + "mean_token_accuracy": 0.874691367149353, + "num_tokens": 829398600.0, + "step": 21735 + }, + { + "epoch": 2.7650426154433276, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.72423553466797, + "learning_rate": 1e-06, + "loss": 0.4314, + "mean_token_accuracy": 0.8634609580039978, + "num_tokens": 829439214.0, + "step": 21736 + }, + { + "epoch": 2.7651698257219186, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.511672973632812, + "learning_rate": 1e-06, + "loss": 0.4393, + "mean_token_accuracy": 0.8644641637802124, + "num_tokens": 829477634.0, + "step": 21737 + }, + { + "epoch": 2.7652970360005087, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.59341812133789, + "learning_rate": 1e-06, + "loss": 0.3852, + "mean_token_accuracy": 0.8803688287734985, + "num_tokens": 829514527.0, + "step": 21738 + }, + { + "epoch": 2.765424246279099, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.672603607177734, + "learning_rate": 1e-06, + "loss": 0.3935, + "mean_token_accuracy": 0.8756744861602783, + "num_tokens": 829558531.0, + "step": 21739 + }, + { + "epoch": 2.7655514565576897, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.445600509643555, + "learning_rate": 1e-06, + "loss": 0.3884, + "mean_token_accuracy": 0.8760398030281067, + "num_tokens": 829600769.0, + "step": 21740 + }, + { + "epoch": 2.7656786668362803, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.615407943725586, + "learning_rate": 1e-06, + "loss": 0.3863, + "mean_token_accuracy": 0.8788086771965027, + "num_tokens": 829635938.0, + "step": 21741 + }, + { + "epoch": 2.765805877114871, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.74898338317871, + "learning_rate": 1e-06, + "loss": 0.4047, + "mean_token_accuracy": 0.873985767364502, + "num_tokens": 829672956.0, + "step": 21742 + }, + { + "epoch": 2.7659330873934613, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.579116821289062, + "learning_rate": 1e-06, + "loss": 0.3903, + "mean_token_accuracy": 0.8734769821166992, + "num_tokens": 829712658.0, + "step": 21743 + }, + { + "epoch": 2.766060297672052, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.749095916748047, + "learning_rate": 1e-06, + "loss": 0.3884, + "mean_token_accuracy": 0.8770661354064941, + "num_tokens": 829755429.0, + "step": 21744 + }, + { + "epoch": 2.7661875079506424, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.80082893371582, + "learning_rate": 1e-06, + "loss": 0.4095, + "mean_token_accuracy": 0.8731107115745544, + "num_tokens": 829796373.0, + "step": 21745 + }, + { + "epoch": 2.766314718229233, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.514741897583008, + "learning_rate": 1e-06, + "loss": 0.408, + "mean_token_accuracy": 0.869365394115448, + "num_tokens": 829834049.0, + "step": 21746 + }, + { + "epoch": 2.7664419285078234, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.89260482788086, + "learning_rate": 1e-06, + "loss": 0.3986, + "mean_token_accuracy": 0.8738235235214233, + "num_tokens": 829870853.0, + "step": 21747 + }, + { + "epoch": 2.766569138786414, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.69095802307129, + "learning_rate": 1e-06, + "loss": 0.4201, + "mean_token_accuracy": 0.8682513236999512, + "num_tokens": 829907519.0, + "step": 21748 + }, + { + "epoch": 2.7666963490650045, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.567882537841797, + "learning_rate": 1e-06, + "loss": 0.4357, + "mean_token_accuracy": 0.860984742641449, + "num_tokens": 829947162.0, + "step": 21749 + }, + { + "epoch": 2.766823559343595, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.633075714111328, + "learning_rate": 1e-06, + "loss": 0.417, + "mean_token_accuracy": 0.8716230392456055, + "num_tokens": 829991981.0, + "step": 21750 + }, + { + "epoch": 2.7669507696221856, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.474666595458984, + "learning_rate": 1e-06, + "loss": 0.3812, + "mean_token_accuracy": 0.8798447847366333, + "num_tokens": 830032409.0, + "step": 21751 + }, + { + "epoch": 2.767077979900776, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.75014305114746, + "learning_rate": 1e-06, + "loss": 0.3921, + "mean_token_accuracy": 0.8773176670074463, + "num_tokens": 830072738.0, + "step": 21752 + }, + { + "epoch": 2.7672051901793666, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.910730361938477, + "learning_rate": 1e-06, + "loss": 0.4135, + "mean_token_accuracy": 0.8703274130821228, + "num_tokens": 830110880.0, + "step": 21753 + }, + { + "epoch": 2.767332400457957, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.563016891479492, + "learning_rate": 1e-06, + "loss": 0.4139, + "mean_token_accuracy": 0.8699960708618164, + "num_tokens": 830150148.0, + "step": 21754 + }, + { + "epoch": 2.7674596107365477, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.719505310058594, + "learning_rate": 1e-06, + "loss": 0.4457, + "mean_token_accuracy": 0.8626998662948608, + "num_tokens": 830190717.0, + "step": 21755 + }, + { + "epoch": 2.767586821015138, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.53607177734375, + "learning_rate": 1e-06, + "loss": 0.3924, + "mean_token_accuracy": 0.8756251335144043, + "num_tokens": 830226972.0, + "step": 21756 + }, + { + "epoch": 2.7677140312937283, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.811437606811523, + "learning_rate": 1e-06, + "loss": 0.3683, + "mean_token_accuracy": 0.8825139999389648, + "num_tokens": 830262612.0, + "step": 21757 + }, + { + "epoch": 2.7678412415723193, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.75655174255371, + "learning_rate": 1e-06, + "loss": 0.3777, + "mean_token_accuracy": 0.8802264928817749, + "num_tokens": 830306856.0, + "step": 21758 + }, + { + "epoch": 2.7679684518509093, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.630617141723633, + "learning_rate": 1e-06, + "loss": 0.4187, + "mean_token_accuracy": 0.8729995489120483, + "num_tokens": 830340726.0, + "step": 21759 + }, + { + "epoch": 2.7680956621295003, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.75737190246582, + "learning_rate": 1e-06, + "loss": 0.4111, + "mean_token_accuracy": 0.8739345073699951, + "num_tokens": 830380697.0, + "step": 21760 + }, + { + "epoch": 2.7682228724080904, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.838041305541992, + "learning_rate": 1e-06, + "loss": 0.4517, + "mean_token_accuracy": 0.8516876697540283, + "num_tokens": 830414737.0, + "step": 21761 + }, + { + "epoch": 2.768350082686681, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.62798309326172, + "learning_rate": 1e-06, + "loss": 0.3963, + "mean_token_accuracy": 0.874507486820221, + "num_tokens": 830456518.0, + "step": 21762 + }, + { + "epoch": 2.7684772929652715, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.822021484375, + "learning_rate": 1e-06, + "loss": 0.4082, + "mean_token_accuracy": 0.8706368207931519, + "num_tokens": 830494192.0, + "step": 21763 + }, + { + "epoch": 2.768604503243862, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.431703567504883, + "learning_rate": 1e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.8684964179992676, + "num_tokens": 830533659.0, + "step": 21764 + }, + { + "epoch": 2.7687317135224525, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.904308319091797, + "learning_rate": 1e-06, + "loss": 0.4036, + "mean_token_accuracy": 0.8674347400665283, + "num_tokens": 830569605.0, + "step": 21765 + }, + { + "epoch": 2.768858923801043, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.841922760009766, + "learning_rate": 1e-06, + "loss": 0.4652, + "mean_token_accuracy": 0.8519726991653442, + "num_tokens": 830610814.0, + "step": 21766 + }, + { + "epoch": 2.7689861340796336, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.510717391967773, + "learning_rate": 1e-06, + "loss": 0.4063, + "mean_token_accuracy": 0.8733267188072205, + "num_tokens": 830646811.0, + "step": 21767 + }, + { + "epoch": 2.769113344358224, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.927003860473633, + "learning_rate": 1e-06, + "loss": 0.3907, + "mean_token_accuracy": 0.8773455619812012, + "num_tokens": 830680984.0, + "step": 21768 + }, + { + "epoch": 2.7692405546368146, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.733625411987305, + "learning_rate": 1e-06, + "loss": 0.4041, + "mean_token_accuracy": 0.8738563656806946, + "num_tokens": 830713384.0, + "step": 21769 + }, + { + "epoch": 2.769367764915405, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.421457290649414, + "learning_rate": 1e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.8615778088569641, + "num_tokens": 830746922.0, + "step": 21770 + }, + { + "epoch": 2.7694949751939957, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.747228622436523, + "learning_rate": 1e-06, + "loss": 0.3773, + "mean_token_accuracy": 0.8797886371612549, + "num_tokens": 830789459.0, + "step": 21771 + }, + { + "epoch": 2.769622185472586, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.612369537353516, + "learning_rate": 1e-06, + "loss": 0.4038, + "mean_token_accuracy": 0.8714824914932251, + "num_tokens": 830826736.0, + "step": 21772 + }, + { + "epoch": 2.7697493957511767, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.60302734375, + "learning_rate": 1e-06, + "loss": 0.3895, + "mean_token_accuracy": 0.8756819367408752, + "num_tokens": 830868118.0, + "step": 21773 + }, + { + "epoch": 2.7698766060297673, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.609928131103516, + "learning_rate": 1e-06, + "loss": 0.4087, + "mean_token_accuracy": 0.8709642887115479, + "num_tokens": 830911658.0, + "step": 21774 + }, + { + "epoch": 2.770003816308358, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.427579879760742, + "learning_rate": 1e-06, + "loss": 0.3905, + "mean_token_accuracy": 0.8750013113021851, + "num_tokens": 830948456.0, + "step": 21775 + }, + { + "epoch": 2.7701310265869483, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.714876174926758, + "learning_rate": 1e-06, + "loss": 0.3771, + "mean_token_accuracy": 0.8829084038734436, + "num_tokens": 830988790.0, + "step": 21776 + }, + { + "epoch": 2.770258236865539, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.56637954711914, + "learning_rate": 1e-06, + "loss": 0.4265, + "mean_token_accuracy": 0.8668165802955627, + "num_tokens": 831024551.0, + "step": 21777 + }, + { + "epoch": 2.7703854471441294, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.476558685302734, + "learning_rate": 1e-06, + "loss": 0.4075, + "mean_token_accuracy": 0.8728904724121094, + "num_tokens": 831060082.0, + "step": 21778 + }, + { + "epoch": 2.77051265742272, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.689899444580078, + "learning_rate": 1e-06, + "loss": 0.4049, + "mean_token_accuracy": 0.875205397605896, + "num_tokens": 831100067.0, + "step": 21779 + }, + { + "epoch": 2.77063986770131, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.516504287719727, + "learning_rate": 1e-06, + "loss": 0.3646, + "mean_token_accuracy": 0.8846995830535889, + "num_tokens": 831131923.0, + "step": 21780 + }, + { + "epoch": 2.770767077979901, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.587644577026367, + "learning_rate": 1e-06, + "loss": 0.4439, + "mean_token_accuracy": 0.8590595722198486, + "num_tokens": 831167985.0, + "step": 21781 + }, + { + "epoch": 2.770894288258491, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.56902313232422, + "learning_rate": 1e-06, + "loss": 0.4188, + "mean_token_accuracy": 0.8692986965179443, + "num_tokens": 831211737.0, + "step": 21782 + }, + { + "epoch": 2.771021498537082, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.909997940063477, + "learning_rate": 1e-06, + "loss": 0.4297, + "mean_token_accuracy": 0.8626195788383484, + "num_tokens": 831248764.0, + "step": 21783 + }, + { + "epoch": 2.771148708815672, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.639354705810547, + "learning_rate": 1e-06, + "loss": 0.3791, + "mean_token_accuracy": 0.8799102306365967, + "num_tokens": 831284140.0, + "step": 21784 + }, + { + "epoch": 2.771275919094263, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.58365821838379, + "learning_rate": 1e-06, + "loss": 0.3873, + "mean_token_accuracy": 0.8771174550056458, + "num_tokens": 831318567.0, + "step": 21785 + }, + { + "epoch": 2.771403129372853, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.660696029663086, + "learning_rate": 1e-06, + "loss": 0.4185, + "mean_token_accuracy": 0.8668925762176514, + "num_tokens": 831357128.0, + "step": 21786 + }, + { + "epoch": 2.7715303396514437, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.70524787902832, + "learning_rate": 1e-06, + "loss": 0.4718, + "mean_token_accuracy": 0.8536343574523926, + "num_tokens": 831391551.0, + "step": 21787 + }, + { + "epoch": 2.771657549930034, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.595287322998047, + "learning_rate": 1e-06, + "loss": 0.4192, + "mean_token_accuracy": 0.8694505095481873, + "num_tokens": 831430332.0, + "step": 21788 + }, + { + "epoch": 2.7717847602086247, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.631689071655273, + "learning_rate": 1e-06, + "loss": 0.3798, + "mean_token_accuracy": 0.8789712190628052, + "num_tokens": 831471639.0, + "step": 21789 + }, + { + "epoch": 2.7719119704872153, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.146690368652344, + "learning_rate": 1e-06, + "loss": 0.3787, + "mean_token_accuracy": 0.8793448209762573, + "num_tokens": 831506470.0, + "step": 21790 + }, + { + "epoch": 2.772039180765806, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.57583236694336, + "learning_rate": 1e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.8661977052688599, + "num_tokens": 831547212.0, + "step": 21791 + }, + { + "epoch": 2.7721663910443963, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.54420280456543, + "learning_rate": 1e-06, + "loss": 0.4425, + "mean_token_accuracy": 0.8594928979873657, + "num_tokens": 831593181.0, + "step": 21792 + }, + { + "epoch": 2.772293601322987, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.992820739746094, + "learning_rate": 1e-06, + "loss": 0.3435, + "mean_token_accuracy": 0.8895516395568848, + "num_tokens": 831624889.0, + "step": 21793 + }, + { + "epoch": 2.7724208116015774, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.765289306640625, + "learning_rate": 1e-06, + "loss": 0.3827, + "mean_token_accuracy": 0.8798606395721436, + "num_tokens": 831662389.0, + "step": 21794 + }, + { + "epoch": 2.772548021880168, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.463632583618164, + "learning_rate": 1e-06, + "loss": 0.3919, + "mean_token_accuracy": 0.8756898641586304, + "num_tokens": 831705282.0, + "step": 21795 + }, + { + "epoch": 2.7726752321587584, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.1176815032959, + "learning_rate": 1e-06, + "loss": 0.3811, + "mean_token_accuracy": 0.8803908228874207, + "num_tokens": 831741604.0, + "step": 21796 + }, + { + "epoch": 2.772802442437349, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.514320373535156, + "learning_rate": 1e-06, + "loss": 0.4323, + "mean_token_accuracy": 0.8652223348617554, + "num_tokens": 831784165.0, + "step": 21797 + }, + { + "epoch": 2.7729296527159395, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.367664337158203, + "learning_rate": 1e-06, + "loss": 0.3859, + "mean_token_accuracy": 0.8784890174865723, + "num_tokens": 831821101.0, + "step": 21798 + }, + { + "epoch": 2.77305686299453, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.58465003967285, + "learning_rate": 1e-06, + "loss": 0.3911, + "mean_token_accuracy": 0.8759276866912842, + "num_tokens": 831855766.0, + "step": 21799 + }, + { + "epoch": 2.7731840732731206, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.714101791381836, + "learning_rate": 1e-06, + "loss": 0.3829, + "mean_token_accuracy": 0.8801726698875427, + "num_tokens": 831895692.0, + "step": 21800 + }, + { + "epoch": 2.773311283551711, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.598848342895508, + "learning_rate": 1e-06, + "loss": 0.3352, + "mean_token_accuracy": 0.8921328783035278, + "num_tokens": 831931079.0, + "step": 21801 + }, + { + "epoch": 2.7734384938303016, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.691570281982422, + "learning_rate": 1e-06, + "loss": 0.4752, + "mean_token_accuracy": 0.8485362529754639, + "num_tokens": 831969742.0, + "step": 21802 + }, + { + "epoch": 2.773565704108892, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.579185485839844, + "learning_rate": 1e-06, + "loss": 0.4256, + "mean_token_accuracy": 0.8642129302024841, + "num_tokens": 832011059.0, + "step": 21803 + }, + { + "epoch": 2.7736929143874827, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.63713836669922, + "learning_rate": 1e-06, + "loss": 0.37, + "mean_token_accuracy": 0.8833153247833252, + "num_tokens": 832042768.0, + "step": 21804 + }, + { + "epoch": 2.7738201246660728, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.695470809936523, + "learning_rate": 1e-06, + "loss": 0.3813, + "mean_token_accuracy": 0.882016658782959, + "num_tokens": 832077699.0, + "step": 21805 + }, + { + "epoch": 2.7739473349446637, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.541479110717773, + "learning_rate": 1e-06, + "loss": 0.3732, + "mean_token_accuracy": 0.8813982009887695, + "num_tokens": 832113528.0, + "step": 21806 + }, + { + "epoch": 2.774074545223254, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.511852264404297, + "learning_rate": 1e-06, + "loss": 0.3807, + "mean_token_accuracy": 0.8778313398361206, + "num_tokens": 832149108.0, + "step": 21807 + }, + { + "epoch": 2.774201755501845, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.568220138549805, + "learning_rate": 1e-06, + "loss": 0.3545, + "mean_token_accuracy": 0.8902878165245056, + "num_tokens": 832189279.0, + "step": 21808 + }, + { + "epoch": 2.774328965780435, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.641536712646484, + "learning_rate": 1e-06, + "loss": 0.3964, + "mean_token_accuracy": 0.8754879236221313, + "num_tokens": 832224423.0, + "step": 21809 + }, + { + "epoch": 2.774456176059026, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.552967071533203, + "learning_rate": 1e-06, + "loss": 0.4193, + "mean_token_accuracy": 0.867339551448822, + "num_tokens": 832260389.0, + "step": 21810 + }, + { + "epoch": 2.774583386337616, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.700763702392578, + "learning_rate": 1e-06, + "loss": 0.4003, + "mean_token_accuracy": 0.8726770281791687, + "num_tokens": 832297591.0, + "step": 21811 + }, + { + "epoch": 2.7747105966162064, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.474645614624023, + "learning_rate": 1e-06, + "loss": 0.3808, + "mean_token_accuracy": 0.8786741495132446, + "num_tokens": 832330195.0, + "step": 21812 + }, + { + "epoch": 2.774837806894797, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.826229095458984, + "learning_rate": 1e-06, + "loss": 0.3724, + "mean_token_accuracy": 0.8807801008224487, + "num_tokens": 832361613.0, + "step": 21813 + }, + { + "epoch": 2.7749650171733875, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.785343170166016, + "learning_rate": 1e-06, + "loss": 0.3865, + "mean_token_accuracy": 0.8793520927429199, + "num_tokens": 832402800.0, + "step": 21814 + }, + { + "epoch": 2.775092227451978, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.58774757385254, + "learning_rate": 1e-06, + "loss": 0.4024, + "mean_token_accuracy": 0.8749662637710571, + "num_tokens": 832445161.0, + "step": 21815 + }, + { + "epoch": 2.7752194377305686, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.76654815673828, + "learning_rate": 1e-06, + "loss": 0.4378, + "mean_token_accuracy": 0.8594614267349243, + "num_tokens": 832487905.0, + "step": 21816 + }, + { + "epoch": 2.775346648009159, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.554553985595703, + "learning_rate": 1e-06, + "loss": 0.3972, + "mean_token_accuracy": 0.8755471706390381, + "num_tokens": 832521107.0, + "step": 21817 + }, + { + "epoch": 2.7754738582877496, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.081825256347656, + "learning_rate": 1e-06, + "loss": 0.4064, + "mean_token_accuracy": 0.8693537712097168, + "num_tokens": 832557213.0, + "step": 21818 + }, + { + "epoch": 2.77560106856634, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.723480224609375, + "learning_rate": 1e-06, + "loss": 0.4417, + "mean_token_accuracy": 0.8618888854980469, + "num_tokens": 832600445.0, + "step": 21819 + }, + { + "epoch": 2.7757282788449307, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.564231872558594, + "learning_rate": 1e-06, + "loss": 0.4058, + "mean_token_accuracy": 0.8728066682815552, + "num_tokens": 832635994.0, + "step": 21820 + }, + { + "epoch": 2.775855489123521, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.48410987854004, + "learning_rate": 1e-06, + "loss": 0.3661, + "mean_token_accuracy": 0.88484787940979, + "num_tokens": 832677151.0, + "step": 21821 + }, + { + "epoch": 2.7759826994021117, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.7989501953125, + "learning_rate": 1e-06, + "loss": 0.3989, + "mean_token_accuracy": 0.8737531900405884, + "num_tokens": 832713995.0, + "step": 21822 + }, + { + "epoch": 2.7761099096807023, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.504133224487305, + "learning_rate": 1e-06, + "loss": 0.4213, + "mean_token_accuracy": 0.8648536205291748, + "num_tokens": 832754201.0, + "step": 21823 + }, + { + "epoch": 2.776237119959293, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.022541046142578, + "learning_rate": 1e-06, + "loss": 0.416, + "mean_token_accuracy": 0.8712050914764404, + "num_tokens": 832796920.0, + "step": 21824 + }, + { + "epoch": 2.7763643302378833, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.86441421508789, + "learning_rate": 1e-06, + "loss": 0.3966, + "mean_token_accuracy": 0.8711309432983398, + "num_tokens": 832835952.0, + "step": 21825 + }, + { + "epoch": 2.776491540516474, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.660667419433594, + "learning_rate": 1e-06, + "loss": 0.3912, + "mean_token_accuracy": 0.877000093460083, + "num_tokens": 832878306.0, + "step": 21826 + }, + { + "epoch": 2.7766187507950644, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.994152069091797, + "learning_rate": 1e-06, + "loss": 0.4209, + "mean_token_accuracy": 0.8655247688293457, + "num_tokens": 832914315.0, + "step": 21827 + }, + { + "epoch": 2.776745961073655, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.853601455688477, + "learning_rate": 1e-06, + "loss": 0.4463, + "mean_token_accuracy": 0.8701351881027222, + "num_tokens": 832956661.0, + "step": 21828 + }, + { + "epoch": 2.7768731713522454, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.94122886657715, + "learning_rate": 1e-06, + "loss": 0.3813, + "mean_token_accuracy": 0.8797652125358582, + "num_tokens": 832999446.0, + "step": 21829 + }, + { + "epoch": 2.7770003816308355, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.7559757232666, + "learning_rate": 1e-06, + "loss": 0.4739, + "mean_token_accuracy": 0.8474202752113342, + "num_tokens": 833039001.0, + "step": 21830 + }, + { + "epoch": 2.7771275919094265, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.523454666137695, + "learning_rate": 1e-06, + "loss": 0.3977, + "mean_token_accuracy": 0.8724192976951599, + "num_tokens": 833074334.0, + "step": 21831 + }, + { + "epoch": 2.7772548021880166, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.680908203125, + "learning_rate": 1e-06, + "loss": 0.4415, + "mean_token_accuracy": 0.8623271584510803, + "num_tokens": 833113089.0, + "step": 21832 + }, + { + "epoch": 2.7773820124666075, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.71010398864746, + "learning_rate": 1e-06, + "loss": 0.4275, + "mean_token_accuracy": 0.8703086376190186, + "num_tokens": 833149755.0, + "step": 21833 + }, + { + "epoch": 2.7775092227451976, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.71246910095215, + "learning_rate": 1e-06, + "loss": 0.4421, + "mean_token_accuracy": 0.8652106523513794, + "num_tokens": 833187718.0, + "step": 21834 + }, + { + "epoch": 2.7776364330237886, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.930644989013672, + "learning_rate": 1e-06, + "loss": 0.376, + "mean_token_accuracy": 0.8805065155029297, + "num_tokens": 833221416.0, + "step": 21835 + }, + { + "epoch": 2.7777636433023787, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.495790481567383, + "learning_rate": 1e-06, + "loss": 0.3621, + "mean_token_accuracy": 0.8863062858581543, + "num_tokens": 833261101.0, + "step": 21836 + }, + { + "epoch": 2.777890853580969, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.757997512817383, + "learning_rate": 1e-06, + "loss": 0.3773, + "mean_token_accuracy": 0.8821210265159607, + "num_tokens": 833296804.0, + "step": 21837 + }, + { + "epoch": 2.7780180638595597, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.64433479309082, + "learning_rate": 1e-06, + "loss": 0.4094, + "mean_token_accuracy": 0.8743118047714233, + "num_tokens": 833332005.0, + "step": 21838 + }, + { + "epoch": 2.7781452741381503, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.474748611450195, + "learning_rate": 1e-06, + "loss": 0.4114, + "mean_token_accuracy": 0.8696384429931641, + "num_tokens": 833371991.0, + "step": 21839 + }, + { + "epoch": 2.778272484416741, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.62275505065918, + "learning_rate": 1e-06, + "loss": 0.367, + "mean_token_accuracy": 0.88490891456604, + "num_tokens": 833407514.0, + "step": 21840 + }, + { + "epoch": 2.7783996946953313, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.893356323242188, + "learning_rate": 1e-06, + "loss": 0.4259, + "mean_token_accuracy": 0.8722680807113647, + "num_tokens": 833444519.0, + "step": 21841 + }, + { + "epoch": 2.778526904973922, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.646770477294922, + "learning_rate": 1e-06, + "loss": 0.4155, + "mean_token_accuracy": 0.8697153925895691, + "num_tokens": 833482252.0, + "step": 21842 + }, + { + "epoch": 2.7786541152525124, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.736709594726562, + "learning_rate": 1e-06, + "loss": 0.4112, + "mean_token_accuracy": 0.8720751404762268, + "num_tokens": 833515513.0, + "step": 21843 + }, + { + "epoch": 2.778781325531103, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.7520751953125, + "learning_rate": 1e-06, + "loss": 0.3846, + "mean_token_accuracy": 0.8812589049339294, + "num_tokens": 833551188.0, + "step": 21844 + }, + { + "epoch": 2.7789085358096934, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.639732360839844, + "learning_rate": 1e-06, + "loss": 0.3671, + "mean_token_accuracy": 0.8839468359947205, + "num_tokens": 833589159.0, + "step": 21845 + }, + { + "epoch": 2.779035746088284, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.54401969909668, + "learning_rate": 1e-06, + "loss": 0.4143, + "mean_token_accuracy": 0.8640193343162537, + "num_tokens": 833623581.0, + "step": 21846 + }, + { + "epoch": 2.7791629563668745, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.920169830322266, + "learning_rate": 1e-06, + "loss": 0.4368, + "mean_token_accuracy": 0.8614641427993774, + "num_tokens": 833660466.0, + "step": 21847 + }, + { + "epoch": 2.779290166645465, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.869855880737305, + "learning_rate": 1e-06, + "loss": 0.4409, + "mean_token_accuracy": 0.8641873598098755, + "num_tokens": 833697465.0, + "step": 21848 + }, + { + "epoch": 2.7794173769240555, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.577800750732422, + "learning_rate": 1e-06, + "loss": 0.3991, + "mean_token_accuracy": 0.8713979721069336, + "num_tokens": 833735178.0, + "step": 21849 + }, + { + "epoch": 2.779544587202646, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.604862213134766, + "learning_rate": 1e-06, + "loss": 0.3492, + "mean_token_accuracy": 0.8911833763122559, + "num_tokens": 833770193.0, + "step": 21850 + }, + { + "epoch": 2.7796717974812366, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.641319274902344, + "learning_rate": 1e-06, + "loss": 0.3965, + "mean_token_accuracy": 0.8739441633224487, + "num_tokens": 833807836.0, + "step": 21851 + }, + { + "epoch": 2.779799007759827, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.472871780395508, + "learning_rate": 1e-06, + "loss": 0.4032, + "mean_token_accuracy": 0.8704850673675537, + "num_tokens": 833847605.0, + "step": 21852 + }, + { + "epoch": 2.7799262180384177, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.622255325317383, + "learning_rate": 1e-06, + "loss": 0.3927, + "mean_token_accuracy": 0.8768945336341858, + "num_tokens": 833884239.0, + "step": 21853 + }, + { + "epoch": 2.780053428317008, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.74369239807129, + "learning_rate": 1e-06, + "loss": 0.4104, + "mean_token_accuracy": 0.874512791633606, + "num_tokens": 833921305.0, + "step": 21854 + }, + { + "epoch": 2.7801806385955983, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.607406616210938, + "learning_rate": 1e-06, + "loss": 0.4326, + "mean_token_accuracy": 0.8661435842514038, + "num_tokens": 833959297.0, + "step": 21855 + }, + { + "epoch": 2.7803078488741892, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.02166748046875, + "learning_rate": 1e-06, + "loss": 0.4755, + "mean_token_accuracy": 0.8508797883987427, + "num_tokens": 833992135.0, + "step": 21856 + }, + { + "epoch": 2.7804350591527793, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.621501922607422, + "learning_rate": 1e-06, + "loss": 0.4323, + "mean_token_accuracy": 0.8636301159858704, + "num_tokens": 834035933.0, + "step": 21857 + }, + { + "epoch": 2.7805622694313703, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.763015747070312, + "learning_rate": 1e-06, + "loss": 0.4081, + "mean_token_accuracy": 0.8735517263412476, + "num_tokens": 834070187.0, + "step": 21858 + }, + { + "epoch": 2.7806894797099604, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.551664352416992, + "learning_rate": 1e-06, + "loss": 0.3992, + "mean_token_accuracy": 0.8741362690925598, + "num_tokens": 834107370.0, + "step": 21859 + }, + { + "epoch": 2.780816689988551, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.60701560974121, + "learning_rate": 1e-06, + "loss": 0.4192, + "mean_token_accuracy": 0.8656901121139526, + "num_tokens": 834145357.0, + "step": 21860 + }, + { + "epoch": 2.7809439002671414, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.624683380126953, + "learning_rate": 1e-06, + "loss": 0.4039, + "mean_token_accuracy": 0.871615469455719, + "num_tokens": 834187250.0, + "step": 21861 + }, + { + "epoch": 2.781071110545732, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.49983024597168, + "learning_rate": 1e-06, + "loss": 0.3671, + "mean_token_accuracy": 0.884143590927124, + "num_tokens": 834227598.0, + "step": 21862 + }, + { + "epoch": 2.7811983208243225, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.72798728942871, + "learning_rate": 1e-06, + "loss": 0.358, + "mean_token_accuracy": 0.887859046459198, + "num_tokens": 834267032.0, + "step": 21863 + }, + { + "epoch": 2.781325531102913, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.580564498901367, + "learning_rate": 1e-06, + "loss": 0.4371, + "mean_token_accuracy": 0.8626569509506226, + "num_tokens": 834308479.0, + "step": 21864 + }, + { + "epoch": 2.7814527413815036, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.56148338317871, + "learning_rate": 1e-06, + "loss": 0.3635, + "mean_token_accuracy": 0.8829762935638428, + "num_tokens": 834346185.0, + "step": 21865 + }, + { + "epoch": 2.781579951660094, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.779720306396484, + "learning_rate": 1e-06, + "loss": 0.3774, + "mean_token_accuracy": 0.8820945024490356, + "num_tokens": 834384404.0, + "step": 21866 + }, + { + "epoch": 2.7817071619386846, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.817609786987305, + "learning_rate": 1e-06, + "loss": 0.4104, + "mean_token_accuracy": 0.8724417686462402, + "num_tokens": 834424833.0, + "step": 21867 + }, + { + "epoch": 2.781834372217275, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.480064392089844, + "learning_rate": 1e-06, + "loss": 0.3884, + "mean_token_accuracy": 0.8751506209373474, + "num_tokens": 834464215.0, + "step": 21868 + }, + { + "epoch": 2.7819615824958657, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.80538558959961, + "learning_rate": 1e-06, + "loss": 0.475, + "mean_token_accuracy": 0.8506062030792236, + "num_tokens": 834502452.0, + "step": 21869 + }, + { + "epoch": 2.782088792774456, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.7214412689209, + "learning_rate": 1e-06, + "loss": 0.3972, + "mean_token_accuracy": 0.8737881183624268, + "num_tokens": 834539690.0, + "step": 21870 + }, + { + "epoch": 2.7822160030530467, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.740373611450195, + "learning_rate": 1e-06, + "loss": 0.4341, + "mean_token_accuracy": 0.8632221221923828, + "num_tokens": 834583193.0, + "step": 21871 + }, + { + "epoch": 2.7823432133316373, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.848814010620117, + "learning_rate": 1e-06, + "loss": 0.4089, + "mean_token_accuracy": 0.8666830062866211, + "num_tokens": 834620028.0, + "step": 21872 + }, + { + "epoch": 2.782470423610228, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.60527992248535, + "learning_rate": 1e-06, + "loss": 0.4287, + "mean_token_accuracy": 0.8635967373847961, + "num_tokens": 834657351.0, + "step": 21873 + }, + { + "epoch": 2.7825976338888183, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.483842849731445, + "learning_rate": 1e-06, + "loss": 0.3406, + "mean_token_accuracy": 0.8872317671775818, + "num_tokens": 834693687.0, + "step": 21874 + }, + { + "epoch": 2.782724844167409, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.76043128967285, + "learning_rate": 1e-06, + "loss": 0.4032, + "mean_token_accuracy": 0.8723641633987427, + "num_tokens": 834736171.0, + "step": 21875 + }, + { + "epoch": 2.7828520544459994, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.626110076904297, + "learning_rate": 1e-06, + "loss": 0.3596, + "mean_token_accuracy": 0.8869336843490601, + "num_tokens": 834775699.0, + "step": 21876 + }, + { + "epoch": 2.78297926472459, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.42392921447754, + "learning_rate": 1e-06, + "loss": 0.4163, + "mean_token_accuracy": 0.8674139380455017, + "num_tokens": 834816053.0, + "step": 21877 + }, + { + "epoch": 2.78310647500318, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.709903717041016, + "learning_rate": 1e-06, + "loss": 0.4175, + "mean_token_accuracy": 0.8687325716018677, + "num_tokens": 834857125.0, + "step": 21878 + }, + { + "epoch": 2.783233685281771, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.468299865722656, + "learning_rate": 1e-06, + "loss": 0.3705, + "mean_token_accuracy": 0.886292576789856, + "num_tokens": 834895676.0, + "step": 21879 + }, + { + "epoch": 2.783360895560361, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.652626037597656, + "learning_rate": 1e-06, + "loss": 0.3984, + "mean_token_accuracy": 0.8769040703773499, + "num_tokens": 834927748.0, + "step": 21880 + }, + { + "epoch": 2.783488105838952, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.567583084106445, + "learning_rate": 1e-06, + "loss": 0.4221, + "mean_token_accuracy": 0.8694672584533691, + "num_tokens": 834966722.0, + "step": 21881 + }, + { + "epoch": 2.783615316117542, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.739290237426758, + "learning_rate": 1e-06, + "loss": 0.3991, + "mean_token_accuracy": 0.8749186992645264, + "num_tokens": 835003770.0, + "step": 21882 + }, + { + "epoch": 2.783742526396133, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.672975540161133, + "learning_rate": 1e-06, + "loss": 0.3758, + "mean_token_accuracy": 0.8820201754570007, + "num_tokens": 835041504.0, + "step": 21883 + }, + { + "epoch": 2.783869736674723, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.585315704345703, + "learning_rate": 1e-06, + "loss": 0.3988, + "mean_token_accuracy": 0.8757273554801941, + "num_tokens": 835075179.0, + "step": 21884 + }, + { + "epoch": 2.7839969469533137, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.70656967163086, + "learning_rate": 1e-06, + "loss": 0.4638, + "mean_token_accuracy": 0.8633190393447876, + "num_tokens": 835113080.0, + "step": 21885 + }, + { + "epoch": 2.784124157231904, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.537445068359375, + "learning_rate": 1e-06, + "loss": 0.4025, + "mean_token_accuracy": 0.872734010219574, + "num_tokens": 835150586.0, + "step": 21886 + }, + { + "epoch": 2.7842513675104947, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.503326416015625, + "learning_rate": 1e-06, + "loss": 0.4127, + "mean_token_accuracy": 0.8714594841003418, + "num_tokens": 835184049.0, + "step": 21887 + }, + { + "epoch": 2.7843785777890853, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.80987548828125, + "learning_rate": 1e-06, + "loss": 0.4082, + "mean_token_accuracy": 0.8681840896606445, + "num_tokens": 835222745.0, + "step": 21888 + }, + { + "epoch": 2.784505788067676, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.745769500732422, + "learning_rate": 1e-06, + "loss": 0.4322, + "mean_token_accuracy": 0.8615289330482483, + "num_tokens": 835263017.0, + "step": 21889 + }, + { + "epoch": 2.7846329983462663, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.472537994384766, + "learning_rate": 1e-06, + "loss": 0.3578, + "mean_token_accuracy": 0.8869167566299438, + "num_tokens": 835303916.0, + "step": 21890 + }, + { + "epoch": 2.784760208624857, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.635318756103516, + "learning_rate": 1e-06, + "loss": 0.3621, + "mean_token_accuracy": 0.8888229727745056, + "num_tokens": 835343148.0, + "step": 21891 + }, + { + "epoch": 2.7848874189034474, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.582965850830078, + "learning_rate": 1e-06, + "loss": 0.4117, + "mean_token_accuracy": 0.8703489303588867, + "num_tokens": 835382337.0, + "step": 21892 + }, + { + "epoch": 2.785014629182038, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.622379302978516, + "learning_rate": 1e-06, + "loss": 0.4208, + "mean_token_accuracy": 0.8669183254241943, + "num_tokens": 835423645.0, + "step": 21893 + }, + { + "epoch": 2.7851418394606284, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.97147560119629, + "learning_rate": 1e-06, + "loss": 0.4792, + "mean_token_accuracy": 0.8470970392227173, + "num_tokens": 835464590.0, + "step": 21894 + }, + { + "epoch": 2.785269049739219, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.896289825439453, + "learning_rate": 1e-06, + "loss": 0.3352, + "mean_token_accuracy": 0.8955724835395813, + "num_tokens": 835496376.0, + "step": 21895 + }, + { + "epoch": 2.7853962600178095, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.52560806274414, + "learning_rate": 1e-06, + "loss": 0.4512, + "mean_token_accuracy": 0.858884871006012, + "num_tokens": 835541231.0, + "step": 21896 + }, + { + "epoch": 2.7855234702964, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.743898391723633, + "learning_rate": 1e-06, + "loss": 0.4029, + "mean_token_accuracy": 0.8771008253097534, + "num_tokens": 835584038.0, + "step": 21897 + }, + { + "epoch": 2.7856506805749905, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.71288299560547, + "learning_rate": 1e-06, + "loss": 0.4249, + "mean_token_accuracy": 0.8671458959579468, + "num_tokens": 835627189.0, + "step": 21898 + }, + { + "epoch": 2.785777890853581, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.508522033691406, + "learning_rate": 1e-06, + "loss": 0.3903, + "mean_token_accuracy": 0.8742530345916748, + "num_tokens": 835659701.0, + "step": 21899 + }, + { + "epoch": 2.7859051011321716, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.15131378173828, + "learning_rate": 1e-06, + "loss": 0.3682, + "mean_token_accuracy": 0.8798414468765259, + "num_tokens": 835694045.0, + "step": 21900 + }, + { + "epoch": 2.786032311410762, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.846080780029297, + "learning_rate": 1e-06, + "loss": 0.4111, + "mean_token_accuracy": 0.8711518049240112, + "num_tokens": 835732118.0, + "step": 21901 + }, + { + "epoch": 2.7861595216893527, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.54859161376953, + "learning_rate": 1e-06, + "loss": 0.3824, + "mean_token_accuracy": 0.8826138973236084, + "num_tokens": 835772700.0, + "step": 21902 + }, + { + "epoch": 2.7862867319679427, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.924509048461914, + "learning_rate": 1e-06, + "loss": 0.4635, + "mean_token_accuracy": 0.8577316403388977, + "num_tokens": 835811206.0, + "step": 21903 + }, + { + "epoch": 2.7864139422465337, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.59222412109375, + "learning_rate": 1e-06, + "loss": 0.3833, + "mean_token_accuracy": 0.879116415977478, + "num_tokens": 835844381.0, + "step": 21904 + }, + { + "epoch": 2.786541152525124, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.639509201049805, + "learning_rate": 1e-06, + "loss": 0.4248, + "mean_token_accuracy": 0.8657404780387878, + "num_tokens": 835879926.0, + "step": 21905 + }, + { + "epoch": 2.7866683628037148, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.974390029907227, + "learning_rate": 1e-06, + "loss": 0.4365, + "mean_token_accuracy": 0.8627662658691406, + "num_tokens": 835924014.0, + "step": 21906 + }, + { + "epoch": 2.786795573082305, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.582117080688477, + "learning_rate": 1e-06, + "loss": 0.4131, + "mean_token_accuracy": 0.866901159286499, + "num_tokens": 835956054.0, + "step": 21907 + }, + { + "epoch": 2.786922783360896, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.83418846130371, + "learning_rate": 1e-06, + "loss": 0.3842, + "mean_token_accuracy": 0.879433274269104, + "num_tokens": 835988885.0, + "step": 21908 + }, + { + "epoch": 2.787049993639486, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.65486717224121, + "learning_rate": 1e-06, + "loss": 0.4096, + "mean_token_accuracy": 0.8727602362632751, + "num_tokens": 836028885.0, + "step": 21909 + }, + { + "epoch": 2.7871772039180764, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.560840606689453, + "learning_rate": 1e-06, + "loss": 0.3691, + "mean_token_accuracy": 0.8863275051116943, + "num_tokens": 836067340.0, + "step": 21910 + }, + { + "epoch": 2.787304414196667, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.38004493713379, + "learning_rate": 1e-06, + "loss": 0.3908, + "mean_token_accuracy": 0.8736743927001953, + "num_tokens": 836104432.0, + "step": 21911 + }, + { + "epoch": 2.7874316244752575, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.868961334228516, + "learning_rate": 1e-06, + "loss": 0.4183, + "mean_token_accuracy": 0.8687936067581177, + "num_tokens": 836141197.0, + "step": 21912 + }, + { + "epoch": 2.787558834753848, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.8966121673584, + "learning_rate": 1e-06, + "loss": 0.3711, + "mean_token_accuracy": 0.8849940299987793, + "num_tokens": 836176949.0, + "step": 21913 + }, + { + "epoch": 2.7876860450324386, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.51766014099121, + "learning_rate": 1e-06, + "loss": 0.4182, + "mean_token_accuracy": 0.870448112487793, + "num_tokens": 836206484.0, + "step": 21914 + }, + { + "epoch": 2.787813255311029, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.596351623535156, + "learning_rate": 1e-06, + "loss": 0.3975, + "mean_token_accuracy": 0.8785861730575562, + "num_tokens": 836243056.0, + "step": 21915 + }, + { + "epoch": 2.7879404655896196, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.796056747436523, + "learning_rate": 1e-06, + "loss": 0.424, + "mean_token_accuracy": 0.868500828742981, + "num_tokens": 836281154.0, + "step": 21916 + }, + { + "epoch": 2.78806767586821, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.70359992980957, + "learning_rate": 1e-06, + "loss": 0.3991, + "mean_token_accuracy": 0.87270188331604, + "num_tokens": 836319895.0, + "step": 21917 + }, + { + "epoch": 2.7881948861468007, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.565031051635742, + "learning_rate": 1e-06, + "loss": 0.385, + "mean_token_accuracy": 0.8779816627502441, + "num_tokens": 836357227.0, + "step": 21918 + }, + { + "epoch": 2.788322096425391, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.891260147094727, + "learning_rate": 1e-06, + "loss": 0.4291, + "mean_token_accuracy": 0.8664509057998657, + "num_tokens": 836398442.0, + "step": 21919 + }, + { + "epoch": 2.7884493067039817, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.957630157470703, + "learning_rate": 1e-06, + "loss": 0.3855, + "mean_token_accuracy": 0.8816656470298767, + "num_tokens": 836436660.0, + "step": 21920 + }, + { + "epoch": 2.7885765169825723, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.902454376220703, + "learning_rate": 1e-06, + "loss": 0.4338, + "mean_token_accuracy": 0.8673515915870667, + "num_tokens": 836479180.0, + "step": 21921 + }, + { + "epoch": 2.788703727261163, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.8374080657959, + "learning_rate": 1e-06, + "loss": 0.3852, + "mean_token_accuracy": 0.8795362710952759, + "num_tokens": 836518874.0, + "step": 21922 + }, + { + "epoch": 2.7888309375397533, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.662538528442383, + "learning_rate": 1e-06, + "loss": 0.4103, + "mean_token_accuracy": 0.8691607713699341, + "num_tokens": 836556910.0, + "step": 21923 + }, + { + "epoch": 2.788958147818344, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.610197067260742, + "learning_rate": 1e-06, + "loss": 0.375, + "mean_token_accuracy": 0.885393500328064, + "num_tokens": 836598151.0, + "step": 21924 + }, + { + "epoch": 2.7890853580969344, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.66642189025879, + "learning_rate": 1e-06, + "loss": 0.4203, + "mean_token_accuracy": 0.8658249974250793, + "num_tokens": 836637952.0, + "step": 21925 + }, + { + "epoch": 2.789212568375525, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.797422409057617, + "learning_rate": 1e-06, + "loss": 0.3592, + "mean_token_accuracy": 0.8865115642547607, + "num_tokens": 836677138.0, + "step": 21926 + }, + { + "epoch": 2.7893397786541154, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.96868896484375, + "learning_rate": 1e-06, + "loss": 0.3861, + "mean_token_accuracy": 0.8783305287361145, + "num_tokens": 836716880.0, + "step": 21927 + }, + { + "epoch": 2.7894669889327055, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.593446731567383, + "learning_rate": 1e-06, + "loss": 0.3591, + "mean_token_accuracy": 0.8858940601348877, + "num_tokens": 836752572.0, + "step": 21928 + }, + { + "epoch": 2.7895941992112965, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.759401321411133, + "learning_rate": 1e-06, + "loss": 0.4201, + "mean_token_accuracy": 0.8706942200660706, + "num_tokens": 836794321.0, + "step": 21929 + }, + { + "epoch": 2.7897214094898866, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.65475845336914, + "learning_rate": 1e-06, + "loss": 0.346, + "mean_token_accuracy": 0.8870601654052734, + "num_tokens": 836829183.0, + "step": 21930 + }, + { + "epoch": 2.7898486197684775, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.76507568359375, + "learning_rate": 1e-06, + "loss": 0.4391, + "mean_token_accuracy": 0.8625234365463257, + "num_tokens": 836864644.0, + "step": 21931 + }, + { + "epoch": 2.7899758300470676, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.521400451660156, + "learning_rate": 1e-06, + "loss": 0.3659, + "mean_token_accuracy": 0.8855961561203003, + "num_tokens": 836895384.0, + "step": 21932 + }, + { + "epoch": 2.7901030403256586, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.652610778808594, + "learning_rate": 1e-06, + "loss": 0.4081, + "mean_token_accuracy": 0.8762491941452026, + "num_tokens": 836943336.0, + "step": 21933 + }, + { + "epoch": 2.7902302506042487, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.84852409362793, + "learning_rate": 1e-06, + "loss": 0.3548, + "mean_token_accuracy": 0.8879842162132263, + "num_tokens": 836980708.0, + "step": 21934 + }, + { + "epoch": 2.790357460882839, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.538652420043945, + "learning_rate": 1e-06, + "loss": 0.3934, + "mean_token_accuracy": 0.8770751357078552, + "num_tokens": 837015738.0, + "step": 21935 + }, + { + "epoch": 2.7904846711614297, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.920459747314453, + "learning_rate": 1e-06, + "loss": 0.3773, + "mean_token_accuracy": 0.8800827264785767, + "num_tokens": 837053014.0, + "step": 21936 + }, + { + "epoch": 2.7906118814400203, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.455907821655273, + "learning_rate": 1e-06, + "loss": 0.4272, + "mean_token_accuracy": 0.8682361245155334, + "num_tokens": 837083579.0, + "step": 21937 + }, + { + "epoch": 2.790739091718611, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.636335372924805, + "learning_rate": 1e-06, + "loss": 0.3732, + "mean_token_accuracy": 0.8825826644897461, + "num_tokens": 837126201.0, + "step": 21938 + }, + { + "epoch": 2.7908663019972013, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.8669376373291, + "learning_rate": 1e-06, + "loss": 0.3724, + "mean_token_accuracy": 0.8830205202102661, + "num_tokens": 837162400.0, + "step": 21939 + }, + { + "epoch": 2.790993512275792, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.59841537475586, + "learning_rate": 1e-06, + "loss": 0.3925, + "mean_token_accuracy": 0.8774278163909912, + "num_tokens": 837205733.0, + "step": 21940 + }, + { + "epoch": 2.7911207225543824, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.678691864013672, + "learning_rate": 1e-06, + "loss": 0.434, + "mean_token_accuracy": 0.864308774471283, + "num_tokens": 837243838.0, + "step": 21941 + }, + { + "epoch": 2.791247932832973, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.648193359375, + "learning_rate": 1e-06, + "loss": 0.4076, + "mean_token_accuracy": 0.8697109222412109, + "num_tokens": 837280936.0, + "step": 21942 + }, + { + "epoch": 2.7913751431115634, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.64530372619629, + "learning_rate": 1e-06, + "loss": 0.3942, + "mean_token_accuracy": 0.8743786811828613, + "num_tokens": 837321790.0, + "step": 21943 + }, + { + "epoch": 2.791502353390154, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.77820587158203, + "learning_rate": 1e-06, + "loss": 0.3851, + "mean_token_accuracy": 0.8762108087539673, + "num_tokens": 837362023.0, + "step": 21944 + }, + { + "epoch": 2.7916295636687445, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.53700065612793, + "learning_rate": 1e-06, + "loss": 0.3739, + "mean_token_accuracy": 0.8808488845825195, + "num_tokens": 837404876.0, + "step": 21945 + }, + { + "epoch": 2.791756773947335, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.834104537963867, + "learning_rate": 1e-06, + "loss": 0.4155, + "mean_token_accuracy": 0.8676459193229675, + "num_tokens": 837442899.0, + "step": 21946 + }, + { + "epoch": 2.7918839842259255, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.73066520690918, + "learning_rate": 1e-06, + "loss": 0.3753, + "mean_token_accuracy": 0.8833299875259399, + "num_tokens": 837477611.0, + "step": 21947 + }, + { + "epoch": 2.792011194504516, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.57777214050293, + "learning_rate": 1e-06, + "loss": 0.4355, + "mean_token_accuracy": 0.8619333505630493, + "num_tokens": 837515928.0, + "step": 21948 + }, + { + "epoch": 2.7921384047831066, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.724706649780273, + "learning_rate": 1e-06, + "loss": 0.3996, + "mean_token_accuracy": 0.8751273155212402, + "num_tokens": 837553042.0, + "step": 21949 + }, + { + "epoch": 2.792265615061697, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.778060913085938, + "learning_rate": 1e-06, + "loss": 0.3871, + "mean_token_accuracy": 0.8797974586486816, + "num_tokens": 837589749.0, + "step": 21950 + }, + { + "epoch": 2.7923928253402877, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.518613815307617, + "learning_rate": 1e-06, + "loss": 0.3505, + "mean_token_accuracy": 0.8918188810348511, + "num_tokens": 837631104.0, + "step": 21951 + }, + { + "epoch": 2.792520035618878, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.867053985595703, + "learning_rate": 1e-06, + "loss": 0.3699, + "mean_token_accuracy": 0.8834744095802307, + "num_tokens": 837668173.0, + "step": 21952 + }, + { + "epoch": 2.7926472458974683, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.625341415405273, + "learning_rate": 1e-06, + "loss": 0.3788, + "mean_token_accuracy": 0.8806467056274414, + "num_tokens": 837704282.0, + "step": 21953 + }, + { + "epoch": 2.7927744561760592, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.730207443237305, + "learning_rate": 1e-06, + "loss": 0.436, + "mean_token_accuracy": 0.8650068044662476, + "num_tokens": 837750790.0, + "step": 21954 + }, + { + "epoch": 2.7929016664546493, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.72107696533203, + "learning_rate": 1e-06, + "loss": 0.4016, + "mean_token_accuracy": 0.8715665936470032, + "num_tokens": 837792041.0, + "step": 21955 + }, + { + "epoch": 2.7930288767332403, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.51519012451172, + "learning_rate": 1e-06, + "loss": 0.3731, + "mean_token_accuracy": 0.8801140785217285, + "num_tokens": 837818645.0, + "step": 21956 + }, + { + "epoch": 2.7931560870118304, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.58585548400879, + "learning_rate": 1e-06, + "loss": 0.3789, + "mean_token_accuracy": 0.8836957812309265, + "num_tokens": 837859816.0, + "step": 21957 + }, + { + "epoch": 2.793283297290421, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.893905639648438, + "learning_rate": 1e-06, + "loss": 0.4178, + "mean_token_accuracy": 0.8693797588348389, + "num_tokens": 837903970.0, + "step": 21958 + }, + { + "epoch": 2.7934105075690114, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.730009078979492, + "learning_rate": 1e-06, + "loss": 0.4023, + "mean_token_accuracy": 0.8748779892921448, + "num_tokens": 837941158.0, + "step": 21959 + }, + { + "epoch": 2.793537717847602, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.44798469543457, + "learning_rate": 1e-06, + "loss": 0.3849, + "mean_token_accuracy": 0.8808020949363708, + "num_tokens": 837978157.0, + "step": 21960 + }, + { + "epoch": 2.7936649281261925, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.809236526489258, + "learning_rate": 1e-06, + "loss": 0.3596, + "mean_token_accuracy": 0.8850361108779907, + "num_tokens": 838017871.0, + "step": 21961 + }, + { + "epoch": 2.793792138404783, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.59074592590332, + "learning_rate": 1e-06, + "loss": 0.3632, + "mean_token_accuracy": 0.8853480815887451, + "num_tokens": 838050521.0, + "step": 21962 + }, + { + "epoch": 2.7939193486833735, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.78117561340332, + "learning_rate": 1e-06, + "loss": 0.3705, + "mean_token_accuracy": 0.8826352953910828, + "num_tokens": 838084450.0, + "step": 21963 + }, + { + "epoch": 2.794046558961964, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.801982879638672, + "learning_rate": 1e-06, + "loss": 0.4258, + "mean_token_accuracy": 0.8681246042251587, + "num_tokens": 838121505.0, + "step": 21964 + }, + { + "epoch": 2.7941737692405546, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.595945358276367, + "learning_rate": 1e-06, + "loss": 0.4042, + "mean_token_accuracy": 0.8721683025360107, + "num_tokens": 838166148.0, + "step": 21965 + }, + { + "epoch": 2.794300979519145, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.63620376586914, + "learning_rate": 1e-06, + "loss": 0.3847, + "mean_token_accuracy": 0.8805273175239563, + "num_tokens": 838203578.0, + "step": 21966 + }, + { + "epoch": 2.7944281897977357, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.827043533325195, + "learning_rate": 1e-06, + "loss": 0.3804, + "mean_token_accuracy": 0.8801789879798889, + "num_tokens": 838240702.0, + "step": 21967 + }, + { + "epoch": 2.794555400076326, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.55576515197754, + "learning_rate": 1e-06, + "loss": 0.3742, + "mean_token_accuracy": 0.8818542957305908, + "num_tokens": 838275639.0, + "step": 21968 + }, + { + "epoch": 2.7946826103549167, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.77699851989746, + "learning_rate": 1e-06, + "loss": 0.3913, + "mean_token_accuracy": 0.8753892183303833, + "num_tokens": 838310045.0, + "step": 21969 + }, + { + "epoch": 2.7948098206335072, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.42380142211914, + "learning_rate": 1e-06, + "loss": 0.3981, + "mean_token_accuracy": 0.8751017451286316, + "num_tokens": 838351571.0, + "step": 21970 + }, + { + "epoch": 2.7949370309120978, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.740705490112305, + "learning_rate": 1e-06, + "loss": 0.3615, + "mean_token_accuracy": 0.8868297338485718, + "num_tokens": 838393804.0, + "step": 21971 + }, + { + "epoch": 2.7950642411906883, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.908100128173828, + "learning_rate": 1e-06, + "loss": 0.4047, + "mean_token_accuracy": 0.8743633031845093, + "num_tokens": 838435516.0, + "step": 21972 + }, + { + "epoch": 2.795191451469279, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.532649993896484, + "learning_rate": 1e-06, + "loss": 0.3712, + "mean_token_accuracy": 0.8824726343154907, + "num_tokens": 838465425.0, + "step": 21973 + }, + { + "epoch": 2.7953186617478694, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.707359313964844, + "learning_rate": 1e-06, + "loss": 0.4085, + "mean_token_accuracy": 0.8721808791160583, + "num_tokens": 838512832.0, + "step": 21974 + }, + { + "epoch": 2.79544587202646, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.70688247680664, + "learning_rate": 1e-06, + "loss": 0.3829, + "mean_token_accuracy": 0.8807957172393799, + "num_tokens": 838552806.0, + "step": 21975 + }, + { + "epoch": 2.79557308230505, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.602079391479492, + "learning_rate": 1e-06, + "loss": 0.4061, + "mean_token_accuracy": 0.873556911945343, + "num_tokens": 838588592.0, + "step": 21976 + }, + { + "epoch": 2.795700292583641, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.910722732543945, + "learning_rate": 1e-06, + "loss": 0.4101, + "mean_token_accuracy": 0.872486412525177, + "num_tokens": 838629402.0, + "step": 21977 + }, + { + "epoch": 2.795827502862231, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.714120864868164, + "learning_rate": 1e-06, + "loss": 0.429, + "mean_token_accuracy": 0.8655067086219788, + "num_tokens": 838676580.0, + "step": 21978 + }, + { + "epoch": 2.795954713140822, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.541301727294922, + "learning_rate": 1e-06, + "loss": 0.4148, + "mean_token_accuracy": 0.8691506385803223, + "num_tokens": 838715773.0, + "step": 21979 + }, + { + "epoch": 2.796081923419412, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.60785675048828, + "learning_rate": 1e-06, + "loss": 0.4352, + "mean_token_accuracy": 0.8636152744293213, + "num_tokens": 838750270.0, + "step": 21980 + }, + { + "epoch": 2.796209133698003, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.640811920166016, + "learning_rate": 1e-06, + "loss": 0.4629, + "mean_token_accuracy": 0.8534442186355591, + "num_tokens": 838794823.0, + "step": 21981 + }, + { + "epoch": 2.796336343976593, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.47022247314453, + "learning_rate": 1e-06, + "loss": 0.4456, + "mean_token_accuracy": 0.8656195402145386, + "num_tokens": 838828901.0, + "step": 21982 + }, + { + "epoch": 2.7964635542551837, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.599618911743164, + "learning_rate": 1e-06, + "loss": 0.4326, + "mean_token_accuracy": 0.8655836582183838, + "num_tokens": 838872648.0, + "step": 21983 + }, + { + "epoch": 2.796590764533774, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.976703643798828, + "learning_rate": 1e-06, + "loss": 0.3577, + "mean_token_accuracy": 0.8876702785491943, + "num_tokens": 838905150.0, + "step": 21984 + }, + { + "epoch": 2.7967179748123647, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.547090530395508, + "learning_rate": 1e-06, + "loss": 0.4064, + "mean_token_accuracy": 0.8742281198501587, + "num_tokens": 838949027.0, + "step": 21985 + }, + { + "epoch": 2.7968451850909553, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.605777740478516, + "learning_rate": 1e-06, + "loss": 0.3935, + "mean_token_accuracy": 0.8760216236114502, + "num_tokens": 838987320.0, + "step": 21986 + }, + { + "epoch": 2.796972395369546, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.699533462524414, + "learning_rate": 1e-06, + "loss": 0.4196, + "mean_token_accuracy": 0.8699977993965149, + "num_tokens": 839027396.0, + "step": 21987 + }, + { + "epoch": 2.7970996056481363, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.727306365966797, + "learning_rate": 1e-06, + "loss": 0.3815, + "mean_token_accuracy": 0.8801854848861694, + "num_tokens": 839069779.0, + "step": 21988 + }, + { + "epoch": 2.797226815926727, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.56586456298828, + "learning_rate": 1e-06, + "loss": 0.3734, + "mean_token_accuracy": 0.8809068202972412, + "num_tokens": 839110919.0, + "step": 21989 + }, + { + "epoch": 2.7973540262053174, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.80453872680664, + "learning_rate": 1e-06, + "loss": 0.3633, + "mean_token_accuracy": 0.8883867263793945, + "num_tokens": 839143689.0, + "step": 21990 + }, + { + "epoch": 2.797481236483908, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.876527786254883, + "learning_rate": 1e-06, + "loss": 0.4596, + "mean_token_accuracy": 0.8567148447036743, + "num_tokens": 839181274.0, + "step": 21991 + }, + { + "epoch": 2.7976084467624984, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.65023422241211, + "learning_rate": 1e-06, + "loss": 0.4514, + "mean_token_accuracy": 0.8581738471984863, + "num_tokens": 839217038.0, + "step": 21992 + }, + { + "epoch": 2.797735657041089, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.796844482421875, + "learning_rate": 1e-06, + "loss": 0.4078, + "mean_token_accuracy": 0.8743611574172974, + "num_tokens": 839260104.0, + "step": 21993 + }, + { + "epoch": 2.7978628673196795, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.162391662597656, + "learning_rate": 1e-06, + "loss": 0.3799, + "mean_token_accuracy": 0.8774105906486511, + "num_tokens": 839294328.0, + "step": 21994 + }, + { + "epoch": 2.79799007759827, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.53656768798828, + "learning_rate": 1e-06, + "loss": 0.399, + "mean_token_accuracy": 0.8728591203689575, + "num_tokens": 839326662.0, + "step": 21995 + }, + { + "epoch": 2.7981172878768605, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.669431686401367, + "learning_rate": 1e-06, + "loss": 0.4022, + "mean_token_accuracy": 0.8757702112197876, + "num_tokens": 839355997.0, + "step": 21996 + }, + { + "epoch": 2.798244498155451, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.922225952148438, + "learning_rate": 1e-06, + "loss": 0.3799, + "mean_token_accuracy": 0.8796523809432983, + "num_tokens": 839396596.0, + "step": 21997 + }, + { + "epoch": 2.7983717084340416, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.61939811706543, + "learning_rate": 1e-06, + "loss": 0.4007, + "mean_token_accuracy": 0.875823974609375, + "num_tokens": 839432251.0, + "step": 21998 + }, + { + "epoch": 2.798498918712632, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.798498153686523, + "learning_rate": 1e-06, + "loss": 0.3666, + "mean_token_accuracy": 0.8830807209014893, + "num_tokens": 839465987.0, + "step": 21999 + }, + { + "epoch": 2.7986261289912227, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.666793823242188, + "learning_rate": 1e-06, + "loss": 0.3964, + "mean_token_accuracy": 0.8770334124565125, + "num_tokens": 839506147.0, + "step": 22000 + }, + { + "epoch": 2.7987533392698127, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.704925537109375, + "learning_rate": 1e-06, + "loss": 0.4252, + "mean_token_accuracy": 0.8662716150283813, + "num_tokens": 839544620.0, + "step": 22001 + }, + { + "epoch": 2.7988805495484037, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.578697204589844, + "learning_rate": 1e-06, + "loss": 0.4381, + "mean_token_accuracy": 0.866547703742981, + "num_tokens": 839583217.0, + "step": 22002 + }, + { + "epoch": 2.799007759826994, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.770263671875, + "learning_rate": 1e-06, + "loss": 0.353, + "mean_token_accuracy": 0.8925364017486572, + "num_tokens": 839621283.0, + "step": 22003 + }, + { + "epoch": 2.7991349701055848, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.632139205932617, + "learning_rate": 1e-06, + "loss": 0.4135, + "mean_token_accuracy": 0.8697338104248047, + "num_tokens": 839658072.0, + "step": 22004 + }, + { + "epoch": 2.799262180384175, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.89253807067871, + "learning_rate": 1e-06, + "loss": 0.4561, + "mean_token_accuracy": 0.8567419052124023, + "num_tokens": 839700926.0, + "step": 22005 + }, + { + "epoch": 2.799389390662766, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.665294647216797, + "learning_rate": 1e-06, + "loss": 0.3624, + "mean_token_accuracy": 0.8835389614105225, + "num_tokens": 839736160.0, + "step": 22006 + }, + { + "epoch": 2.799516600941356, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.65203094482422, + "learning_rate": 1e-06, + "loss": 0.3516, + "mean_token_accuracy": 0.8887569904327393, + "num_tokens": 839776834.0, + "step": 22007 + }, + { + "epoch": 2.7996438112199464, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.81586456298828, + "learning_rate": 1e-06, + "loss": 0.4343, + "mean_token_accuracy": 0.8667443990707397, + "num_tokens": 839818156.0, + "step": 22008 + }, + { + "epoch": 2.799771021498537, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.54852294921875, + "learning_rate": 1e-06, + "loss": 0.4099, + "mean_token_accuracy": 0.8682043552398682, + "num_tokens": 839857133.0, + "step": 22009 + }, + { + "epoch": 2.7998982317771275, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.69886589050293, + "learning_rate": 1e-06, + "loss": 0.4407, + "mean_token_accuracy": 0.8585673570632935, + "num_tokens": 839893129.0, + "step": 22010 + }, + { + "epoch": 2.800025442055718, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.516525268554688, + "learning_rate": 1e-06, + "loss": 0.386, + "mean_token_accuracy": 0.8772133588790894, + "num_tokens": 839924622.0, + "step": 22011 + }, + { + "epoch": 2.8001526523343085, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.71346092224121, + "learning_rate": 1e-06, + "loss": 0.3436, + "mean_token_accuracy": 0.8933520317077637, + "num_tokens": 839961276.0, + "step": 22012 + }, + { + "epoch": 2.800279862612899, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.617042541503906, + "learning_rate": 1e-06, + "loss": 0.4212, + "mean_token_accuracy": 0.8634081482887268, + "num_tokens": 840002554.0, + "step": 22013 + }, + { + "epoch": 2.8004070728914896, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.62798500061035, + "learning_rate": 1e-06, + "loss": 0.4076, + "mean_token_accuracy": 0.8727364540100098, + "num_tokens": 840041642.0, + "step": 22014 + }, + { + "epoch": 2.80053428317008, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.627925872802734, + "learning_rate": 1e-06, + "loss": 0.3728, + "mean_token_accuracy": 0.8831534385681152, + "num_tokens": 840083532.0, + "step": 22015 + }, + { + "epoch": 2.8006614934486707, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.612850189208984, + "learning_rate": 1e-06, + "loss": 0.4099, + "mean_token_accuracy": 0.8710283041000366, + "num_tokens": 840124221.0, + "step": 22016 + }, + { + "epoch": 2.800788703727261, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.755823135375977, + "learning_rate": 1e-06, + "loss": 0.3797, + "mean_token_accuracy": 0.8788636922836304, + "num_tokens": 840162309.0, + "step": 22017 + }, + { + "epoch": 2.8009159140058517, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.7532958984375, + "learning_rate": 1e-06, + "loss": 0.3916, + "mean_token_accuracy": 0.8756759166717529, + "num_tokens": 840194778.0, + "step": 22018 + }, + { + "epoch": 2.8010431242844422, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.77121353149414, + "learning_rate": 1e-06, + "loss": 0.3435, + "mean_token_accuracy": 0.8923592567443848, + "num_tokens": 840225081.0, + "step": 22019 + }, + { + "epoch": 2.8011703345630328, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.776256561279297, + "learning_rate": 1e-06, + "loss": 0.3996, + "mean_token_accuracy": 0.8728856444358826, + "num_tokens": 840267698.0, + "step": 22020 + }, + { + "epoch": 2.8012975448416233, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.567707061767578, + "learning_rate": 1e-06, + "loss": 0.417, + "mean_token_accuracy": 0.8682311773300171, + "num_tokens": 840303255.0, + "step": 22021 + }, + { + "epoch": 2.801424755120214, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.727460861206055, + "learning_rate": 1e-06, + "loss": 0.4003, + "mean_token_accuracy": 0.8733909130096436, + "num_tokens": 840344616.0, + "step": 22022 + }, + { + "epoch": 2.8015519653988044, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.636741638183594, + "learning_rate": 1e-06, + "loss": 0.3882, + "mean_token_accuracy": 0.8752591609954834, + "num_tokens": 840383293.0, + "step": 22023 + }, + { + "epoch": 2.801679175677395, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.672727584838867, + "learning_rate": 1e-06, + "loss": 0.435, + "mean_token_accuracy": 0.8654055595397949, + "num_tokens": 840424583.0, + "step": 22024 + }, + { + "epoch": 2.8018063859559854, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.614479064941406, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.8693090677261353, + "num_tokens": 840458752.0, + "step": 22025 + }, + { + "epoch": 2.8019335962345755, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.601062774658203, + "learning_rate": 1e-06, + "loss": 0.3744, + "mean_token_accuracy": 0.8804709911346436, + "num_tokens": 840498441.0, + "step": 22026 + }, + { + "epoch": 2.8020608065131665, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.826356887817383, + "learning_rate": 1e-06, + "loss": 0.3759, + "mean_token_accuracy": 0.8845813274383545, + "num_tokens": 840537838.0, + "step": 22027 + }, + { + "epoch": 2.8021880167917566, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.654054641723633, + "learning_rate": 1e-06, + "loss": 0.4343, + "mean_token_accuracy": 0.8631491661071777, + "num_tokens": 840571770.0, + "step": 22028 + }, + { + "epoch": 2.8023152270703475, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.878528594970703, + "learning_rate": 1e-06, + "loss": 0.4188, + "mean_token_accuracy": 0.8692048788070679, + "num_tokens": 840607559.0, + "step": 22029 + }, + { + "epoch": 2.8024424373489376, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.652193069458008, + "learning_rate": 1e-06, + "loss": 0.4477, + "mean_token_accuracy": 0.8596105575561523, + "num_tokens": 840650156.0, + "step": 22030 + }, + { + "epoch": 2.8025696476275286, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.507692337036133, + "learning_rate": 1e-06, + "loss": 0.4355, + "mean_token_accuracy": 0.8658979535102844, + "num_tokens": 840686590.0, + "step": 22031 + }, + { + "epoch": 2.8026968579061187, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.862953186035156, + "learning_rate": 1e-06, + "loss": 0.4315, + "mean_token_accuracy": 0.8667594194412231, + "num_tokens": 840724445.0, + "step": 22032 + }, + { + "epoch": 2.802824068184709, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.448007583618164, + "learning_rate": 1e-06, + "loss": 0.3517, + "mean_token_accuracy": 0.888754665851593, + "num_tokens": 840765047.0, + "step": 22033 + }, + { + "epoch": 2.8029512784632997, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.932180404663086, + "learning_rate": 1e-06, + "loss": 0.4361, + "mean_token_accuracy": 0.8615325689315796, + "num_tokens": 840802375.0, + "step": 22034 + }, + { + "epoch": 2.8030784887418903, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.635631561279297, + "learning_rate": 1e-06, + "loss": 0.4036, + "mean_token_accuracy": 0.8731484413146973, + "num_tokens": 840839994.0, + "step": 22035 + }, + { + "epoch": 2.803205699020481, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.731029510498047, + "learning_rate": 1e-06, + "loss": 0.4155, + "mean_token_accuracy": 0.8723477721214294, + "num_tokens": 840877144.0, + "step": 22036 + }, + { + "epoch": 2.8033329092990713, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.8748779296875, + "learning_rate": 1e-06, + "loss": 0.4335, + "mean_token_accuracy": 0.8647283911705017, + "num_tokens": 840918478.0, + "step": 22037 + }, + { + "epoch": 2.803460119577662, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.608150482177734, + "learning_rate": 1e-06, + "loss": 0.3687, + "mean_token_accuracy": 0.8829936981201172, + "num_tokens": 840958484.0, + "step": 22038 + }, + { + "epoch": 2.8035873298562524, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.72323989868164, + "learning_rate": 1e-06, + "loss": 0.3971, + "mean_token_accuracy": 0.8763630390167236, + "num_tokens": 841000035.0, + "step": 22039 + }, + { + "epoch": 2.803714540134843, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.754446029663086, + "learning_rate": 1e-06, + "loss": 0.4176, + "mean_token_accuracy": 0.8691249489784241, + "num_tokens": 841035836.0, + "step": 22040 + }, + { + "epoch": 2.8038417504134334, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.57999038696289, + "learning_rate": 1e-06, + "loss": 0.3994, + "mean_token_accuracy": 0.8756594657897949, + "num_tokens": 841069295.0, + "step": 22041 + }, + { + "epoch": 2.803968960692024, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.947107315063477, + "learning_rate": 1e-06, + "loss": 0.4116, + "mean_token_accuracy": 0.8692629337310791, + "num_tokens": 841108341.0, + "step": 22042 + }, + { + "epoch": 2.8040961709706145, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.739501953125, + "learning_rate": 1e-06, + "loss": 0.3686, + "mean_token_accuracy": 0.8840680122375488, + "num_tokens": 841142945.0, + "step": 22043 + }, + { + "epoch": 2.804223381249205, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.7309627532959, + "learning_rate": 1e-06, + "loss": 0.3573, + "mean_token_accuracy": 0.8869563341140747, + "num_tokens": 841174137.0, + "step": 22044 + }, + { + "epoch": 2.8043505915277955, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.674440383911133, + "learning_rate": 1e-06, + "loss": 0.3758, + "mean_token_accuracy": 0.8840391635894775, + "num_tokens": 841215914.0, + "step": 22045 + }, + { + "epoch": 2.804477801806386, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.726890563964844, + "learning_rate": 1e-06, + "loss": 0.3905, + "mean_token_accuracy": 0.8757390975952148, + "num_tokens": 841256097.0, + "step": 22046 + }, + { + "epoch": 2.8046050120849766, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.800796508789062, + "learning_rate": 1e-06, + "loss": 0.395, + "mean_token_accuracy": 0.8785299062728882, + "num_tokens": 841288997.0, + "step": 22047 + }, + { + "epoch": 2.804732222363567, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.870241165161133, + "learning_rate": 1e-06, + "loss": 0.4243, + "mean_token_accuracy": 0.8697679042816162, + "num_tokens": 841328053.0, + "step": 22048 + }, + { + "epoch": 2.8048594326421576, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.619016647338867, + "learning_rate": 1e-06, + "loss": 0.3465, + "mean_token_accuracy": 0.8924407958984375, + "num_tokens": 841366848.0, + "step": 22049 + }, + { + "epoch": 2.804986642920748, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.657909393310547, + "learning_rate": 1e-06, + "loss": 0.3698, + "mean_token_accuracy": 0.8828446865081787, + "num_tokens": 841398568.0, + "step": 22050 + }, + { + "epoch": 2.8051138531993383, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.51123809814453, + "learning_rate": 1e-06, + "loss": 0.4007, + "mean_token_accuracy": 0.8718545436859131, + "num_tokens": 841435722.0, + "step": 22051 + }, + { + "epoch": 2.8052410634779292, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.75144386291504, + "learning_rate": 1e-06, + "loss": 0.418, + "mean_token_accuracy": 0.8699181079864502, + "num_tokens": 841473635.0, + "step": 22052 + }, + { + "epoch": 2.8053682737565193, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.613496780395508, + "learning_rate": 1e-06, + "loss": 0.3908, + "mean_token_accuracy": 0.8773784637451172, + "num_tokens": 841512383.0, + "step": 22053 + }, + { + "epoch": 2.8054954840351103, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.611019134521484, + "learning_rate": 1e-06, + "loss": 0.4114, + "mean_token_accuracy": 0.8688495755195618, + "num_tokens": 841543174.0, + "step": 22054 + }, + { + "epoch": 2.8056226943137004, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.736385345458984, + "learning_rate": 1e-06, + "loss": 0.3878, + "mean_token_accuracy": 0.8796181678771973, + "num_tokens": 841582185.0, + "step": 22055 + }, + { + "epoch": 2.805749904592291, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.4179744720459, + "learning_rate": 1e-06, + "loss": 0.4158, + "mean_token_accuracy": 0.8696177005767822, + "num_tokens": 841623179.0, + "step": 22056 + }, + { + "epoch": 2.8058771148708814, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.4700927734375, + "learning_rate": 1e-06, + "loss": 0.3714, + "mean_token_accuracy": 0.8822221159934998, + "num_tokens": 841661062.0, + "step": 22057 + }, + { + "epoch": 2.806004325149472, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.575111389160156, + "learning_rate": 1e-06, + "loss": 0.4017, + "mean_token_accuracy": 0.8719595670700073, + "num_tokens": 841704259.0, + "step": 22058 + }, + { + "epoch": 2.8061315354280625, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.56993293762207, + "learning_rate": 1e-06, + "loss": 0.4096, + "mean_token_accuracy": 0.872506320476532, + "num_tokens": 841748396.0, + "step": 22059 + }, + { + "epoch": 2.806258745706653, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.753265380859375, + "learning_rate": 1e-06, + "loss": 0.3428, + "mean_token_accuracy": 0.8925118446350098, + "num_tokens": 841787809.0, + "step": 22060 + }, + { + "epoch": 2.8063859559852435, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.772655487060547, + "learning_rate": 1e-06, + "loss": 0.4067, + "mean_token_accuracy": 0.8739395141601562, + "num_tokens": 841824046.0, + "step": 22061 + }, + { + "epoch": 2.806513166263834, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.702106475830078, + "learning_rate": 1e-06, + "loss": 0.3649, + "mean_token_accuracy": 0.882948637008667, + "num_tokens": 841856949.0, + "step": 22062 + }, + { + "epoch": 2.8066403765424246, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.758129119873047, + "learning_rate": 1e-06, + "loss": 0.4471, + "mean_token_accuracy": 0.8615790605545044, + "num_tokens": 841898644.0, + "step": 22063 + }, + { + "epoch": 2.806767586821015, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.59722328186035, + "learning_rate": 1e-06, + "loss": 0.3921, + "mean_token_accuracy": 0.877227783203125, + "num_tokens": 841934795.0, + "step": 22064 + }, + { + "epoch": 2.8068947970996057, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.73431968688965, + "learning_rate": 1e-06, + "loss": 0.4528, + "mean_token_accuracy": 0.8594245910644531, + "num_tokens": 841976878.0, + "step": 22065 + }, + { + "epoch": 2.807022007378196, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.774633407592773, + "learning_rate": 1e-06, + "loss": 0.3939, + "mean_token_accuracy": 0.8743990659713745, + "num_tokens": 842010998.0, + "step": 22066 + }, + { + "epoch": 2.8071492176567867, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.73653793334961, + "learning_rate": 1e-06, + "loss": 0.3239, + "mean_token_accuracy": 0.8989081382751465, + "num_tokens": 842051410.0, + "step": 22067 + }, + { + "epoch": 2.8072764279353772, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.56066131591797, + "learning_rate": 1e-06, + "loss": 0.383, + "mean_token_accuracy": 0.8764042854309082, + "num_tokens": 842087954.0, + "step": 22068 + }, + { + "epoch": 2.8074036382139678, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.72269058227539, + "learning_rate": 1e-06, + "loss": 0.417, + "mean_token_accuracy": 0.8711112141609192, + "num_tokens": 842128326.0, + "step": 22069 + }, + { + "epoch": 2.8075308484925583, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.506500244140625, + "learning_rate": 1e-06, + "loss": 0.4692, + "mean_token_accuracy": 0.8541002869606018, + "num_tokens": 842171143.0, + "step": 22070 + }, + { + "epoch": 2.807658058771149, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.055017471313477, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8623488545417786, + "num_tokens": 842212167.0, + "step": 22071 + }, + { + "epoch": 2.8077852690497394, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.691848754882812, + "learning_rate": 1e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.8645374774932861, + "num_tokens": 842252576.0, + "step": 22072 + }, + { + "epoch": 2.80791247932833, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.952463150024414, + "learning_rate": 1e-06, + "loss": 0.4062, + "mean_token_accuracy": 0.8710785508155823, + "num_tokens": 842287812.0, + "step": 22073 + }, + { + "epoch": 2.80803968960692, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.69962501525879, + "learning_rate": 1e-06, + "loss": 0.4165, + "mean_token_accuracy": 0.8709231615066528, + "num_tokens": 842328289.0, + "step": 22074 + }, + { + "epoch": 2.808166899885511, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.854143142700195, + "learning_rate": 1e-06, + "loss": 0.3888, + "mean_token_accuracy": 0.8775978088378906, + "num_tokens": 842359182.0, + "step": 22075 + }, + { + "epoch": 2.808294110164101, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.901565551757812, + "learning_rate": 1e-06, + "loss": 0.4396, + "mean_token_accuracy": 0.8648586273193359, + "num_tokens": 842399820.0, + "step": 22076 + }, + { + "epoch": 2.808421320442692, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.657073974609375, + "learning_rate": 1e-06, + "loss": 0.4172, + "mean_token_accuracy": 0.8726490139961243, + "num_tokens": 842437852.0, + "step": 22077 + }, + { + "epoch": 2.808548530721282, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.713151931762695, + "learning_rate": 1e-06, + "loss": 0.4394, + "mean_token_accuracy": 0.8603770732879639, + "num_tokens": 842476729.0, + "step": 22078 + }, + { + "epoch": 2.808675740999873, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.673423767089844, + "learning_rate": 1e-06, + "loss": 0.4408, + "mean_token_accuracy": 0.8620272874832153, + "num_tokens": 842514590.0, + "step": 22079 + }, + { + "epoch": 2.808802951278463, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.92438507080078, + "learning_rate": 1e-06, + "loss": 0.4145, + "mean_token_accuracy": 0.8727383613586426, + "num_tokens": 842556502.0, + "step": 22080 + }, + { + "epoch": 2.8089301615570537, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.68939781188965, + "learning_rate": 1e-06, + "loss": 0.4349, + "mean_token_accuracy": 0.8627147674560547, + "num_tokens": 842594299.0, + "step": 22081 + }, + { + "epoch": 2.809057371835644, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.688133239746094, + "learning_rate": 1e-06, + "loss": 0.4056, + "mean_token_accuracy": 0.8714179992675781, + "num_tokens": 842633189.0, + "step": 22082 + }, + { + "epoch": 2.8091845821142347, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.77213478088379, + "learning_rate": 1e-06, + "loss": 0.3915, + "mean_token_accuracy": 0.8759452104568481, + "num_tokens": 842673626.0, + "step": 22083 + }, + { + "epoch": 2.8093117923928252, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.547510147094727, + "learning_rate": 1e-06, + "loss": 0.4128, + "mean_token_accuracy": 0.8674921989440918, + "num_tokens": 842712574.0, + "step": 22084 + }, + { + "epoch": 2.8094390026714158, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.762800216674805, + "learning_rate": 1e-06, + "loss": 0.4087, + "mean_token_accuracy": 0.8715770244598389, + "num_tokens": 842749534.0, + "step": 22085 + }, + { + "epoch": 2.8095662129500063, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.565282821655273, + "learning_rate": 1e-06, + "loss": 0.4298, + "mean_token_accuracy": 0.8650399446487427, + "num_tokens": 842784989.0, + "step": 22086 + }, + { + "epoch": 2.809693423228597, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.951671600341797, + "learning_rate": 1e-06, + "loss": 0.3948, + "mean_token_accuracy": 0.8744361996650696, + "num_tokens": 842823858.0, + "step": 22087 + }, + { + "epoch": 2.8098206335071874, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.57685661315918, + "learning_rate": 1e-06, + "loss": 0.4032, + "mean_token_accuracy": 0.8741886615753174, + "num_tokens": 842858849.0, + "step": 22088 + }, + { + "epoch": 2.809947843785778, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.7772274017334, + "learning_rate": 1e-06, + "loss": 0.3411, + "mean_token_accuracy": 0.8935927152633667, + "num_tokens": 842895809.0, + "step": 22089 + }, + { + "epoch": 2.8100750540643684, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.786983489990234, + "learning_rate": 1e-06, + "loss": 0.3167, + "mean_token_accuracy": 0.9015663266181946, + "num_tokens": 842933650.0, + "step": 22090 + }, + { + "epoch": 2.810202264342959, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.796768188476562, + "learning_rate": 1e-06, + "loss": 0.3825, + "mean_token_accuracy": 0.8790720701217651, + "num_tokens": 842967200.0, + "step": 22091 + }, + { + "epoch": 2.8103294746215495, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.8868465423584, + "learning_rate": 1e-06, + "loss": 0.4188, + "mean_token_accuracy": 0.866026759147644, + "num_tokens": 843002498.0, + "step": 22092 + }, + { + "epoch": 2.81045668490014, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.710147857666016, + "learning_rate": 1e-06, + "loss": 0.3828, + "mean_token_accuracy": 0.8786506652832031, + "num_tokens": 843042627.0, + "step": 22093 + }, + { + "epoch": 2.8105838951787305, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.819557189941406, + "learning_rate": 1e-06, + "loss": 0.4017, + "mean_token_accuracy": 0.8699765801429749, + "num_tokens": 843076862.0, + "step": 22094 + }, + { + "epoch": 2.810711105457321, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.886890411376953, + "learning_rate": 1e-06, + "loss": 0.4014, + "mean_token_accuracy": 0.8742385506629944, + "num_tokens": 843115000.0, + "step": 22095 + }, + { + "epoch": 2.8108383157359116, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.58926773071289, + "learning_rate": 1e-06, + "loss": 0.3735, + "mean_token_accuracy": 0.8809319734573364, + "num_tokens": 843152408.0, + "step": 22096 + }, + { + "epoch": 2.810965526014502, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.9152774810791, + "learning_rate": 1e-06, + "loss": 0.3988, + "mean_token_accuracy": 0.875496506690979, + "num_tokens": 843189269.0, + "step": 22097 + }, + { + "epoch": 2.8110927362930926, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.736373901367188, + "learning_rate": 1e-06, + "loss": 0.4084, + "mean_token_accuracy": 0.870200514793396, + "num_tokens": 843227148.0, + "step": 22098 + }, + { + "epoch": 2.8112199465716827, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.611204147338867, + "learning_rate": 1e-06, + "loss": 0.4268, + "mean_token_accuracy": 0.864751398563385, + "num_tokens": 843259736.0, + "step": 22099 + }, + { + "epoch": 2.8113471568502737, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.923921585083008, + "learning_rate": 1e-06, + "loss": 0.4133, + "mean_token_accuracy": 0.8700404763221741, + "num_tokens": 843302598.0, + "step": 22100 + }, + { + "epoch": 2.811474367128864, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.726545333862305, + "learning_rate": 1e-06, + "loss": 0.3579, + "mean_token_accuracy": 0.8895696997642517, + "num_tokens": 843337615.0, + "step": 22101 + }, + { + "epoch": 2.8116015774074548, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.918832778930664, + "learning_rate": 1e-06, + "loss": 0.3581, + "mean_token_accuracy": 0.8901091814041138, + "num_tokens": 843378845.0, + "step": 22102 + }, + { + "epoch": 2.811728787686045, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.732019424438477, + "learning_rate": 1e-06, + "loss": 0.4343, + "mean_token_accuracy": 0.8646286129951477, + "num_tokens": 843414128.0, + "step": 22103 + }, + { + "epoch": 2.811855997964636, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.73593521118164, + "learning_rate": 1e-06, + "loss": 0.358, + "mean_token_accuracy": 0.8895204067230225, + "num_tokens": 843454255.0, + "step": 22104 + }, + { + "epoch": 2.811983208243226, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.88302230834961, + "learning_rate": 1e-06, + "loss": 0.3994, + "mean_token_accuracy": 0.8758299350738525, + "num_tokens": 843498332.0, + "step": 22105 + }, + { + "epoch": 2.8121104185218164, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.730731964111328, + "learning_rate": 1e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.8646471500396729, + "num_tokens": 843539196.0, + "step": 22106 + }, + { + "epoch": 2.812237628800407, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.829565048217773, + "learning_rate": 1e-06, + "loss": 0.3799, + "mean_token_accuracy": 0.8782411813735962, + "num_tokens": 843573767.0, + "step": 22107 + }, + { + "epoch": 2.8123648390789975, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.959440231323242, + "learning_rate": 1e-06, + "loss": 0.4369, + "mean_token_accuracy": 0.8624410033226013, + "num_tokens": 843611156.0, + "step": 22108 + }, + { + "epoch": 2.812492049357588, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.741304397583008, + "learning_rate": 1e-06, + "loss": 0.3759, + "mean_token_accuracy": 0.8797423243522644, + "num_tokens": 843644026.0, + "step": 22109 + }, + { + "epoch": 2.8126192596361785, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.848352432250977, + "learning_rate": 1e-06, + "loss": 0.3846, + "mean_token_accuracy": 0.8772484064102173, + "num_tokens": 843683477.0, + "step": 22110 + }, + { + "epoch": 2.812746469914769, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.959619522094727, + "learning_rate": 1e-06, + "loss": 0.4427, + "mean_token_accuracy": 0.8613274097442627, + "num_tokens": 843722815.0, + "step": 22111 + }, + { + "epoch": 2.8128736801933596, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.721784591674805, + "learning_rate": 1e-06, + "loss": 0.4023, + "mean_token_accuracy": 0.8753460049629211, + "num_tokens": 843763843.0, + "step": 22112 + }, + { + "epoch": 2.81300089047195, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.867963790893555, + "learning_rate": 1e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.86680006980896, + "num_tokens": 843805327.0, + "step": 22113 + }, + { + "epoch": 2.8131281007505406, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.801177978515625, + "learning_rate": 1e-06, + "loss": 0.3983, + "mean_token_accuracy": 0.875771164894104, + "num_tokens": 843850132.0, + "step": 22114 + }, + { + "epoch": 2.813255311029131, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.59269142150879, + "learning_rate": 1e-06, + "loss": 0.3932, + "mean_token_accuracy": 0.8728746175765991, + "num_tokens": 843887425.0, + "step": 22115 + }, + { + "epoch": 2.8133825213077217, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.88410758972168, + "learning_rate": 1e-06, + "loss": 0.3713, + "mean_token_accuracy": 0.8836109638214111, + "num_tokens": 843920728.0, + "step": 22116 + }, + { + "epoch": 2.8135097315863122, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.75826072692871, + "learning_rate": 1e-06, + "loss": 0.4028, + "mean_token_accuracy": 0.8737103939056396, + "num_tokens": 843959733.0, + "step": 22117 + }, + { + "epoch": 2.8136369418649028, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.851518630981445, + "learning_rate": 1e-06, + "loss": 0.4025, + "mean_token_accuracy": 0.8753238320350647, + "num_tokens": 844005616.0, + "step": 22118 + }, + { + "epoch": 2.8137641521434933, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.81831932067871, + "learning_rate": 1e-06, + "loss": 0.3428, + "mean_token_accuracy": 0.8891069889068604, + "num_tokens": 844034216.0, + "step": 22119 + }, + { + "epoch": 2.813891362422084, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.836416244506836, + "learning_rate": 1e-06, + "loss": 0.4011, + "mean_token_accuracy": 0.870611846446991, + "num_tokens": 844071173.0, + "step": 22120 + }, + { + "epoch": 2.8140185727006743, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.706417083740234, + "learning_rate": 1e-06, + "loss": 0.4034, + "mean_token_accuracy": 0.8724552392959595, + "num_tokens": 844113665.0, + "step": 22121 + }, + { + "epoch": 2.814145782979265, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.950315475463867, + "learning_rate": 1e-06, + "loss": 0.4022, + "mean_token_accuracy": 0.874671995639801, + "num_tokens": 844154063.0, + "step": 22122 + }, + { + "epoch": 2.8142729932578554, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.845474243164062, + "learning_rate": 1e-06, + "loss": 0.3454, + "mean_token_accuracy": 0.8929388523101807, + "num_tokens": 844185634.0, + "step": 22123 + }, + { + "epoch": 2.8144002035364455, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.975257873535156, + "learning_rate": 1e-06, + "loss": 0.4297, + "mean_token_accuracy": 0.8640254139900208, + "num_tokens": 844221755.0, + "step": 22124 + }, + { + "epoch": 2.8145274138150365, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.884647369384766, + "learning_rate": 1e-06, + "loss": 0.3819, + "mean_token_accuracy": 0.8795558214187622, + "num_tokens": 844256230.0, + "step": 22125 + }, + { + "epoch": 2.8146546240936265, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.57990074157715, + "learning_rate": 1e-06, + "loss": 0.4335, + "mean_token_accuracy": 0.8674894571304321, + "num_tokens": 844292823.0, + "step": 22126 + }, + { + "epoch": 2.8147818343722175, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.797588348388672, + "learning_rate": 1e-06, + "loss": 0.4667, + "mean_token_accuracy": 0.8522890210151672, + "num_tokens": 844331843.0, + "step": 22127 + }, + { + "epoch": 2.8149090446508076, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.15406608581543, + "learning_rate": 1e-06, + "loss": 0.3716, + "mean_token_accuracy": 0.8846639394760132, + "num_tokens": 844375823.0, + "step": 22128 + }, + { + "epoch": 2.815036254929398, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.719152450561523, + "learning_rate": 1e-06, + "loss": 0.3983, + "mean_token_accuracy": 0.8746049404144287, + "num_tokens": 844415187.0, + "step": 22129 + }, + { + "epoch": 2.8151634652079887, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.904361724853516, + "learning_rate": 1e-06, + "loss": 0.3919, + "mean_token_accuracy": 0.8774449825286865, + "num_tokens": 844454986.0, + "step": 22130 + }, + { + "epoch": 2.815290675486579, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.663442611694336, + "learning_rate": 1e-06, + "loss": 0.3812, + "mean_token_accuracy": 0.8806278705596924, + "num_tokens": 844495514.0, + "step": 22131 + }, + { + "epoch": 2.8154178857651697, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.93235969543457, + "learning_rate": 1e-06, + "loss": 0.4063, + "mean_token_accuracy": 0.8735562562942505, + "num_tokens": 844532406.0, + "step": 22132 + }, + { + "epoch": 2.8155450960437602, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.88188934326172, + "learning_rate": 1e-06, + "loss": 0.4026, + "mean_token_accuracy": 0.8738117218017578, + "num_tokens": 844569561.0, + "step": 22133 + }, + { + "epoch": 2.8156723063223508, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.590164184570312, + "learning_rate": 1e-06, + "loss": 0.3894, + "mean_token_accuracy": 0.8793765306472778, + "num_tokens": 844613504.0, + "step": 22134 + }, + { + "epoch": 2.8157995166009413, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.567167282104492, + "learning_rate": 1e-06, + "loss": 0.4123, + "mean_token_accuracy": 0.8719263076782227, + "num_tokens": 844649255.0, + "step": 22135 + }, + { + "epoch": 2.815926726879532, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.703323364257812, + "learning_rate": 1e-06, + "loss": 0.398, + "mean_token_accuracy": 0.8770585060119629, + "num_tokens": 844687202.0, + "step": 22136 + }, + { + "epoch": 2.8160539371581224, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.81917953491211, + "learning_rate": 1e-06, + "loss": 0.3975, + "mean_token_accuracy": 0.8756917119026184, + "num_tokens": 844723096.0, + "step": 22137 + }, + { + "epoch": 2.816181147436713, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.720144271850586, + "learning_rate": 1e-06, + "loss": 0.3933, + "mean_token_accuracy": 0.8753061890602112, + "num_tokens": 844759754.0, + "step": 22138 + }, + { + "epoch": 2.8163083577153034, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.791893005371094, + "learning_rate": 1e-06, + "loss": 0.3907, + "mean_token_accuracy": 0.8774733543395996, + "num_tokens": 844798372.0, + "step": 22139 + }, + { + "epoch": 2.816435567993894, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.866626739501953, + "learning_rate": 1e-06, + "loss": 0.4153, + "mean_token_accuracy": 0.8693327307701111, + "num_tokens": 844832738.0, + "step": 22140 + }, + { + "epoch": 2.8165627782724845, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.509925842285156, + "learning_rate": 1e-06, + "loss": 0.3874, + "mean_token_accuracy": 0.8754416704177856, + "num_tokens": 844871684.0, + "step": 22141 + }, + { + "epoch": 2.816689988551075, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.91545295715332, + "learning_rate": 1e-06, + "loss": 0.4205, + "mean_token_accuracy": 0.8675650358200073, + "num_tokens": 844912140.0, + "step": 22142 + }, + { + "epoch": 2.8168171988296655, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.862375259399414, + "learning_rate": 1e-06, + "loss": 0.4287, + "mean_token_accuracy": 0.8632237911224365, + "num_tokens": 844948256.0, + "step": 22143 + }, + { + "epoch": 2.816944409108256, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.91114616394043, + "learning_rate": 1e-06, + "loss": 0.3482, + "mean_token_accuracy": 0.8899344801902771, + "num_tokens": 844983821.0, + "step": 22144 + }, + { + "epoch": 2.8170716193868466, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.565515518188477, + "learning_rate": 1e-06, + "loss": 0.3979, + "mean_token_accuracy": 0.8786836862564087, + "num_tokens": 845020622.0, + "step": 22145 + }, + { + "epoch": 2.817198829665437, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.833765029907227, + "learning_rate": 1e-06, + "loss": 0.425, + "mean_token_accuracy": 0.8661717772483826, + "num_tokens": 845056748.0, + "step": 22146 + }, + { + "epoch": 2.8173260399440276, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.683277130126953, + "learning_rate": 1e-06, + "loss": 0.4564, + "mean_token_accuracy": 0.856478750705719, + "num_tokens": 845096294.0, + "step": 22147 + }, + { + "epoch": 2.817453250222618, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.95404624938965, + "learning_rate": 1e-06, + "loss": 0.3485, + "mean_token_accuracy": 0.8891156911849976, + "num_tokens": 845135132.0, + "step": 22148 + }, + { + "epoch": 2.8175804605012083, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.75409507751465, + "learning_rate": 1e-06, + "loss": 0.3684, + "mean_token_accuracy": 0.8829973936080933, + "num_tokens": 845172164.0, + "step": 22149 + }, + { + "epoch": 2.8177076707797992, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.66193962097168, + "learning_rate": 1e-06, + "loss": 0.4056, + "mean_token_accuracy": 0.8746157288551331, + "num_tokens": 845210270.0, + "step": 22150 + }, + { + "epoch": 2.8178348810583893, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.678157806396484, + "learning_rate": 1e-06, + "loss": 0.4009, + "mean_token_accuracy": 0.877732515335083, + "num_tokens": 845251863.0, + "step": 22151 + }, + { + "epoch": 2.8179620913369803, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.86812973022461, + "learning_rate": 1e-06, + "loss": 0.388, + "mean_token_accuracy": 0.8785539865493774, + "num_tokens": 845290262.0, + "step": 22152 + }, + { + "epoch": 2.8180893016155704, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.734323501586914, + "learning_rate": 1e-06, + "loss": 0.3845, + "mean_token_accuracy": 0.88230961561203, + "num_tokens": 845326894.0, + "step": 22153 + }, + { + "epoch": 2.818216511894161, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.957347869873047, + "learning_rate": 1e-06, + "loss": 0.381, + "mean_token_accuracy": 0.8832268118858337, + "num_tokens": 845367545.0, + "step": 22154 + }, + { + "epoch": 2.8183437221727514, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.623735427856445, + "learning_rate": 1e-06, + "loss": 0.4541, + "mean_token_accuracy": 0.8605594635009766, + "num_tokens": 845400920.0, + "step": 22155 + }, + { + "epoch": 2.818470932451342, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.554180145263672, + "learning_rate": 1e-06, + "loss": 0.4056, + "mean_token_accuracy": 0.8698022961616516, + "num_tokens": 845436279.0, + "step": 22156 + }, + { + "epoch": 2.8185981427299325, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.874961853027344, + "learning_rate": 1e-06, + "loss": 0.384, + "mean_token_accuracy": 0.8784343004226685, + "num_tokens": 845476308.0, + "step": 22157 + }, + { + "epoch": 2.818725353008523, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.934045791625977, + "learning_rate": 1e-06, + "loss": 0.4177, + "mean_token_accuracy": 0.870308518409729, + "num_tokens": 845516624.0, + "step": 22158 + }, + { + "epoch": 2.8188525632871135, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.61766815185547, + "learning_rate": 1e-06, + "loss": 0.3402, + "mean_token_accuracy": 0.8926293849945068, + "num_tokens": 845552385.0, + "step": 22159 + }, + { + "epoch": 2.818979773565704, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.9405574798584, + "learning_rate": 1e-06, + "loss": 0.3667, + "mean_token_accuracy": 0.884325385093689, + "num_tokens": 845592600.0, + "step": 22160 + }, + { + "epoch": 2.8191069838442946, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.688337326049805, + "learning_rate": 1e-06, + "loss": 0.3773, + "mean_token_accuracy": 0.882813572883606, + "num_tokens": 845630028.0, + "step": 22161 + }, + { + "epoch": 2.819234194122885, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.630990982055664, + "learning_rate": 1e-06, + "loss": 0.4145, + "mean_token_accuracy": 0.8705523014068604, + "num_tokens": 845665874.0, + "step": 22162 + }, + { + "epoch": 2.8193614044014756, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.74336051940918, + "learning_rate": 1e-06, + "loss": 0.4362, + "mean_token_accuracy": 0.8625296950340271, + "num_tokens": 845702738.0, + "step": 22163 + }, + { + "epoch": 2.819488614680066, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.74349594116211, + "learning_rate": 1e-06, + "loss": 0.4086, + "mean_token_accuracy": 0.8771169185638428, + "num_tokens": 845744785.0, + "step": 22164 + }, + { + "epoch": 2.8196158249586567, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.42387580871582, + "learning_rate": 1e-06, + "loss": 0.4034, + "mean_token_accuracy": 0.874915599822998, + "num_tokens": 845779196.0, + "step": 22165 + }, + { + "epoch": 2.8197430352372472, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.732608795166016, + "learning_rate": 1e-06, + "loss": 0.423, + "mean_token_accuracy": 0.864682674407959, + "num_tokens": 845819016.0, + "step": 22166 + }, + { + "epoch": 2.8198702455158378, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.705963134765625, + "learning_rate": 1e-06, + "loss": 0.3887, + "mean_token_accuracy": 0.8767691254615784, + "num_tokens": 845855014.0, + "step": 22167 + }, + { + "epoch": 2.8199974557944283, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.595943450927734, + "learning_rate": 1e-06, + "loss": 0.3718, + "mean_token_accuracy": 0.8834697008132935, + "num_tokens": 845895491.0, + "step": 22168 + }, + { + "epoch": 2.820124666073019, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.84120750427246, + "learning_rate": 1e-06, + "loss": 0.4294, + "mean_token_accuracy": 0.8655493259429932, + "num_tokens": 845931560.0, + "step": 22169 + }, + { + "epoch": 2.8202518763516093, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.859853744506836, + "learning_rate": 1e-06, + "loss": 0.3937, + "mean_token_accuracy": 0.8755114078521729, + "num_tokens": 845966701.0, + "step": 22170 + }, + { + "epoch": 2.8203790866302, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.57851219177246, + "learning_rate": 1e-06, + "loss": 0.3521, + "mean_token_accuracy": 0.8915842771530151, + "num_tokens": 846002327.0, + "step": 22171 + }, + { + "epoch": 2.82050629690879, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.550806045532227, + "learning_rate": 1e-06, + "loss": 0.3851, + "mean_token_accuracy": 0.8758383393287659, + "num_tokens": 846042157.0, + "step": 22172 + }, + { + "epoch": 2.820633507187381, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.806610107421875, + "learning_rate": 1e-06, + "loss": 0.4655, + "mean_token_accuracy": 0.8600651025772095, + "num_tokens": 846082851.0, + "step": 22173 + }, + { + "epoch": 2.820760717465971, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.607568740844727, + "learning_rate": 1e-06, + "loss": 0.3994, + "mean_token_accuracy": 0.8733700513839722, + "num_tokens": 846116846.0, + "step": 22174 + }, + { + "epoch": 2.820887927744562, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.891284942626953, + "learning_rate": 1e-06, + "loss": 0.3951, + "mean_token_accuracy": 0.8767374753952026, + "num_tokens": 846154373.0, + "step": 22175 + }, + { + "epoch": 2.821015138023152, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.746688842773438, + "learning_rate": 1e-06, + "loss": 0.3379, + "mean_token_accuracy": 0.8918194770812988, + "num_tokens": 846195329.0, + "step": 22176 + }, + { + "epoch": 2.821142348301743, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.95473861694336, + "learning_rate": 1e-06, + "loss": 0.3603, + "mean_token_accuracy": 0.886056661605835, + "num_tokens": 846233078.0, + "step": 22177 + }, + { + "epoch": 2.821269558580333, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.715133666992188, + "learning_rate": 1e-06, + "loss": 0.4032, + "mean_token_accuracy": 0.8731849789619446, + "num_tokens": 846269030.0, + "step": 22178 + }, + { + "epoch": 2.8213967688589237, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.73056983947754, + "learning_rate": 1e-06, + "loss": 0.411, + "mean_token_accuracy": 0.8711315393447876, + "num_tokens": 846307269.0, + "step": 22179 + }, + { + "epoch": 2.821523979137514, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.71061897277832, + "learning_rate": 1e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.8649376034736633, + "num_tokens": 846339116.0, + "step": 22180 + }, + { + "epoch": 2.8216511894161047, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.903303146362305, + "learning_rate": 1e-06, + "loss": 0.4206, + "mean_token_accuracy": 0.8659957051277161, + "num_tokens": 846384968.0, + "step": 22181 + }, + { + "epoch": 2.8217783996946952, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.75237464904785, + "learning_rate": 1e-06, + "loss": 0.4463, + "mean_token_accuracy": 0.8592804670333862, + "num_tokens": 846426235.0, + "step": 22182 + }, + { + "epoch": 2.8219056099732858, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.504009246826172, + "learning_rate": 1e-06, + "loss": 0.4358, + "mean_token_accuracy": 0.8615179657936096, + "num_tokens": 846465715.0, + "step": 22183 + }, + { + "epoch": 2.8220328202518763, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.86760139465332, + "learning_rate": 1e-06, + "loss": 0.4407, + "mean_token_accuracy": 0.8614318370819092, + "num_tokens": 846499972.0, + "step": 22184 + }, + { + "epoch": 2.822160030530467, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.623010635375977, + "learning_rate": 1e-06, + "loss": 0.396, + "mean_token_accuracy": 0.8736658692359924, + "num_tokens": 846535906.0, + "step": 22185 + }, + { + "epoch": 2.8222872408090574, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.70639991760254, + "learning_rate": 1e-06, + "loss": 0.414, + "mean_token_accuracy": 0.870831310749054, + "num_tokens": 846572723.0, + "step": 22186 + }, + { + "epoch": 2.822414451087648, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.863021850585938, + "learning_rate": 1e-06, + "loss": 0.386, + "mean_token_accuracy": 0.8789777755737305, + "num_tokens": 846613501.0, + "step": 22187 + }, + { + "epoch": 2.8225416613662384, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.707244873046875, + "learning_rate": 1e-06, + "loss": 0.3427, + "mean_token_accuracy": 0.8950955271720886, + "num_tokens": 846650001.0, + "step": 22188 + }, + { + "epoch": 2.822668871644829, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.569351196289062, + "learning_rate": 1e-06, + "loss": 0.4324, + "mean_token_accuracy": 0.8625009655952454, + "num_tokens": 846690636.0, + "step": 22189 + }, + { + "epoch": 2.8227960819234195, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.966371536254883, + "learning_rate": 1e-06, + "loss": 0.4698, + "mean_token_accuracy": 0.8540607690811157, + "num_tokens": 846728513.0, + "step": 22190 + }, + { + "epoch": 2.82292329220201, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.64436149597168, + "learning_rate": 1e-06, + "loss": 0.4464, + "mean_token_accuracy": 0.861329197883606, + "num_tokens": 846771989.0, + "step": 22191 + }, + { + "epoch": 2.8230505024806005, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.890724182128906, + "learning_rate": 1e-06, + "loss": 0.3465, + "mean_token_accuracy": 0.8910617232322693, + "num_tokens": 846808393.0, + "step": 22192 + }, + { + "epoch": 2.823177712759191, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.539854049682617, + "learning_rate": 1e-06, + "loss": 0.4214, + "mean_token_accuracy": 0.8648083209991455, + "num_tokens": 846846928.0, + "step": 22193 + }, + { + "epoch": 2.8233049230377816, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.99704360961914, + "learning_rate": 1e-06, + "loss": 0.4322, + "mean_token_accuracy": 0.8650038242340088, + "num_tokens": 846884637.0, + "step": 22194 + }, + { + "epoch": 2.823432133316372, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.756778717041016, + "learning_rate": 1e-06, + "loss": 0.3945, + "mean_token_accuracy": 0.878247082233429, + "num_tokens": 846925749.0, + "step": 22195 + }, + { + "epoch": 2.8235593435949626, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.731647491455078, + "learning_rate": 1e-06, + "loss": 0.4525, + "mean_token_accuracy": 0.8607255220413208, + "num_tokens": 846967191.0, + "step": 22196 + }, + { + "epoch": 2.8236865538735527, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.720060348510742, + "learning_rate": 1e-06, + "loss": 0.3782, + "mean_token_accuracy": 0.8794344663619995, + "num_tokens": 847003280.0, + "step": 22197 + }, + { + "epoch": 2.8238137641521437, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.98689079284668, + "learning_rate": 1e-06, + "loss": 0.4378, + "mean_token_accuracy": 0.8622627258300781, + "num_tokens": 847042179.0, + "step": 22198 + }, + { + "epoch": 2.8239409744307338, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.809839248657227, + "learning_rate": 1e-06, + "loss": 0.4044, + "mean_token_accuracy": 0.8706642389297485, + "num_tokens": 847083008.0, + "step": 22199 + }, + { + "epoch": 2.8240681847093247, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.842458724975586, + "learning_rate": 1e-06, + "loss": 0.4639, + "mean_token_accuracy": 0.8544058203697205, + "num_tokens": 847124009.0, + "step": 22200 + }, + { + "epoch": 2.824195394987915, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.81047821044922, + "learning_rate": 1e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.867920994758606, + "num_tokens": 847163526.0, + "step": 22201 + }, + { + "epoch": 2.824322605266506, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.967288970947266, + "learning_rate": 1e-06, + "loss": 0.4097, + "mean_token_accuracy": 0.8722697496414185, + "num_tokens": 847201374.0, + "step": 22202 + }, + { + "epoch": 2.824449815545096, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.6732120513916, + "learning_rate": 1e-06, + "loss": 0.4457, + "mean_token_accuracy": 0.8607436418533325, + "num_tokens": 847239469.0, + "step": 22203 + }, + { + "epoch": 2.8245770258236864, + "ewc_loss": 0.039306640625, + "ewc_loss_parallel": 3.933906555175781e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.512378692626953, + "learning_rate": 1e-06, + "loss": 0.4125, + "mean_token_accuracy": 0.8720901012420654, + "num_tokens": 847275964.0, + "step": 22204 + }, + { + "epoch": 2.824704236102277, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.07732391357422, + "learning_rate": 1e-06, + "loss": 0.4128, + "mean_token_accuracy": 0.8752705454826355, + "num_tokens": 847316815.0, + "step": 22205 + }, + { + "epoch": 2.8248314463808675, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.682231903076172, + "learning_rate": 1e-06, + "loss": 0.3875, + "mean_token_accuracy": 0.8791183233261108, + "num_tokens": 847350949.0, + "step": 22206 + }, + { + "epoch": 2.824958656659458, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.91025733947754, + "learning_rate": 1e-06, + "loss": 0.378, + "mean_token_accuracy": 0.8779771327972412, + "num_tokens": 847388945.0, + "step": 22207 + }, + { + "epoch": 2.8250858669380485, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.728412628173828, + "learning_rate": 1e-06, + "loss": 0.3657, + "mean_token_accuracy": 0.8852260112762451, + "num_tokens": 847435114.0, + "step": 22208 + }, + { + "epoch": 2.825213077216639, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.888887405395508, + "learning_rate": 1e-06, + "loss": 0.4215, + "mean_token_accuracy": 0.8663123846054077, + "num_tokens": 847470815.0, + "step": 22209 + }, + { + "epoch": 2.8253402874952296, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.856109619140625, + "learning_rate": 1e-06, + "loss": 0.4243, + "mean_token_accuracy": 0.872157096862793, + "num_tokens": 847509379.0, + "step": 22210 + }, + { + "epoch": 2.82546749777382, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.791791915893555, + "learning_rate": 1e-06, + "loss": 0.3945, + "mean_token_accuracy": 0.8775647282600403, + "num_tokens": 847547427.0, + "step": 22211 + }, + { + "epoch": 2.8255947080524106, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.863054275512695, + "learning_rate": 1e-06, + "loss": 0.4675, + "mean_token_accuracy": 0.8579232692718506, + "num_tokens": 847584740.0, + "step": 22212 + }, + { + "epoch": 2.825721918331001, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.087329864501953, + "learning_rate": 1e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.862593412399292, + "num_tokens": 847626201.0, + "step": 22213 + }, + { + "epoch": 2.8258491286095917, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.797941207885742, + "learning_rate": 1e-06, + "loss": 0.3595, + "mean_token_accuracy": 0.887755274772644, + "num_tokens": 847661311.0, + "step": 22214 + }, + { + "epoch": 2.8259763388881822, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.853723526000977, + "learning_rate": 1e-06, + "loss": 0.3906, + "mean_token_accuracy": 0.8706039190292358, + "num_tokens": 847695234.0, + "step": 22215 + }, + { + "epoch": 2.8261035491667728, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.63483238220215, + "learning_rate": 1e-06, + "loss": 0.3445, + "mean_token_accuracy": 0.8907110691070557, + "num_tokens": 847738848.0, + "step": 22216 + }, + { + "epoch": 2.8262307594453633, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.792293548583984, + "learning_rate": 1e-06, + "loss": 0.3851, + "mean_token_accuracy": 0.8763828277587891, + "num_tokens": 847775952.0, + "step": 22217 + }, + { + "epoch": 2.826357969723954, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.99466323852539, + "learning_rate": 1e-06, + "loss": 0.3542, + "mean_token_accuracy": 0.8899602890014648, + "num_tokens": 847809381.0, + "step": 22218 + }, + { + "epoch": 2.8264851800025443, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.763092041015625, + "learning_rate": 1e-06, + "loss": 0.4182, + "mean_token_accuracy": 0.8696330189704895, + "num_tokens": 847849835.0, + "step": 22219 + }, + { + "epoch": 2.826612390281135, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.89461326599121, + "learning_rate": 1e-06, + "loss": 0.3985, + "mean_token_accuracy": 0.8714850544929504, + "num_tokens": 847882726.0, + "step": 22220 + }, + { + "epoch": 2.8267396005597254, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.818552017211914, + "learning_rate": 1e-06, + "loss": 0.3913, + "mean_token_accuracy": 0.8739856481552124, + "num_tokens": 847920663.0, + "step": 22221 + }, + { + "epoch": 2.8268668108383155, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.59122657775879, + "learning_rate": 1e-06, + "loss": 0.3759, + "mean_token_accuracy": 0.8821299076080322, + "num_tokens": 847962204.0, + "step": 22222 + }, + { + "epoch": 2.8269940211169065, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.814739227294922, + "learning_rate": 1e-06, + "loss": 0.4205, + "mean_token_accuracy": 0.8669746518135071, + "num_tokens": 848003722.0, + "step": 22223 + }, + { + "epoch": 2.8271212313954965, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.97437858581543, + "learning_rate": 1e-06, + "loss": 0.3844, + "mean_token_accuracy": 0.8827692866325378, + "num_tokens": 848045335.0, + "step": 22224 + }, + { + "epoch": 2.8272484416740875, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.917068481445312, + "learning_rate": 1e-06, + "loss": 0.4281, + "mean_token_accuracy": 0.8665226697921753, + "num_tokens": 848088817.0, + "step": 22225 + }, + { + "epoch": 2.8273756519526776, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.719377517700195, + "learning_rate": 1e-06, + "loss": 0.4281, + "mean_token_accuracy": 0.8659806251525879, + "num_tokens": 848120993.0, + "step": 22226 + }, + { + "epoch": 2.827502862231268, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.797094345092773, + "learning_rate": 1e-06, + "loss": 0.4384, + "mean_token_accuracy": 0.8659050464630127, + "num_tokens": 848162542.0, + "step": 22227 + }, + { + "epoch": 2.8276300725098586, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.861255645751953, + "learning_rate": 1e-06, + "loss": 0.3596, + "mean_token_accuracy": 0.886938214302063, + "num_tokens": 848198194.0, + "step": 22228 + }, + { + "epoch": 2.827757282788449, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.77224349975586, + "learning_rate": 1e-06, + "loss": 0.4248, + "mean_token_accuracy": 0.8692154288291931, + "num_tokens": 848240575.0, + "step": 22229 + }, + { + "epoch": 2.8278844930670397, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.818784713745117, + "learning_rate": 1e-06, + "loss": 0.3993, + "mean_token_accuracy": 0.8734728693962097, + "num_tokens": 848289026.0, + "step": 22230 + }, + { + "epoch": 2.8280117033456302, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.601057052612305, + "learning_rate": 1e-06, + "loss": 0.3888, + "mean_token_accuracy": 0.8764585256576538, + "num_tokens": 848326438.0, + "step": 22231 + }, + { + "epoch": 2.8281389136242208, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.868640899658203, + "learning_rate": 1e-06, + "loss": 0.3828, + "mean_token_accuracy": 0.8802154660224915, + "num_tokens": 848368106.0, + "step": 22232 + }, + { + "epoch": 2.8282661239028113, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.76182746887207, + "learning_rate": 1e-06, + "loss": 0.4092, + "mean_token_accuracy": 0.8710256814956665, + "num_tokens": 848409775.0, + "step": 22233 + }, + { + "epoch": 2.828393334181402, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.616079330444336, + "learning_rate": 1e-06, + "loss": 0.4003, + "mean_token_accuracy": 0.8750600814819336, + "num_tokens": 848442149.0, + "step": 22234 + }, + { + "epoch": 2.8285205444599923, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.665361404418945, + "learning_rate": 1e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.8675590753555298, + "num_tokens": 848475468.0, + "step": 22235 + }, + { + "epoch": 2.828647754738583, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.02872085571289, + "learning_rate": 1e-06, + "loss": 0.3917, + "mean_token_accuracy": 0.8743041753768921, + "num_tokens": 848509565.0, + "step": 22236 + }, + { + "epoch": 2.8287749650171734, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.51434326171875, + "learning_rate": 1e-06, + "loss": 0.3981, + "mean_token_accuracy": 0.8758273124694824, + "num_tokens": 848546357.0, + "step": 22237 + }, + { + "epoch": 2.828902175295764, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.74064826965332, + "learning_rate": 1e-06, + "loss": 0.4303, + "mean_token_accuracy": 0.8622595071792603, + "num_tokens": 848587661.0, + "step": 22238 + }, + { + "epoch": 2.8290293855743545, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.670732498168945, + "learning_rate": 1e-06, + "loss": 0.3632, + "mean_token_accuracy": 0.8783011436462402, + "num_tokens": 848628250.0, + "step": 22239 + }, + { + "epoch": 2.829156595852945, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.572731018066406, + "learning_rate": 1e-06, + "loss": 0.3932, + "mean_token_accuracy": 0.875174343585968, + "num_tokens": 848670452.0, + "step": 22240 + }, + { + "epoch": 2.8292838061315355, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.929794311523438, + "learning_rate": 1e-06, + "loss": 0.3917, + "mean_token_accuracy": 0.8758423924446106, + "num_tokens": 848703592.0, + "step": 22241 + }, + { + "epoch": 2.829411016410126, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.678455352783203, + "learning_rate": 1e-06, + "loss": 0.4535, + "mean_token_accuracy": 0.8577977418899536, + "num_tokens": 848739291.0, + "step": 22242 + }, + { + "epoch": 2.8295382266887166, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.58608627319336, + "learning_rate": 1e-06, + "loss": 0.415, + "mean_token_accuracy": 0.8678102493286133, + "num_tokens": 848776039.0, + "step": 22243 + }, + { + "epoch": 2.829665436967307, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.769794464111328, + "learning_rate": 1e-06, + "loss": 0.3919, + "mean_token_accuracy": 0.8758299350738525, + "num_tokens": 848814799.0, + "step": 22244 + }, + { + "epoch": 2.8297926472458976, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.700328826904297, + "learning_rate": 1e-06, + "loss": 0.3951, + "mean_token_accuracy": 0.8749077320098877, + "num_tokens": 848860123.0, + "step": 22245 + }, + { + "epoch": 2.829919857524488, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.759113311767578, + "learning_rate": 1e-06, + "loss": 0.3418, + "mean_token_accuracy": 0.8905907869338989, + "num_tokens": 848900447.0, + "step": 22246 + }, + { + "epoch": 2.8300470678030782, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.83400535583496, + "learning_rate": 1e-06, + "loss": 0.425, + "mean_token_accuracy": 0.8680545091629028, + "num_tokens": 848945385.0, + "step": 22247 + }, + { + "epoch": 2.830174278081669, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.730548858642578, + "learning_rate": 1e-06, + "loss": 0.4231, + "mean_token_accuracy": 0.863798975944519, + "num_tokens": 848978783.0, + "step": 22248 + }, + { + "epoch": 2.8303014883602593, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.676624298095703, + "learning_rate": 1e-06, + "loss": 0.4387, + "mean_token_accuracy": 0.862196683883667, + "num_tokens": 849017324.0, + "step": 22249 + }, + { + "epoch": 2.8304286986388503, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.93211555480957, + "learning_rate": 1e-06, + "loss": 0.3449, + "mean_token_accuracy": 0.8934968113899231, + "num_tokens": 849045928.0, + "step": 22250 + }, + { + "epoch": 2.8305559089174404, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.649688720703125, + "learning_rate": 1e-06, + "loss": 0.4078, + "mean_token_accuracy": 0.8737426996231079, + "num_tokens": 849080855.0, + "step": 22251 + }, + { + "epoch": 2.830683119196031, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.66676902770996, + "learning_rate": 1e-06, + "loss": 0.3952, + "mean_token_accuracy": 0.8779445886611938, + "num_tokens": 849114835.0, + "step": 22252 + }, + { + "epoch": 2.8308103294746214, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.78052520751953, + "learning_rate": 1e-06, + "loss": 0.4394, + "mean_token_accuracy": 0.8631008863449097, + "num_tokens": 849152632.0, + "step": 22253 + }, + { + "epoch": 2.830937539753212, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.751008987426758, + "learning_rate": 1e-06, + "loss": 0.3846, + "mean_token_accuracy": 0.882697582244873, + "num_tokens": 849189911.0, + "step": 22254 + }, + { + "epoch": 2.8310647500318025, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.600418090820312, + "learning_rate": 1e-06, + "loss": 0.4089, + "mean_token_accuracy": 0.8728855848312378, + "num_tokens": 849227847.0, + "step": 22255 + }, + { + "epoch": 2.831191960310393, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.638408660888672, + "learning_rate": 1e-06, + "loss": 0.3739, + "mean_token_accuracy": 0.8781533241271973, + "num_tokens": 849265778.0, + "step": 22256 + }, + { + "epoch": 2.8313191705889835, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.869422912597656, + "learning_rate": 1e-06, + "loss": 0.3911, + "mean_token_accuracy": 0.873145580291748, + "num_tokens": 849302488.0, + "step": 22257 + }, + { + "epoch": 2.831446380867574, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.7711238861084, + "learning_rate": 1e-06, + "loss": 0.3504, + "mean_token_accuracy": 0.8884398937225342, + "num_tokens": 849337483.0, + "step": 22258 + }, + { + "epoch": 2.8315735911461646, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.64469337463379, + "learning_rate": 1e-06, + "loss": 0.3854, + "mean_token_accuracy": 0.8797950744628906, + "num_tokens": 849378491.0, + "step": 22259 + }, + { + "epoch": 2.831700801424755, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.602720260620117, + "learning_rate": 1e-06, + "loss": 0.3819, + "mean_token_accuracy": 0.8811063766479492, + "num_tokens": 849418897.0, + "step": 22260 + }, + { + "epoch": 2.8318280117033456, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.565187454223633, + "learning_rate": 1e-06, + "loss": 0.3915, + "mean_token_accuracy": 0.8784170150756836, + "num_tokens": 849457057.0, + "step": 22261 + }, + { + "epoch": 2.831955221981936, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.789186477661133, + "learning_rate": 1e-06, + "loss": 0.3838, + "mean_token_accuracy": 0.8789259791374207, + "num_tokens": 849488825.0, + "step": 22262 + }, + { + "epoch": 2.8320824322605267, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.691253662109375, + "learning_rate": 1e-06, + "loss": 0.3843, + "mean_token_accuracy": 0.881247878074646, + "num_tokens": 849528336.0, + "step": 22263 + }, + { + "epoch": 2.8322096425391172, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.608375549316406, + "learning_rate": 1e-06, + "loss": 0.3643, + "mean_token_accuracy": 0.8876115083694458, + "num_tokens": 849565373.0, + "step": 22264 + }, + { + "epoch": 2.8323368528177078, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.512739181518555, + "learning_rate": 1e-06, + "loss": 0.3558, + "mean_token_accuracy": 0.8855489492416382, + "num_tokens": 849600909.0, + "step": 22265 + }, + { + "epoch": 2.8324640630962983, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.840682983398438, + "learning_rate": 1e-06, + "loss": 0.4285, + "mean_token_accuracy": 0.8651844263076782, + "num_tokens": 849639499.0, + "step": 22266 + }, + { + "epoch": 2.832591273374889, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.666919708251953, + "learning_rate": 1e-06, + "loss": 0.3616, + "mean_token_accuracy": 0.8849822282791138, + "num_tokens": 849677232.0, + "step": 22267 + }, + { + "epoch": 2.8327184836534793, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.74118423461914, + "learning_rate": 1e-06, + "loss": 0.3926, + "mean_token_accuracy": 0.8794740438461304, + "num_tokens": 849717187.0, + "step": 22268 + }, + { + "epoch": 2.83284569393207, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.83263397216797, + "learning_rate": 1e-06, + "loss": 0.3681, + "mean_token_accuracy": 0.8845332264900208, + "num_tokens": 849756545.0, + "step": 22269 + }, + { + "epoch": 2.83297290421066, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.887075424194336, + "learning_rate": 1e-06, + "loss": 0.3691, + "mean_token_accuracy": 0.8787661790847778, + "num_tokens": 849788670.0, + "step": 22270 + }, + { + "epoch": 2.833100114489251, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.66799545288086, + "learning_rate": 1e-06, + "loss": 0.4002, + "mean_token_accuracy": 0.8727617263793945, + "num_tokens": 849825715.0, + "step": 22271 + }, + { + "epoch": 2.833227324767841, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.796669006347656, + "learning_rate": 1e-06, + "loss": 0.3769, + "mean_token_accuracy": 0.8806321620941162, + "num_tokens": 849856435.0, + "step": 22272 + }, + { + "epoch": 2.833354535046432, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.789405822753906, + "learning_rate": 1e-06, + "loss": 0.3544, + "mean_token_accuracy": 0.8886146545410156, + "num_tokens": 849895634.0, + "step": 22273 + }, + { + "epoch": 2.833481745325022, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.578937530517578, + "learning_rate": 1e-06, + "loss": 0.4604, + "mean_token_accuracy": 0.8534285426139832, + "num_tokens": 849938646.0, + "step": 22274 + }, + { + "epoch": 2.833608955603613, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.994617462158203, + "learning_rate": 1e-06, + "loss": 0.4061, + "mean_token_accuracy": 0.8714268803596497, + "num_tokens": 849978642.0, + "step": 22275 + }, + { + "epoch": 2.833736165882203, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.787208557128906, + "learning_rate": 1e-06, + "loss": 0.4022, + "mean_token_accuracy": 0.8737875819206238, + "num_tokens": 850017820.0, + "step": 22276 + }, + { + "epoch": 2.8338633761607936, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.724117279052734, + "learning_rate": 1e-06, + "loss": 0.4602, + "mean_token_accuracy": 0.8549851775169373, + "num_tokens": 850056619.0, + "step": 22277 + }, + { + "epoch": 2.833990586439384, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.8732852935791, + "learning_rate": 1e-06, + "loss": 0.399, + "mean_token_accuracy": 0.872258186340332, + "num_tokens": 850092456.0, + "step": 22278 + }, + { + "epoch": 2.8341177967179747, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.6672420501709, + "learning_rate": 1e-06, + "loss": 0.4085, + "mean_token_accuracy": 0.8716346025466919, + "num_tokens": 850131598.0, + "step": 22279 + }, + { + "epoch": 2.8342450069965652, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.828351974487305, + "learning_rate": 1e-06, + "loss": 0.4313, + "mean_token_accuracy": 0.865083634853363, + "num_tokens": 850170884.0, + "step": 22280 + }, + { + "epoch": 2.8343722172751558, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.97098731994629, + "learning_rate": 1e-06, + "loss": 0.3998, + "mean_token_accuracy": 0.8736832141876221, + "num_tokens": 850206989.0, + "step": 22281 + }, + { + "epoch": 2.8344994275537463, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.60443115234375, + "learning_rate": 1e-06, + "loss": 0.4142, + "mean_token_accuracy": 0.8706209659576416, + "num_tokens": 850248602.0, + "step": 22282 + }, + { + "epoch": 2.834626637832337, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.832294464111328, + "learning_rate": 1e-06, + "loss": 0.422, + "mean_token_accuracy": 0.8670058846473694, + "num_tokens": 850289790.0, + "step": 22283 + }, + { + "epoch": 2.8347538481109273, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.915597915649414, + "learning_rate": 1e-06, + "loss": 0.3705, + "mean_token_accuracy": 0.882421612739563, + "num_tokens": 850326688.0, + "step": 22284 + }, + { + "epoch": 2.834881058389518, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.694847106933594, + "learning_rate": 1e-06, + "loss": 0.365, + "mean_token_accuracy": 0.8853777647018433, + "num_tokens": 850366460.0, + "step": 22285 + }, + { + "epoch": 2.8350082686681084, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.204866409301758, + "learning_rate": 1e-06, + "loss": 0.3772, + "mean_token_accuracy": 0.8799748420715332, + "num_tokens": 850400046.0, + "step": 22286 + }, + { + "epoch": 2.835135478946699, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.789581298828125, + "learning_rate": 1e-06, + "loss": 0.4385, + "mean_token_accuracy": 0.8657970428466797, + "num_tokens": 850434374.0, + "step": 22287 + }, + { + "epoch": 2.8352626892252895, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.815515518188477, + "learning_rate": 1e-06, + "loss": 0.3941, + "mean_token_accuracy": 0.8768006563186646, + "num_tokens": 850469056.0, + "step": 22288 + }, + { + "epoch": 2.83538989950388, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.59131622314453, + "learning_rate": 1e-06, + "loss": 0.4074, + "mean_token_accuracy": 0.8726209402084351, + "num_tokens": 850505805.0, + "step": 22289 + }, + { + "epoch": 2.8355171097824705, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.68882942199707, + "learning_rate": 1e-06, + "loss": 0.4392, + "mean_token_accuracy": 0.8623369932174683, + "num_tokens": 850550056.0, + "step": 22290 + }, + { + "epoch": 2.835644320061061, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.854080200195312, + "learning_rate": 1e-06, + "loss": 0.3822, + "mean_token_accuracy": 0.8811818361282349, + "num_tokens": 850585075.0, + "step": 22291 + }, + { + "epoch": 2.8357715303396516, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.890165328979492, + "learning_rate": 1e-06, + "loss": 0.3673, + "mean_token_accuracy": 0.8838712573051453, + "num_tokens": 850625758.0, + "step": 22292 + }, + { + "epoch": 2.835898740618242, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.60511016845703, + "learning_rate": 1e-06, + "loss": 0.3491, + "mean_token_accuracy": 0.8876837491989136, + "num_tokens": 850664608.0, + "step": 22293 + }, + { + "epoch": 2.8360259508968326, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.723276138305664, + "learning_rate": 1e-06, + "loss": 0.3762, + "mean_token_accuracy": 0.8819043040275574, + "num_tokens": 850705897.0, + "step": 22294 + }, + { + "epoch": 2.8361531611754227, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.679052352905273, + "learning_rate": 1e-06, + "loss": 0.4188, + "mean_token_accuracy": 0.8695170879364014, + "num_tokens": 850746195.0, + "step": 22295 + }, + { + "epoch": 2.8362803714540137, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.812915802001953, + "learning_rate": 1e-06, + "loss": 0.3776, + "mean_token_accuracy": 0.8818645477294922, + "num_tokens": 850785183.0, + "step": 22296 + }, + { + "epoch": 2.8364075817326038, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.822893142700195, + "learning_rate": 1e-06, + "loss": 0.3815, + "mean_token_accuracy": 0.8803039789199829, + "num_tokens": 850819558.0, + "step": 22297 + }, + { + "epoch": 2.8365347920111947, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.737783432006836, + "learning_rate": 1e-06, + "loss": 0.4303, + "mean_token_accuracy": 0.8647454977035522, + "num_tokens": 850857385.0, + "step": 22298 + }, + { + "epoch": 2.836662002289785, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.622425079345703, + "learning_rate": 1e-06, + "loss": 0.4193, + "mean_token_accuracy": 0.8697309494018555, + "num_tokens": 850895297.0, + "step": 22299 + }, + { + "epoch": 2.836789212568376, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.831344604492188, + "learning_rate": 1e-06, + "loss": 0.41, + "mean_token_accuracy": 0.8738868236541748, + "num_tokens": 850930982.0, + "step": 22300 + }, + { + "epoch": 2.836916422846966, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.80721092224121, + "learning_rate": 1e-06, + "loss": 0.4236, + "mean_token_accuracy": 0.870514988899231, + "num_tokens": 850969032.0, + "step": 22301 + }, + { + "epoch": 2.8370436331255564, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.896692276000977, + "learning_rate": 1e-06, + "loss": 0.3928, + "mean_token_accuracy": 0.8774397373199463, + "num_tokens": 851008325.0, + "step": 22302 + }, + { + "epoch": 2.837170843404147, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.669954299926758, + "learning_rate": 1e-06, + "loss": 0.397, + "mean_token_accuracy": 0.8756383061408997, + "num_tokens": 851044289.0, + "step": 22303 + }, + { + "epoch": 2.8372980536827375, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.992843627929688, + "learning_rate": 1e-06, + "loss": 0.378, + "mean_token_accuracy": 0.881072998046875, + "num_tokens": 851079887.0, + "step": 22304 + }, + { + "epoch": 2.837425263961328, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.738773345947266, + "learning_rate": 1e-06, + "loss": 0.3824, + "mean_token_accuracy": 0.8784042596817017, + "num_tokens": 851124535.0, + "step": 22305 + }, + { + "epoch": 2.8375524742399185, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.973196029663086, + "learning_rate": 1e-06, + "loss": 0.3642, + "mean_token_accuracy": 0.8830605149269104, + "num_tokens": 851162982.0, + "step": 22306 + }, + { + "epoch": 2.837679684518509, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.886892318725586, + "learning_rate": 1e-06, + "loss": 0.367, + "mean_token_accuracy": 0.8827376365661621, + "num_tokens": 851201864.0, + "step": 22307 + }, + { + "epoch": 2.8378068947970996, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.786243438720703, + "learning_rate": 1e-06, + "loss": 0.3849, + "mean_token_accuracy": 0.8781565427780151, + "num_tokens": 851241659.0, + "step": 22308 + }, + { + "epoch": 2.83793410507569, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.642974853515625, + "learning_rate": 1e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.866388201713562, + "num_tokens": 851278861.0, + "step": 22309 + }, + { + "epoch": 2.8380613153542806, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.857563018798828, + "learning_rate": 1e-06, + "loss": 0.4004, + "mean_token_accuracy": 0.8766242265701294, + "num_tokens": 851314442.0, + "step": 22310 + }, + { + "epoch": 2.838188525632871, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.08277130126953, + "learning_rate": 1e-06, + "loss": 0.4078, + "mean_token_accuracy": 0.869850218296051, + "num_tokens": 851356322.0, + "step": 22311 + }, + { + "epoch": 2.8383157359114617, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.647302627563477, + "learning_rate": 1e-06, + "loss": 0.435, + "mean_token_accuracy": 0.8631659746170044, + "num_tokens": 851395374.0, + "step": 22312 + }, + { + "epoch": 2.838442946190052, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.920429229736328, + "learning_rate": 1e-06, + "loss": 0.4092, + "mean_token_accuracy": 0.8735507726669312, + "num_tokens": 851430957.0, + "step": 22313 + }, + { + "epoch": 2.8385701564686427, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.859188079833984, + "learning_rate": 1e-06, + "loss": 0.4049, + "mean_token_accuracy": 0.8687360286712646, + "num_tokens": 851474357.0, + "step": 22314 + }, + { + "epoch": 2.8386973667472333, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.794593811035156, + "learning_rate": 1e-06, + "loss": 0.3566, + "mean_token_accuracy": 0.8866344094276428, + "num_tokens": 851510346.0, + "step": 22315 + }, + { + "epoch": 2.838824577025824, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.12736701965332, + "learning_rate": 1e-06, + "loss": 0.3981, + "mean_token_accuracy": 0.8761024475097656, + "num_tokens": 851546798.0, + "step": 22316 + }, + { + "epoch": 2.8389517873044143, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.811939239501953, + "learning_rate": 1e-06, + "loss": 0.3853, + "mean_token_accuracy": 0.8792890906333923, + "num_tokens": 851587819.0, + "step": 22317 + }, + { + "epoch": 2.839078997583005, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.84478187561035, + "learning_rate": 1e-06, + "loss": 0.3861, + "mean_token_accuracy": 0.8792883157730103, + "num_tokens": 851628764.0, + "step": 22318 + }, + { + "epoch": 2.8392062078615954, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.869216918945312, + "learning_rate": 1e-06, + "loss": 0.44, + "mean_token_accuracy": 0.8616322875022888, + "num_tokens": 851672206.0, + "step": 22319 + }, + { + "epoch": 2.8393334181401855, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.665279388427734, + "learning_rate": 1e-06, + "loss": 0.4196, + "mean_token_accuracy": 0.8689393997192383, + "num_tokens": 851709969.0, + "step": 22320 + }, + { + "epoch": 2.8394606284187764, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.861713409423828, + "learning_rate": 1e-06, + "loss": 0.4254, + "mean_token_accuracy": 0.8679333925247192, + "num_tokens": 851749015.0, + "step": 22321 + }, + { + "epoch": 2.8395878386973665, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.957460403442383, + "learning_rate": 1e-06, + "loss": 0.3828, + "mean_token_accuracy": 0.8785213828086853, + "num_tokens": 851787210.0, + "step": 22322 + }, + { + "epoch": 2.8397150489759575, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.992752075195312, + "learning_rate": 1e-06, + "loss": 0.4139, + "mean_token_accuracy": 0.8689508438110352, + "num_tokens": 851824677.0, + "step": 22323 + }, + { + "epoch": 2.8398422592545476, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.63334846496582, + "learning_rate": 1e-06, + "loss": 0.3809, + "mean_token_accuracy": 0.8795165419578552, + "num_tokens": 851863614.0, + "step": 22324 + }, + { + "epoch": 2.839969469533138, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.804853439331055, + "learning_rate": 1e-06, + "loss": 0.414, + "mean_token_accuracy": 0.8687403202056885, + "num_tokens": 851908153.0, + "step": 22325 + }, + { + "epoch": 2.8400966798117286, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.658050537109375, + "learning_rate": 1e-06, + "loss": 0.413, + "mean_token_accuracy": 0.8706678152084351, + "num_tokens": 851946473.0, + "step": 22326 + }, + { + "epoch": 2.840223890090319, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.816978454589844, + "learning_rate": 1e-06, + "loss": 0.4293, + "mean_token_accuracy": 0.863601565361023, + "num_tokens": 851991600.0, + "step": 22327 + }, + { + "epoch": 2.8403511003689097, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.810565948486328, + "learning_rate": 1e-06, + "loss": 0.3922, + "mean_token_accuracy": 0.8761831521987915, + "num_tokens": 852029468.0, + "step": 22328 + }, + { + "epoch": 2.8404783106475002, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.496627807617188, + "learning_rate": 1e-06, + "loss": 0.3772, + "mean_token_accuracy": 0.8799039125442505, + "num_tokens": 852061400.0, + "step": 22329 + }, + { + "epoch": 2.8406055209260908, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.339801788330078, + "learning_rate": 1e-06, + "loss": 0.4132, + "mean_token_accuracy": 0.8736832141876221, + "num_tokens": 852100596.0, + "step": 22330 + }, + { + "epoch": 2.8407327312046813, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.649051666259766, + "learning_rate": 1e-06, + "loss": 0.4, + "mean_token_accuracy": 0.876754641532898, + "num_tokens": 852140320.0, + "step": 22331 + }, + { + "epoch": 2.840859941483272, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.033641815185547, + "learning_rate": 1e-06, + "loss": 0.4031, + "mean_token_accuracy": 0.8733774423599243, + "num_tokens": 852175969.0, + "step": 22332 + }, + { + "epoch": 2.8409871517618623, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.722660064697266, + "learning_rate": 1e-06, + "loss": 0.4422, + "mean_token_accuracy": 0.8607171773910522, + "num_tokens": 852210816.0, + "step": 22333 + }, + { + "epoch": 2.841114362040453, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.918025970458984, + "learning_rate": 1e-06, + "loss": 0.4016, + "mean_token_accuracy": 0.8730999827384949, + "num_tokens": 852246684.0, + "step": 22334 + }, + { + "epoch": 2.8412415723190434, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.02201271057129, + "learning_rate": 1e-06, + "loss": 0.3971, + "mean_token_accuracy": 0.8769828081130981, + "num_tokens": 852285283.0, + "step": 22335 + }, + { + "epoch": 2.841368782597634, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.912723541259766, + "learning_rate": 1e-06, + "loss": 0.3751, + "mean_token_accuracy": 0.8826882839202881, + "num_tokens": 852323310.0, + "step": 22336 + }, + { + "epoch": 2.8414959928762245, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.72443199157715, + "learning_rate": 1e-06, + "loss": 0.3901, + "mean_token_accuracy": 0.878625214099884, + "num_tokens": 852360850.0, + "step": 22337 + }, + { + "epoch": 2.841623203154815, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.938980102539062, + "learning_rate": 1e-06, + "loss": 0.4154, + "mean_token_accuracy": 0.8680842518806458, + "num_tokens": 852399320.0, + "step": 22338 + }, + { + "epoch": 2.8417504134334055, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.8012638092041, + "learning_rate": 1e-06, + "loss": 0.4082, + "mean_token_accuracy": 0.8715018033981323, + "num_tokens": 852430980.0, + "step": 22339 + }, + { + "epoch": 2.841877623711996, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.82158088684082, + "learning_rate": 1e-06, + "loss": 0.3712, + "mean_token_accuracy": 0.8846636414527893, + "num_tokens": 852465688.0, + "step": 22340 + }, + { + "epoch": 2.8420048339905866, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.758424758911133, + "learning_rate": 1e-06, + "loss": 0.4039, + "mean_token_accuracy": 0.8733401298522949, + "num_tokens": 852504372.0, + "step": 22341 + }, + { + "epoch": 2.842132044269177, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.77493667602539, + "learning_rate": 1e-06, + "loss": 0.3816, + "mean_token_accuracy": 0.8791288137435913, + "num_tokens": 852542232.0, + "step": 22342 + }, + { + "epoch": 2.8422592545477676, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.062734603881836, + "learning_rate": 1e-06, + "loss": 0.4357, + "mean_token_accuracy": 0.8651284575462341, + "num_tokens": 852580005.0, + "step": 22343 + }, + { + "epoch": 2.842386464826358, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.83446502685547, + "learning_rate": 1e-06, + "loss": 0.3602, + "mean_token_accuracy": 0.8864824771881104, + "num_tokens": 852619912.0, + "step": 22344 + }, + { + "epoch": 2.8425136751049482, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.8570556640625, + "learning_rate": 1e-06, + "loss": 0.3854, + "mean_token_accuracy": 0.8782215118408203, + "num_tokens": 852654349.0, + "step": 22345 + }, + { + "epoch": 2.842640885383539, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.989181518554688, + "learning_rate": 1e-06, + "loss": 0.413, + "mean_token_accuracy": 0.8665359020233154, + "num_tokens": 852691300.0, + "step": 22346 + }, + { + "epoch": 2.8427680956621293, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.716163635253906, + "learning_rate": 1e-06, + "loss": 0.3993, + "mean_token_accuracy": 0.8724590539932251, + "num_tokens": 852731228.0, + "step": 22347 + }, + { + "epoch": 2.8428953059407203, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.357521057128906, + "learning_rate": 1e-06, + "loss": 0.3978, + "mean_token_accuracy": 0.8737918138504028, + "num_tokens": 852768333.0, + "step": 22348 + }, + { + "epoch": 2.8430225162193103, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.721452713012695, + "learning_rate": 1e-06, + "loss": 0.496, + "mean_token_accuracy": 0.8443702459335327, + "num_tokens": 852810962.0, + "step": 22349 + }, + { + "epoch": 2.843149726497901, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.98825454711914, + "learning_rate": 1e-06, + "loss": 0.355, + "mean_token_accuracy": 0.8905740976333618, + "num_tokens": 852844326.0, + "step": 22350 + }, + { + "epoch": 2.8432769367764914, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.914093017578125, + "learning_rate": 1e-06, + "loss": 0.4068, + "mean_token_accuracy": 0.8690400719642639, + "num_tokens": 852877737.0, + "step": 22351 + }, + { + "epoch": 2.843404147055082, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.71884536743164, + "learning_rate": 1e-06, + "loss": 0.3704, + "mean_token_accuracy": 0.8817152976989746, + "num_tokens": 852922085.0, + "step": 22352 + }, + { + "epoch": 2.8435313573336725, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.956714630126953, + "learning_rate": 1e-06, + "loss": 0.3434, + "mean_token_accuracy": 0.8912899494171143, + "num_tokens": 852960742.0, + "step": 22353 + }, + { + "epoch": 2.843658567612263, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.014446258544922, + "learning_rate": 1e-06, + "loss": 0.3779, + "mean_token_accuracy": 0.8792970776557922, + "num_tokens": 852993502.0, + "step": 22354 + }, + { + "epoch": 2.8437857778908535, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.047767639160156, + "learning_rate": 1e-06, + "loss": 0.3819, + "mean_token_accuracy": 0.8792217969894409, + "num_tokens": 853036844.0, + "step": 22355 + }, + { + "epoch": 2.843912988169444, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.003324508666992, + "learning_rate": 1e-06, + "loss": 0.3973, + "mean_token_accuracy": 0.874204158782959, + "num_tokens": 853070851.0, + "step": 22356 + }, + { + "epoch": 2.8440401984480346, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.704402923583984, + "learning_rate": 1e-06, + "loss": 0.4045, + "mean_token_accuracy": 0.874758243560791, + "num_tokens": 853106116.0, + "step": 22357 + }, + { + "epoch": 2.844167408726625, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.841073989868164, + "learning_rate": 1e-06, + "loss": 0.4023, + "mean_token_accuracy": 0.8727700114250183, + "num_tokens": 853152776.0, + "step": 22358 + }, + { + "epoch": 2.8442946190052156, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.12973403930664, + "learning_rate": 1e-06, + "loss": 0.3966, + "mean_token_accuracy": 0.8737375140190125, + "num_tokens": 853194176.0, + "step": 22359 + }, + { + "epoch": 2.844421829283806, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.556354522705078, + "learning_rate": 1e-06, + "loss": 0.4085, + "mean_token_accuracy": 0.8724982738494873, + "num_tokens": 853236834.0, + "step": 22360 + }, + { + "epoch": 2.8445490395623967, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.11070442199707, + "learning_rate": 1e-06, + "loss": 0.4248, + "mean_token_accuracy": 0.8663586378097534, + "num_tokens": 853274145.0, + "step": 22361 + }, + { + "epoch": 2.844676249840987, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.636085510253906, + "learning_rate": 1e-06, + "loss": 0.3949, + "mean_token_accuracy": 0.878010630607605, + "num_tokens": 853312569.0, + "step": 22362 + }, + { + "epoch": 2.8448034601195777, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.760278701782227, + "learning_rate": 1e-06, + "loss": 0.3783, + "mean_token_accuracy": 0.8791809678077698, + "num_tokens": 853351742.0, + "step": 22363 + }, + { + "epoch": 2.8449306703981683, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.879322052001953, + "learning_rate": 1e-06, + "loss": 0.3907, + "mean_token_accuracy": 0.8765019178390503, + "num_tokens": 853389607.0, + "step": 22364 + }, + { + "epoch": 2.845057880676759, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.71637725830078, + "learning_rate": 1e-06, + "loss": 0.3717, + "mean_token_accuracy": 0.8817494511604309, + "num_tokens": 853427218.0, + "step": 22365 + }, + { + "epoch": 2.8451850909553493, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.89495849609375, + "learning_rate": 1e-06, + "loss": 0.3871, + "mean_token_accuracy": 0.8779901266098022, + "num_tokens": 853464773.0, + "step": 22366 + }, + { + "epoch": 2.84531230123394, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.81243896484375, + "learning_rate": 1e-06, + "loss": 0.3993, + "mean_token_accuracy": 0.8744052648544312, + "num_tokens": 853507250.0, + "step": 22367 + }, + { + "epoch": 2.84543951151253, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.917343139648438, + "learning_rate": 1e-06, + "loss": 0.3904, + "mean_token_accuracy": 0.8756265640258789, + "num_tokens": 853551947.0, + "step": 22368 + }, + { + "epoch": 2.845566721791121, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.891191482543945, + "learning_rate": 1e-06, + "loss": 0.4199, + "mean_token_accuracy": 0.8667289018630981, + "num_tokens": 853590557.0, + "step": 22369 + }, + { + "epoch": 2.845693932069711, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.794002532958984, + "learning_rate": 1e-06, + "loss": 0.4256, + "mean_token_accuracy": 0.8691042065620422, + "num_tokens": 853630870.0, + "step": 22370 + }, + { + "epoch": 2.845821142348302, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.787357330322266, + "learning_rate": 1e-06, + "loss": 0.3941, + "mean_token_accuracy": 0.87189781665802, + "num_tokens": 853662360.0, + "step": 22371 + }, + { + "epoch": 2.845948352626892, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.8242130279541, + "learning_rate": 1e-06, + "loss": 0.4162, + "mean_token_accuracy": 0.869024932384491, + "num_tokens": 853703661.0, + "step": 22372 + }, + { + "epoch": 2.846075562905483, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.71385955810547, + "learning_rate": 1e-06, + "loss": 0.4428, + "mean_token_accuracy": 0.8646847009658813, + "num_tokens": 853742398.0, + "step": 22373 + }, + { + "epoch": 2.846202773184073, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.80050277709961, + "learning_rate": 1e-06, + "loss": 0.3787, + "mean_token_accuracy": 0.8816249370574951, + "num_tokens": 853781479.0, + "step": 22374 + }, + { + "epoch": 2.8463299834626636, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.83418083190918, + "learning_rate": 1e-06, + "loss": 0.4753, + "mean_token_accuracy": 0.8498550653457642, + "num_tokens": 853822330.0, + "step": 22375 + }, + { + "epoch": 2.846457193741254, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.700393676757812, + "learning_rate": 1e-06, + "loss": 0.4278, + "mean_token_accuracy": 0.8620022535324097, + "num_tokens": 853862747.0, + "step": 22376 + }, + { + "epoch": 2.8465844040198447, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.75237464904785, + "learning_rate": 1e-06, + "loss": 0.3279, + "mean_token_accuracy": 0.8969263434410095, + "num_tokens": 853897124.0, + "step": 22377 + }, + { + "epoch": 2.8467116142984352, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.970487594604492, + "learning_rate": 1e-06, + "loss": 0.4459, + "mean_token_accuracy": 0.8608441352844238, + "num_tokens": 853930516.0, + "step": 22378 + }, + { + "epoch": 2.8468388245770258, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.782100677490234, + "learning_rate": 1e-06, + "loss": 0.3735, + "mean_token_accuracy": 0.8780101537704468, + "num_tokens": 853967016.0, + "step": 22379 + }, + { + "epoch": 2.8469660348556163, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.87401580810547, + "learning_rate": 1e-06, + "loss": 0.4398, + "mean_token_accuracy": 0.8573299646377563, + "num_tokens": 854008804.0, + "step": 22380 + }, + { + "epoch": 2.847093245134207, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.763301849365234, + "learning_rate": 1e-06, + "loss": 0.4415, + "mean_token_accuracy": 0.8616006374359131, + "num_tokens": 854048393.0, + "step": 22381 + }, + { + "epoch": 2.8472204554127973, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.688255310058594, + "learning_rate": 1e-06, + "loss": 0.3748, + "mean_token_accuracy": 0.8794776797294617, + "num_tokens": 854089480.0, + "step": 22382 + }, + { + "epoch": 2.847347665691388, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.06683349609375, + "learning_rate": 1e-06, + "loss": 0.3637, + "mean_token_accuracy": 0.8855334520339966, + "num_tokens": 854125677.0, + "step": 22383 + }, + { + "epoch": 2.8474748759699784, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.725313186645508, + "learning_rate": 1e-06, + "loss": 0.4151, + "mean_token_accuracy": 0.8699353933334351, + "num_tokens": 854160485.0, + "step": 22384 + }, + { + "epoch": 2.847602086248569, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.73369026184082, + "learning_rate": 1e-06, + "loss": 0.4297, + "mean_token_accuracy": 0.8670758605003357, + "num_tokens": 854200635.0, + "step": 22385 + }, + { + "epoch": 2.8477292965271594, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.945449829101562, + "learning_rate": 1e-06, + "loss": 0.3853, + "mean_token_accuracy": 0.8780132532119751, + "num_tokens": 854240046.0, + "step": 22386 + }, + { + "epoch": 2.84785650680575, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.837190628051758, + "learning_rate": 1e-06, + "loss": 0.3646, + "mean_token_accuracy": 0.8850377798080444, + "num_tokens": 854275880.0, + "step": 22387 + }, + { + "epoch": 2.8479837170843405, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.57009506225586, + "learning_rate": 1e-06, + "loss": 0.3964, + "mean_token_accuracy": 0.8746692538261414, + "num_tokens": 854311204.0, + "step": 22388 + }, + { + "epoch": 2.848110927362931, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.68383026123047, + "learning_rate": 1e-06, + "loss": 0.4098, + "mean_token_accuracy": 0.8717318773269653, + "num_tokens": 854346076.0, + "step": 22389 + }, + { + "epoch": 2.8482381376415216, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.811378479003906, + "learning_rate": 1e-06, + "loss": 0.3729, + "mean_token_accuracy": 0.8809890747070312, + "num_tokens": 854383075.0, + "step": 22390 + }, + { + "epoch": 2.848365347920112, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.809167861938477, + "learning_rate": 1e-06, + "loss": 0.4316, + "mean_token_accuracy": 0.8683664798736572, + "num_tokens": 854416011.0, + "step": 22391 + }, + { + "epoch": 2.8484925581987026, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.69833755493164, + "learning_rate": 1e-06, + "loss": 0.3767, + "mean_token_accuracy": 0.8761487007141113, + "num_tokens": 854449520.0, + "step": 22392 + }, + { + "epoch": 2.8486197684772927, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.91016960144043, + "learning_rate": 1e-06, + "loss": 0.3994, + "mean_token_accuracy": 0.8727837800979614, + "num_tokens": 854484355.0, + "step": 22393 + }, + { + "epoch": 2.8487469787558837, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.701900482177734, + "learning_rate": 1e-06, + "loss": 0.4522, + "mean_token_accuracy": 0.8607679605484009, + "num_tokens": 854526238.0, + "step": 22394 + }, + { + "epoch": 2.8488741890344738, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.74533462524414, + "learning_rate": 1e-06, + "loss": 0.3863, + "mean_token_accuracy": 0.8770921230316162, + "num_tokens": 854562619.0, + "step": 22395 + }, + { + "epoch": 2.8490013993130647, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.759614944458008, + "learning_rate": 1e-06, + "loss": 0.3767, + "mean_token_accuracy": 0.8796399831771851, + "num_tokens": 854604974.0, + "step": 22396 + }, + { + "epoch": 2.849128609591655, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.928632736206055, + "learning_rate": 1e-06, + "loss": 0.3796, + "mean_token_accuracy": 0.8804916143417358, + "num_tokens": 854644066.0, + "step": 22397 + }, + { + "epoch": 2.849255819870246, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.934803009033203, + "learning_rate": 1e-06, + "loss": 0.3927, + "mean_token_accuracy": 0.877272367477417, + "num_tokens": 854682723.0, + "step": 22398 + }, + { + "epoch": 2.849383030148836, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.751420974731445, + "learning_rate": 1e-06, + "loss": 0.3707, + "mean_token_accuracy": 0.8841317296028137, + "num_tokens": 854720292.0, + "step": 22399 + }, + { + "epoch": 2.8495102404274264, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.713062286376953, + "learning_rate": 1e-06, + "loss": 0.4209, + "mean_token_accuracy": 0.870816707611084, + "num_tokens": 854756917.0, + "step": 22400 + }, + { + "epoch": 2.849637450706017, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.029237747192383, + "learning_rate": 1e-06, + "loss": 0.4002, + "mean_token_accuracy": 0.8760050535202026, + "num_tokens": 854798771.0, + "step": 22401 + }, + { + "epoch": 2.8497646609846075, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.789825439453125, + "learning_rate": 1e-06, + "loss": 0.3944, + "mean_token_accuracy": 0.8720034956932068, + "num_tokens": 854838193.0, + "step": 22402 + }, + { + "epoch": 2.849891871263198, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.031938552856445, + "learning_rate": 1e-06, + "loss": 0.4221, + "mean_token_accuracy": 0.8687828183174133, + "num_tokens": 854871650.0, + "step": 22403 + }, + { + "epoch": 2.8500190815417885, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.8133487701416, + "learning_rate": 1e-06, + "loss": 0.4022, + "mean_token_accuracy": 0.8723124861717224, + "num_tokens": 854910757.0, + "step": 22404 + }, + { + "epoch": 2.850146291820379, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.072847366333008, + "learning_rate": 1e-06, + "loss": 0.4306, + "mean_token_accuracy": 0.8626440763473511, + "num_tokens": 854945018.0, + "step": 22405 + }, + { + "epoch": 2.8502735020989696, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.7645206451416, + "learning_rate": 1e-06, + "loss": 0.4134, + "mean_token_accuracy": 0.8663854002952576, + "num_tokens": 854988369.0, + "step": 22406 + }, + { + "epoch": 2.85040071237756, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.92263412475586, + "learning_rate": 1e-06, + "loss": 0.3907, + "mean_token_accuracy": 0.8765129446983337, + "num_tokens": 855023935.0, + "step": 22407 + }, + { + "epoch": 2.8505279226561506, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.661306381225586, + "learning_rate": 1e-06, + "loss": 0.4075, + "mean_token_accuracy": 0.8736511468887329, + "num_tokens": 855062002.0, + "step": 22408 + }, + { + "epoch": 2.850655132934741, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.012807846069336, + "learning_rate": 1e-06, + "loss": 0.3902, + "mean_token_accuracy": 0.8778363466262817, + "num_tokens": 855101257.0, + "step": 22409 + }, + { + "epoch": 2.8507823432133317, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.889202117919922, + "learning_rate": 1e-06, + "loss": 0.3956, + "mean_token_accuracy": 0.8770049810409546, + "num_tokens": 855135834.0, + "step": 22410 + }, + { + "epoch": 2.850909553491922, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.865158081054688, + "learning_rate": 1e-06, + "loss": 0.3879, + "mean_token_accuracy": 0.8764589428901672, + "num_tokens": 855174156.0, + "step": 22411 + }, + { + "epoch": 2.8510367637705127, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.756704330444336, + "learning_rate": 1e-06, + "loss": 0.3695, + "mean_token_accuracy": 0.8846712708473206, + "num_tokens": 855207698.0, + "step": 22412 + }, + { + "epoch": 2.8511639740491033, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.985559463500977, + "learning_rate": 1e-06, + "loss": 0.406, + "mean_token_accuracy": 0.8748338222503662, + "num_tokens": 855240636.0, + "step": 22413 + }, + { + "epoch": 2.851291184327694, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.89518165588379, + "learning_rate": 1e-06, + "loss": 0.3839, + "mean_token_accuracy": 0.8770670890808105, + "num_tokens": 855279923.0, + "step": 22414 + }, + { + "epoch": 2.8514183946062843, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.754070281982422, + "learning_rate": 1e-06, + "loss": 0.4296, + "mean_token_accuracy": 0.8637135028839111, + "num_tokens": 855315460.0, + "step": 22415 + }, + { + "epoch": 2.851545604884875, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.648834228515625, + "learning_rate": 1e-06, + "loss": 0.3649, + "mean_token_accuracy": 0.8818314075469971, + "num_tokens": 855351548.0, + "step": 22416 + }, + { + "epoch": 2.8516728151634654, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.93961524963379, + "learning_rate": 1e-06, + "loss": 0.453, + "mean_token_accuracy": 0.8559938073158264, + "num_tokens": 855390715.0, + "step": 22417 + }, + { + "epoch": 2.8518000254420555, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.832977294921875, + "learning_rate": 1e-06, + "loss": 0.4666, + "mean_token_accuracy": 0.8601710796356201, + "num_tokens": 855431997.0, + "step": 22418 + }, + { + "epoch": 2.8519272357206464, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.78739356994629, + "learning_rate": 1e-06, + "loss": 0.418, + "mean_token_accuracy": 0.8685944676399231, + "num_tokens": 855470242.0, + "step": 22419 + }, + { + "epoch": 2.8520544459992365, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.575780868530273, + "learning_rate": 1e-06, + "loss": 0.4319, + "mean_token_accuracy": 0.861196756362915, + "num_tokens": 855508901.0, + "step": 22420 + }, + { + "epoch": 2.8521816562778275, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.882568359375, + "learning_rate": 1e-06, + "loss": 0.4028, + "mean_token_accuracy": 0.8700698018074036, + "num_tokens": 855545290.0, + "step": 22421 + }, + { + "epoch": 2.8523088665564176, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.845340728759766, + "learning_rate": 1e-06, + "loss": 0.4052, + "mean_token_accuracy": 0.8714991807937622, + "num_tokens": 855579567.0, + "step": 22422 + }, + { + "epoch": 2.852436076835008, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.793378829956055, + "learning_rate": 1e-06, + "loss": 0.422, + "mean_token_accuracy": 0.8668228387832642, + "num_tokens": 855621117.0, + "step": 22423 + }, + { + "epoch": 2.8525632871135986, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.751415252685547, + "learning_rate": 1e-06, + "loss": 0.395, + "mean_token_accuracy": 0.8761901259422302, + "num_tokens": 855660494.0, + "step": 22424 + }, + { + "epoch": 2.852690497392189, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.701007843017578, + "learning_rate": 1e-06, + "loss": 0.4159, + "mean_token_accuracy": 0.8744246363639832, + "num_tokens": 855694043.0, + "step": 22425 + }, + { + "epoch": 2.8528177076707797, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.589223861694336, + "learning_rate": 1e-06, + "loss": 0.439, + "mean_token_accuracy": 0.8601180911064148, + "num_tokens": 855734615.0, + "step": 22426 + }, + { + "epoch": 2.85294491794937, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.966968536376953, + "learning_rate": 1e-06, + "loss": 0.3675, + "mean_token_accuracy": 0.8851000666618347, + "num_tokens": 855771141.0, + "step": 22427 + }, + { + "epoch": 2.8530721282279607, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.113479614257812, + "learning_rate": 1e-06, + "loss": 0.4052, + "mean_token_accuracy": 0.8735978007316589, + "num_tokens": 855806877.0, + "step": 22428 + }, + { + "epoch": 2.8531993385065513, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.834949493408203, + "learning_rate": 1e-06, + "loss": 0.3549, + "mean_token_accuracy": 0.8885831832885742, + "num_tokens": 855840076.0, + "step": 22429 + }, + { + "epoch": 2.853326548785142, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.95887565612793, + "learning_rate": 1e-06, + "loss": 0.3796, + "mean_token_accuracy": 0.8838712573051453, + "num_tokens": 855877972.0, + "step": 22430 + }, + { + "epoch": 2.8534537590637323, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.960405349731445, + "learning_rate": 1e-06, + "loss": 0.4227, + "mean_token_accuracy": 0.8690139651298523, + "num_tokens": 855917424.0, + "step": 22431 + }, + { + "epoch": 2.853580969342323, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.967695236206055, + "learning_rate": 1e-06, + "loss": 0.4319, + "mean_token_accuracy": 0.8638001680374146, + "num_tokens": 855954853.0, + "step": 22432 + }, + { + "epoch": 2.8537081796209134, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.778661727905273, + "learning_rate": 1e-06, + "loss": 0.394, + "mean_token_accuracy": 0.8756541609764099, + "num_tokens": 855993300.0, + "step": 22433 + }, + { + "epoch": 2.853835389899504, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.03376579284668, + "learning_rate": 1e-06, + "loss": 0.3917, + "mean_token_accuracy": 0.8779508471488953, + "num_tokens": 856035307.0, + "step": 22434 + }, + { + "epoch": 2.8539626001780944, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.842002868652344, + "learning_rate": 1e-06, + "loss": 0.411, + "mean_token_accuracy": 0.8718366026878357, + "num_tokens": 856071801.0, + "step": 22435 + }, + { + "epoch": 2.854089810456685, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.841703414916992, + "learning_rate": 1e-06, + "loss": 0.4238, + "mean_token_accuracy": 0.8623114824295044, + "num_tokens": 856112035.0, + "step": 22436 + }, + { + "epoch": 2.8542170207352755, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.946624755859375, + "learning_rate": 1e-06, + "loss": 0.3847, + "mean_token_accuracy": 0.8793929815292358, + "num_tokens": 856148275.0, + "step": 22437 + }, + { + "epoch": 2.854344231013866, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.722736358642578, + "learning_rate": 1e-06, + "loss": 0.3844, + "mean_token_accuracy": 0.8767120838165283, + "num_tokens": 856180330.0, + "step": 22438 + }, + { + "epoch": 2.8544714412924566, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.046791076660156, + "learning_rate": 1e-06, + "loss": 0.4459, + "mean_token_accuracy": 0.8606661558151245, + "num_tokens": 856219354.0, + "step": 22439 + }, + { + "epoch": 2.854598651571047, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.775449752807617, + "learning_rate": 1e-06, + "loss": 0.3701, + "mean_token_accuracy": 0.8841084241867065, + "num_tokens": 856257583.0, + "step": 22440 + }, + { + "epoch": 2.8547258618496376, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.829933166503906, + "learning_rate": 1e-06, + "loss": 0.4473, + "mean_token_accuracy": 0.8569586277008057, + "num_tokens": 856298056.0, + "step": 22441 + }, + { + "epoch": 2.854853072128228, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.1145076751709, + "learning_rate": 1e-06, + "loss": 0.4099, + "mean_token_accuracy": 0.8712712526321411, + "num_tokens": 856334109.0, + "step": 22442 + }, + { + "epoch": 2.8549802824068182, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.690813064575195, + "learning_rate": 1e-06, + "loss": 0.3888, + "mean_token_accuracy": 0.8753787875175476, + "num_tokens": 856376655.0, + "step": 22443 + }, + { + "epoch": 2.855107492685409, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.845138549804688, + "learning_rate": 1e-06, + "loss": 0.3448, + "mean_token_accuracy": 0.8899341821670532, + "num_tokens": 856409926.0, + "step": 22444 + }, + { + "epoch": 2.8552347029639993, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.769250869750977, + "learning_rate": 1e-06, + "loss": 0.4337, + "mean_token_accuracy": 0.8655803203582764, + "num_tokens": 856451471.0, + "step": 22445 + }, + { + "epoch": 2.8553619132425903, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.668045043945312, + "learning_rate": 1e-06, + "loss": 0.3724, + "mean_token_accuracy": 0.882534384727478, + "num_tokens": 856483237.0, + "step": 22446 + }, + { + "epoch": 2.8554891235211803, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.95326805114746, + "learning_rate": 1e-06, + "loss": 0.4358, + "mean_token_accuracy": 0.8655770421028137, + "num_tokens": 856521278.0, + "step": 22447 + }, + { + "epoch": 2.855616333799771, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.80751609802246, + "learning_rate": 1e-06, + "loss": 0.414, + "mean_token_accuracy": 0.8709672093391418, + "num_tokens": 856565048.0, + "step": 22448 + }, + { + "epoch": 2.8557435440783614, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.94593048095703, + "learning_rate": 1e-06, + "loss": 0.3911, + "mean_token_accuracy": 0.8773108720779419, + "num_tokens": 856603756.0, + "step": 22449 + }, + { + "epoch": 2.855870754356952, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.73004913330078, + "learning_rate": 1e-06, + "loss": 0.3884, + "mean_token_accuracy": 0.879909336566925, + "num_tokens": 856640677.0, + "step": 22450 + }, + { + "epoch": 2.8559979646355425, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.911293029785156, + "learning_rate": 1e-06, + "loss": 0.3917, + "mean_token_accuracy": 0.8740034103393555, + "num_tokens": 856679982.0, + "step": 22451 + }, + { + "epoch": 2.856125174914133, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.728649139404297, + "learning_rate": 1e-06, + "loss": 0.4287, + "mean_token_accuracy": 0.8624057769775391, + "num_tokens": 856720621.0, + "step": 22452 + }, + { + "epoch": 2.8562523851927235, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.738975524902344, + "learning_rate": 1e-06, + "loss": 0.3767, + "mean_token_accuracy": 0.8817275762557983, + "num_tokens": 856757172.0, + "step": 22453 + }, + { + "epoch": 2.856379595471314, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.811208724975586, + "learning_rate": 1e-06, + "loss": 0.4052, + "mean_token_accuracy": 0.8717746734619141, + "num_tokens": 856796876.0, + "step": 22454 + }, + { + "epoch": 2.8565068057499046, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.80072021484375, + "learning_rate": 1e-06, + "loss": 0.4024, + "mean_token_accuracy": 0.8740885853767395, + "num_tokens": 856831148.0, + "step": 22455 + }, + { + "epoch": 2.856634016028495, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.84322738647461, + "learning_rate": 1e-06, + "loss": 0.4102, + "mean_token_accuracy": 0.870401918888092, + "num_tokens": 856867091.0, + "step": 22456 + }, + { + "epoch": 2.8567612263070856, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.79942512512207, + "learning_rate": 1e-06, + "loss": 0.4368, + "mean_token_accuracy": 0.8634971380233765, + "num_tokens": 856906118.0, + "step": 22457 + }, + { + "epoch": 2.856888436585676, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.99049949645996, + "learning_rate": 1e-06, + "loss": 0.4775, + "mean_token_accuracy": 0.8482321500778198, + "num_tokens": 856942694.0, + "step": 22458 + }, + { + "epoch": 2.8570156468642667, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.86566925048828, + "learning_rate": 1e-06, + "loss": 0.4218, + "mean_token_accuracy": 0.8660242557525635, + "num_tokens": 856977530.0, + "step": 22459 + }, + { + "epoch": 2.857142857142857, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.714391708374023, + "learning_rate": 1e-06, + "loss": 0.3825, + "mean_token_accuracy": 0.8774586915969849, + "num_tokens": 857012417.0, + "step": 22460 + }, + { + "epoch": 2.8572700674214477, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.696718215942383, + "learning_rate": 1e-06, + "loss": 0.4401, + "mean_token_accuracy": 0.866863489151001, + "num_tokens": 857053556.0, + "step": 22461 + }, + { + "epoch": 2.8573972777000383, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.985219955444336, + "learning_rate": 1e-06, + "loss": 0.3847, + "mean_token_accuracy": 0.8776172399520874, + "num_tokens": 857094514.0, + "step": 22462 + }, + { + "epoch": 2.857524487978629, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.04366683959961, + "learning_rate": 1e-06, + "loss": 0.4059, + "mean_token_accuracy": 0.8757179379463196, + "num_tokens": 857133670.0, + "step": 22463 + }, + { + "epoch": 2.8576516982572193, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.67487907409668, + "learning_rate": 1e-06, + "loss": 0.4153, + "mean_token_accuracy": 0.8677002191543579, + "num_tokens": 857167934.0, + "step": 22464 + }, + { + "epoch": 2.85777890853581, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.85424041748047, + "learning_rate": 1e-06, + "loss": 0.4522, + "mean_token_accuracy": 0.8560885190963745, + "num_tokens": 857208263.0, + "step": 22465 + }, + { + "epoch": 2.8579061188144, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.768774032592773, + "learning_rate": 1e-06, + "loss": 0.456, + "mean_token_accuracy": 0.8561934232711792, + "num_tokens": 857255107.0, + "step": 22466 + }, + { + "epoch": 2.858033329092991, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.841379165649414, + "learning_rate": 1e-06, + "loss": 0.4105, + "mean_token_accuracy": 0.8710042834281921, + "num_tokens": 857298368.0, + "step": 22467 + }, + { + "epoch": 2.858160539371581, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.898818969726562, + "learning_rate": 1e-06, + "loss": 0.403, + "mean_token_accuracy": 0.8745319843292236, + "num_tokens": 857336899.0, + "step": 22468 + }, + { + "epoch": 2.858287749650172, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.85117530822754, + "learning_rate": 1e-06, + "loss": 0.3851, + "mean_token_accuracy": 0.8788077235221863, + "num_tokens": 857378482.0, + "step": 22469 + }, + { + "epoch": 2.858414959928762, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.709123611450195, + "learning_rate": 1e-06, + "loss": 0.3722, + "mean_token_accuracy": 0.88344407081604, + "num_tokens": 857413203.0, + "step": 22470 + }, + { + "epoch": 2.858542170207353, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.758750915527344, + "learning_rate": 1e-06, + "loss": 0.4143, + "mean_token_accuracy": 0.8725311160087585, + "num_tokens": 857446706.0, + "step": 22471 + }, + { + "epoch": 2.858669380485943, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.818449020385742, + "learning_rate": 1e-06, + "loss": 0.401, + "mean_token_accuracy": 0.877550482749939, + "num_tokens": 857483415.0, + "step": 22472 + }, + { + "epoch": 2.8587965907645336, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.52356719970703, + "learning_rate": 1e-06, + "loss": 0.3992, + "mean_token_accuracy": 0.8794513940811157, + "num_tokens": 857525323.0, + "step": 22473 + }, + { + "epoch": 2.858923801043124, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.913955688476562, + "learning_rate": 1e-06, + "loss": 0.3585, + "mean_token_accuracy": 0.8905059695243835, + "num_tokens": 857564400.0, + "step": 22474 + }, + { + "epoch": 2.8590510113217147, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.82029914855957, + "learning_rate": 1e-06, + "loss": 0.4008, + "mean_token_accuracy": 0.872187614440918, + "num_tokens": 857602756.0, + "step": 22475 + }, + { + "epoch": 2.859178221600305, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.56578254699707, + "learning_rate": 1e-06, + "loss": 0.4011, + "mean_token_accuracy": 0.8742880821228027, + "num_tokens": 857636875.0, + "step": 22476 + }, + { + "epoch": 2.8593054318788957, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.985759735107422, + "learning_rate": 1e-06, + "loss": 0.3528, + "mean_token_accuracy": 0.8906534314155579, + "num_tokens": 857671950.0, + "step": 22477 + }, + { + "epoch": 2.8594326421574863, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.05394744873047, + "learning_rate": 1e-06, + "loss": 0.3634, + "mean_token_accuracy": 0.8864010572433472, + "num_tokens": 857705274.0, + "step": 22478 + }, + { + "epoch": 2.859559852436077, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.691797256469727, + "learning_rate": 1e-06, + "loss": 0.4172, + "mean_token_accuracy": 0.8678805828094482, + "num_tokens": 857743730.0, + "step": 22479 + }, + { + "epoch": 2.8596870627146673, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.823482513427734, + "learning_rate": 1e-06, + "loss": 0.4031, + "mean_token_accuracy": 0.873079776763916, + "num_tokens": 857784055.0, + "step": 22480 + }, + { + "epoch": 2.859814272993258, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.75638771057129, + "learning_rate": 1e-06, + "loss": 0.3753, + "mean_token_accuracy": 0.8804689645767212, + "num_tokens": 857822431.0, + "step": 22481 + }, + { + "epoch": 2.8599414832718484, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.83368492126465, + "learning_rate": 1e-06, + "loss": 0.3638, + "mean_token_accuracy": 0.8849286437034607, + "num_tokens": 857859258.0, + "step": 22482 + }, + { + "epoch": 2.860068693550439, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.85392951965332, + "learning_rate": 1e-06, + "loss": 0.4307, + "mean_token_accuracy": 0.8657759428024292, + "num_tokens": 857901209.0, + "step": 22483 + }, + { + "epoch": 2.8601959038290294, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.671375274658203, + "learning_rate": 1e-06, + "loss": 0.3581, + "mean_token_accuracy": 0.8869504332542419, + "num_tokens": 857935563.0, + "step": 22484 + }, + { + "epoch": 2.86032311410762, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.70807647705078, + "learning_rate": 1e-06, + "loss": 0.4281, + "mean_token_accuracy": 0.8650697469711304, + "num_tokens": 857966777.0, + "step": 22485 + }, + { + "epoch": 2.8604503243862105, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.70734977722168, + "learning_rate": 1e-06, + "loss": 0.3869, + "mean_token_accuracy": 0.8785953521728516, + "num_tokens": 858005078.0, + "step": 22486 + }, + { + "epoch": 2.860577534664801, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.81185531616211, + "learning_rate": 1e-06, + "loss": 0.444, + "mean_token_accuracy": 0.8618991374969482, + "num_tokens": 858049288.0, + "step": 22487 + }, + { + "epoch": 2.8607047449433916, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.755109786987305, + "learning_rate": 1e-06, + "loss": 0.3418, + "mean_token_accuracy": 0.8942055702209473, + "num_tokens": 858089834.0, + "step": 22488 + }, + { + "epoch": 2.860831955221982, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.929122924804688, + "learning_rate": 1e-06, + "loss": 0.3937, + "mean_token_accuracy": 0.8787586688995361, + "num_tokens": 858124152.0, + "step": 22489 + }, + { + "epoch": 2.8609591655005726, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.669336318969727, + "learning_rate": 1e-06, + "loss": 0.3874, + "mean_token_accuracy": 0.8781509399414062, + "num_tokens": 858163202.0, + "step": 22490 + }, + { + "epoch": 2.8610863757791627, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.877857208251953, + "learning_rate": 1e-06, + "loss": 0.3387, + "mean_token_accuracy": 0.8930336236953735, + "num_tokens": 858199173.0, + "step": 22491 + }, + { + "epoch": 2.8612135860577537, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.10514259338379, + "learning_rate": 1e-06, + "loss": 0.4101, + "mean_token_accuracy": 0.8727446794509888, + "num_tokens": 858239693.0, + "step": 22492 + }, + { + "epoch": 2.8613407963363438, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.519779205322266, + "learning_rate": 1e-06, + "loss": 0.3776, + "mean_token_accuracy": 0.881820797920227, + "num_tokens": 858280720.0, + "step": 22493 + }, + { + "epoch": 2.8614680066149347, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.112346649169922, + "learning_rate": 1e-06, + "loss": 0.3633, + "mean_token_accuracy": 0.8851249814033508, + "num_tokens": 858314145.0, + "step": 22494 + }, + { + "epoch": 2.861595216893525, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.97543716430664, + "learning_rate": 1e-06, + "loss": 0.3963, + "mean_token_accuracy": 0.8762984275817871, + "num_tokens": 858355177.0, + "step": 22495 + }, + { + "epoch": 2.861722427172116, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.600505828857422, + "learning_rate": 1e-06, + "loss": 0.3898, + "mean_token_accuracy": 0.8777697086334229, + "num_tokens": 858393070.0, + "step": 22496 + }, + { + "epoch": 2.861849637450706, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.98797607421875, + "learning_rate": 1e-06, + "loss": 0.408, + "mean_token_accuracy": 0.8737010955810547, + "num_tokens": 858431827.0, + "step": 22497 + }, + { + "epoch": 2.8619768477292964, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.63691520690918, + "learning_rate": 1e-06, + "loss": 0.4027, + "mean_token_accuracy": 0.872157633304596, + "num_tokens": 858474990.0, + "step": 22498 + }, + { + "epoch": 2.862104058007887, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.039737701416016, + "learning_rate": 1e-06, + "loss": 0.3625, + "mean_token_accuracy": 0.8872506618499756, + "num_tokens": 858514698.0, + "step": 22499 + }, + { + "epoch": 2.8622312682864774, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.838417053222656, + "learning_rate": 1e-06, + "loss": 0.4046, + "mean_token_accuracy": 0.8752573728561401, + "num_tokens": 858557601.0, + "step": 22500 + }, + { + "epoch": 2.862358478565068, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.768829345703125, + "learning_rate": 1e-06, + "loss": 0.3694, + "mean_token_accuracy": 0.8829189538955688, + "num_tokens": 858596830.0, + "step": 22501 + }, + { + "epoch": 2.8624856888436585, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.062124252319336, + "learning_rate": 1e-06, + "loss": 0.4411, + "mean_token_accuracy": 0.8649387359619141, + "num_tokens": 858637555.0, + "step": 22502 + }, + { + "epoch": 2.862612899122249, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.806583404541016, + "learning_rate": 1e-06, + "loss": 0.3371, + "mean_token_accuracy": 0.8933336734771729, + "num_tokens": 858672813.0, + "step": 22503 + }, + { + "epoch": 2.8627401094008396, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.015552520751953, + "learning_rate": 1e-06, + "loss": 0.3844, + "mean_token_accuracy": 0.8803013563156128, + "num_tokens": 858710743.0, + "step": 22504 + }, + { + "epoch": 2.86286731967943, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.05755615234375, + "learning_rate": 1e-06, + "loss": 0.3535, + "mean_token_accuracy": 0.8868032097816467, + "num_tokens": 858739486.0, + "step": 22505 + }, + { + "epoch": 2.8629945299580206, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.841049194335938, + "learning_rate": 1e-06, + "loss": 0.447, + "mean_token_accuracy": 0.8550165295600891, + "num_tokens": 858780678.0, + "step": 22506 + }, + { + "epoch": 2.863121740236611, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.878774642944336, + "learning_rate": 1e-06, + "loss": 0.4147, + "mean_token_accuracy": 0.870334267616272, + "num_tokens": 858826112.0, + "step": 22507 + }, + { + "epoch": 2.8632489505152017, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.822998046875, + "learning_rate": 1e-06, + "loss": 0.3636, + "mean_token_accuracy": 0.8859802484512329, + "num_tokens": 858862158.0, + "step": 22508 + }, + { + "epoch": 2.863376160793792, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.751094818115234, + "learning_rate": 1e-06, + "loss": 0.3825, + "mean_token_accuracy": 0.8794667720794678, + "num_tokens": 858905270.0, + "step": 22509 + }, + { + "epoch": 2.8635033710723827, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.79349136352539, + "learning_rate": 1e-06, + "loss": 0.4085, + "mean_token_accuracy": 0.8677127957344055, + "num_tokens": 858941802.0, + "step": 22510 + }, + { + "epoch": 2.8636305813509733, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.83109474182129, + "learning_rate": 1e-06, + "loss": 0.3887, + "mean_token_accuracy": 0.8772854804992676, + "num_tokens": 858974672.0, + "step": 22511 + }, + { + "epoch": 2.863757791629564, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.789499282836914, + "learning_rate": 1e-06, + "loss": 0.403, + "mean_token_accuracy": 0.8733645677566528, + "num_tokens": 859015305.0, + "step": 22512 + }, + { + "epoch": 2.8638850019081543, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.83268928527832, + "learning_rate": 1e-06, + "loss": 0.4226, + "mean_token_accuracy": 0.8658932447433472, + "num_tokens": 859050646.0, + "step": 22513 + }, + { + "epoch": 2.864012212186745, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.908782958984375, + "learning_rate": 1e-06, + "loss": 0.3732, + "mean_token_accuracy": 0.8840038776397705, + "num_tokens": 859090139.0, + "step": 22514 + }, + { + "epoch": 2.8641394224653354, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.624271392822266, + "learning_rate": 1e-06, + "loss": 0.375, + "mean_token_accuracy": 0.8789535760879517, + "num_tokens": 859130661.0, + "step": 22515 + }, + { + "epoch": 2.8642666327439255, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.77652931213379, + "learning_rate": 1e-06, + "loss": 0.3959, + "mean_token_accuracy": 0.8744657039642334, + "num_tokens": 859166399.0, + "step": 22516 + }, + { + "epoch": 2.8643938430225164, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.955215454101562, + "learning_rate": 1e-06, + "loss": 0.4004, + "mean_token_accuracy": 0.8752802014350891, + "num_tokens": 859208349.0, + "step": 22517 + }, + { + "epoch": 2.8645210533011065, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.521223068237305, + "learning_rate": 1e-06, + "loss": 0.3879, + "mean_token_accuracy": 0.8775448203086853, + "num_tokens": 859243784.0, + "step": 22518 + }, + { + "epoch": 2.8646482635796975, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.923532485961914, + "learning_rate": 1e-06, + "loss": 0.4373, + "mean_token_accuracy": 0.8609018325805664, + "num_tokens": 859278345.0, + "step": 22519 + }, + { + "epoch": 2.8647754738582876, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.78312873840332, + "learning_rate": 1e-06, + "loss": 0.4388, + "mean_token_accuracy": 0.86495041847229, + "num_tokens": 859317454.0, + "step": 22520 + }, + { + "epoch": 2.864902684136878, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.985702514648438, + "learning_rate": 1e-06, + "loss": 0.4152, + "mean_token_accuracy": 0.8706369996070862, + "num_tokens": 859352076.0, + "step": 22521 + }, + { + "epoch": 2.8650298944154686, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.87038230895996, + "learning_rate": 1e-06, + "loss": 0.3737, + "mean_token_accuracy": 0.8827970027923584, + "num_tokens": 859389957.0, + "step": 22522 + }, + { + "epoch": 2.865157104694059, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.839590072631836, + "learning_rate": 1e-06, + "loss": 0.3917, + "mean_token_accuracy": 0.8746258616447449, + "num_tokens": 859425167.0, + "step": 22523 + }, + { + "epoch": 2.8652843149726497, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.00183868408203, + "learning_rate": 1e-06, + "loss": 0.4073, + "mean_token_accuracy": 0.8717666864395142, + "num_tokens": 859470999.0, + "step": 22524 + }, + { + "epoch": 2.86541152525124, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.766048431396484, + "learning_rate": 1e-06, + "loss": 0.3717, + "mean_token_accuracy": 0.8815705180168152, + "num_tokens": 859503763.0, + "step": 22525 + }, + { + "epoch": 2.8655387355298307, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.82389259338379, + "learning_rate": 1e-06, + "loss": 0.3721, + "mean_token_accuracy": 0.8844215273857117, + "num_tokens": 859537739.0, + "step": 22526 + }, + { + "epoch": 2.8656659458084213, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.850440979003906, + "learning_rate": 1e-06, + "loss": 0.3628, + "mean_token_accuracy": 0.8842766284942627, + "num_tokens": 859573497.0, + "step": 22527 + }, + { + "epoch": 2.865793156087012, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.853879928588867, + "learning_rate": 1e-06, + "loss": 0.4115, + "mean_token_accuracy": 0.8714773654937744, + "num_tokens": 859606997.0, + "step": 22528 + }, + { + "epoch": 2.8659203663656023, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.8160400390625, + "learning_rate": 1e-06, + "loss": 0.4343, + "mean_token_accuracy": 0.8624369502067566, + "num_tokens": 859642278.0, + "step": 22529 + }, + { + "epoch": 2.866047576644193, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.563032150268555, + "learning_rate": 1e-06, + "loss": 0.3678, + "mean_token_accuracy": 0.8864541053771973, + "num_tokens": 859684728.0, + "step": 22530 + }, + { + "epoch": 2.8661747869227834, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.88339614868164, + "learning_rate": 1e-06, + "loss": 0.4274, + "mean_token_accuracy": 0.8681609630584717, + "num_tokens": 859720946.0, + "step": 22531 + }, + { + "epoch": 2.866301997201374, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.772069931030273, + "learning_rate": 1e-06, + "loss": 0.3993, + "mean_token_accuracy": 0.872830331325531, + "num_tokens": 859752605.0, + "step": 22532 + }, + { + "epoch": 2.8664292074799644, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.71446990966797, + "learning_rate": 1e-06, + "loss": 0.3632, + "mean_token_accuracy": 0.8874292373657227, + "num_tokens": 859784206.0, + "step": 22533 + }, + { + "epoch": 2.866556417758555, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.9212646484375, + "learning_rate": 1e-06, + "loss": 0.4132, + "mean_token_accuracy": 0.870868444442749, + "num_tokens": 859828142.0, + "step": 22534 + }, + { + "epoch": 2.8666836280371455, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.767555236816406, + "learning_rate": 1e-06, + "loss": 0.4058, + "mean_token_accuracy": 0.8703817129135132, + "num_tokens": 859870032.0, + "step": 22535 + }, + { + "epoch": 2.866810838315736, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.681377410888672, + "learning_rate": 1e-06, + "loss": 0.4247, + "mean_token_accuracy": 0.8644765615463257, + "num_tokens": 859911417.0, + "step": 22536 + }, + { + "epoch": 2.8669380485943265, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.79737091064453, + "learning_rate": 1e-06, + "loss": 0.3526, + "mean_token_accuracy": 0.8862204551696777, + "num_tokens": 859953929.0, + "step": 22537 + }, + { + "epoch": 2.867065258872917, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.865577697753906, + "learning_rate": 1e-06, + "loss": 0.3814, + "mean_token_accuracy": 0.8785825967788696, + "num_tokens": 859985402.0, + "step": 22538 + }, + { + "epoch": 2.8671924691515076, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.95216178894043, + "learning_rate": 1e-06, + "loss": 0.4044, + "mean_token_accuracy": 0.8721802234649658, + "num_tokens": 860017374.0, + "step": 22539 + }, + { + "epoch": 2.867319679430098, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.562681198120117, + "learning_rate": 1e-06, + "loss": 0.3662, + "mean_token_accuracy": 0.8833670020103455, + "num_tokens": 860058217.0, + "step": 22540 + }, + { + "epoch": 2.867446889708688, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.83060646057129, + "learning_rate": 1e-06, + "loss": 0.4417, + "mean_token_accuracy": 0.8582977056503296, + "num_tokens": 860092317.0, + "step": 22541 + }, + { + "epoch": 2.867574099987279, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.049898147583008, + "learning_rate": 1e-06, + "loss": 0.3969, + "mean_token_accuracy": 0.8704836368560791, + "num_tokens": 860125067.0, + "step": 22542 + }, + { + "epoch": 2.8677013102658693, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.827972412109375, + "learning_rate": 1e-06, + "loss": 0.3621, + "mean_token_accuracy": 0.8869593143463135, + "num_tokens": 860163805.0, + "step": 22543 + }, + { + "epoch": 2.8678285205444602, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.943620681762695, + "learning_rate": 1e-06, + "loss": 0.4314, + "mean_token_accuracy": 0.8621814250946045, + "num_tokens": 860200178.0, + "step": 22544 + }, + { + "epoch": 2.8679557308230503, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.714832305908203, + "learning_rate": 1e-06, + "loss": 0.4081, + "mean_token_accuracy": 0.8749954700469971, + "num_tokens": 860235283.0, + "step": 22545 + }, + { + "epoch": 2.868082941101641, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.778717041015625, + "learning_rate": 1e-06, + "loss": 0.3457, + "mean_token_accuracy": 0.8911087512969971, + "num_tokens": 860276776.0, + "step": 22546 + }, + { + "epoch": 2.8682101513802314, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.97904396057129, + "learning_rate": 1e-06, + "loss": 0.3804, + "mean_token_accuracy": 0.8789710402488708, + "num_tokens": 860318492.0, + "step": 22547 + }, + { + "epoch": 2.868337361658822, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.78028106689453, + "learning_rate": 1e-06, + "loss": 0.4333, + "mean_token_accuracy": 0.8604917526245117, + "num_tokens": 860358556.0, + "step": 22548 + }, + { + "epoch": 2.8684645719374124, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.860576629638672, + "learning_rate": 1e-06, + "loss": 0.4141, + "mean_token_accuracy": 0.86588454246521, + "num_tokens": 860391114.0, + "step": 22549 + }, + { + "epoch": 2.868591782216003, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.611234664916992, + "learning_rate": 1e-06, + "loss": 0.4258, + "mean_token_accuracy": 0.8706406354904175, + "num_tokens": 860430696.0, + "step": 22550 + }, + { + "epoch": 2.8687189924945935, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.890060424804688, + "learning_rate": 1e-06, + "loss": 0.3702, + "mean_token_accuracy": 0.8813222646713257, + "num_tokens": 860471055.0, + "step": 22551 + }, + { + "epoch": 2.868846202773184, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.111879348754883, + "learning_rate": 1e-06, + "loss": 0.4194, + "mean_token_accuracy": 0.8683528900146484, + "num_tokens": 860512472.0, + "step": 22552 + }, + { + "epoch": 2.8689734130517746, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.70619773864746, + "learning_rate": 1e-06, + "loss": 0.4092, + "mean_token_accuracy": 0.8702383041381836, + "num_tokens": 860541014.0, + "step": 22553 + }, + { + "epoch": 2.869100623330365, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.877174377441406, + "learning_rate": 1e-06, + "loss": 0.3549, + "mean_token_accuracy": 0.889668345451355, + "num_tokens": 860585025.0, + "step": 22554 + }, + { + "epoch": 2.8692278336089556, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.725141525268555, + "learning_rate": 1e-06, + "loss": 0.3751, + "mean_token_accuracy": 0.8826289772987366, + "num_tokens": 860616361.0, + "step": 22555 + }, + { + "epoch": 2.869355043887546, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.880001068115234, + "learning_rate": 1e-06, + "loss": 0.4342, + "mean_token_accuracy": 0.867098867893219, + "num_tokens": 860655976.0, + "step": 22556 + }, + { + "epoch": 2.8694822541661367, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.950300216674805, + "learning_rate": 1e-06, + "loss": 0.4366, + "mean_token_accuracy": 0.8626415729522705, + "num_tokens": 860703949.0, + "step": 22557 + }, + { + "epoch": 2.869609464444727, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.700035095214844, + "learning_rate": 1e-06, + "loss": 0.3972, + "mean_token_accuracy": 0.8747743368148804, + "num_tokens": 860739935.0, + "step": 22558 + }, + { + "epoch": 2.8697366747233177, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.75309944152832, + "learning_rate": 1e-06, + "loss": 0.3844, + "mean_token_accuracy": 0.8780676126480103, + "num_tokens": 860782483.0, + "step": 22559 + }, + { + "epoch": 2.8698638850019083, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.730241775512695, + "learning_rate": 1e-06, + "loss": 0.4278, + "mean_token_accuracy": 0.863326907157898, + "num_tokens": 860816821.0, + "step": 22560 + }, + { + "epoch": 2.869991095280499, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.836027145385742, + "learning_rate": 1e-06, + "loss": 0.429, + "mean_token_accuracy": 0.8635420799255371, + "num_tokens": 860851313.0, + "step": 22561 + }, + { + "epoch": 2.8701183055590893, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.826557159423828, + "learning_rate": 1e-06, + "loss": 0.3733, + "mean_token_accuracy": 0.8843158483505249, + "num_tokens": 860890540.0, + "step": 22562 + }, + { + "epoch": 2.87024551583768, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.83799171447754, + "learning_rate": 1e-06, + "loss": 0.4157, + "mean_token_accuracy": 0.8691243529319763, + "num_tokens": 860929686.0, + "step": 22563 + }, + { + "epoch": 2.87037272611627, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.84852409362793, + "learning_rate": 1e-06, + "loss": 0.3932, + "mean_token_accuracy": 0.8771579265594482, + "num_tokens": 860968186.0, + "step": 22564 + }, + { + "epoch": 2.870499936394861, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.601905822753906, + "learning_rate": 1e-06, + "loss": 0.407, + "mean_token_accuracy": 0.8701648712158203, + "num_tokens": 861007252.0, + "step": 22565 + }, + { + "epoch": 2.870627146673451, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.89580726623535, + "learning_rate": 1e-06, + "loss": 0.3902, + "mean_token_accuracy": 0.875643253326416, + "num_tokens": 861049898.0, + "step": 22566 + }, + { + "epoch": 2.870754356952042, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.801420211791992, + "learning_rate": 1e-06, + "loss": 0.3787, + "mean_token_accuracy": 0.8821835517883301, + "num_tokens": 861087711.0, + "step": 22567 + }, + { + "epoch": 2.870881567230632, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.663835525512695, + "learning_rate": 1e-06, + "loss": 0.4586, + "mean_token_accuracy": 0.8556778430938721, + "num_tokens": 861128566.0, + "step": 22568 + }, + { + "epoch": 2.871008777509223, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.04149627685547, + "learning_rate": 1e-06, + "loss": 0.3777, + "mean_token_accuracy": 0.8786212205886841, + "num_tokens": 861156299.0, + "step": 22569 + }, + { + "epoch": 2.871135987787813, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.753183364868164, + "learning_rate": 1e-06, + "loss": 0.3595, + "mean_token_accuracy": 0.8858368396759033, + "num_tokens": 861189376.0, + "step": 22570 + }, + { + "epoch": 2.8712631980664036, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.93837547302246, + "learning_rate": 1e-06, + "loss": 0.4111, + "mean_token_accuracy": 0.8711031675338745, + "num_tokens": 861227156.0, + "step": 22571 + }, + { + "epoch": 2.871390408344994, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.937355041503906, + "learning_rate": 1e-06, + "loss": 0.4027, + "mean_token_accuracy": 0.8736833333969116, + "num_tokens": 861268088.0, + "step": 22572 + }, + { + "epoch": 2.8715176186235847, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.806425094604492, + "learning_rate": 1e-06, + "loss": 0.3872, + "mean_token_accuracy": 0.8774720430374146, + "num_tokens": 861308892.0, + "step": 22573 + }, + { + "epoch": 2.871644828902175, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.736515045166016, + "learning_rate": 1e-06, + "loss": 0.3891, + "mean_token_accuracy": 0.876796305179596, + "num_tokens": 861350108.0, + "step": 22574 + }, + { + "epoch": 2.8717720391807657, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.995073318481445, + "learning_rate": 1e-06, + "loss": 0.371, + "mean_token_accuracy": 0.8841497302055359, + "num_tokens": 861390553.0, + "step": 22575 + }, + { + "epoch": 2.8718992494593563, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.697120666503906, + "learning_rate": 1e-06, + "loss": 0.3853, + "mean_token_accuracy": 0.8766347169876099, + "num_tokens": 861425878.0, + "step": 22576 + }, + { + "epoch": 2.872026459737947, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.87352180480957, + "learning_rate": 1e-06, + "loss": 0.3522, + "mean_token_accuracy": 0.8912447094917297, + "num_tokens": 861460935.0, + "step": 22577 + }, + { + "epoch": 2.8721536700165373, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.72481346130371, + "learning_rate": 1e-06, + "loss": 0.449, + "mean_token_accuracy": 0.8589544296264648, + "num_tokens": 861500446.0, + "step": 22578 + }, + { + "epoch": 2.872280880295128, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.708969116210938, + "learning_rate": 1e-06, + "loss": 0.4263, + "mean_token_accuracy": 0.8628271818161011, + "num_tokens": 861538000.0, + "step": 22579 + }, + { + "epoch": 2.8724080905737184, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.69536018371582, + "learning_rate": 1e-06, + "loss": 0.3864, + "mean_token_accuracy": 0.8771914839744568, + "num_tokens": 861578052.0, + "step": 22580 + }, + { + "epoch": 2.872535300852309, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.930313110351562, + "learning_rate": 1e-06, + "loss": 0.4116, + "mean_token_accuracy": 0.8703631162643433, + "num_tokens": 861619065.0, + "step": 22581 + }, + { + "epoch": 2.8726625111308994, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.690383911132812, + "learning_rate": 1e-06, + "loss": 0.396, + "mean_token_accuracy": 0.8781591653823853, + "num_tokens": 861660112.0, + "step": 22582 + }, + { + "epoch": 2.87278972140949, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.718894958496094, + "learning_rate": 1e-06, + "loss": 0.4037, + "mean_token_accuracy": 0.8762781620025635, + "num_tokens": 861699312.0, + "step": 22583 + }, + { + "epoch": 2.8729169316880805, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.114627838134766, + "learning_rate": 1e-06, + "loss": 0.3452, + "mean_token_accuracy": 0.891933023929596, + "num_tokens": 861735167.0, + "step": 22584 + }, + { + "epoch": 2.873044141966671, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.743043899536133, + "learning_rate": 1e-06, + "loss": 0.4194, + "mean_token_accuracy": 0.8685026168823242, + "num_tokens": 861778594.0, + "step": 22585 + }, + { + "epoch": 2.8731713522452615, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.93834114074707, + "learning_rate": 1e-06, + "loss": 0.4031, + "mean_token_accuracy": 0.8745929002761841, + "num_tokens": 861814754.0, + "step": 22586 + }, + { + "epoch": 2.873298562523852, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.958362579345703, + "learning_rate": 1e-06, + "loss": 0.4354, + "mean_token_accuracy": 0.8599927425384521, + "num_tokens": 861849019.0, + "step": 22587 + }, + { + "epoch": 2.8734257728024426, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.940275192260742, + "learning_rate": 1e-06, + "loss": 0.4109, + "mean_token_accuracy": 0.8717652559280396, + "num_tokens": 861889471.0, + "step": 22588 + }, + { + "epoch": 2.8735529830810327, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.040616989135742, + "learning_rate": 1e-06, + "loss": 0.4273, + "mean_token_accuracy": 0.8655434250831604, + "num_tokens": 861926811.0, + "step": 22589 + }, + { + "epoch": 2.8736801933596237, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.763141632080078, + "learning_rate": 1e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.8721162676811218, + "num_tokens": 861963628.0, + "step": 22590 + }, + { + "epoch": 2.8738074036382137, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.111848831176758, + "learning_rate": 1e-06, + "loss": 0.4272, + "mean_token_accuracy": 0.8632490634918213, + "num_tokens": 862002915.0, + "step": 22591 + }, + { + "epoch": 2.8739346139168047, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.04880714416504, + "learning_rate": 1e-06, + "loss": 0.3787, + "mean_token_accuracy": 0.8811272382736206, + "num_tokens": 862047276.0, + "step": 22592 + }, + { + "epoch": 2.874061824195395, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.70755386352539, + "learning_rate": 1e-06, + "loss": 0.3743, + "mean_token_accuracy": 0.8826759457588196, + "num_tokens": 862091241.0, + "step": 22593 + }, + { + "epoch": 2.8741890344739858, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.228517532348633, + "learning_rate": 1e-06, + "loss": 0.3838, + "mean_token_accuracy": 0.8783013820648193, + "num_tokens": 862126025.0, + "step": 22594 + }, + { + "epoch": 2.874316244752576, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.843721389770508, + "learning_rate": 1e-06, + "loss": 0.4244, + "mean_token_accuracy": 0.866483211517334, + "num_tokens": 862160822.0, + "step": 22595 + }, + { + "epoch": 2.8744434550311664, + "ewc_loss": 0.03955078125, + "ewc_loss_parallel": 3.9577484130859375e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.621700286865234, + "learning_rate": 1e-06, + "loss": 0.396, + "mean_token_accuracy": 0.8795377016067505, + "num_tokens": 862196808.0, + "step": 22596 + }, + { + "epoch": 2.874570665309757, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.057880401611328, + "learning_rate": 1e-06, + "loss": 0.4489, + "mean_token_accuracy": 0.8604547381401062, + "num_tokens": 862233722.0, + "step": 22597 + }, + { + "epoch": 2.8746978755883474, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.08711051940918, + "learning_rate": 1e-06, + "loss": 0.3498, + "mean_token_accuracy": 0.8897222280502319, + "num_tokens": 862271914.0, + "step": 22598 + }, + { + "epoch": 2.874825085866938, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.713308334350586, + "learning_rate": 1e-06, + "loss": 0.4113, + "mean_token_accuracy": 0.871064305305481, + "num_tokens": 862313696.0, + "step": 22599 + }, + { + "epoch": 2.8749522961455285, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.814010620117188, + "learning_rate": 1e-06, + "loss": 0.4243, + "mean_token_accuracy": 0.8677513599395752, + "num_tokens": 862355366.0, + "step": 22600 + }, + { + "epoch": 2.875079506424119, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.934974670410156, + "learning_rate": 1e-06, + "loss": 0.4258, + "mean_token_accuracy": 0.8658469319343567, + "num_tokens": 862395630.0, + "step": 22601 + }, + { + "epoch": 2.8752067167027096, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.870258331298828, + "learning_rate": 1e-06, + "loss": 0.3903, + "mean_token_accuracy": 0.8758910894393921, + "num_tokens": 862429650.0, + "step": 22602 + }, + { + "epoch": 2.8753339269813, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.013343811035156, + "learning_rate": 1e-06, + "loss": 0.4046, + "mean_token_accuracy": 0.8717763423919678, + "num_tokens": 862470921.0, + "step": 22603 + }, + { + "epoch": 2.8754611372598906, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.653757095336914, + "learning_rate": 1e-06, + "loss": 0.3721, + "mean_token_accuracy": 0.8800722360610962, + "num_tokens": 862505204.0, + "step": 22604 + }, + { + "epoch": 2.875588347538481, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.251989364624023, + "learning_rate": 1e-06, + "loss": 0.414, + "mean_token_accuracy": 0.8736269474029541, + "num_tokens": 862548686.0, + "step": 22605 + }, + { + "epoch": 2.8757155578170717, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.649749755859375, + "learning_rate": 1e-06, + "loss": 0.3722, + "mean_token_accuracy": 0.8868923187255859, + "num_tokens": 862585920.0, + "step": 22606 + }, + { + "epoch": 2.875842768095662, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.97672462463379, + "learning_rate": 1e-06, + "loss": 0.4085, + "mean_token_accuracy": 0.8700111508369446, + "num_tokens": 862619364.0, + "step": 22607 + }, + { + "epoch": 2.8759699783742527, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.237274169921875, + "learning_rate": 1e-06, + "loss": 0.414, + "mean_token_accuracy": 0.870596170425415, + "num_tokens": 862661317.0, + "step": 22608 + }, + { + "epoch": 2.8760971886528433, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.7753963470459, + "learning_rate": 1e-06, + "loss": 0.3787, + "mean_token_accuracy": 0.8781393766403198, + "num_tokens": 862698236.0, + "step": 22609 + }, + { + "epoch": 2.876224398931434, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.12561798095703, + "learning_rate": 1e-06, + "loss": 0.3618, + "mean_token_accuracy": 0.8848960399627686, + "num_tokens": 862739224.0, + "step": 22610 + }, + { + "epoch": 2.8763516092100243, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.893461227416992, + "learning_rate": 1e-06, + "loss": 0.3633, + "mean_token_accuracy": 0.889113187789917, + "num_tokens": 862776534.0, + "step": 22611 + }, + { + "epoch": 2.876478819488615, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.959659576416016, + "learning_rate": 1e-06, + "loss": 0.4219, + "mean_token_accuracy": 0.8688259124755859, + "num_tokens": 862815690.0, + "step": 22612 + }, + { + "epoch": 2.8766060297672054, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.814807891845703, + "learning_rate": 1e-06, + "loss": 0.3964, + "mean_token_accuracy": 0.8736007809638977, + "num_tokens": 862851021.0, + "step": 22613 + }, + { + "epoch": 2.8767332400457954, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.81338119506836, + "learning_rate": 1e-06, + "loss": 0.3738, + "mean_token_accuracy": 0.8829212784767151, + "num_tokens": 862885122.0, + "step": 22614 + }, + { + "epoch": 2.8768604503243864, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.69244956970215, + "learning_rate": 1e-06, + "loss": 0.374, + "mean_token_accuracy": 0.8809938430786133, + "num_tokens": 862922233.0, + "step": 22615 + }, + { + "epoch": 2.8769876606029765, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.920316696166992, + "learning_rate": 1e-06, + "loss": 0.3679, + "mean_token_accuracy": 0.8843159675598145, + "num_tokens": 862961355.0, + "step": 22616 + }, + { + "epoch": 2.8771148708815675, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.834402084350586, + "learning_rate": 1e-06, + "loss": 0.3798, + "mean_token_accuracy": 0.8826454877853394, + "num_tokens": 862998523.0, + "step": 22617 + }, + { + "epoch": 2.8772420811601576, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.004230499267578, + "learning_rate": 1e-06, + "loss": 0.3938, + "mean_token_accuracy": 0.8759840726852417, + "num_tokens": 863039684.0, + "step": 22618 + }, + { + "epoch": 2.877369291438748, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.898807525634766, + "learning_rate": 1e-06, + "loss": 0.3855, + "mean_token_accuracy": 0.8787621259689331, + "num_tokens": 863077649.0, + "step": 22619 + }, + { + "epoch": 2.8774965017173386, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.732269287109375, + "learning_rate": 1e-06, + "loss": 0.411, + "mean_token_accuracy": 0.8709951043128967, + "num_tokens": 863116816.0, + "step": 22620 + }, + { + "epoch": 2.877623711995929, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.962265014648438, + "learning_rate": 1e-06, + "loss": 0.3666, + "mean_token_accuracy": 0.8853600025177002, + "num_tokens": 863155306.0, + "step": 22621 + }, + { + "epoch": 2.8777509222745197, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.894363403320312, + "learning_rate": 1e-06, + "loss": 0.3758, + "mean_token_accuracy": 0.8818748593330383, + "num_tokens": 863194381.0, + "step": 22622 + }, + { + "epoch": 2.87787813255311, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.99219512939453, + "learning_rate": 1e-06, + "loss": 0.4078, + "mean_token_accuracy": 0.8746669888496399, + "num_tokens": 863231470.0, + "step": 22623 + }, + { + "epoch": 2.8780053428317007, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.873592376708984, + "learning_rate": 1e-06, + "loss": 0.4235, + "mean_token_accuracy": 0.8680181503295898, + "num_tokens": 863265617.0, + "step": 22624 + }, + { + "epoch": 2.8781325531102913, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.92621421813965, + "learning_rate": 1e-06, + "loss": 0.4285, + "mean_token_accuracy": 0.8638901710510254, + "num_tokens": 863300111.0, + "step": 22625 + }, + { + "epoch": 2.878259763388882, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.77505111694336, + "learning_rate": 1e-06, + "loss": 0.3689, + "mean_token_accuracy": 0.882964015007019, + "num_tokens": 863338626.0, + "step": 22626 + }, + { + "epoch": 2.8783869736674723, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.04643440246582, + "learning_rate": 1e-06, + "loss": 0.4327, + "mean_token_accuracy": 0.8680978417396545, + "num_tokens": 863374672.0, + "step": 22627 + }, + { + "epoch": 2.878514183946063, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.727092742919922, + "learning_rate": 1e-06, + "loss": 0.4084, + "mean_token_accuracy": 0.8705990314483643, + "num_tokens": 863412717.0, + "step": 22628 + }, + { + "epoch": 2.8786413942246534, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.99302101135254, + "learning_rate": 1e-06, + "loss": 0.3741, + "mean_token_accuracy": 0.8832641839981079, + "num_tokens": 863447776.0, + "step": 22629 + }, + { + "epoch": 2.878768604503244, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.70978546142578, + "learning_rate": 1e-06, + "loss": 0.3493, + "mean_token_accuracy": 0.8911327123641968, + "num_tokens": 863485697.0, + "step": 22630 + }, + { + "epoch": 2.8788958147818344, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.91936492919922, + "learning_rate": 1e-06, + "loss": 0.404, + "mean_token_accuracy": 0.869265615940094, + "num_tokens": 863528706.0, + "step": 22631 + }, + { + "epoch": 2.879023025060425, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.03135871887207, + "learning_rate": 1e-06, + "loss": 0.4487, + "mean_token_accuracy": 0.8567306995391846, + "num_tokens": 863569181.0, + "step": 22632 + }, + { + "epoch": 2.8791502353390155, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.873924255371094, + "learning_rate": 1e-06, + "loss": 0.4389, + "mean_token_accuracy": 0.8596125841140747, + "num_tokens": 863602985.0, + "step": 22633 + }, + { + "epoch": 2.879277445617606, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.71828269958496, + "learning_rate": 1e-06, + "loss": 0.3836, + "mean_token_accuracy": 0.87945157289505, + "num_tokens": 863634590.0, + "step": 22634 + }, + { + "epoch": 2.8794046558961965, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.672290802001953, + "learning_rate": 1e-06, + "loss": 0.4154, + "mean_token_accuracy": 0.872809648513794, + "num_tokens": 863677671.0, + "step": 22635 + }, + { + "epoch": 2.879531866174787, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.830753326416016, + "learning_rate": 1e-06, + "loss": 0.3963, + "mean_token_accuracy": 0.8774051666259766, + "num_tokens": 863716102.0, + "step": 22636 + }, + { + "epoch": 2.8796590764533776, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.849260330200195, + "learning_rate": 1e-06, + "loss": 0.4288, + "mean_token_accuracy": 0.8668646216392517, + "num_tokens": 863751560.0, + "step": 22637 + }, + { + "epoch": 2.879786286731968, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.914213180541992, + "learning_rate": 1e-06, + "loss": 0.3439, + "mean_token_accuracy": 0.8895568251609802, + "num_tokens": 863792373.0, + "step": 22638 + }, + { + "epoch": 2.879913497010558, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.699447631835938, + "learning_rate": 1e-06, + "loss": 0.4319, + "mean_token_accuracy": 0.8707044124603271, + "num_tokens": 863829969.0, + "step": 22639 + }, + { + "epoch": 2.880040707289149, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.87322998046875, + "learning_rate": 1e-06, + "loss": 0.3688, + "mean_token_accuracy": 0.8804947733879089, + "num_tokens": 863867080.0, + "step": 22640 + }, + { + "epoch": 2.8801679175677393, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.715362548828125, + "learning_rate": 1e-06, + "loss": 0.3802, + "mean_token_accuracy": 0.87856525182724, + "num_tokens": 863904230.0, + "step": 22641 + }, + { + "epoch": 2.8802951278463302, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.778661727905273, + "learning_rate": 1e-06, + "loss": 0.3886, + "mean_token_accuracy": 0.8798210620880127, + "num_tokens": 863947309.0, + "step": 22642 + }, + { + "epoch": 2.8804223381249203, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.725656509399414, + "learning_rate": 1e-06, + "loss": 0.3815, + "mean_token_accuracy": 0.8807722926139832, + "num_tokens": 863985557.0, + "step": 22643 + }, + { + "epoch": 2.880549548403511, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.975784301757812, + "learning_rate": 1e-06, + "loss": 0.4321, + "mean_token_accuracy": 0.863099217414856, + "num_tokens": 864021842.0, + "step": 22644 + }, + { + "epoch": 2.8806767586821014, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.955904006958008, + "learning_rate": 1e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.8666412234306335, + "num_tokens": 864060765.0, + "step": 22645 + }, + { + "epoch": 2.880803968960692, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.75973892211914, + "learning_rate": 1e-06, + "loss": 0.3932, + "mean_token_accuracy": 0.8777730464935303, + "num_tokens": 864100624.0, + "step": 22646 + }, + { + "epoch": 2.8809311792392824, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.931922912597656, + "learning_rate": 1e-06, + "loss": 0.3645, + "mean_token_accuracy": 0.8853473663330078, + "num_tokens": 864136599.0, + "step": 22647 + }, + { + "epoch": 2.881058389517873, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.836814880371094, + "learning_rate": 1e-06, + "loss": 0.3856, + "mean_token_accuracy": 0.8784831166267395, + "num_tokens": 864169876.0, + "step": 22648 + }, + { + "epoch": 2.8811855997964635, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.990880966186523, + "learning_rate": 1e-06, + "loss": 0.3976, + "mean_token_accuracy": 0.8739694356918335, + "num_tokens": 864213709.0, + "step": 22649 + }, + { + "epoch": 2.881312810075054, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.789146423339844, + "learning_rate": 1e-06, + "loss": 0.3965, + "mean_token_accuracy": 0.880419135093689, + "num_tokens": 864245581.0, + "step": 22650 + }, + { + "epoch": 2.8814400203536445, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.844104766845703, + "learning_rate": 1e-06, + "loss": 0.449, + "mean_token_accuracy": 0.8605870604515076, + "num_tokens": 864287081.0, + "step": 22651 + }, + { + "epoch": 2.881567230632235, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.92132568359375, + "learning_rate": 1e-06, + "loss": 0.4428, + "mean_token_accuracy": 0.8590636253356934, + "num_tokens": 864321480.0, + "step": 22652 + }, + { + "epoch": 2.8816944409108256, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.00082778930664, + "learning_rate": 1e-06, + "loss": 0.3819, + "mean_token_accuracy": 0.8814874887466431, + "num_tokens": 864358580.0, + "step": 22653 + }, + { + "epoch": 2.881821651189416, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.69513511657715, + "learning_rate": 1e-06, + "loss": 0.4177, + "mean_token_accuracy": 0.8667739033699036, + "num_tokens": 864396277.0, + "step": 22654 + }, + { + "epoch": 2.8819488614680067, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.02802276611328, + "learning_rate": 1e-06, + "loss": 0.4159, + "mean_token_accuracy": 0.8700021505355835, + "num_tokens": 864433282.0, + "step": 22655 + }, + { + "epoch": 2.882076071746597, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.948280334472656, + "learning_rate": 1e-06, + "loss": 0.4261, + "mean_token_accuracy": 0.8672500252723694, + "num_tokens": 864469305.0, + "step": 22656 + }, + { + "epoch": 2.8822032820251877, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.85007667541504, + "learning_rate": 1e-06, + "loss": 0.3992, + "mean_token_accuracy": 0.8761911988258362, + "num_tokens": 864507246.0, + "step": 22657 + }, + { + "epoch": 2.8823304923037782, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.910585403442383, + "learning_rate": 1e-06, + "loss": 0.4137, + "mean_token_accuracy": 0.8694566488265991, + "num_tokens": 864547315.0, + "step": 22658 + }, + { + "epoch": 2.8824577025823688, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.958192825317383, + "learning_rate": 1e-06, + "loss": 0.4293, + "mean_token_accuracy": 0.869015097618103, + "num_tokens": 864590813.0, + "step": 22659 + }, + { + "epoch": 2.8825849128609593, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.73186492919922, + "learning_rate": 1e-06, + "loss": 0.4267, + "mean_token_accuracy": 0.8660395741462708, + "num_tokens": 864625078.0, + "step": 22660 + }, + { + "epoch": 2.88271212313955, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.93861198425293, + "learning_rate": 1e-06, + "loss": 0.3739, + "mean_token_accuracy": 0.8817620873451233, + "num_tokens": 864663714.0, + "step": 22661 + }, + { + "epoch": 2.88283933341814, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.907833099365234, + "learning_rate": 1e-06, + "loss": 0.4096, + "mean_token_accuracy": 0.8734483122825623, + "num_tokens": 864697026.0, + "step": 22662 + }, + { + "epoch": 2.882966543696731, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.737133026123047, + "learning_rate": 1e-06, + "loss": 0.375, + "mean_token_accuracy": 0.8847054839134216, + "num_tokens": 864732337.0, + "step": 22663 + }, + { + "epoch": 2.883093753975321, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.109010696411133, + "learning_rate": 1e-06, + "loss": 0.4198, + "mean_token_accuracy": 0.8684262037277222, + "num_tokens": 864776598.0, + "step": 22664 + }, + { + "epoch": 2.883220964253912, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.84444808959961, + "learning_rate": 1e-06, + "loss": 0.4467, + "mean_token_accuracy": 0.8616943359375, + "num_tokens": 864817279.0, + "step": 22665 + }, + { + "epoch": 2.883348174532502, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.81815528869629, + "learning_rate": 1e-06, + "loss": 0.3788, + "mean_token_accuracy": 0.8805931806564331, + "num_tokens": 864854914.0, + "step": 22666 + }, + { + "epoch": 2.883475384811093, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.324480056762695, + "learning_rate": 1e-06, + "loss": 0.3962, + "mean_token_accuracy": 0.8777468204498291, + "num_tokens": 864892914.0, + "step": 22667 + }, + { + "epoch": 2.883602595089683, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.91561508178711, + "learning_rate": 1e-06, + "loss": 0.4004, + "mean_token_accuracy": 0.8715254664421082, + "num_tokens": 864934189.0, + "step": 22668 + }, + { + "epoch": 2.8837298053682736, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.910064697265625, + "learning_rate": 1e-06, + "loss": 0.375, + "mean_token_accuracy": 0.8836479187011719, + "num_tokens": 864972085.0, + "step": 22669 + }, + { + "epoch": 2.883857015646864, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.959054946899414, + "learning_rate": 1e-06, + "loss": 0.4498, + "mean_token_accuracy": 0.8582570552825928, + "num_tokens": 865008674.0, + "step": 22670 + }, + { + "epoch": 2.8839842259254547, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.836177825927734, + "learning_rate": 1e-06, + "loss": 0.4211, + "mean_token_accuracy": 0.8644354939460754, + "num_tokens": 865046004.0, + "step": 22671 + }, + { + "epoch": 2.884111436204045, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.999601364135742, + "learning_rate": 1e-06, + "loss": 0.3804, + "mean_token_accuracy": 0.8789823651313782, + "num_tokens": 865082409.0, + "step": 22672 + }, + { + "epoch": 2.8842386464826357, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.887165069580078, + "learning_rate": 1e-06, + "loss": 0.4004, + "mean_token_accuracy": 0.8741588592529297, + "num_tokens": 865124457.0, + "step": 22673 + }, + { + "epoch": 2.8843658567612263, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.710338592529297, + "learning_rate": 1e-06, + "loss": 0.3689, + "mean_token_accuracy": 0.8858346939086914, + "num_tokens": 865162290.0, + "step": 22674 + }, + { + "epoch": 2.884493067039817, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.962074279785156, + "learning_rate": 1e-06, + "loss": 0.4136, + "mean_token_accuracy": 0.8731437921524048, + "num_tokens": 865201686.0, + "step": 22675 + }, + { + "epoch": 2.8846202773184073, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.90498161315918, + "learning_rate": 1e-06, + "loss": 0.4272, + "mean_token_accuracy": 0.8653286099433899, + "num_tokens": 865236876.0, + "step": 22676 + }, + { + "epoch": 2.884747487596998, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.852094650268555, + "learning_rate": 1e-06, + "loss": 0.3868, + "mean_token_accuracy": 0.8786391019821167, + "num_tokens": 865276994.0, + "step": 22677 + }, + { + "epoch": 2.8848746978755884, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.883304595947266, + "learning_rate": 1e-06, + "loss": 0.3885, + "mean_token_accuracy": 0.8739856481552124, + "num_tokens": 865311990.0, + "step": 22678 + }, + { + "epoch": 2.885001908154179, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.060775756835938, + "learning_rate": 1e-06, + "loss": 0.3836, + "mean_token_accuracy": 0.8808586597442627, + "num_tokens": 865352324.0, + "step": 22679 + }, + { + "epoch": 2.8851291184327694, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.301742553710938, + "learning_rate": 1e-06, + "loss": 0.3998, + "mean_token_accuracy": 0.874786376953125, + "num_tokens": 865391428.0, + "step": 22680 + }, + { + "epoch": 2.88525632871136, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.764612197875977, + "learning_rate": 1e-06, + "loss": 0.402, + "mean_token_accuracy": 0.8728307485580444, + "num_tokens": 865428478.0, + "step": 22681 + }, + { + "epoch": 2.8853835389899505, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.0262393951416, + "learning_rate": 1e-06, + "loss": 0.4022, + "mean_token_accuracy": 0.8743966817855835, + "num_tokens": 865466531.0, + "step": 22682 + }, + { + "epoch": 2.885510749268541, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.015016555786133, + "learning_rate": 1e-06, + "loss": 0.3807, + "mean_token_accuracy": 0.8816241025924683, + "num_tokens": 865503177.0, + "step": 22683 + }, + { + "epoch": 2.8856379595471315, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.844823837280273, + "learning_rate": 1e-06, + "loss": 0.4227, + "mean_token_accuracy": 0.8676940202713013, + "num_tokens": 865539454.0, + "step": 22684 + }, + { + "epoch": 2.885765169825722, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.8105411529541, + "learning_rate": 1e-06, + "loss": 0.3453, + "mean_token_accuracy": 0.8926610946655273, + "num_tokens": 865571937.0, + "step": 22685 + }, + { + "epoch": 2.8858923801043126, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.81070899963379, + "learning_rate": 1e-06, + "loss": 0.388, + "mean_token_accuracy": 0.8764535188674927, + "num_tokens": 865609457.0, + "step": 22686 + }, + { + "epoch": 2.8860195903829027, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.98518943786621, + "learning_rate": 1e-06, + "loss": 0.4422, + "mean_token_accuracy": 0.8617925643920898, + "num_tokens": 865647391.0, + "step": 22687 + }, + { + "epoch": 2.8861468006614936, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.040145874023438, + "learning_rate": 1e-06, + "loss": 0.3947, + "mean_token_accuracy": 0.8729220628738403, + "num_tokens": 865680578.0, + "step": 22688 + }, + { + "epoch": 2.8862740109400837, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.838451385498047, + "learning_rate": 1e-06, + "loss": 0.4117, + "mean_token_accuracy": 0.8717559576034546, + "num_tokens": 865714610.0, + "step": 22689 + }, + { + "epoch": 2.8864012212186747, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.032392501831055, + "learning_rate": 1e-06, + "loss": 0.4288, + "mean_token_accuracy": 0.870322048664093, + "num_tokens": 865755672.0, + "step": 22690 + }, + { + "epoch": 2.886528431497265, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.726165771484375, + "learning_rate": 1e-06, + "loss": 0.3795, + "mean_token_accuracy": 0.8832699060440063, + "num_tokens": 865790111.0, + "step": 22691 + }, + { + "epoch": 2.8866556417758558, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.757448196411133, + "learning_rate": 1e-06, + "loss": 0.4006, + "mean_token_accuracy": 0.8754857778549194, + "num_tokens": 865825580.0, + "step": 22692 + }, + { + "epoch": 2.886782852054446, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.918041229248047, + "learning_rate": 1e-06, + "loss": 0.3834, + "mean_token_accuracy": 0.8777048587799072, + "num_tokens": 865869099.0, + "step": 22693 + }, + { + "epoch": 2.8869100623330364, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.781173706054688, + "learning_rate": 1e-06, + "loss": 0.4435, + "mean_token_accuracy": 0.8650288581848145, + "num_tokens": 865906957.0, + "step": 22694 + }, + { + "epoch": 2.887037272611627, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.813369750976562, + "learning_rate": 1e-06, + "loss": 0.3677, + "mean_token_accuracy": 0.8844152092933655, + "num_tokens": 865945712.0, + "step": 22695 + }, + { + "epoch": 2.8871644828902174, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.073486328125, + "learning_rate": 1e-06, + "loss": 0.4475, + "mean_token_accuracy": 0.8602908253669739, + "num_tokens": 865984469.0, + "step": 22696 + }, + { + "epoch": 2.887291693168808, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.90274429321289, + "learning_rate": 1e-06, + "loss": 0.4047, + "mean_token_accuracy": 0.8752121925354004, + "num_tokens": 866024624.0, + "step": 22697 + }, + { + "epoch": 2.8874189034473985, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.07857894897461, + "learning_rate": 1e-06, + "loss": 0.4026, + "mean_token_accuracy": 0.8715121746063232, + "num_tokens": 866062446.0, + "step": 22698 + }, + { + "epoch": 2.887546113725989, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.99820899963379, + "learning_rate": 1e-06, + "loss": 0.3668, + "mean_token_accuracy": 0.8839871287345886, + "num_tokens": 866100896.0, + "step": 22699 + }, + { + "epoch": 2.8876733240045795, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.80301284790039, + "learning_rate": 1e-06, + "loss": 0.4335, + "mean_token_accuracy": 0.8606674671173096, + "num_tokens": 866134668.0, + "step": 22700 + }, + { + "epoch": 2.88780053428317, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.895254135131836, + "learning_rate": 1e-06, + "loss": 0.4314, + "mean_token_accuracy": 0.8680375814437866, + "num_tokens": 866173834.0, + "step": 22701 + }, + { + "epoch": 2.8879277445617606, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.953224182128906, + "learning_rate": 1e-06, + "loss": 0.4611, + "mean_token_accuracy": 0.8580992817878723, + "num_tokens": 866211691.0, + "step": 22702 + }, + { + "epoch": 2.888054954840351, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.84933853149414, + "learning_rate": 1e-06, + "loss": 0.3966, + "mean_token_accuracy": 0.8743808269500732, + "num_tokens": 866240878.0, + "step": 22703 + }, + { + "epoch": 2.8881821651189417, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.867549896240234, + "learning_rate": 1e-06, + "loss": 0.4356, + "mean_token_accuracy": 0.8607170581817627, + "num_tokens": 866289552.0, + "step": 22704 + }, + { + "epoch": 2.888309375397532, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.908260345458984, + "learning_rate": 1e-06, + "loss": 0.4334, + "mean_token_accuracy": 0.8655052185058594, + "num_tokens": 866330361.0, + "step": 22705 + }, + { + "epoch": 2.8884365856761227, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.03018569946289, + "learning_rate": 1e-06, + "loss": 0.397, + "mean_token_accuracy": 0.8782272338867188, + "num_tokens": 866371557.0, + "step": 22706 + }, + { + "epoch": 2.8885637959547132, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.834306716918945, + "learning_rate": 1e-06, + "loss": 0.3991, + "mean_token_accuracy": 0.874565064907074, + "num_tokens": 866410627.0, + "step": 22707 + }, + { + "epoch": 2.8886910062333038, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.049760818481445, + "learning_rate": 1e-06, + "loss": 0.4002, + "mean_token_accuracy": 0.8730477094650269, + "num_tokens": 866446036.0, + "step": 22708 + }, + { + "epoch": 2.8888182165118943, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.930612564086914, + "learning_rate": 1e-06, + "loss": 0.4294, + "mean_token_accuracy": 0.8645210266113281, + "num_tokens": 866489655.0, + "step": 22709 + }, + { + "epoch": 2.888945426790485, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.038904190063477, + "learning_rate": 1e-06, + "loss": 0.4258, + "mean_token_accuracy": 0.8673539161682129, + "num_tokens": 866532750.0, + "step": 22710 + }, + { + "epoch": 2.8890726370690754, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.884647369384766, + "learning_rate": 1e-06, + "loss": 0.3814, + "mean_token_accuracy": 0.8804610371589661, + "num_tokens": 866570748.0, + "step": 22711 + }, + { + "epoch": 2.8891998473476654, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.98037338256836, + "learning_rate": 1e-06, + "loss": 0.4193, + "mean_token_accuracy": 0.8684757947921753, + "num_tokens": 866609422.0, + "step": 22712 + }, + { + "epoch": 2.8893270576262564, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.803125381469727, + "learning_rate": 1e-06, + "loss": 0.3916, + "mean_token_accuracy": 0.8756116628646851, + "num_tokens": 866650865.0, + "step": 22713 + }, + { + "epoch": 2.8894542679048465, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.15070152282715, + "learning_rate": 1e-06, + "loss": 0.4107, + "mean_token_accuracy": 0.8719629049301147, + "num_tokens": 866687605.0, + "step": 22714 + }, + { + "epoch": 2.8895814781834375, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.8129940032959, + "learning_rate": 1e-06, + "loss": 0.4243, + "mean_token_accuracy": 0.8642635345458984, + "num_tokens": 866722974.0, + "step": 22715 + }, + { + "epoch": 2.8897086884620276, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.9888858795166, + "learning_rate": 1e-06, + "loss": 0.4089, + "mean_token_accuracy": 0.8719143867492676, + "num_tokens": 866760694.0, + "step": 22716 + }, + { + "epoch": 2.889835898740618, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.832462310791016, + "learning_rate": 1e-06, + "loss": 0.4469, + "mean_token_accuracy": 0.8559269905090332, + "num_tokens": 866795102.0, + "step": 22717 + }, + { + "epoch": 2.8899631090192086, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.918895721435547, + "learning_rate": 1e-06, + "loss": 0.359, + "mean_token_accuracy": 0.8865696787834167, + "num_tokens": 866831282.0, + "step": 22718 + }, + { + "epoch": 2.890090319297799, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.027603149414062, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.8665488958358765, + "num_tokens": 866872661.0, + "step": 22719 + }, + { + "epoch": 2.8902175295763897, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.873884201049805, + "learning_rate": 1e-06, + "loss": 0.428, + "mean_token_accuracy": 0.864531397819519, + "num_tokens": 866917576.0, + "step": 22720 + }, + { + "epoch": 2.89034473985498, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.822851181030273, + "learning_rate": 1e-06, + "loss": 0.3936, + "mean_token_accuracy": 0.8775871992111206, + "num_tokens": 866957245.0, + "step": 22721 + }, + { + "epoch": 2.8904719501335707, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.880685806274414, + "learning_rate": 1e-06, + "loss": 0.4778, + "mean_token_accuracy": 0.8464527130126953, + "num_tokens": 866998695.0, + "step": 22722 + }, + { + "epoch": 2.8905991604121613, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.891475677490234, + "learning_rate": 1e-06, + "loss": 0.4425, + "mean_token_accuracy": 0.8602117300033569, + "num_tokens": 867033502.0, + "step": 22723 + }, + { + "epoch": 2.890726370690752, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.767732620239258, + "learning_rate": 1e-06, + "loss": 0.4512, + "mean_token_accuracy": 0.856123149394989, + "num_tokens": 867076032.0, + "step": 22724 + }, + { + "epoch": 2.8908535809693423, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.966957092285156, + "learning_rate": 1e-06, + "loss": 0.3736, + "mean_token_accuracy": 0.8736332654953003, + "num_tokens": 867107429.0, + "step": 22725 + }, + { + "epoch": 2.890980791247933, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.969614028930664, + "learning_rate": 1e-06, + "loss": 0.3712, + "mean_token_accuracy": 0.8761928677558899, + "num_tokens": 867139432.0, + "step": 22726 + }, + { + "epoch": 2.8911080015265234, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.577896118164062, + "learning_rate": 1e-06, + "loss": 0.4062, + "mean_token_accuracy": 0.8703539371490479, + "num_tokens": 867180588.0, + "step": 22727 + }, + { + "epoch": 2.891235211805114, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.02988052368164, + "learning_rate": 1e-06, + "loss": 0.4043, + "mean_token_accuracy": 0.8710195422172546, + "num_tokens": 867215170.0, + "step": 22728 + }, + { + "epoch": 2.8913624220837044, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.752206802368164, + "learning_rate": 1e-06, + "loss": 0.3591, + "mean_token_accuracy": 0.8878650665283203, + "num_tokens": 867254958.0, + "step": 22729 + }, + { + "epoch": 2.891489632362295, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.877140045166016, + "learning_rate": 1e-06, + "loss": 0.4039, + "mean_token_accuracy": 0.8720033764839172, + "num_tokens": 867291135.0, + "step": 22730 + }, + { + "epoch": 2.8916168426408855, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.000898361206055, + "learning_rate": 1e-06, + "loss": 0.3869, + "mean_token_accuracy": 0.8820476531982422, + "num_tokens": 867326926.0, + "step": 22731 + }, + { + "epoch": 2.891744052919476, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.746261596679688, + "learning_rate": 1e-06, + "loss": 0.4231, + "mean_token_accuracy": 0.8651641607284546, + "num_tokens": 867361142.0, + "step": 22732 + }, + { + "epoch": 2.8918712631980665, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.929166793823242, + "learning_rate": 1e-06, + "loss": 0.3511, + "mean_token_accuracy": 0.8914710283279419, + "num_tokens": 867398344.0, + "step": 22733 + }, + { + "epoch": 2.891998473476657, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.769319534301758, + "learning_rate": 1e-06, + "loss": 0.3768, + "mean_token_accuracy": 0.8818258047103882, + "num_tokens": 867437642.0, + "step": 22734 + }, + { + "epoch": 2.8921256837552476, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.770572662353516, + "learning_rate": 1e-06, + "loss": 0.4353, + "mean_token_accuracy": 0.8638494610786438, + "num_tokens": 867471646.0, + "step": 22735 + }, + { + "epoch": 2.892252894033838, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.816537857055664, + "learning_rate": 1e-06, + "loss": 0.416, + "mean_token_accuracy": 0.8741474747657776, + "num_tokens": 867509682.0, + "step": 22736 + }, + { + "epoch": 2.892380104312428, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.114377975463867, + "learning_rate": 1e-06, + "loss": 0.4263, + "mean_token_accuracy": 0.8700612783432007, + "num_tokens": 867544425.0, + "step": 22737 + }, + { + "epoch": 2.892507314591019, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.111343383789062, + "learning_rate": 1e-06, + "loss": 0.4215, + "mean_token_accuracy": 0.8673246502876282, + "num_tokens": 867584978.0, + "step": 22738 + }, + { + "epoch": 2.8926345248696093, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.96786880493164, + "learning_rate": 1e-06, + "loss": 0.4, + "mean_token_accuracy": 0.8751170635223389, + "num_tokens": 867620360.0, + "step": 22739 + }, + { + "epoch": 2.8927617351482002, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.16058921813965, + "learning_rate": 1e-06, + "loss": 0.3689, + "mean_token_accuracy": 0.8821197152137756, + "num_tokens": 867655246.0, + "step": 22740 + }, + { + "epoch": 2.8928889454267903, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.93109703063965, + "learning_rate": 1e-06, + "loss": 0.4235, + "mean_token_accuracy": 0.865835428237915, + "num_tokens": 867692792.0, + "step": 22741 + }, + { + "epoch": 2.893016155705381, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.91878318786621, + "learning_rate": 1e-06, + "loss": 0.3936, + "mean_token_accuracy": 0.875401496887207, + "num_tokens": 867732365.0, + "step": 22742 + }, + { + "epoch": 2.8931433659839714, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.907438278198242, + "learning_rate": 1e-06, + "loss": 0.445, + "mean_token_accuracy": 0.8594473600387573, + "num_tokens": 867764800.0, + "step": 22743 + }, + { + "epoch": 2.893270576262562, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.825420379638672, + "learning_rate": 1e-06, + "loss": 0.4258, + "mean_token_accuracy": 0.867154598236084, + "num_tokens": 867807107.0, + "step": 22744 + }, + { + "epoch": 2.8933977865411524, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.01082420349121, + "learning_rate": 1e-06, + "loss": 0.4324, + "mean_token_accuracy": 0.865628719329834, + "num_tokens": 867850225.0, + "step": 22745 + }, + { + "epoch": 2.893524996819743, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.902414321899414, + "learning_rate": 1e-06, + "loss": 0.4111, + "mean_token_accuracy": 0.8707759976387024, + "num_tokens": 867890457.0, + "step": 22746 + }, + { + "epoch": 2.8936522070983335, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.07251739501953, + "learning_rate": 1e-06, + "loss": 0.3764, + "mean_token_accuracy": 0.8808683156967163, + "num_tokens": 867925880.0, + "step": 22747 + }, + { + "epoch": 2.893779417376924, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.9644775390625, + "learning_rate": 1e-06, + "loss": 0.3562, + "mean_token_accuracy": 0.8866944909095764, + "num_tokens": 867963923.0, + "step": 22748 + }, + { + "epoch": 2.8939066276555145, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.01784324645996, + "learning_rate": 1e-06, + "loss": 0.3864, + "mean_token_accuracy": 0.8772673606872559, + "num_tokens": 868000489.0, + "step": 22749 + }, + { + "epoch": 2.894033837934105, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.902484893798828, + "learning_rate": 1e-06, + "loss": 0.3801, + "mean_token_accuracy": 0.8778464794158936, + "num_tokens": 868034001.0, + "step": 22750 + }, + { + "epoch": 2.8941610482126956, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.055377960205078, + "learning_rate": 1e-06, + "loss": 0.3964, + "mean_token_accuracy": 0.8782840371131897, + "num_tokens": 868074956.0, + "step": 22751 + }, + { + "epoch": 2.894288258491286, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.953950881958008, + "learning_rate": 1e-06, + "loss": 0.3795, + "mean_token_accuracy": 0.881833553314209, + "num_tokens": 868110727.0, + "step": 22752 + }, + { + "epoch": 2.8944154687698767, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.97273826599121, + "learning_rate": 1e-06, + "loss": 0.4587, + "mean_token_accuracy": 0.8562968373298645, + "num_tokens": 868151480.0, + "step": 22753 + }, + { + "epoch": 2.894542679048467, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.769895553588867, + "learning_rate": 1e-06, + "loss": 0.4254, + "mean_token_accuracy": 0.8642085790634155, + "num_tokens": 868190620.0, + "step": 22754 + }, + { + "epoch": 2.8946698893270577, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.094392776489258, + "learning_rate": 1e-06, + "loss": 0.4057, + "mean_token_accuracy": 0.8714343309402466, + "num_tokens": 868230598.0, + "step": 22755 + }, + { + "epoch": 2.8947970996056482, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.93726348876953, + "learning_rate": 1e-06, + "loss": 0.4332, + "mean_token_accuracy": 0.8678128123283386, + "num_tokens": 868265933.0, + "step": 22756 + }, + { + "epoch": 2.8949243098842388, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.81252098083496, + "learning_rate": 1e-06, + "loss": 0.3837, + "mean_token_accuracy": 0.8762260675430298, + "num_tokens": 868300282.0, + "step": 22757 + }, + { + "epoch": 2.8950515201628293, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.9501895904541, + "learning_rate": 1e-06, + "loss": 0.3567, + "mean_token_accuracy": 0.8883862495422363, + "num_tokens": 868331031.0, + "step": 22758 + }, + { + "epoch": 2.89517873044142, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.956199645996094, + "learning_rate": 1e-06, + "loss": 0.383, + "mean_token_accuracy": 0.8765716552734375, + "num_tokens": 868368268.0, + "step": 22759 + }, + { + "epoch": 2.89530594072001, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.738037109375, + "learning_rate": 1e-06, + "loss": 0.4477, + "mean_token_accuracy": 0.8588792681694031, + "num_tokens": 868407551.0, + "step": 22760 + }, + { + "epoch": 2.895433150998601, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.99458885192871, + "learning_rate": 1e-06, + "loss": 0.4063, + "mean_token_accuracy": 0.8713416457176208, + "num_tokens": 868449301.0, + "step": 22761 + }, + { + "epoch": 2.895560361277191, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.6398983001709, + "learning_rate": 1e-06, + "loss": 0.3922, + "mean_token_accuracy": 0.8712753057479858, + "num_tokens": 868482853.0, + "step": 22762 + }, + { + "epoch": 2.895687571555782, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.944143295288086, + "learning_rate": 1e-06, + "loss": 0.4094, + "mean_token_accuracy": 0.8695816993713379, + "num_tokens": 868523960.0, + "step": 22763 + }, + { + "epoch": 2.895814781834372, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.071250915527344, + "learning_rate": 1e-06, + "loss": 0.427, + "mean_token_accuracy": 0.8664554953575134, + "num_tokens": 868566447.0, + "step": 22764 + }, + { + "epoch": 2.895941992112963, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.859663009643555, + "learning_rate": 1e-06, + "loss": 0.3707, + "mean_token_accuracy": 0.8834564685821533, + "num_tokens": 868608342.0, + "step": 22765 + }, + { + "epoch": 2.896069202391553, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.79054069519043, + "learning_rate": 1e-06, + "loss": 0.3733, + "mean_token_accuracy": 0.8852456212043762, + "num_tokens": 868647354.0, + "step": 22766 + }, + { + "epoch": 2.8961964126701436, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.039758682250977, + "learning_rate": 1e-06, + "loss": 0.4617, + "mean_token_accuracy": 0.8549472689628601, + "num_tokens": 868690716.0, + "step": 22767 + }, + { + "epoch": 2.896323622948734, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.92994499206543, + "learning_rate": 1e-06, + "loss": 0.3794, + "mean_token_accuracy": 0.8799681663513184, + "num_tokens": 868728318.0, + "step": 22768 + }, + { + "epoch": 2.8964508332273247, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.972763061523438, + "learning_rate": 1e-06, + "loss": 0.3513, + "mean_token_accuracy": 0.8893956542015076, + "num_tokens": 868764323.0, + "step": 22769 + }, + { + "epoch": 2.896578043505915, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.859121322631836, + "learning_rate": 1e-06, + "loss": 0.4619, + "mean_token_accuracy": 0.8572548031806946, + "num_tokens": 868801371.0, + "step": 22770 + }, + { + "epoch": 2.8967052537845057, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.95811653137207, + "learning_rate": 1e-06, + "loss": 0.3845, + "mean_token_accuracy": 0.8789915442466736, + "num_tokens": 868846679.0, + "step": 22771 + }, + { + "epoch": 2.8968324640630962, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.887086868286133, + "learning_rate": 1e-06, + "loss": 0.4458, + "mean_token_accuracy": 0.8611201643943787, + "num_tokens": 868880076.0, + "step": 22772 + }, + { + "epoch": 2.8969596743416868, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.08433723449707, + "learning_rate": 1e-06, + "loss": 0.4059, + "mean_token_accuracy": 0.8751670122146606, + "num_tokens": 868917086.0, + "step": 22773 + }, + { + "epoch": 2.8970868846202773, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.717954635620117, + "learning_rate": 1e-06, + "loss": 0.3782, + "mean_token_accuracy": 0.8803314566612244, + "num_tokens": 868954952.0, + "step": 22774 + }, + { + "epoch": 2.897214094898868, + "ewc_loss": 0.039794921875, + "ewc_loss_parallel": 3.981590270996094e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.957530975341797, + "learning_rate": 1e-06, + "loss": 0.4162, + "mean_token_accuracy": 0.8695566654205322, + "num_tokens": 868990911.0, + "step": 22775 + }, + { + "epoch": 2.8973413051774584, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.835161209106445, + "learning_rate": 1e-06, + "loss": 0.3733, + "mean_token_accuracy": 0.8849061727523804, + "num_tokens": 869033398.0, + "step": 22776 + }, + { + "epoch": 2.897468515456049, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.097368240356445, + "learning_rate": 1e-06, + "loss": 0.437, + "mean_token_accuracy": 0.8609669804573059, + "num_tokens": 869070513.0, + "step": 22777 + }, + { + "epoch": 2.8975957257346394, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.901710510253906, + "learning_rate": 1e-06, + "loss": 0.3857, + "mean_token_accuracy": 0.881118655204773, + "num_tokens": 869113494.0, + "step": 22778 + }, + { + "epoch": 2.89772293601323, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.958208084106445, + "learning_rate": 1e-06, + "loss": 0.3757, + "mean_token_accuracy": 0.8823083639144897, + "num_tokens": 869147433.0, + "step": 22779 + }, + { + "epoch": 2.8978501462918205, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.931249618530273, + "learning_rate": 1e-06, + "loss": 0.4164, + "mean_token_accuracy": 0.8695888519287109, + "num_tokens": 869186763.0, + "step": 22780 + }, + { + "epoch": 2.897977356570411, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.063312530517578, + "learning_rate": 1e-06, + "loss": 0.3635, + "mean_token_accuracy": 0.8868736624717712, + "num_tokens": 869222624.0, + "step": 22781 + }, + { + "epoch": 2.8981045668490015, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.81945037841797, + "learning_rate": 1e-06, + "loss": 0.3413, + "mean_token_accuracy": 0.8943208456039429, + "num_tokens": 869263027.0, + "step": 22782 + }, + { + "epoch": 2.898231777127592, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.89737892150879, + "learning_rate": 1e-06, + "loss": 0.4141, + "mean_token_accuracy": 0.8688204288482666, + "num_tokens": 869304232.0, + "step": 22783 + }, + { + "epoch": 2.8983589874061826, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.8245906829834, + "learning_rate": 1e-06, + "loss": 0.3817, + "mean_token_accuracy": 0.8793735504150391, + "num_tokens": 869340141.0, + "step": 22784 + }, + { + "epoch": 2.8984861976847727, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.985740661621094, + "learning_rate": 1e-06, + "loss": 0.4171, + "mean_token_accuracy": 0.8652859330177307, + "num_tokens": 869382428.0, + "step": 22785 + }, + { + "epoch": 2.8986134079633636, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.023710250854492, + "learning_rate": 1e-06, + "loss": 0.3774, + "mean_token_accuracy": 0.8815981149673462, + "num_tokens": 869418890.0, + "step": 22786 + }, + { + "epoch": 2.8987406182419537, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.803688049316406, + "learning_rate": 1e-06, + "loss": 0.3924, + "mean_token_accuracy": 0.8745383024215698, + "num_tokens": 869453704.0, + "step": 22787 + }, + { + "epoch": 2.8988678285205447, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.90618133544922, + "learning_rate": 1e-06, + "loss": 0.4196, + "mean_token_accuracy": 0.8714659214019775, + "num_tokens": 869491845.0, + "step": 22788 + }, + { + "epoch": 2.898995038799135, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.84762954711914, + "learning_rate": 1e-06, + "loss": 0.3713, + "mean_token_accuracy": 0.8859966993331909, + "num_tokens": 869523393.0, + "step": 22789 + }, + { + "epoch": 2.8991222490777258, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.770750045776367, + "learning_rate": 1e-06, + "loss": 0.4056, + "mean_token_accuracy": 0.8727728128433228, + "num_tokens": 869561927.0, + "step": 22790 + }, + { + "epoch": 2.899249459356316, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.05257225036621, + "learning_rate": 1e-06, + "loss": 0.3964, + "mean_token_accuracy": 0.872933030128479, + "num_tokens": 869599568.0, + "step": 22791 + }, + { + "epoch": 2.8993766696349064, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.788036346435547, + "learning_rate": 1e-06, + "loss": 0.3878, + "mean_token_accuracy": 0.8777817487716675, + "num_tokens": 869636329.0, + "step": 22792 + }, + { + "epoch": 2.899503879913497, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.05940818786621, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.8698740005493164, + "num_tokens": 869679720.0, + "step": 22793 + }, + { + "epoch": 2.8996310901920874, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.97195053100586, + "learning_rate": 1e-06, + "loss": 0.4086, + "mean_token_accuracy": 0.8768439888954163, + "num_tokens": 869719527.0, + "step": 22794 + }, + { + "epoch": 2.899758300470678, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.91893768310547, + "learning_rate": 1e-06, + "loss": 0.385, + "mean_token_accuracy": 0.8792893886566162, + "num_tokens": 869755139.0, + "step": 22795 + }, + { + "epoch": 2.8998855107492685, + "ewc_loss": 0.041259765625, + "ewc_loss_parallel": 4.124641418457031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.125900268554688, + "learning_rate": 1e-06, + "loss": 0.4109, + "mean_token_accuracy": 0.8690747618675232, + "num_tokens": 869791860.0, + "step": 22796 + }, + { + "epoch": 2.900012721027859, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.013147354125977, + "learning_rate": 1e-06, + "loss": 0.3387, + "mean_token_accuracy": 0.8958994150161743, + "num_tokens": 869827294.0, + "step": 22797 + }, + { + "epoch": 2.9001399313064495, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.990650177001953, + "learning_rate": 1e-06, + "loss": 0.4126, + "mean_token_accuracy": 0.8689457774162292, + "num_tokens": 869859398.0, + "step": 22798 + }, + { + "epoch": 2.90026714158504, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.83914566040039, + "learning_rate": 1e-06, + "loss": 0.3981, + "mean_token_accuracy": 0.8732346892356873, + "num_tokens": 869898215.0, + "step": 22799 + }, + { + "epoch": 2.9003943518636306, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.020404815673828, + "learning_rate": 1e-06, + "loss": 0.4274, + "mean_token_accuracy": 0.8641209006309509, + "num_tokens": 869939787.0, + "step": 22800 + }, + { + "epoch": 2.900521562142221, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.891841888427734, + "learning_rate": 1e-06, + "loss": 0.4362, + "mean_token_accuracy": 0.865752637386322, + "num_tokens": 869976790.0, + "step": 22801 + }, + { + "epoch": 2.9006487724208116, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.834684371948242, + "learning_rate": 1e-06, + "loss": 0.3835, + "mean_token_accuracy": 0.8789129257202148, + "num_tokens": 870013547.0, + "step": 22802 + }, + { + "epoch": 2.900775982699402, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.95072364807129, + "learning_rate": 1e-06, + "loss": 0.361, + "mean_token_accuracy": 0.8852661848068237, + "num_tokens": 870049540.0, + "step": 22803 + }, + { + "epoch": 2.9009031929779927, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.846012115478516, + "learning_rate": 1e-06, + "loss": 0.381, + "mean_token_accuracy": 0.8814921379089355, + "num_tokens": 870088247.0, + "step": 22804 + }, + { + "epoch": 2.9010304032565832, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.254337310791016, + "learning_rate": 1e-06, + "loss": 0.4052, + "mean_token_accuracy": 0.8730651140213013, + "num_tokens": 870129112.0, + "step": 22805 + }, + { + "epoch": 2.9011576135351738, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.850961685180664, + "learning_rate": 1e-06, + "loss": 0.3909, + "mean_token_accuracy": 0.8749869465827942, + "num_tokens": 870162082.0, + "step": 22806 + }, + { + "epoch": 2.9012848238137643, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.955354690551758, + "learning_rate": 1e-06, + "loss": 0.3691, + "mean_token_accuracy": 0.8856842517852783, + "num_tokens": 870199868.0, + "step": 22807 + }, + { + "epoch": 2.901412034092355, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.90448570251465, + "learning_rate": 1e-06, + "loss": 0.3957, + "mean_token_accuracy": 0.8789469599723816, + "num_tokens": 870238121.0, + "step": 22808 + }, + { + "epoch": 2.9015392443709453, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.022907257080078, + "learning_rate": 1e-06, + "loss": 0.4273, + "mean_token_accuracy": 0.8655931949615479, + "num_tokens": 870278008.0, + "step": 22809 + }, + { + "epoch": 2.9016664546495354, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.034080505371094, + "learning_rate": 1e-06, + "loss": 0.4007, + "mean_token_accuracy": 0.8716064095497131, + "num_tokens": 870317671.0, + "step": 22810 + }, + { + "epoch": 2.9017936649281264, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.886287689208984, + "learning_rate": 1e-06, + "loss": 0.3812, + "mean_token_accuracy": 0.8815219402313232, + "num_tokens": 870352893.0, + "step": 22811 + }, + { + "epoch": 2.9019208752067165, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.86739158630371, + "learning_rate": 1e-06, + "loss": 0.3775, + "mean_token_accuracy": 0.8820385932922363, + "num_tokens": 870387513.0, + "step": 22812 + }, + { + "epoch": 2.9020480854853075, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.883771896362305, + "learning_rate": 1e-06, + "loss": 0.377, + "mean_token_accuracy": 0.8810237646102905, + "num_tokens": 870428649.0, + "step": 22813 + }, + { + "epoch": 2.9021752957638975, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.081453323364258, + "learning_rate": 1e-06, + "loss": 0.3598, + "mean_token_accuracy": 0.8887375593185425, + "num_tokens": 870468329.0, + "step": 22814 + }, + { + "epoch": 2.902302506042488, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.932018280029297, + "learning_rate": 1e-06, + "loss": 0.4082, + "mean_token_accuracy": 0.8738563060760498, + "num_tokens": 870509273.0, + "step": 22815 + }, + { + "epoch": 2.9024297163210786, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.114788055419922, + "learning_rate": 1e-06, + "loss": 0.4098, + "mean_token_accuracy": 0.8683090209960938, + "num_tokens": 870544206.0, + "step": 22816 + }, + { + "epoch": 2.902556926599669, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.017681121826172, + "learning_rate": 1e-06, + "loss": 0.3518, + "mean_token_accuracy": 0.8917207717895508, + "num_tokens": 870578708.0, + "step": 22817 + }, + { + "epoch": 2.9026841368782597, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.94074821472168, + "learning_rate": 1e-06, + "loss": 0.3761, + "mean_token_accuracy": 0.8812914490699768, + "num_tokens": 870620599.0, + "step": 22818 + }, + { + "epoch": 2.90281134715685, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.020294189453125, + "learning_rate": 1e-06, + "loss": 0.3506, + "mean_token_accuracy": 0.8882975578308105, + "num_tokens": 870657828.0, + "step": 22819 + }, + { + "epoch": 2.9029385574354407, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.840240478515625, + "learning_rate": 1e-06, + "loss": 0.4187, + "mean_token_accuracy": 0.8693492412567139, + "num_tokens": 870694786.0, + "step": 22820 + }, + { + "epoch": 2.9030657677140312, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.792034149169922, + "learning_rate": 1e-06, + "loss": 0.3533, + "mean_token_accuracy": 0.8911508321762085, + "num_tokens": 870733850.0, + "step": 22821 + }, + { + "epoch": 2.9031929779926218, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.869443893432617, + "learning_rate": 1e-06, + "loss": 0.4001, + "mean_token_accuracy": 0.8739672899246216, + "num_tokens": 870771556.0, + "step": 22822 + }, + { + "epoch": 2.9033201882712123, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.92439842224121, + "learning_rate": 1e-06, + "loss": 0.3631, + "mean_token_accuracy": 0.8862782716751099, + "num_tokens": 870809335.0, + "step": 22823 + }, + { + "epoch": 2.903447398549803, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.898117065429688, + "learning_rate": 1e-06, + "loss": 0.4042, + "mean_token_accuracy": 0.8707800507545471, + "num_tokens": 870844239.0, + "step": 22824 + }, + { + "epoch": 2.9035746088283934, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.984420776367188, + "learning_rate": 1e-06, + "loss": 0.4029, + "mean_token_accuracy": 0.8726311326026917, + "num_tokens": 870879540.0, + "step": 22825 + }, + { + "epoch": 2.903701819106984, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.77065086364746, + "learning_rate": 1e-06, + "loss": 0.4026, + "mean_token_accuracy": 0.8748073577880859, + "num_tokens": 870917219.0, + "step": 22826 + }, + { + "epoch": 2.9038290293855744, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.946531295776367, + "learning_rate": 1e-06, + "loss": 0.3983, + "mean_token_accuracy": 0.8760327100753784, + "num_tokens": 870952332.0, + "step": 22827 + }, + { + "epoch": 2.903956239664165, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.873029708862305, + "learning_rate": 1e-06, + "loss": 0.441, + "mean_token_accuracy": 0.8629869818687439, + "num_tokens": 870991609.0, + "step": 22828 + }, + { + "epoch": 2.9040834499427555, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.96455955505371, + "learning_rate": 1e-06, + "loss": 0.3855, + "mean_token_accuracy": 0.8781391382217407, + "num_tokens": 871027456.0, + "step": 22829 + }, + { + "epoch": 2.904210660221346, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.821895599365234, + "learning_rate": 1e-06, + "loss": 0.4208, + "mean_token_accuracy": 0.8655339479446411, + "num_tokens": 871071833.0, + "step": 22830 + }, + { + "epoch": 2.9043378704999365, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.072429656982422, + "learning_rate": 1e-06, + "loss": 0.4028, + "mean_token_accuracy": 0.8732852935791016, + "num_tokens": 871105446.0, + "step": 22831 + }, + { + "epoch": 2.904465080778527, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.98796844482422, + "learning_rate": 1e-06, + "loss": 0.3993, + "mean_token_accuracy": 0.8726760745048523, + "num_tokens": 871141408.0, + "step": 22832 + }, + { + "epoch": 2.9045922910571176, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.807849884033203, + "learning_rate": 1e-06, + "loss": 0.3891, + "mean_token_accuracy": 0.8783313035964966, + "num_tokens": 871173296.0, + "step": 22833 + }, + { + "epoch": 2.904719501335708, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.93608283996582, + "learning_rate": 1e-06, + "loss": 0.4161, + "mean_token_accuracy": 0.8722994923591614, + "num_tokens": 871210711.0, + "step": 22834 + }, + { + "epoch": 2.904846711614298, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.903690338134766, + "learning_rate": 1e-06, + "loss": 0.4041, + "mean_token_accuracy": 0.871376097202301, + "num_tokens": 871251179.0, + "step": 22835 + }, + { + "epoch": 2.904973921892889, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.845802307128906, + "learning_rate": 1e-06, + "loss": 0.3408, + "mean_token_accuracy": 0.8932313919067383, + "num_tokens": 871295758.0, + "step": 22836 + }, + { + "epoch": 2.9051011321714793, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.84280776977539, + "learning_rate": 1e-06, + "loss": 0.4449, + "mean_token_accuracy": 0.8590647578239441, + "num_tokens": 871331145.0, + "step": 22837 + }, + { + "epoch": 2.9052283424500702, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.78365707397461, + "learning_rate": 1e-06, + "loss": 0.47, + "mean_token_accuracy": 0.8503827452659607, + "num_tokens": 871372819.0, + "step": 22838 + }, + { + "epoch": 2.9053555527286603, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.765695571899414, + "learning_rate": 1e-06, + "loss": 0.4115, + "mean_token_accuracy": 0.8701595664024353, + "num_tokens": 871414223.0, + "step": 22839 + }, + { + "epoch": 2.905482763007251, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.843076705932617, + "learning_rate": 1e-06, + "loss": 0.3842, + "mean_token_accuracy": 0.8798000812530518, + "num_tokens": 871455013.0, + "step": 22840 + }, + { + "epoch": 2.9056099732858414, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.810590744018555, + "learning_rate": 1e-06, + "loss": 0.4101, + "mean_token_accuracy": 0.8733857870101929, + "num_tokens": 871495462.0, + "step": 22841 + }, + { + "epoch": 2.905737183564432, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.806264877319336, + "learning_rate": 1e-06, + "loss": 0.396, + "mean_token_accuracy": 0.8776082396507263, + "num_tokens": 871534999.0, + "step": 22842 + }, + { + "epoch": 2.9058643938430224, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.81624984741211, + "learning_rate": 1e-06, + "loss": 0.3968, + "mean_token_accuracy": 0.8757569789886475, + "num_tokens": 871569816.0, + "step": 22843 + }, + { + "epoch": 2.905991604121613, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.709774017333984, + "learning_rate": 1e-06, + "loss": 0.3933, + "mean_token_accuracy": 0.8761670589447021, + "num_tokens": 871610441.0, + "step": 22844 + }, + { + "epoch": 2.9061188144002035, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.12583351135254, + "learning_rate": 1e-06, + "loss": 0.4365, + "mean_token_accuracy": 0.8610126972198486, + "num_tokens": 871649005.0, + "step": 22845 + }, + { + "epoch": 2.906246024678794, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.87758445739746, + "learning_rate": 1e-06, + "loss": 0.4119, + "mean_token_accuracy": 0.8697054386138916, + "num_tokens": 871689263.0, + "step": 22846 + }, + { + "epoch": 2.9063732349573845, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.951126098632812, + "learning_rate": 1e-06, + "loss": 0.3712, + "mean_token_accuracy": 0.8837925791740417, + "num_tokens": 871725440.0, + "step": 22847 + }, + { + "epoch": 2.906500445235975, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.84330940246582, + "learning_rate": 1e-06, + "loss": 0.4136, + "mean_token_accuracy": 0.8715126514434814, + "num_tokens": 871767839.0, + "step": 22848 + }, + { + "epoch": 2.9066276555145656, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.97159767150879, + "learning_rate": 1e-06, + "loss": 0.3805, + "mean_token_accuracy": 0.8786200284957886, + "num_tokens": 871801516.0, + "step": 22849 + }, + { + "epoch": 2.906754865793156, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.923303604125977, + "learning_rate": 1e-06, + "loss": 0.4158, + "mean_token_accuracy": 0.8697312474250793, + "num_tokens": 871840359.0, + "step": 22850 + }, + { + "epoch": 2.9068820760717466, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.200178146362305, + "learning_rate": 1e-06, + "loss": 0.3901, + "mean_token_accuracy": 0.8792767524719238, + "num_tokens": 871879396.0, + "step": 22851 + }, + { + "epoch": 2.907009286350337, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.630895614624023, + "learning_rate": 1e-06, + "loss": 0.4224, + "mean_token_accuracy": 0.8671751022338867, + "num_tokens": 871915043.0, + "step": 22852 + }, + { + "epoch": 2.9071364966289277, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.774410247802734, + "learning_rate": 1e-06, + "loss": 0.4138, + "mean_token_accuracy": 0.8699314594268799, + "num_tokens": 871953391.0, + "step": 22853 + }, + { + "epoch": 2.9072637069075182, + "ewc_loss": 0.04150390625, + "ewc_loss_parallel": 4.1484832763671875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.847915649414062, + "learning_rate": 1e-06, + "loss": 0.3851, + "mean_token_accuracy": 0.8781924247741699, + "num_tokens": 871989879.0, + "step": 22854 + }, + { + "epoch": 2.9073909171861088, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.816356658935547, + "learning_rate": 1e-06, + "loss": 0.3784, + "mean_token_accuracy": 0.8778759837150574, + "num_tokens": 872022428.0, + "step": 22855 + }, + { + "epoch": 2.9075181274646993, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.87251853942871, + "learning_rate": 1e-06, + "loss": 0.3825, + "mean_token_accuracy": 0.8793258666992188, + "num_tokens": 872060644.0, + "step": 22856 + }, + { + "epoch": 2.90764533774329, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.917634963989258, + "learning_rate": 1e-06, + "loss": 0.4097, + "mean_token_accuracy": 0.8718700408935547, + "num_tokens": 872098230.0, + "step": 22857 + }, + { + "epoch": 2.90777254802188, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.82351303100586, + "learning_rate": 1e-06, + "loss": 0.427, + "mean_token_accuracy": 0.8641270399093628, + "num_tokens": 872131217.0, + "step": 22858 + }, + { + "epoch": 2.907899758300471, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.965106964111328, + "learning_rate": 1e-06, + "loss": 0.3723, + "mean_token_accuracy": 0.8832457065582275, + "num_tokens": 872163912.0, + "step": 22859 + }, + { + "epoch": 2.908026968579061, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.958398818969727, + "learning_rate": 1e-06, + "loss": 0.4772, + "mean_token_accuracy": 0.850752592086792, + "num_tokens": 872200694.0, + "step": 22860 + }, + { + "epoch": 2.908154178857652, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.850772857666016, + "learning_rate": 1e-06, + "loss": 0.3911, + "mean_token_accuracy": 0.8753249645233154, + "num_tokens": 872241156.0, + "step": 22861 + }, + { + "epoch": 2.908281389136242, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.83200454711914, + "learning_rate": 1e-06, + "loss": 0.3914, + "mean_token_accuracy": 0.8767243027687073, + "num_tokens": 872273048.0, + "step": 22862 + }, + { + "epoch": 2.908408599414833, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.8385066986084, + "learning_rate": 1e-06, + "loss": 0.3894, + "mean_token_accuracy": 0.8778639435768127, + "num_tokens": 872312165.0, + "step": 22863 + }, + { + "epoch": 2.908535809693423, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.016599655151367, + "learning_rate": 1e-06, + "loss": 0.3957, + "mean_token_accuracy": 0.877155065536499, + "num_tokens": 872352740.0, + "step": 22864 + }, + { + "epoch": 2.9086630199720136, + "ewc_loss": 0.041259765625, + "ewc_loss_parallel": 4.124641418457031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.636520385742188, + "learning_rate": 1e-06, + "loss": 0.4311, + "mean_token_accuracy": 0.8607333898544312, + "num_tokens": 872397800.0, + "step": 22865 + }, + { + "epoch": 2.908790230250604, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.18927764892578, + "learning_rate": 1e-06, + "loss": 0.37, + "mean_token_accuracy": 0.885948657989502, + "num_tokens": 872429307.0, + "step": 22866 + }, + { + "epoch": 2.9089174405291947, + "ewc_loss": 0.04150390625, + "ewc_loss_parallel": 4.1484832763671875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.956798553466797, + "learning_rate": 1e-06, + "loss": 0.4038, + "mean_token_accuracy": 0.870914101600647, + "num_tokens": 872465897.0, + "step": 22867 + }, + { + "epoch": 2.909044650807785, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.860563278198242, + "learning_rate": 1e-06, + "loss": 0.425, + "mean_token_accuracy": 0.8659723997116089, + "num_tokens": 872503252.0, + "step": 22868 + }, + { + "epoch": 2.9091718610863757, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.04373550415039, + "learning_rate": 1e-06, + "loss": 0.3887, + "mean_token_accuracy": 0.8781865835189819, + "num_tokens": 872546276.0, + "step": 22869 + }, + { + "epoch": 2.9092990713649662, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.7645320892334, + "learning_rate": 1e-06, + "loss": 0.411, + "mean_token_accuracy": 0.8730393052101135, + "num_tokens": 872585690.0, + "step": 22870 + }, + { + "epoch": 2.9094262816435568, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.87620735168457, + "learning_rate": 1e-06, + "loss": 0.4349, + "mean_token_accuracy": 0.8639971017837524, + "num_tokens": 872621718.0, + "step": 22871 + }, + { + "epoch": 2.9095534919221473, + "ewc_loss": 0.041259765625, + "ewc_loss_parallel": 4.124641418457031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.155397415161133, + "learning_rate": 1e-06, + "loss": 0.3845, + "mean_token_accuracy": 0.8794413805007935, + "num_tokens": 872659349.0, + "step": 22872 + }, + { + "epoch": 2.909680702200738, + "ewc_loss": 0.041259765625, + "ewc_loss_parallel": 4.124641418457031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.870197296142578, + "learning_rate": 1e-06, + "loss": 0.4133, + "mean_token_accuracy": 0.8698970079421997, + "num_tokens": 872704468.0, + "step": 22873 + }, + { + "epoch": 2.9098079124793284, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.940155029296875, + "learning_rate": 1e-06, + "loss": 0.4051, + "mean_token_accuracy": 0.8750197887420654, + "num_tokens": 872740003.0, + "step": 22874 + }, + { + "epoch": 2.909935122757919, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.989423751831055, + "learning_rate": 1e-06, + "loss": 0.4713, + "mean_token_accuracy": 0.8530875444412231, + "num_tokens": 872780362.0, + "step": 22875 + }, + { + "epoch": 2.9100623330365094, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.94386100769043, + "learning_rate": 1e-06, + "loss": 0.3759, + "mean_token_accuracy": 0.8824032545089722, + "num_tokens": 872823719.0, + "step": 22876 + }, + { + "epoch": 2.9101895433151, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.866050720214844, + "learning_rate": 1e-06, + "loss": 0.3927, + "mean_token_accuracy": 0.8779969811439514, + "num_tokens": 872856276.0, + "step": 22877 + }, + { + "epoch": 2.9103167535936905, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.94561195373535, + "learning_rate": 1e-06, + "loss": 0.4174, + "mean_token_accuracy": 0.8674644231796265, + "num_tokens": 872893688.0, + "step": 22878 + }, + { + "epoch": 2.910443963872281, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.928539276123047, + "learning_rate": 1e-06, + "loss": 0.3432, + "mean_token_accuracy": 0.8923619985580444, + "num_tokens": 872931169.0, + "step": 22879 + }, + { + "epoch": 2.9105711741508715, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.96528434753418, + "learning_rate": 1e-06, + "loss": 0.4078, + "mean_token_accuracy": 0.8700661063194275, + "num_tokens": 872970836.0, + "step": 22880 + }, + { + "epoch": 2.910698384429462, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.011442184448242, + "learning_rate": 1e-06, + "loss": 0.3578, + "mean_token_accuracy": 0.8886042833328247, + "num_tokens": 873010879.0, + "step": 22881 + }, + { + "epoch": 2.9108255947080526, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.790266036987305, + "learning_rate": 1e-06, + "loss": 0.3968, + "mean_token_accuracy": 0.8764712810516357, + "num_tokens": 873058018.0, + "step": 22882 + }, + { + "epoch": 2.9109528049866427, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.204679489135742, + "learning_rate": 1e-06, + "loss": 0.4347, + "mean_token_accuracy": 0.8644193410873413, + "num_tokens": 873096259.0, + "step": 22883 + }, + { + "epoch": 2.9110800152652336, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.821636199951172, + "learning_rate": 1e-06, + "loss": 0.4348, + "mean_token_accuracy": 0.8645252585411072, + "num_tokens": 873131955.0, + "step": 22884 + }, + { + "epoch": 2.9112072255438237, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.88707733154297, + "learning_rate": 1e-06, + "loss": 0.4284, + "mean_token_accuracy": 0.8681606650352478, + "num_tokens": 873171024.0, + "step": 22885 + }, + { + "epoch": 2.9113344358224147, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.964750289916992, + "learning_rate": 1e-06, + "loss": 0.3872, + "mean_token_accuracy": 0.8782570958137512, + "num_tokens": 873210568.0, + "step": 22886 + }, + { + "epoch": 2.9114616461010048, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.793184280395508, + "learning_rate": 1e-06, + "loss": 0.397, + "mean_token_accuracy": 0.8771542906761169, + "num_tokens": 873250025.0, + "step": 22887 + }, + { + "epoch": 2.9115888563795957, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.920713424682617, + "learning_rate": 1e-06, + "loss": 0.4295, + "mean_token_accuracy": 0.8658633232116699, + "num_tokens": 873285284.0, + "step": 22888 + }, + { + "epoch": 2.911716066658186, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.846521377563477, + "learning_rate": 1e-06, + "loss": 0.3924, + "mean_token_accuracy": 0.8773754835128784, + "num_tokens": 873320102.0, + "step": 22889 + }, + { + "epoch": 2.9118432769367764, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.978330612182617, + "learning_rate": 1e-06, + "loss": 0.3757, + "mean_token_accuracy": 0.8799201846122742, + "num_tokens": 873354668.0, + "step": 22890 + }, + { + "epoch": 2.911970487215367, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.867338180541992, + "learning_rate": 1e-06, + "loss": 0.4122, + "mean_token_accuracy": 0.8723725080490112, + "num_tokens": 873387415.0, + "step": 22891 + }, + { + "epoch": 2.9120976974939574, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.405664443969727, + "learning_rate": 1e-06, + "loss": 0.3499, + "mean_token_accuracy": 0.891795814037323, + "num_tokens": 873428679.0, + "step": 22892 + }, + { + "epoch": 2.912224907772548, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.70028305053711, + "learning_rate": 1e-06, + "loss": 0.3866, + "mean_token_accuracy": 0.8759640455245972, + "num_tokens": 873466973.0, + "step": 22893 + }, + { + "epoch": 2.9123521180511385, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.028079986572266, + "learning_rate": 1e-06, + "loss": 0.3496, + "mean_token_accuracy": 0.8893705010414124, + "num_tokens": 873510957.0, + "step": 22894 + }, + { + "epoch": 2.912479328329729, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.958948135375977, + "learning_rate": 1e-06, + "loss": 0.4593, + "mean_token_accuracy": 0.8548616170883179, + "num_tokens": 873546456.0, + "step": 22895 + }, + { + "epoch": 2.9126065386083195, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.951658248901367, + "learning_rate": 1e-06, + "loss": 0.4414, + "mean_token_accuracy": 0.861638069152832, + "num_tokens": 873591972.0, + "step": 22896 + }, + { + "epoch": 2.91273374888691, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.086761474609375, + "learning_rate": 1e-06, + "loss": 0.4029, + "mean_token_accuracy": 0.8723193407058716, + "num_tokens": 873629790.0, + "step": 22897 + }, + { + "epoch": 2.9128609591655006, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.11437225341797, + "learning_rate": 1e-06, + "loss": 0.4046, + "mean_token_accuracy": 0.8739675879478455, + "num_tokens": 873667419.0, + "step": 22898 + }, + { + "epoch": 2.912988169444091, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.804550170898438, + "learning_rate": 1e-06, + "loss": 0.3721, + "mean_token_accuracy": 0.8838277459144592, + "num_tokens": 873705834.0, + "step": 22899 + }, + { + "epoch": 2.9131153797226816, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.081565856933594, + "learning_rate": 1e-06, + "loss": 0.4258, + "mean_token_accuracy": 0.8664078712463379, + "num_tokens": 873748859.0, + "step": 22900 + }, + { + "epoch": 2.913242590001272, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.994661331176758, + "learning_rate": 1e-06, + "loss": 0.4256, + "mean_token_accuracy": 0.8664283752441406, + "num_tokens": 873788068.0, + "step": 22901 + }, + { + "epoch": 2.9133698002798627, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.84530258178711, + "learning_rate": 1e-06, + "loss": 0.3729, + "mean_token_accuracy": 0.8830153942108154, + "num_tokens": 873827820.0, + "step": 22902 + }, + { + "epoch": 2.9134970105584532, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.876789093017578, + "learning_rate": 1e-06, + "loss": 0.4117, + "mean_token_accuracy": 0.8714710474014282, + "num_tokens": 873866271.0, + "step": 22903 + }, + { + "epoch": 2.9136242208370438, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.756319046020508, + "learning_rate": 1e-06, + "loss": 0.4152, + "mean_token_accuracy": 0.8713846802711487, + "num_tokens": 873907409.0, + "step": 22904 + }, + { + "epoch": 2.9137514311156343, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.178829193115234, + "learning_rate": 1e-06, + "loss": 0.3786, + "mean_token_accuracy": 0.881295382976532, + "num_tokens": 873947940.0, + "step": 22905 + }, + { + "epoch": 2.913878641394225, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.886280059814453, + "learning_rate": 1e-06, + "loss": 0.4385, + "mean_token_accuracy": 0.8589800596237183, + "num_tokens": 873990062.0, + "step": 22906 + }, + { + "epoch": 2.9140058516728153, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.836713790893555, + "learning_rate": 1e-06, + "loss": 0.3605, + "mean_token_accuracy": 0.8862019777297974, + "num_tokens": 874026660.0, + "step": 22907 + }, + { + "epoch": 2.9141330619514054, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.104019165039062, + "learning_rate": 1e-06, + "loss": 0.3955, + "mean_token_accuracy": 0.880966305732727, + "num_tokens": 874064935.0, + "step": 22908 + }, + { + "epoch": 2.9142602722299964, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.848573684692383, + "learning_rate": 1e-06, + "loss": 0.4077, + "mean_token_accuracy": 0.8736415505409241, + "num_tokens": 874102807.0, + "step": 22909 + }, + { + "epoch": 2.9143874825085865, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.01153564453125, + "learning_rate": 1e-06, + "loss": 0.3971, + "mean_token_accuracy": 0.8755959272384644, + "num_tokens": 874139688.0, + "step": 22910 + }, + { + "epoch": 2.9145146927871775, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.941654205322266, + "learning_rate": 1e-06, + "loss": 0.3851, + "mean_token_accuracy": 0.879632830619812, + "num_tokens": 874184505.0, + "step": 22911 + }, + { + "epoch": 2.9146419030657675, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.959501266479492, + "learning_rate": 1e-06, + "loss": 0.3974, + "mean_token_accuracy": 0.8746145367622375, + "num_tokens": 874223210.0, + "step": 22912 + }, + { + "epoch": 2.914769113344358, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.1532039642334, + "learning_rate": 1e-06, + "loss": 0.4327, + "mean_token_accuracy": 0.8626493811607361, + "num_tokens": 874263939.0, + "step": 22913 + }, + { + "epoch": 2.9148963236229486, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.91053009033203, + "learning_rate": 1e-06, + "loss": 0.3834, + "mean_token_accuracy": 0.8823504447937012, + "num_tokens": 874303946.0, + "step": 22914 + }, + { + "epoch": 2.915023533901539, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.24991798400879, + "learning_rate": 1e-06, + "loss": 0.3818, + "mean_token_accuracy": 0.8765925168991089, + "num_tokens": 874343701.0, + "step": 22915 + }, + { + "epoch": 2.9151507441801296, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.878976821899414, + "learning_rate": 1e-06, + "loss": 0.3944, + "mean_token_accuracy": 0.8775907754898071, + "num_tokens": 874380425.0, + "step": 22916 + }, + { + "epoch": 2.91527795445872, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.03982925415039, + "learning_rate": 1e-06, + "loss": 0.3985, + "mean_token_accuracy": 0.8756099939346313, + "num_tokens": 874417643.0, + "step": 22917 + }, + { + "epoch": 2.9154051647373107, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.94991683959961, + "learning_rate": 1e-06, + "loss": 0.4273, + "mean_token_accuracy": 0.86797034740448, + "num_tokens": 874463939.0, + "step": 22918 + }, + { + "epoch": 2.9155323750159012, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.864784240722656, + "learning_rate": 1e-06, + "loss": 0.4125, + "mean_token_accuracy": 0.8715614676475525, + "num_tokens": 874503156.0, + "step": 22919 + }, + { + "epoch": 2.9156595852944918, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.328636169433594, + "learning_rate": 1e-06, + "loss": 0.4284, + "mean_token_accuracy": 0.8650828003883362, + "num_tokens": 874546754.0, + "step": 22920 + }, + { + "epoch": 2.9157867955730823, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.899396896362305, + "learning_rate": 1e-06, + "loss": 0.3826, + "mean_token_accuracy": 0.8777023553848267, + "num_tokens": 874590781.0, + "step": 22921 + }, + { + "epoch": 2.915914005851673, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.878232955932617, + "learning_rate": 1e-06, + "loss": 0.3668, + "mean_token_accuracy": 0.883590579032898, + "num_tokens": 874629592.0, + "step": 22922 + }, + { + "epoch": 2.9160412161302633, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.061525344848633, + "learning_rate": 1e-06, + "loss": 0.368, + "mean_token_accuracy": 0.8819566369056702, + "num_tokens": 874667372.0, + "step": 22923 + }, + { + "epoch": 2.916168426408854, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.048383712768555, + "learning_rate": 1e-06, + "loss": 0.4033, + "mean_token_accuracy": 0.8709981441497803, + "num_tokens": 874704687.0, + "step": 22924 + }, + { + "epoch": 2.9162956366874444, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.9952335357666, + "learning_rate": 1e-06, + "loss": 0.401, + "mean_token_accuracy": 0.8734008073806763, + "num_tokens": 874740471.0, + "step": 22925 + }, + { + "epoch": 2.916422846966035, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.921594619750977, + "learning_rate": 1e-06, + "loss": 0.3762, + "mean_token_accuracy": 0.8804318308830261, + "num_tokens": 874780747.0, + "step": 22926 + }, + { + "epoch": 2.9165500572446255, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.273263931274414, + "learning_rate": 1e-06, + "loss": 0.3819, + "mean_token_accuracy": 0.8794125914573669, + "num_tokens": 874814204.0, + "step": 22927 + }, + { + "epoch": 2.916677267523216, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.91120147705078, + "learning_rate": 1e-06, + "loss": 0.4049, + "mean_token_accuracy": 0.8738794326782227, + "num_tokens": 874851872.0, + "step": 22928 + }, + { + "epoch": 2.9168044778018065, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.16680335998535, + "learning_rate": 1e-06, + "loss": 0.4329, + "mean_token_accuracy": 0.858959436416626, + "num_tokens": 874884215.0, + "step": 22929 + }, + { + "epoch": 2.916931688080397, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.960124969482422, + "learning_rate": 1e-06, + "loss": 0.4152, + "mean_token_accuracy": 0.8724006414413452, + "num_tokens": 874919047.0, + "step": 22930 + }, + { + "epoch": 2.917058898358987, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.819059371948242, + "learning_rate": 1e-06, + "loss": 0.4101, + "mean_token_accuracy": 0.8683993220329285, + "num_tokens": 874956469.0, + "step": 22931 + }, + { + "epoch": 2.917186108637578, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.821718215942383, + "learning_rate": 1e-06, + "loss": 0.3633, + "mean_token_accuracy": 0.8854504823684692, + "num_tokens": 874994633.0, + "step": 22932 + }, + { + "epoch": 2.917313318916168, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.824729919433594, + "learning_rate": 1e-06, + "loss": 0.3654, + "mean_token_accuracy": 0.8866758346557617, + "num_tokens": 875028530.0, + "step": 22933 + }, + { + "epoch": 2.917440529194759, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.87340545654297, + "learning_rate": 1e-06, + "loss": 0.4222, + "mean_token_accuracy": 0.867882251739502, + "num_tokens": 875059785.0, + "step": 22934 + }, + { + "epoch": 2.9175677394733492, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.97287940979004, + "learning_rate": 1e-06, + "loss": 0.4035, + "mean_token_accuracy": 0.8719900846481323, + "num_tokens": 875096057.0, + "step": 22935 + }, + { + "epoch": 2.91769494975194, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.799373626708984, + "learning_rate": 1e-06, + "loss": 0.3785, + "mean_token_accuracy": 0.8802942633628845, + "num_tokens": 875141988.0, + "step": 22936 + }, + { + "epoch": 2.9178221600305303, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.142499923706055, + "learning_rate": 1e-06, + "loss": 0.4074, + "mean_token_accuracy": 0.8797597885131836, + "num_tokens": 875176973.0, + "step": 22937 + }, + { + "epoch": 2.917949370309121, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.916656494140625, + "learning_rate": 1e-06, + "loss": 0.3764, + "mean_token_accuracy": 0.8833271265029907, + "num_tokens": 875209271.0, + "step": 22938 + }, + { + "epoch": 2.9180765805877114, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.95022201538086, + "learning_rate": 1e-06, + "loss": 0.387, + "mean_token_accuracy": 0.8798332214355469, + "num_tokens": 875247460.0, + "step": 22939 + }, + { + "epoch": 2.918203790866302, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.143796920776367, + "learning_rate": 1e-06, + "loss": 0.3993, + "mean_token_accuracy": 0.8728948831558228, + "num_tokens": 875286013.0, + "step": 22940 + }, + { + "epoch": 2.9183310011448924, + "ewc_loss": 0.041259765625, + "ewc_loss_parallel": 4.124641418457031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.84545135498047, + "learning_rate": 1e-06, + "loss": 0.418, + "mean_token_accuracy": 0.8710048794746399, + "num_tokens": 875324547.0, + "step": 22941 + }, + { + "epoch": 2.918458211423483, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.983154296875, + "learning_rate": 1e-06, + "loss": 0.4148, + "mean_token_accuracy": 0.8654017448425293, + "num_tokens": 875366655.0, + "step": 22942 + }, + { + "epoch": 2.9185854217020735, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.042922973632812, + "learning_rate": 1e-06, + "loss": 0.4272, + "mean_token_accuracy": 0.8688203692436218, + "num_tokens": 875408571.0, + "step": 22943 + }, + { + "epoch": 2.918712631980664, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.982187271118164, + "learning_rate": 1e-06, + "loss": 0.4204, + "mean_token_accuracy": 0.8689645528793335, + "num_tokens": 875450730.0, + "step": 22944 + }, + { + "epoch": 2.9188398422592545, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.133392333984375, + "learning_rate": 1e-06, + "loss": 0.3927, + "mean_token_accuracy": 0.8782155513763428, + "num_tokens": 875486850.0, + "step": 22945 + }, + { + "epoch": 2.918967052537845, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.055017471313477, + "learning_rate": 1e-06, + "loss": 0.4314, + "mean_token_accuracy": 0.868155837059021, + "num_tokens": 875528653.0, + "step": 22946 + }, + { + "epoch": 2.9190942628164356, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.184093475341797, + "learning_rate": 1e-06, + "loss": 0.3714, + "mean_token_accuracy": 0.8844415545463562, + "num_tokens": 875564155.0, + "step": 22947 + }, + { + "epoch": 2.919221473095026, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.07517433166504, + "learning_rate": 1e-06, + "loss": 0.3932, + "mean_token_accuracy": 0.872683048248291, + "num_tokens": 875598548.0, + "step": 22948 + }, + { + "epoch": 2.9193486833736166, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.193410873413086, + "learning_rate": 1e-06, + "loss": 0.3552, + "mean_token_accuracy": 0.8895296454429626, + "num_tokens": 875634587.0, + "step": 22949 + }, + { + "epoch": 2.919475893652207, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.03307342529297, + "learning_rate": 1e-06, + "loss": 0.3991, + "mean_token_accuracy": 0.873222827911377, + "num_tokens": 875671956.0, + "step": 22950 + }, + { + "epoch": 2.9196031039307977, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.07518196105957, + "learning_rate": 1e-06, + "loss": 0.3907, + "mean_token_accuracy": 0.8766129016876221, + "num_tokens": 875706327.0, + "step": 22951 + }, + { + "epoch": 2.9197303142093882, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.12952423095703, + "learning_rate": 1e-06, + "loss": 0.3828, + "mean_token_accuracy": 0.8807374238967896, + "num_tokens": 875749320.0, + "step": 22952 + }, + { + "epoch": 2.9198575244879788, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.066730499267578, + "learning_rate": 1e-06, + "loss": 0.377, + "mean_token_accuracy": 0.8834731578826904, + "num_tokens": 875783871.0, + "step": 22953 + }, + { + "epoch": 2.9199847347665693, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.003122329711914, + "learning_rate": 1e-06, + "loss": 0.3994, + "mean_token_accuracy": 0.8739526271820068, + "num_tokens": 875821716.0, + "step": 22954 + }, + { + "epoch": 2.92011194504516, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.0213565826416, + "learning_rate": 1e-06, + "loss": 0.3795, + "mean_token_accuracy": 0.877812385559082, + "num_tokens": 875854339.0, + "step": 22955 + }, + { + "epoch": 2.92023915532375, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.32246971130371, + "learning_rate": 1e-06, + "loss": 0.3951, + "mean_token_accuracy": 0.8756759166717529, + "num_tokens": 875889456.0, + "step": 22956 + }, + { + "epoch": 2.920366365602341, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.122934341430664, + "learning_rate": 1e-06, + "loss": 0.4285, + "mean_token_accuracy": 0.8657792806625366, + "num_tokens": 875926659.0, + "step": 22957 + }, + { + "epoch": 2.920493575880931, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.93642234802246, + "learning_rate": 1e-06, + "loss": 0.3793, + "mean_token_accuracy": 0.8823210000991821, + "num_tokens": 875972766.0, + "step": 22958 + }, + { + "epoch": 2.920620786159522, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.948816299438477, + "learning_rate": 1e-06, + "loss": 0.418, + "mean_token_accuracy": 0.8711029887199402, + "num_tokens": 876011824.0, + "step": 22959 + }, + { + "epoch": 2.920747996438112, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.1434383392334, + "learning_rate": 1e-06, + "loss": 0.4622, + "mean_token_accuracy": 0.8578521013259888, + "num_tokens": 876050353.0, + "step": 22960 + }, + { + "epoch": 2.920875206716703, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.86448097229004, + "learning_rate": 1e-06, + "loss": 0.443, + "mean_token_accuracy": 0.8609378337860107, + "num_tokens": 876089358.0, + "step": 22961 + }, + { + "epoch": 2.921002416995293, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.011234283447266, + "learning_rate": 1e-06, + "loss": 0.4393, + "mean_token_accuracy": 0.8607131242752075, + "num_tokens": 876126287.0, + "step": 22962 + }, + { + "epoch": 2.9211296272738836, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.006662368774414, + "learning_rate": 1e-06, + "loss": 0.3846, + "mean_token_accuracy": 0.883407711982727, + "num_tokens": 876165157.0, + "step": 22963 + }, + { + "epoch": 2.921256837552474, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.046274185180664, + "learning_rate": 1e-06, + "loss": 0.3946, + "mean_token_accuracy": 0.8753944635391235, + "num_tokens": 876201391.0, + "step": 22964 + }, + { + "epoch": 2.9213840478310646, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.07476234436035, + "learning_rate": 1e-06, + "loss": 0.3806, + "mean_token_accuracy": 0.8810452222824097, + "num_tokens": 876243004.0, + "step": 22965 + }, + { + "epoch": 2.921511258109655, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.953044891357422, + "learning_rate": 1e-06, + "loss": 0.3886, + "mean_token_accuracy": 0.8761808276176453, + "num_tokens": 876279245.0, + "step": 22966 + }, + { + "epoch": 2.9216384683882457, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.122838973999023, + "learning_rate": 1e-06, + "loss": 0.3974, + "mean_token_accuracy": 0.8740004897117615, + "num_tokens": 876312145.0, + "step": 22967 + }, + { + "epoch": 2.9217656786668362, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.85458755493164, + "learning_rate": 1e-06, + "loss": 0.3922, + "mean_token_accuracy": 0.8758273124694824, + "num_tokens": 876351354.0, + "step": 22968 + }, + { + "epoch": 2.9218928889454268, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.03754425048828, + "learning_rate": 1e-06, + "loss": 0.4436, + "mean_token_accuracy": 0.8626959323883057, + "num_tokens": 876388621.0, + "step": 22969 + }, + { + "epoch": 2.9220200992240173, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.028505325317383, + "learning_rate": 1e-06, + "loss": 0.4183, + "mean_token_accuracy": 0.8685480952262878, + "num_tokens": 876418377.0, + "step": 22970 + }, + { + "epoch": 2.922147309502608, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.19486427307129, + "learning_rate": 1e-06, + "loss": 0.3951, + "mean_token_accuracy": 0.8778097629547119, + "num_tokens": 876453351.0, + "step": 22971 + }, + { + "epoch": 2.9222745197811983, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.12994384765625, + "learning_rate": 1e-06, + "loss": 0.376, + "mean_token_accuracy": 0.8809512853622437, + "num_tokens": 876484830.0, + "step": 22972 + }, + { + "epoch": 2.922401730059789, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.03561782836914, + "learning_rate": 1e-06, + "loss": 0.3682, + "mean_token_accuracy": 0.884181022644043, + "num_tokens": 876524318.0, + "step": 22973 + }, + { + "epoch": 2.9225289403383794, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.296924591064453, + "learning_rate": 1e-06, + "loss": 0.4168, + "mean_token_accuracy": 0.8717846274375916, + "num_tokens": 876562421.0, + "step": 22974 + }, + { + "epoch": 2.92265615061697, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.000930786132812, + "learning_rate": 1e-06, + "loss": 0.4279, + "mean_token_accuracy": 0.8680444359779358, + "num_tokens": 876605681.0, + "step": 22975 + }, + { + "epoch": 2.9227833608955605, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.03799057006836, + "learning_rate": 1e-06, + "loss": 0.4289, + "mean_token_accuracy": 0.8648989796638489, + "num_tokens": 876647371.0, + "step": 22976 + }, + { + "epoch": 2.922910571174151, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.802213668823242, + "learning_rate": 1e-06, + "loss": 0.3813, + "mean_token_accuracy": 0.8798421621322632, + "num_tokens": 876685563.0, + "step": 22977 + }, + { + "epoch": 2.9230377814527415, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.249818801879883, + "learning_rate": 1e-06, + "loss": 0.4149, + "mean_token_accuracy": 0.8706152439117432, + "num_tokens": 876718404.0, + "step": 22978 + }, + { + "epoch": 2.923164991731332, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.910917282104492, + "learning_rate": 1e-06, + "loss": 0.3937, + "mean_token_accuracy": 0.8774820566177368, + "num_tokens": 876751888.0, + "step": 22979 + }, + { + "epoch": 2.9232922020099226, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.00528907775879, + "learning_rate": 1e-06, + "loss": 0.3852, + "mean_token_accuracy": 0.8802902102470398, + "num_tokens": 876786813.0, + "step": 22980 + }, + { + "epoch": 2.9234194122885127, + "ewc_loss": 0.041259765625, + "ewc_loss_parallel": 4.124641418457031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.84605598449707, + "learning_rate": 1e-06, + "loss": 0.3668, + "mean_token_accuracy": 0.8841614127159119, + "num_tokens": 876824511.0, + "step": 22981 + }, + { + "epoch": 2.9235466225671036, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.8287353515625, + "learning_rate": 1e-06, + "loss": 0.3938, + "mean_token_accuracy": 0.8771204948425293, + "num_tokens": 876856409.0, + "step": 22982 + }, + { + "epoch": 2.9236738328456937, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.13134002685547, + "learning_rate": 1e-06, + "loss": 0.4376, + "mean_token_accuracy": 0.8607523441314697, + "num_tokens": 876889204.0, + "step": 22983 + }, + { + "epoch": 2.9238010431242847, + "ewc_loss": 0.041259765625, + "ewc_loss_parallel": 4.124641418457031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.770166397094727, + "learning_rate": 1e-06, + "loss": 0.3983, + "mean_token_accuracy": 0.8748236298561096, + "num_tokens": 876927185.0, + "step": 22984 + }, + { + "epoch": 2.9239282534028748, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.031352996826172, + "learning_rate": 1e-06, + "loss": 0.4137, + "mean_token_accuracy": 0.8714854717254639, + "num_tokens": 876965744.0, + "step": 22985 + }, + { + "epoch": 2.9240554636814657, + "ewc_loss": 0.041259765625, + "ewc_loss_parallel": 4.124641418457031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.056337356567383, + "learning_rate": 1e-06, + "loss": 0.3816, + "mean_token_accuracy": 0.8801003694534302, + "num_tokens": 876997107.0, + "step": 22986 + }, + { + "epoch": 2.924182673960056, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.773502349853516, + "learning_rate": 1e-06, + "loss": 0.4197, + "mean_token_accuracy": 0.8653053045272827, + "num_tokens": 877037894.0, + "step": 22987 + }, + { + "epoch": 2.9243098842386464, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.842851638793945, + "learning_rate": 1e-06, + "loss": 0.4487, + "mean_token_accuracy": 0.8613653182983398, + "num_tokens": 877078724.0, + "step": 22988 + }, + { + "epoch": 2.924437094517237, + "ewc_loss": 0.041259765625, + "ewc_loss_parallel": 4.124641418457031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.776748657226562, + "learning_rate": 1e-06, + "loss": 0.3994, + "mean_token_accuracy": 0.8750240802764893, + "num_tokens": 877119685.0, + "step": 22989 + }, + { + "epoch": 2.9245643047958274, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.07341194152832, + "learning_rate": 1e-06, + "loss": 0.3911, + "mean_token_accuracy": 0.8777708411216736, + "num_tokens": 877157055.0, + "step": 22990 + }, + { + "epoch": 2.924691515074418, + "ewc_loss": 0.04150390625, + "ewc_loss_parallel": 4.1484832763671875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.970476150512695, + "learning_rate": 1e-06, + "loss": 0.3871, + "mean_token_accuracy": 0.8803646564483643, + "num_tokens": 877188569.0, + "step": 22991 + }, + { + "epoch": 2.9248187253530085, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.809022903442383, + "learning_rate": 1e-06, + "loss": 0.4237, + "mean_token_accuracy": 0.8672643899917603, + "num_tokens": 877222745.0, + "step": 22992 + }, + { + "epoch": 2.924945935631599, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.98594093322754, + "learning_rate": 1e-06, + "loss": 0.4513, + "mean_token_accuracy": 0.8588632345199585, + "num_tokens": 877262510.0, + "step": 22993 + }, + { + "epoch": 2.9250731459101895, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.917842864990234, + "learning_rate": 1e-06, + "loss": 0.4206, + "mean_token_accuracy": 0.8679061532020569, + "num_tokens": 877300164.0, + "step": 22994 + }, + { + "epoch": 2.92520035618878, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.931325912475586, + "learning_rate": 1e-06, + "loss": 0.3735, + "mean_token_accuracy": 0.8840150833129883, + "num_tokens": 877337187.0, + "step": 22995 + }, + { + "epoch": 2.9253275664673706, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.012849807739258, + "learning_rate": 1e-06, + "loss": 0.4564, + "mean_token_accuracy": 0.8558412194252014, + "num_tokens": 877375902.0, + "step": 22996 + }, + { + "epoch": 2.925454776745961, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.89134407043457, + "learning_rate": 1e-06, + "loss": 0.404, + "mean_token_accuracy": 0.8727098703384399, + "num_tokens": 877414406.0, + "step": 22997 + }, + { + "epoch": 2.9255819870245516, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.734344482421875, + "learning_rate": 1e-06, + "loss": 0.3649, + "mean_token_accuracy": 0.8803284764289856, + "num_tokens": 877448977.0, + "step": 22998 + }, + { + "epoch": 2.925709197303142, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.24503517150879, + "learning_rate": 1e-06, + "loss": 0.3745, + "mean_token_accuracy": 0.8816127777099609, + "num_tokens": 877486766.0, + "step": 22999 + }, + { + "epoch": 2.9258364075817327, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.70440673828125, + "learning_rate": 1e-06, + "loss": 0.3607, + "mean_token_accuracy": 0.8874924182891846, + "num_tokens": 877519320.0, + "step": 23000 + }, + { + "epoch": 2.925963617860323, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.971649169921875, + "learning_rate": 1e-06, + "loss": 0.4783, + "mean_token_accuracy": 0.8514090776443481, + "num_tokens": 877557913.0, + "step": 23001 + }, + { + "epoch": 2.9260908281389137, + "ewc_loss": 0.04150390625, + "ewc_loss_parallel": 4.1484832763671875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.022926330566406, + "learning_rate": 1e-06, + "loss": 0.3975, + "mean_token_accuracy": 0.8747940063476562, + "num_tokens": 877591197.0, + "step": 23002 + }, + { + "epoch": 2.9262180384175043, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.736860275268555, + "learning_rate": 1e-06, + "loss": 0.3494, + "mean_token_accuracy": 0.8907257318496704, + "num_tokens": 877625018.0, + "step": 23003 + }, + { + "epoch": 2.926345248696095, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.846717834472656, + "learning_rate": 1e-06, + "loss": 0.3933, + "mean_token_accuracy": 0.8788235783576965, + "num_tokens": 877662433.0, + "step": 23004 + }, + { + "epoch": 2.9264724589746853, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.761262893676758, + "learning_rate": 1e-06, + "loss": 0.3749, + "mean_token_accuracy": 0.8861374258995056, + "num_tokens": 877703576.0, + "step": 23005 + }, + { + "epoch": 2.9265996692532754, + "ewc_loss": 0.041259765625, + "ewc_loss_parallel": 4.124641418457031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.922252655029297, + "learning_rate": 1e-06, + "loss": 0.4092, + "mean_token_accuracy": 0.870600700378418, + "num_tokens": 877734025.0, + "step": 23006 + }, + { + "epoch": 2.9267268795318664, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.733783721923828, + "learning_rate": 1e-06, + "loss": 0.363, + "mean_token_accuracy": 0.8828125, + "num_tokens": 877771942.0, + "step": 23007 + }, + { + "epoch": 2.9268540898104565, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.024160385131836, + "learning_rate": 1e-06, + "loss": 0.3929, + "mean_token_accuracy": 0.8783174753189087, + "num_tokens": 877815662.0, + "step": 23008 + }, + { + "epoch": 2.9269813000890474, + "ewc_loss": 0.041259765625, + "ewc_loss_parallel": 4.124641418457031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.96307373046875, + "learning_rate": 1e-06, + "loss": 0.4492, + "mean_token_accuracy": 0.858895480632782, + "num_tokens": 877855398.0, + "step": 23009 + }, + { + "epoch": 2.9271085103676375, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.97791290283203, + "learning_rate": 1e-06, + "loss": 0.4212, + "mean_token_accuracy": 0.8687711954116821, + "num_tokens": 877898544.0, + "step": 23010 + }, + { + "epoch": 2.927235720646228, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.89171028137207, + "learning_rate": 1e-06, + "loss": 0.4108, + "mean_token_accuracy": 0.8732030391693115, + "num_tokens": 877937914.0, + "step": 23011 + }, + { + "epoch": 2.9273629309248186, + "ewc_loss": 0.041259765625, + "ewc_loss_parallel": 4.124641418457031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.049619674682617, + "learning_rate": 1e-06, + "loss": 0.4574, + "mean_token_accuracy": 0.8577958345413208, + "num_tokens": 877979266.0, + "step": 23012 + }, + { + "epoch": 2.927490141203409, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.045917510986328, + "learning_rate": 1e-06, + "loss": 0.3929, + "mean_token_accuracy": 0.8782042264938354, + "num_tokens": 878019762.0, + "step": 23013 + }, + { + "epoch": 2.9276173514819996, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.852764129638672, + "learning_rate": 1e-06, + "loss": 0.4145, + "mean_token_accuracy": 0.8693856000900269, + "num_tokens": 878062174.0, + "step": 23014 + }, + { + "epoch": 2.92774456176059, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.01715087890625, + "learning_rate": 1e-06, + "loss": 0.4174, + "mean_token_accuracy": 0.8691920638084412, + "num_tokens": 878098197.0, + "step": 23015 + }, + { + "epoch": 2.9278717720391807, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.151952743530273, + "learning_rate": 1e-06, + "loss": 0.3824, + "mean_token_accuracy": 0.8800457715988159, + "num_tokens": 878130478.0, + "step": 23016 + }, + { + "epoch": 2.9279989823177712, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.996150970458984, + "learning_rate": 1e-06, + "loss": 0.4512, + "mean_token_accuracy": 0.8570618033409119, + "num_tokens": 878175670.0, + "step": 23017 + }, + { + "epoch": 2.9281261925963618, + "ewc_loss": 0.041259765625, + "ewc_loss_parallel": 4.124641418457031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.864622116088867, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8634779453277588, + "num_tokens": 878216415.0, + "step": 23018 + }, + { + "epoch": 2.9282534028749523, + "ewc_loss": 0.04150390625, + "ewc_loss_parallel": 4.1484832763671875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.064531326293945, + "learning_rate": 1e-06, + "loss": 0.4059, + "mean_token_accuracy": 0.8738154172897339, + "num_tokens": 878249582.0, + "step": 23019 + }, + { + "epoch": 2.928380613153543, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.98557472229004, + "learning_rate": 1e-06, + "loss": 0.3641, + "mean_token_accuracy": 0.8856041431427002, + "num_tokens": 878290859.0, + "step": 23020 + }, + { + "epoch": 2.9285078234321333, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.83481788635254, + "learning_rate": 1e-06, + "loss": 0.3544, + "mean_token_accuracy": 0.8902246952056885, + "num_tokens": 878322463.0, + "step": 23021 + }, + { + "epoch": 2.928635033710724, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.986787796020508, + "learning_rate": 1e-06, + "loss": 0.3665, + "mean_token_accuracy": 0.8839888572692871, + "num_tokens": 878365754.0, + "step": 23022 + }, + { + "epoch": 2.9287622439893144, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.837312698364258, + "learning_rate": 1e-06, + "loss": 0.362, + "mean_token_accuracy": 0.8877657055854797, + "num_tokens": 878410325.0, + "step": 23023 + }, + { + "epoch": 2.928889454267905, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.862184524536133, + "learning_rate": 1e-06, + "loss": 0.3797, + "mean_token_accuracy": 0.8799694776535034, + "num_tokens": 878451604.0, + "step": 23024 + }, + { + "epoch": 2.9290166645464955, + "ewc_loss": 0.041259765625, + "ewc_loss_parallel": 4.124641418457031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.841920852661133, + "learning_rate": 1e-06, + "loss": 0.3974, + "mean_token_accuracy": 0.8773475885391235, + "num_tokens": 878485174.0, + "step": 23025 + }, + { + "epoch": 2.929143874825086, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.13142204284668, + "learning_rate": 1e-06, + "loss": 0.4303, + "mean_token_accuracy": 0.8663435578346252, + "num_tokens": 878525867.0, + "step": 23026 + }, + { + "epoch": 2.9292710851036765, + "ewc_loss": 0.041259765625, + "ewc_loss_parallel": 4.124641418457031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.037199020385742, + "learning_rate": 1e-06, + "loss": 0.4404, + "mean_token_accuracy": 0.8623125553131104, + "num_tokens": 878561327.0, + "step": 23027 + }, + { + "epoch": 2.929398295382267, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.07386016845703, + "learning_rate": 1e-06, + "loss": 0.3879, + "mean_token_accuracy": 0.8809690475463867, + "num_tokens": 878603045.0, + "step": 23028 + }, + { + "epoch": 2.929525505660857, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.972158432006836, + "learning_rate": 1e-06, + "loss": 0.3784, + "mean_token_accuracy": 0.8792837858200073, + "num_tokens": 878645051.0, + "step": 23029 + }, + { + "epoch": 2.929652715939448, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.947568893432617, + "learning_rate": 1e-06, + "loss": 0.4604, + "mean_token_accuracy": 0.85712730884552, + "num_tokens": 878682003.0, + "step": 23030 + }, + { + "epoch": 2.929779926218038, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.14899444580078, + "learning_rate": 1e-06, + "loss": 0.3856, + "mean_token_accuracy": 0.8804805278778076, + "num_tokens": 878720509.0, + "step": 23031 + }, + { + "epoch": 2.929907136496629, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.95784568786621, + "learning_rate": 1e-06, + "loss": 0.3878, + "mean_token_accuracy": 0.8810191750526428, + "num_tokens": 878757950.0, + "step": 23032 + }, + { + "epoch": 2.9300343467752192, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.866188049316406, + "learning_rate": 1e-06, + "loss": 0.4258, + "mean_token_accuracy": 0.8674935698509216, + "num_tokens": 878801477.0, + "step": 23033 + }, + { + "epoch": 2.93016155705381, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.152015686035156, + "learning_rate": 1e-06, + "loss": 0.4273, + "mean_token_accuracy": 0.8675568103790283, + "num_tokens": 878838802.0, + "step": 23034 + }, + { + "epoch": 2.9302887673324003, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.15672492980957, + "learning_rate": 1e-06, + "loss": 0.4171, + "mean_token_accuracy": 0.8705400228500366, + "num_tokens": 878875566.0, + "step": 23035 + }, + { + "epoch": 2.930415977610991, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.832414627075195, + "learning_rate": 1e-06, + "loss": 0.431, + "mean_token_accuracy": 0.8674725294113159, + "num_tokens": 878913463.0, + "step": 23036 + }, + { + "epoch": 2.9305431878895813, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.23946189880371, + "learning_rate": 1e-06, + "loss": 0.4361, + "mean_token_accuracy": 0.8636534214019775, + "num_tokens": 878950769.0, + "step": 23037 + }, + { + "epoch": 2.930670398168172, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.86333465576172, + "learning_rate": 1e-06, + "loss": 0.3772, + "mean_token_accuracy": 0.8847348690032959, + "num_tokens": 878989041.0, + "step": 23038 + }, + { + "epoch": 2.9307976084467624, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.862768173217773, + "learning_rate": 1e-06, + "loss": 0.3794, + "mean_token_accuracy": 0.8818726539611816, + "num_tokens": 879020195.0, + "step": 23039 + }, + { + "epoch": 2.930924818725353, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.127206802368164, + "learning_rate": 1e-06, + "loss": 0.3973, + "mean_token_accuracy": 0.8728987574577332, + "num_tokens": 879057818.0, + "step": 23040 + }, + { + "epoch": 2.9310520290039435, + "ewc_loss": 0.041259765625, + "ewc_loss_parallel": 4.124641418457031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.838912963867188, + "learning_rate": 1e-06, + "loss": 0.409, + "mean_token_accuracy": 0.8718993067741394, + "num_tokens": 879094747.0, + "step": 23041 + }, + { + "epoch": 2.931179239282534, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.10103416442871, + "learning_rate": 1e-06, + "loss": 0.4307, + "mean_token_accuracy": 0.865016520023346, + "num_tokens": 879131127.0, + "step": 23042 + }, + { + "epoch": 2.9313064495611245, + "ewc_loss": 0.041259765625, + "ewc_loss_parallel": 4.124641418457031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.9450740814209, + "learning_rate": 1e-06, + "loss": 0.3901, + "mean_token_accuracy": 0.8747212886810303, + "num_tokens": 879168541.0, + "step": 23043 + }, + { + "epoch": 2.931433659839715, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.937362670898438, + "learning_rate": 1e-06, + "loss": 0.4168, + "mean_token_accuracy": 0.8713679313659668, + "num_tokens": 879199409.0, + "step": 23044 + }, + { + "epoch": 2.9315608701183056, + "ewc_loss": 0.041259765625, + "ewc_loss_parallel": 4.124641418457031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.924346923828125, + "learning_rate": 1e-06, + "loss": 0.3588, + "mean_token_accuracy": 0.8876700401306152, + "num_tokens": 879236447.0, + "step": 23045 + }, + { + "epoch": 2.931688080396896, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.08530044555664, + "learning_rate": 1e-06, + "loss": 0.3947, + "mean_token_accuracy": 0.8750889301300049, + "num_tokens": 879275596.0, + "step": 23046 + }, + { + "epoch": 2.9318152906754866, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.05621337890625, + "learning_rate": 1e-06, + "loss": 0.4063, + "mean_token_accuracy": 0.8728815317153931, + "num_tokens": 879315620.0, + "step": 23047 + }, + { + "epoch": 2.931942500954077, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.792709350585938, + "learning_rate": 1e-06, + "loss": 0.3902, + "mean_token_accuracy": 0.8765794634819031, + "num_tokens": 879356220.0, + "step": 23048 + }, + { + "epoch": 2.9320697112326677, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.02999496459961, + "learning_rate": 1e-06, + "loss": 0.4525, + "mean_token_accuracy": 0.8592845797538757, + "num_tokens": 879397544.0, + "step": 23049 + }, + { + "epoch": 2.932196921511258, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.856782913208008, + "learning_rate": 1e-06, + "loss": 0.3749, + "mean_token_accuracy": 0.8837398290634155, + "num_tokens": 879438165.0, + "step": 23050 + }, + { + "epoch": 2.9323241317898487, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.894559860229492, + "learning_rate": 1e-06, + "loss": 0.3921, + "mean_token_accuracy": 0.8761577606201172, + "num_tokens": 879473601.0, + "step": 23051 + }, + { + "epoch": 2.9324513420684393, + "ewc_loss": 0.041259765625, + "ewc_loss_parallel": 4.124641418457031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.186872482299805, + "learning_rate": 1e-06, + "loss": 0.4344, + "mean_token_accuracy": 0.8673518896102905, + "num_tokens": 879510998.0, + "step": 23052 + }, + { + "epoch": 2.93257855234703, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.90993881225586, + "learning_rate": 1e-06, + "loss": 0.3834, + "mean_token_accuracy": 0.8794263601303101, + "num_tokens": 879551469.0, + "step": 23053 + }, + { + "epoch": 2.93270576262562, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.08038902282715, + "learning_rate": 1e-06, + "loss": 0.4199, + "mean_token_accuracy": 0.8669083118438721, + "num_tokens": 879585620.0, + "step": 23054 + }, + { + "epoch": 2.932832972904211, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.965612411499023, + "learning_rate": 1e-06, + "loss": 0.3681, + "mean_token_accuracy": 0.8846186399459839, + "num_tokens": 879621749.0, + "step": 23055 + }, + { + "epoch": 2.932960183182801, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.95960807800293, + "learning_rate": 1e-06, + "loss": 0.3907, + "mean_token_accuracy": 0.8777613043785095, + "num_tokens": 879664529.0, + "step": 23056 + }, + { + "epoch": 2.933087393461392, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.93366813659668, + "learning_rate": 1e-06, + "loss": 0.3832, + "mean_token_accuracy": 0.8796507716178894, + "num_tokens": 879696293.0, + "step": 23057 + }, + { + "epoch": 2.933214603739982, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.9156494140625, + "learning_rate": 1e-06, + "loss": 0.3783, + "mean_token_accuracy": 0.8797376751899719, + "num_tokens": 879733562.0, + "step": 23058 + }, + { + "epoch": 2.933341814018573, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.857454299926758, + "learning_rate": 1e-06, + "loss": 0.4066, + "mean_token_accuracy": 0.8720715045928955, + "num_tokens": 879771740.0, + "step": 23059 + }, + { + "epoch": 2.933469024297163, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.044780731201172, + "learning_rate": 1e-06, + "loss": 0.4125, + "mean_token_accuracy": 0.8701862692832947, + "num_tokens": 879805578.0, + "step": 23060 + }, + { + "epoch": 2.9335962345757536, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.774259567260742, + "learning_rate": 1e-06, + "loss": 0.4073, + "mean_token_accuracy": 0.871562659740448, + "num_tokens": 879845175.0, + "step": 23061 + }, + { + "epoch": 2.933723444854344, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.16959571838379, + "learning_rate": 1e-06, + "loss": 0.3565, + "mean_token_accuracy": 0.8865863084793091, + "num_tokens": 879878975.0, + "step": 23062 + }, + { + "epoch": 2.9338506551329346, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.871286392211914, + "learning_rate": 1e-06, + "loss": 0.4265, + "mean_token_accuracy": 0.8658075928688049, + "num_tokens": 879919373.0, + "step": 23063 + }, + { + "epoch": 2.933977865411525, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.8785457611084, + "learning_rate": 1e-06, + "loss": 0.4494, + "mean_token_accuracy": 0.853821337223053, + "num_tokens": 879953821.0, + "step": 23064 + }, + { + "epoch": 2.9341050756901157, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.11143684387207, + "learning_rate": 1e-06, + "loss": 0.4243, + "mean_token_accuracy": 0.866053581237793, + "num_tokens": 879993505.0, + "step": 23065 + }, + { + "epoch": 2.934232285968706, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.827102661132812, + "learning_rate": 1e-06, + "loss": 0.3803, + "mean_token_accuracy": 0.8790808320045471, + "num_tokens": 880030124.0, + "step": 23066 + }, + { + "epoch": 2.9343594962472968, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.98272132873535, + "learning_rate": 1e-06, + "loss": 0.4654, + "mean_token_accuracy": 0.8519778251647949, + "num_tokens": 880059480.0, + "step": 23067 + }, + { + "epoch": 2.9344867065258873, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.910314559936523, + "learning_rate": 1e-06, + "loss": 0.3901, + "mean_token_accuracy": 0.875869631767273, + "num_tokens": 880093402.0, + "step": 23068 + }, + { + "epoch": 2.934613916804478, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.785768508911133, + "learning_rate": 1e-06, + "loss": 0.4, + "mean_token_accuracy": 0.8722314238548279, + "num_tokens": 880127874.0, + "step": 23069 + }, + { + "epoch": 2.9347411270830683, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.981136322021484, + "learning_rate": 1e-06, + "loss": 0.3817, + "mean_token_accuracy": 0.8834653496742249, + "num_tokens": 880165755.0, + "step": 23070 + }, + { + "epoch": 2.934868337361659, + "ewc_loss": 0.04150390625, + "ewc_loss_parallel": 4.1484832763671875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.73554801940918, + "learning_rate": 1e-06, + "loss": 0.426, + "mean_token_accuracy": 0.8687347173690796, + "num_tokens": 880203931.0, + "step": 23071 + }, + { + "epoch": 2.9349955476402494, + "ewc_loss": 0.04150390625, + "ewc_loss_parallel": 4.1484832763671875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.018573760986328, + "learning_rate": 1e-06, + "loss": 0.3921, + "mean_token_accuracy": 0.8766984343528748, + "num_tokens": 880242844.0, + "step": 23072 + }, + { + "epoch": 2.93512275791884, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.04739761352539, + "learning_rate": 1e-06, + "loss": 0.443, + "mean_token_accuracy": 0.8605445027351379, + "num_tokens": 880280068.0, + "step": 23073 + }, + { + "epoch": 2.9352499681974304, + "ewc_loss": 0.041259765625, + "ewc_loss_parallel": 4.124641418457031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.95572280883789, + "learning_rate": 1e-06, + "loss": 0.3816, + "mean_token_accuracy": 0.8801826238632202, + "num_tokens": 880314044.0, + "step": 23074 + }, + { + "epoch": 2.935377178476021, + "ewc_loss": 0.04150390625, + "ewc_loss_parallel": 4.1484832763671875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.05815887451172, + "learning_rate": 1e-06, + "loss": 0.4136, + "mean_token_accuracy": 0.8706105351448059, + "num_tokens": 880355077.0, + "step": 23075 + }, + { + "epoch": 2.9355043887546115, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.117244720458984, + "learning_rate": 1e-06, + "loss": 0.3514, + "mean_token_accuracy": 0.8889669179916382, + "num_tokens": 880388906.0, + "step": 23076 + }, + { + "epoch": 2.935631599033202, + "ewc_loss": 0.041259765625, + "ewc_loss_parallel": 4.124641418457031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.93171501159668, + "learning_rate": 1e-06, + "loss": 0.3733, + "mean_token_accuracy": 0.8852028846740723, + "num_tokens": 880426834.0, + "step": 23077 + }, + { + "epoch": 2.9357588093117926, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.09148406982422, + "learning_rate": 1e-06, + "loss": 0.4044, + "mean_token_accuracy": 0.8725212812423706, + "num_tokens": 880463477.0, + "step": 23078 + }, + { + "epoch": 2.9358860195903826, + "ewc_loss": 0.041259765625, + "ewc_loss_parallel": 4.124641418457031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.95658302307129, + "learning_rate": 1e-06, + "loss": 0.4481, + "mean_token_accuracy": 0.8572235703468323, + "num_tokens": 880500884.0, + "step": 23079 + }, + { + "epoch": 2.9360132298689736, + "ewc_loss": 0.04150390625, + "ewc_loss_parallel": 4.1484832763671875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.18616485595703, + "learning_rate": 1e-06, + "loss": 0.4128, + "mean_token_accuracy": 0.8682698607444763, + "num_tokens": 880539724.0, + "step": 23080 + }, + { + "epoch": 2.9361404401475637, + "ewc_loss": 0.041259765625, + "ewc_loss_parallel": 4.124641418457031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.12722396850586, + "learning_rate": 1e-06, + "loss": 0.4156, + "mean_token_accuracy": 0.8676577210426331, + "num_tokens": 880577820.0, + "step": 23081 + }, + { + "epoch": 2.9362676504261547, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.310924530029297, + "learning_rate": 1e-06, + "loss": 0.3898, + "mean_token_accuracy": 0.8778371810913086, + "num_tokens": 880612055.0, + "step": 23082 + }, + { + "epoch": 2.9363948607047448, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.03862762451172, + "learning_rate": 1e-06, + "loss": 0.375, + "mean_token_accuracy": 0.8818286657333374, + "num_tokens": 880654907.0, + "step": 23083 + }, + { + "epoch": 2.9365220709833357, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.9047794342041, + "learning_rate": 1e-06, + "loss": 0.3456, + "mean_token_accuracy": 0.8892459273338318, + "num_tokens": 880692418.0, + "step": 23084 + }, + { + "epoch": 2.936649281261926, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.00752067565918, + "learning_rate": 1e-06, + "loss": 0.4381, + "mean_token_accuracy": 0.8608565330505371, + "num_tokens": 880727742.0, + "step": 23085 + }, + { + "epoch": 2.9367764915405163, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.140762329101562, + "learning_rate": 1e-06, + "loss": 0.3672, + "mean_token_accuracy": 0.8866630792617798, + "num_tokens": 880766673.0, + "step": 23086 + }, + { + "epoch": 2.936903701819107, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.95627212524414, + "learning_rate": 1e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.8669902086257935, + "num_tokens": 880810390.0, + "step": 23087 + }, + { + "epoch": 2.9370309120976974, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.03868293762207, + "learning_rate": 1e-06, + "loss": 0.412, + "mean_token_accuracy": 0.8694493770599365, + "num_tokens": 880846920.0, + "step": 23088 + }, + { + "epoch": 2.937158122376288, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.04393196105957, + "learning_rate": 1e-06, + "loss": 0.3731, + "mean_token_accuracy": 0.8802835941314697, + "num_tokens": 880885867.0, + "step": 23089 + }, + { + "epoch": 2.9372853326548785, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.762784957885742, + "learning_rate": 1e-06, + "loss": 0.3759, + "mean_token_accuracy": 0.8857546448707581, + "num_tokens": 880928408.0, + "step": 23090 + }, + { + "epoch": 2.937412542933469, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.948274612426758, + "learning_rate": 1e-06, + "loss": 0.398, + "mean_token_accuracy": 0.8744450807571411, + "num_tokens": 880960681.0, + "step": 23091 + }, + { + "epoch": 2.9375397532120595, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.06696128845215, + "learning_rate": 1e-06, + "loss": 0.4292, + "mean_token_accuracy": 0.8633227348327637, + "num_tokens": 881000358.0, + "step": 23092 + }, + { + "epoch": 2.93766696349065, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.01398277282715, + "learning_rate": 1e-06, + "loss": 0.3962, + "mean_token_accuracy": 0.8763909339904785, + "num_tokens": 881037765.0, + "step": 23093 + }, + { + "epoch": 2.9377941737692406, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.00013542175293, + "learning_rate": 1e-06, + "loss": 0.3607, + "mean_token_accuracy": 0.8878406286239624, + "num_tokens": 881076091.0, + "step": 23094 + }, + { + "epoch": 2.937921384047831, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.835712432861328, + "learning_rate": 1e-06, + "loss": 0.4081, + "mean_token_accuracy": 0.8739067316055298, + "num_tokens": 881112784.0, + "step": 23095 + }, + { + "epoch": 2.9380485943264216, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.138883590698242, + "learning_rate": 1e-06, + "loss": 0.3843, + "mean_token_accuracy": 0.8803210258483887, + "num_tokens": 881149938.0, + "step": 23096 + }, + { + "epoch": 2.938175804605012, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.800769805908203, + "learning_rate": 1e-06, + "loss": 0.4246, + "mean_token_accuracy": 0.8649139404296875, + "num_tokens": 881187541.0, + "step": 23097 + }, + { + "epoch": 2.9383030148836027, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.1699275970459, + "learning_rate": 1e-06, + "loss": 0.3923, + "mean_token_accuracy": 0.8754518032073975, + "num_tokens": 881221920.0, + "step": 23098 + }, + { + "epoch": 2.938430225162193, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.963546752929688, + "learning_rate": 1e-06, + "loss": 0.4435, + "mean_token_accuracy": 0.8587592244148254, + "num_tokens": 881264804.0, + "step": 23099 + }, + { + "epoch": 2.9385574354407837, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.88909149169922, + "learning_rate": 1e-06, + "loss": 0.4188, + "mean_token_accuracy": 0.8656618595123291, + "num_tokens": 881304482.0, + "step": 23100 + }, + { + "epoch": 2.9386846457193743, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.74570655822754, + "learning_rate": 1e-06, + "loss": 0.4357, + "mean_token_accuracy": 0.8648741841316223, + "num_tokens": 881340822.0, + "step": 23101 + }, + { + "epoch": 2.938811855997965, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.12128257751465, + "learning_rate": 1e-06, + "loss": 0.4159, + "mean_token_accuracy": 0.8730126619338989, + "num_tokens": 881374352.0, + "step": 23102 + }, + { + "epoch": 2.9389390662765553, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.90765953063965, + "learning_rate": 1e-06, + "loss": 0.4131, + "mean_token_accuracy": 0.8683499693870544, + "num_tokens": 881413176.0, + "step": 23103 + }, + { + "epoch": 2.9390662765551454, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.911754608154297, + "learning_rate": 1e-06, + "loss": 0.416, + "mean_token_accuracy": 0.8691174983978271, + "num_tokens": 881454309.0, + "step": 23104 + }, + { + "epoch": 2.9391934868337364, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.337690353393555, + "learning_rate": 1e-06, + "loss": 0.3949, + "mean_token_accuracy": 0.8753188252449036, + "num_tokens": 881490702.0, + "step": 23105 + }, + { + "epoch": 2.9393206971123265, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.863121032714844, + "learning_rate": 1e-06, + "loss": 0.4052, + "mean_token_accuracy": 0.8803616762161255, + "num_tokens": 881530394.0, + "step": 23106 + }, + { + "epoch": 2.9394479073909174, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.17283821105957, + "learning_rate": 1e-06, + "loss": 0.3554, + "mean_token_accuracy": 0.8870484828948975, + "num_tokens": 881567286.0, + "step": 23107 + }, + { + "epoch": 2.9395751176695075, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.02712059020996, + "learning_rate": 1e-06, + "loss": 0.4367, + "mean_token_accuracy": 0.8680799007415771, + "num_tokens": 881607017.0, + "step": 23108 + }, + { + "epoch": 2.939702327948098, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.942045211791992, + "learning_rate": 1e-06, + "loss": 0.393, + "mean_token_accuracy": 0.8754140138626099, + "num_tokens": 881645793.0, + "step": 23109 + }, + { + "epoch": 2.9398295382266886, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.997764587402344, + "learning_rate": 1e-06, + "loss": 0.3978, + "mean_token_accuracy": 0.877045214176178, + "num_tokens": 881683596.0, + "step": 23110 + }, + { + "epoch": 2.939956748505279, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.971654891967773, + "learning_rate": 1e-06, + "loss": 0.4522, + "mean_token_accuracy": 0.8587720990180969, + "num_tokens": 881717607.0, + "step": 23111 + }, + { + "epoch": 2.9400839587838696, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.019634246826172, + "learning_rate": 1e-06, + "loss": 0.3589, + "mean_token_accuracy": 0.886070966720581, + "num_tokens": 881750884.0, + "step": 23112 + }, + { + "epoch": 2.94021116906246, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.087865829467773, + "learning_rate": 1e-06, + "loss": 0.4193, + "mean_token_accuracy": 0.8691903352737427, + "num_tokens": 881789345.0, + "step": 23113 + }, + { + "epoch": 2.9403383793410507, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.134302139282227, + "learning_rate": 1e-06, + "loss": 0.414, + "mean_token_accuracy": 0.8700911402702332, + "num_tokens": 881828459.0, + "step": 23114 + }, + { + "epoch": 2.940465589619641, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.97463607788086, + "learning_rate": 1e-06, + "loss": 0.4252, + "mean_token_accuracy": 0.868274450302124, + "num_tokens": 881865294.0, + "step": 23115 + }, + { + "epoch": 2.9405927998982317, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.157766342163086, + "learning_rate": 1e-06, + "loss": 0.4087, + "mean_token_accuracy": 0.8753565549850464, + "num_tokens": 881904936.0, + "step": 23116 + }, + { + "epoch": 2.9407200101768223, + "ewc_loss": 0.041259765625, + "ewc_loss_parallel": 4.124641418457031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.051231384277344, + "learning_rate": 1e-06, + "loss": 0.3657, + "mean_token_accuracy": 0.8848618268966675, + "num_tokens": 881936300.0, + "step": 23117 + }, + { + "epoch": 2.940847220455413, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.033018112182617, + "learning_rate": 1e-06, + "loss": 0.3692, + "mean_token_accuracy": 0.8856699466705322, + "num_tokens": 881974092.0, + "step": 23118 + }, + { + "epoch": 2.9409744307340033, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.94638442993164, + "learning_rate": 1e-06, + "loss": 0.398, + "mean_token_accuracy": 0.8748868703842163, + "num_tokens": 882019339.0, + "step": 23119 + }, + { + "epoch": 2.941101641012594, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.319053649902344, + "learning_rate": 1e-06, + "loss": 0.3904, + "mean_token_accuracy": 0.8776049613952637, + "num_tokens": 882052837.0, + "step": 23120 + }, + { + "epoch": 2.9412288512911844, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.08744239807129, + "learning_rate": 1e-06, + "loss": 0.3785, + "mean_token_accuracy": 0.8789438009262085, + "num_tokens": 882088449.0, + "step": 23121 + }, + { + "epoch": 2.941356061569775, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.080965042114258, + "learning_rate": 1e-06, + "loss": 0.4186, + "mean_token_accuracy": 0.8649491667747498, + "num_tokens": 882124844.0, + "step": 23122 + }, + { + "epoch": 2.9414832718483654, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.24919891357422, + "learning_rate": 1e-06, + "loss": 0.3846, + "mean_token_accuracy": 0.8762133121490479, + "num_tokens": 882162246.0, + "step": 23123 + }, + { + "epoch": 2.941610482126956, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.19210433959961, + "learning_rate": 1e-06, + "loss": 0.4802, + "mean_token_accuracy": 0.8623923063278198, + "num_tokens": 882202825.0, + "step": 23124 + }, + { + "epoch": 2.9417376924055465, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.98277473449707, + "learning_rate": 1e-06, + "loss": 0.4385, + "mean_token_accuracy": 0.860047459602356, + "num_tokens": 882237951.0, + "step": 23125 + }, + { + "epoch": 2.941864902684137, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.144763946533203, + "learning_rate": 1e-06, + "loss": 0.4049, + "mean_token_accuracy": 0.8733538389205933, + "num_tokens": 882279147.0, + "step": 23126 + }, + { + "epoch": 2.941992112962727, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.9608211517334, + "learning_rate": 1e-06, + "loss": 0.338, + "mean_token_accuracy": 0.89377361536026, + "num_tokens": 882322777.0, + "step": 23127 + }, + { + "epoch": 2.942119323241318, + "ewc_loss": 0.0400390625, + "ewc_loss_parallel": 4.00543212890625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.877716064453125, + "learning_rate": 1e-06, + "loss": 0.3826, + "mean_token_accuracy": 0.8780995607376099, + "num_tokens": 882360681.0, + "step": 23128 + }, + { + "epoch": 2.942246533519908, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.151918411254883, + "learning_rate": 1e-06, + "loss": 0.4575, + "mean_token_accuracy": 0.8544226288795471, + "num_tokens": 882403823.0, + "step": 23129 + }, + { + "epoch": 2.942373743798499, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.011098861694336, + "learning_rate": 1e-06, + "loss": 0.3846, + "mean_token_accuracy": 0.8823410868644714, + "num_tokens": 882443723.0, + "step": 23130 + }, + { + "epoch": 2.9425009540770892, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.970748901367188, + "learning_rate": 1e-06, + "loss": 0.4525, + "mean_token_accuracy": 0.859433650970459, + "num_tokens": 882480610.0, + "step": 23131 + }, + { + "epoch": 2.94262816435568, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.11088752746582, + "learning_rate": 1e-06, + "loss": 0.3943, + "mean_token_accuracy": 0.8765575885772705, + "num_tokens": 882518789.0, + "step": 23132 + }, + { + "epoch": 2.9427553746342703, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.064043045043945, + "learning_rate": 1e-06, + "loss": 0.4413, + "mean_token_accuracy": 0.8614668250083923, + "num_tokens": 882556598.0, + "step": 23133 + }, + { + "epoch": 2.942882584912861, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.989246368408203, + "learning_rate": 1e-06, + "loss": 0.4164, + "mean_token_accuracy": 0.8693831562995911, + "num_tokens": 882593788.0, + "step": 23134 + }, + { + "epoch": 2.9430097951914513, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.294950485229492, + "learning_rate": 1e-06, + "loss": 0.3828, + "mean_token_accuracy": 0.8810135126113892, + "num_tokens": 882630861.0, + "step": 23135 + }, + { + "epoch": 2.943137005470042, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.182893753051758, + "learning_rate": 1e-06, + "loss": 0.3854, + "mean_token_accuracy": 0.8823692202568054, + "num_tokens": 882670950.0, + "step": 23136 + }, + { + "epoch": 2.9432642157486324, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.129657745361328, + "learning_rate": 1e-06, + "loss": 0.4329, + "mean_token_accuracy": 0.8646053075790405, + "num_tokens": 882705680.0, + "step": 23137 + }, + { + "epoch": 2.943391426027223, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.255617141723633, + "learning_rate": 1e-06, + "loss": 0.3736, + "mean_token_accuracy": 0.8831249475479126, + "num_tokens": 882737589.0, + "step": 23138 + }, + { + "epoch": 2.9435186363058135, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.10967254638672, + "learning_rate": 1e-06, + "loss": 0.4475, + "mean_token_accuracy": 0.8591136336326599, + "num_tokens": 882774502.0, + "step": 23139 + }, + { + "epoch": 2.943645846584404, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.05057144165039, + "learning_rate": 1e-06, + "loss": 0.4279, + "mean_token_accuracy": 0.8647878170013428, + "num_tokens": 882815184.0, + "step": 23140 + }, + { + "epoch": 2.9437730568629945, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.25528907775879, + "learning_rate": 1e-06, + "loss": 0.3631, + "mean_token_accuracy": 0.8870440721511841, + "num_tokens": 882856793.0, + "step": 23141 + }, + { + "epoch": 2.943900267141585, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.163820266723633, + "learning_rate": 1e-06, + "loss": 0.3702, + "mean_token_accuracy": 0.8844954967498779, + "num_tokens": 882896128.0, + "step": 23142 + }, + { + "epoch": 2.9440274774201756, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.15363121032715, + "learning_rate": 1e-06, + "loss": 0.3849, + "mean_token_accuracy": 0.8820985555648804, + "num_tokens": 882931253.0, + "step": 23143 + }, + { + "epoch": 2.944154687698766, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.149187088012695, + "learning_rate": 1e-06, + "loss": 0.351, + "mean_token_accuracy": 0.8901345729827881, + "num_tokens": 882975072.0, + "step": 23144 + }, + { + "epoch": 2.9442818979773566, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.110517501831055, + "learning_rate": 1e-06, + "loss": 0.3645, + "mean_token_accuracy": 0.8864680528640747, + "num_tokens": 883013576.0, + "step": 23145 + }, + { + "epoch": 2.944409108255947, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.16913414001465, + "learning_rate": 1e-06, + "loss": 0.3871, + "mean_token_accuracy": 0.8771165609359741, + "num_tokens": 883046052.0, + "step": 23146 + }, + { + "epoch": 2.9445363185345377, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.127002716064453, + "learning_rate": 1e-06, + "loss": 0.4604, + "mean_token_accuracy": 0.8552590012550354, + "num_tokens": 883080398.0, + "step": 23147 + }, + { + "epoch": 2.944663528813128, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.995155334472656, + "learning_rate": 1e-06, + "loss": 0.4234, + "mean_token_accuracy": 0.8657681941986084, + "num_tokens": 883120321.0, + "step": 23148 + }, + { + "epoch": 2.9447907390917187, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.064847946166992, + "learning_rate": 1e-06, + "loss": 0.3823, + "mean_token_accuracy": 0.8805304169654846, + "num_tokens": 883161285.0, + "step": 23149 + }, + { + "epoch": 2.9449179493703093, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.989429473876953, + "learning_rate": 1e-06, + "loss": 0.4034, + "mean_token_accuracy": 0.873406171798706, + "num_tokens": 883199857.0, + "step": 23150 + }, + { + "epoch": 2.9450451596489, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.038179397583008, + "learning_rate": 1e-06, + "loss": 0.3841, + "mean_token_accuracy": 0.8807517886161804, + "num_tokens": 883238325.0, + "step": 23151 + }, + { + "epoch": 2.94517236992749, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.110074996948242, + "learning_rate": 1e-06, + "loss": 0.423, + "mean_token_accuracy": 0.865504264831543, + "num_tokens": 883281814.0, + "step": 23152 + }, + { + "epoch": 2.945299580206081, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.95590591430664, + "learning_rate": 1e-06, + "loss": 0.3799, + "mean_token_accuracy": 0.8786811232566833, + "num_tokens": 883317385.0, + "step": 23153 + }, + { + "epoch": 2.945426790484671, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.159408569335938, + "learning_rate": 1e-06, + "loss": 0.3732, + "mean_token_accuracy": 0.8819090723991394, + "num_tokens": 883353498.0, + "step": 23154 + }, + { + "epoch": 2.945554000763262, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.267555236816406, + "learning_rate": 1e-06, + "loss": 0.3826, + "mean_token_accuracy": 0.8818031549453735, + "num_tokens": 883391709.0, + "step": 23155 + }, + { + "epoch": 2.945681211041852, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.94778060913086, + "learning_rate": 1e-06, + "loss": 0.4508, + "mean_token_accuracy": 0.8582233190536499, + "num_tokens": 883426329.0, + "step": 23156 + }, + { + "epoch": 2.945808421320443, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.116626739501953, + "learning_rate": 1e-06, + "loss": 0.4038, + "mean_token_accuracy": 0.871729850769043, + "num_tokens": 883465797.0, + "step": 23157 + }, + { + "epoch": 2.945935631599033, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.916873931884766, + "learning_rate": 1e-06, + "loss": 0.3932, + "mean_token_accuracy": 0.8768435120582581, + "num_tokens": 883501632.0, + "step": 23158 + }, + { + "epoch": 2.9460628418776236, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.193214416503906, + "learning_rate": 1e-06, + "loss": 0.3694, + "mean_token_accuracy": 0.884797215461731, + "num_tokens": 883537622.0, + "step": 23159 + }, + { + "epoch": 2.946190052156214, + "ewc_loss": 0.041259765625, + "ewc_loss_parallel": 4.124641418457031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.908472061157227, + "learning_rate": 1e-06, + "loss": 0.3985, + "mean_token_accuracy": 0.8754149675369263, + "num_tokens": 883576458.0, + "step": 23160 + }, + { + "epoch": 2.9463172624348046, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.116281509399414, + "learning_rate": 1e-06, + "loss": 0.4184, + "mean_token_accuracy": 0.8657392859458923, + "num_tokens": 883620929.0, + "step": 23161 + }, + { + "epoch": 2.946444472713395, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.19127082824707, + "learning_rate": 1e-06, + "loss": 0.4804, + "mean_token_accuracy": 0.851137638092041, + "num_tokens": 883652589.0, + "step": 23162 + }, + { + "epoch": 2.9465716829919857, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.114160537719727, + "learning_rate": 1e-06, + "loss": 0.4028, + "mean_token_accuracy": 0.873741626739502, + "num_tokens": 883691844.0, + "step": 23163 + }, + { + "epoch": 2.946698893270576, + "ewc_loss": 0.041259765625, + "ewc_loss_parallel": 4.124641418457031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.0789794921875, + "learning_rate": 1e-06, + "loss": 0.4621, + "mean_token_accuracy": 0.8544147610664368, + "num_tokens": 883732838.0, + "step": 23164 + }, + { + "epoch": 2.9468261035491667, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.082998275756836, + "learning_rate": 1e-06, + "loss": 0.3975, + "mean_token_accuracy": 0.8719079494476318, + "num_tokens": 883771415.0, + "step": 23165 + }, + { + "epoch": 2.9469533138277573, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.154294967651367, + "learning_rate": 1e-06, + "loss": 0.4465, + "mean_token_accuracy": 0.8595392107963562, + "num_tokens": 883808807.0, + "step": 23166 + }, + { + "epoch": 2.947080524106348, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.119245529174805, + "learning_rate": 1e-06, + "loss": 0.3516, + "mean_token_accuracy": 0.8883328437805176, + "num_tokens": 883844300.0, + "step": 23167 + }, + { + "epoch": 2.9472077343849383, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.225263595581055, + "learning_rate": 1e-06, + "loss": 0.3577, + "mean_token_accuracy": 0.8881673216819763, + "num_tokens": 883879337.0, + "step": 23168 + }, + { + "epoch": 2.947334944663529, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.11473274230957, + "learning_rate": 1e-06, + "loss": 0.4306, + "mean_token_accuracy": 0.8631588220596313, + "num_tokens": 883925810.0, + "step": 23169 + }, + { + "epoch": 2.9474621549421194, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.251426696777344, + "learning_rate": 1e-06, + "loss": 0.3816, + "mean_token_accuracy": 0.8836413621902466, + "num_tokens": 883964753.0, + "step": 23170 + }, + { + "epoch": 2.94758936522071, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.154409408569336, + "learning_rate": 1e-06, + "loss": 0.4316, + "mean_token_accuracy": 0.8671606183052063, + "num_tokens": 884005737.0, + "step": 23171 + }, + { + "epoch": 2.9477165754993004, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.036088943481445, + "learning_rate": 1e-06, + "loss": 0.4191, + "mean_token_accuracy": 0.8743034601211548, + "num_tokens": 884043414.0, + "step": 23172 + }, + { + "epoch": 2.947843785777891, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.08946418762207, + "learning_rate": 1e-06, + "loss": 0.3867, + "mean_token_accuracy": 0.8792585134506226, + "num_tokens": 884077399.0, + "step": 23173 + }, + { + "epoch": 2.9479709960564815, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.192481994628906, + "learning_rate": 1e-06, + "loss": 0.4063, + "mean_token_accuracy": 0.8724263906478882, + "num_tokens": 884116539.0, + "step": 23174 + }, + { + "epoch": 2.948098206335072, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.91940689086914, + "learning_rate": 1e-06, + "loss": 0.3811, + "mean_token_accuracy": 0.8801360130310059, + "num_tokens": 884149756.0, + "step": 23175 + }, + { + "epoch": 2.9482254166136626, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.306961059570312, + "learning_rate": 1e-06, + "loss": 0.3508, + "mean_token_accuracy": 0.8936707973480225, + "num_tokens": 884181960.0, + "step": 23176 + }, + { + "epoch": 2.9483526268922526, + "ewc_loss": 0.041259765625, + "ewc_loss_parallel": 4.124641418457031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.05329704284668, + "learning_rate": 1e-06, + "loss": 0.4122, + "mean_token_accuracy": 0.8699405789375305, + "num_tokens": 884223562.0, + "step": 23177 + }, + { + "epoch": 2.9484798371708436, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.873889923095703, + "learning_rate": 1e-06, + "loss": 0.3769, + "mean_token_accuracy": 0.8799177408218384, + "num_tokens": 884264541.0, + "step": 23178 + }, + { + "epoch": 2.9486070474494337, + "ewc_loss": 0.041259765625, + "ewc_loss_parallel": 4.124641418457031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.223257064819336, + "learning_rate": 1e-06, + "loss": 0.4453, + "mean_token_accuracy": 0.8605111837387085, + "num_tokens": 884301171.0, + "step": 23179 + }, + { + "epoch": 2.9487342577280247, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.041765213012695, + "learning_rate": 1e-06, + "loss": 0.3949, + "mean_token_accuracy": 0.8724501132965088, + "num_tokens": 884332481.0, + "step": 23180 + }, + { + "epoch": 2.9488614680066147, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.330629348754883, + "learning_rate": 1e-06, + "loss": 0.3395, + "mean_token_accuracy": 0.8945020437240601, + "num_tokens": 884371616.0, + "step": 23181 + }, + { + "epoch": 2.9489886782852053, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.22744369506836, + "learning_rate": 1e-06, + "loss": 0.4309, + "mean_token_accuracy": 0.8638193607330322, + "num_tokens": 884411245.0, + "step": 23182 + }, + { + "epoch": 2.949115888563796, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.978830337524414, + "learning_rate": 1e-06, + "loss": 0.4129, + "mean_token_accuracy": 0.8719589710235596, + "num_tokens": 884445712.0, + "step": 23183 + }, + { + "epoch": 2.9492430988423863, + "ewc_loss": 0.04150390625, + "ewc_loss_parallel": 4.1484832763671875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.998802185058594, + "learning_rate": 1e-06, + "loss": 0.3869, + "mean_token_accuracy": 0.8785483837127686, + "num_tokens": 884484935.0, + "step": 23184 + }, + { + "epoch": 2.949370309120977, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.989376068115234, + "learning_rate": 1e-06, + "loss": 0.3946, + "mean_token_accuracy": 0.8747270107269287, + "num_tokens": 884518714.0, + "step": 23185 + }, + { + "epoch": 2.9494975193995674, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.209630966186523, + "learning_rate": 1e-06, + "loss": 0.4301, + "mean_token_accuracy": 0.8659272193908691, + "num_tokens": 884560430.0, + "step": 23186 + }, + { + "epoch": 2.949624729678158, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.202598571777344, + "learning_rate": 1e-06, + "loss": 0.4176, + "mean_token_accuracy": 0.8660320043563843, + "num_tokens": 884595937.0, + "step": 23187 + }, + { + "epoch": 2.9497519399567484, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.83292007446289, + "learning_rate": 1e-06, + "loss": 0.4176, + "mean_token_accuracy": 0.8685600161552429, + "num_tokens": 884633201.0, + "step": 23188 + }, + { + "epoch": 2.949879150235339, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.2729434967041, + "learning_rate": 1e-06, + "loss": 0.3932, + "mean_token_accuracy": 0.8756448030471802, + "num_tokens": 884672602.0, + "step": 23189 + }, + { + "epoch": 2.9500063605139295, + "ewc_loss": 0.04150390625, + "ewc_loss_parallel": 4.1484832763671875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.948694229125977, + "learning_rate": 1e-06, + "loss": 0.3889, + "mean_token_accuracy": 0.8784744739532471, + "num_tokens": 884708378.0, + "step": 23190 + }, + { + "epoch": 2.95013357079252, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.88147735595703, + "learning_rate": 1e-06, + "loss": 0.405, + "mean_token_accuracy": 0.8718413710594177, + "num_tokens": 884747527.0, + "step": 23191 + }, + { + "epoch": 2.9502607810711106, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.412935256958008, + "learning_rate": 1e-06, + "loss": 0.3426, + "mean_token_accuracy": 0.892081618309021, + "num_tokens": 884781738.0, + "step": 23192 + }, + { + "epoch": 2.950387991349701, + "ewc_loss": 0.041259765625, + "ewc_loss_parallel": 4.124641418457031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.76045799255371, + "learning_rate": 1e-06, + "loss": 0.4352, + "mean_token_accuracy": 0.8629049062728882, + "num_tokens": 884818884.0, + "step": 23193 + }, + { + "epoch": 2.9505152016282916, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.98377799987793, + "learning_rate": 1e-06, + "loss": 0.3707, + "mean_token_accuracy": 0.884473979473114, + "num_tokens": 884851751.0, + "step": 23194 + }, + { + "epoch": 2.950642411906882, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.05849838256836, + "learning_rate": 1e-06, + "loss": 0.3888, + "mean_token_accuracy": 0.8780938386917114, + "num_tokens": 884892935.0, + "step": 23195 + }, + { + "epoch": 2.9507696221854727, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.06951141357422, + "learning_rate": 1e-06, + "loss": 0.3884, + "mean_token_accuracy": 0.8772458434104919, + "num_tokens": 884935820.0, + "step": 23196 + }, + { + "epoch": 2.950896832464063, + "ewc_loss": 0.041259765625, + "ewc_loss_parallel": 4.124641418457031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.90232276916504, + "learning_rate": 1e-06, + "loss": 0.413, + "mean_token_accuracy": 0.8726036548614502, + "num_tokens": 884976042.0, + "step": 23197 + }, + { + "epoch": 2.9510240427426537, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.254430770874023, + "learning_rate": 1e-06, + "loss": 0.3389, + "mean_token_accuracy": 0.892846941947937, + "num_tokens": 885013511.0, + "step": 23198 + }, + { + "epoch": 2.9511512530212443, + "ewc_loss": 0.04150390625, + "ewc_loss_parallel": 4.1484832763671875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.905080795288086, + "learning_rate": 1e-06, + "loss": 0.4353, + "mean_token_accuracy": 0.8623329401016235, + "num_tokens": 885047291.0, + "step": 23199 + }, + { + "epoch": 2.951278463299835, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.909008026123047, + "learning_rate": 1e-06, + "loss": 0.4149, + "mean_token_accuracy": 0.870918869972229, + "num_tokens": 885083116.0, + "step": 23200 + }, + { + "epoch": 2.9514056735784253, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.054000854492188, + "learning_rate": 1e-06, + "loss": 0.411, + "mean_token_accuracy": 0.8699105978012085, + "num_tokens": 885123282.0, + "step": 23201 + }, + { + "epoch": 2.9515328838570154, + "ewc_loss": 0.041259765625, + "ewc_loss_parallel": 4.124641418457031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.03107452392578, + "learning_rate": 1e-06, + "loss": 0.3847, + "mean_token_accuracy": 0.8787475824356079, + "num_tokens": 885166277.0, + "step": 23202 + }, + { + "epoch": 2.9516600941356064, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.1412296295166, + "learning_rate": 1e-06, + "loss": 0.359, + "mean_token_accuracy": 0.8861179351806641, + "num_tokens": 885206414.0, + "step": 23203 + }, + { + "epoch": 2.9517873044141965, + "ewc_loss": 0.041259765625, + "ewc_loss_parallel": 4.124641418457031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.150222778320312, + "learning_rate": 1e-06, + "loss": 0.3858, + "mean_token_accuracy": 0.8825985193252563, + "num_tokens": 885240977.0, + "step": 23204 + }, + { + "epoch": 2.9519145146927874, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.90143394470215, + "learning_rate": 1e-06, + "loss": 0.3512, + "mean_token_accuracy": 0.888885498046875, + "num_tokens": 885283190.0, + "step": 23205 + }, + { + "epoch": 2.9520417249713775, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.977087020874023, + "learning_rate": 1e-06, + "loss": 0.3631, + "mean_token_accuracy": 0.8890892863273621, + "num_tokens": 885321340.0, + "step": 23206 + }, + { + "epoch": 2.952168935249968, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.951934814453125, + "learning_rate": 1e-06, + "loss": 0.3737, + "mean_token_accuracy": 0.8831768035888672, + "num_tokens": 885357524.0, + "step": 23207 + }, + { + "epoch": 2.9522961455285586, + "ewc_loss": 0.041259765625, + "ewc_loss_parallel": 4.124641418457031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.17804718017578, + "learning_rate": 1e-06, + "loss": 0.4673, + "mean_token_accuracy": 0.8509177565574646, + "num_tokens": 885389696.0, + "step": 23208 + }, + { + "epoch": 2.952423355807149, + "ewc_loss": 0.041259765625, + "ewc_loss_parallel": 4.124641418457031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.073753356933594, + "learning_rate": 1e-06, + "loss": 0.3536, + "mean_token_accuracy": 0.889068067073822, + "num_tokens": 885429574.0, + "step": 23209 + }, + { + "epoch": 2.9525505660857396, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.17715835571289, + "learning_rate": 1e-06, + "loss": 0.3863, + "mean_token_accuracy": 0.8785692453384399, + "num_tokens": 885459961.0, + "step": 23210 + }, + { + "epoch": 2.95267777636433, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.012939453125, + "learning_rate": 1e-06, + "loss": 0.4219, + "mean_token_accuracy": 0.8679614067077637, + "num_tokens": 885506725.0, + "step": 23211 + }, + { + "epoch": 2.9528049866429207, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.978748321533203, + "learning_rate": 1e-06, + "loss": 0.3907, + "mean_token_accuracy": 0.8790111541748047, + "num_tokens": 885549409.0, + "step": 23212 + }, + { + "epoch": 2.952932196921511, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.043052673339844, + "learning_rate": 1e-06, + "loss": 0.388, + "mean_token_accuracy": 0.8796693086624146, + "num_tokens": 885589272.0, + "step": 23213 + }, + { + "epoch": 2.9530594072001017, + "ewc_loss": 0.041259765625, + "ewc_loss_parallel": 4.124641418457031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.291250228881836, + "learning_rate": 1e-06, + "loss": 0.4487, + "mean_token_accuracy": 0.856780469417572, + "num_tokens": 885625951.0, + "step": 23214 + }, + { + "epoch": 2.9531866174786923, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.012550354003906, + "learning_rate": 1e-06, + "loss": 0.3924, + "mean_token_accuracy": 0.8774645924568176, + "num_tokens": 885665844.0, + "step": 23215 + }, + { + "epoch": 2.953313827757283, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.16996955871582, + "learning_rate": 1e-06, + "loss": 0.4003, + "mean_token_accuracy": 0.8742440938949585, + "num_tokens": 885705616.0, + "step": 23216 + }, + { + "epoch": 2.9534410380358733, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.88581085205078, + "learning_rate": 1e-06, + "loss": 0.3747, + "mean_token_accuracy": 0.8868353366851807, + "num_tokens": 885752362.0, + "step": 23217 + }, + { + "epoch": 2.953568248314464, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.33051300048828, + "learning_rate": 1e-06, + "loss": 0.4175, + "mean_token_accuracy": 0.8705788254737854, + "num_tokens": 885792485.0, + "step": 23218 + }, + { + "epoch": 2.9536954585930544, + "ewc_loss": 0.041259765625, + "ewc_loss_parallel": 4.124641418457031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.146038055419922, + "learning_rate": 1e-06, + "loss": 0.3929, + "mean_token_accuracy": 0.8766204118728638, + "num_tokens": 885834809.0, + "step": 23219 + }, + { + "epoch": 2.953822668871645, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.861324310302734, + "learning_rate": 1e-06, + "loss": 0.3826, + "mean_token_accuracy": 0.879435658454895, + "num_tokens": 885870555.0, + "step": 23220 + }, + { + "epoch": 2.9539498791502354, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.0919189453125, + "learning_rate": 1e-06, + "loss": 0.3918, + "mean_token_accuracy": 0.8769099712371826, + "num_tokens": 885911911.0, + "step": 23221 + }, + { + "epoch": 2.954077089428826, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.059152603149414, + "learning_rate": 1e-06, + "loss": 0.3586, + "mean_token_accuracy": 0.8864883184432983, + "num_tokens": 885941094.0, + "step": 23222 + }, + { + "epoch": 2.9542042997074165, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.210060119628906, + "learning_rate": 1e-06, + "loss": 0.4096, + "mean_token_accuracy": 0.8743086457252502, + "num_tokens": 885978094.0, + "step": 23223 + }, + { + "epoch": 2.954331509986007, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.830923080444336, + "learning_rate": 1e-06, + "loss": 0.3546, + "mean_token_accuracy": 0.8910868167877197, + "num_tokens": 886014859.0, + "step": 23224 + }, + { + "epoch": 2.954458720264597, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.179906845092773, + "learning_rate": 1e-06, + "loss": 0.3834, + "mean_token_accuracy": 0.8809508085250854, + "num_tokens": 886048317.0, + "step": 23225 + }, + { + "epoch": 2.954585930543188, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.087711334228516, + "learning_rate": 1e-06, + "loss": 0.3899, + "mean_token_accuracy": 0.8760395050048828, + "num_tokens": 886081797.0, + "step": 23226 + }, + { + "epoch": 2.954713140821778, + "ewc_loss": 0.041259765625, + "ewc_loss_parallel": 4.124641418457031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.04603385925293, + "learning_rate": 1e-06, + "loss": 0.4403, + "mean_token_accuracy": 0.861458420753479, + "num_tokens": 886124111.0, + "step": 23227 + }, + { + "epoch": 2.954840351100369, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.97199058532715, + "learning_rate": 1e-06, + "loss": 0.3807, + "mean_token_accuracy": 0.8848088383674622, + "num_tokens": 886162823.0, + "step": 23228 + }, + { + "epoch": 2.954967561378959, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.013471603393555, + "learning_rate": 1e-06, + "loss": 0.394, + "mean_token_accuracy": 0.8770333528518677, + "num_tokens": 886204706.0, + "step": 23229 + }, + { + "epoch": 2.95509477165755, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.173585891723633, + "learning_rate": 1e-06, + "loss": 0.3767, + "mean_token_accuracy": 0.8838684558868408, + "num_tokens": 886241461.0, + "step": 23230 + }, + { + "epoch": 2.9552219819361403, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.98108673095703, + "learning_rate": 1e-06, + "loss": 0.4123, + "mean_token_accuracy": 0.871367335319519, + "num_tokens": 886279276.0, + "step": 23231 + }, + { + "epoch": 2.955349192214731, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.20563316345215, + "learning_rate": 1e-06, + "loss": 0.3894, + "mean_token_accuracy": 0.8764533996582031, + "num_tokens": 886317703.0, + "step": 23232 + }, + { + "epoch": 2.9554764024933213, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.96526527404785, + "learning_rate": 1e-06, + "loss": 0.3803, + "mean_token_accuracy": 0.8809564113616943, + "num_tokens": 886355235.0, + "step": 23233 + }, + { + "epoch": 2.955603612771912, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.3569278717041, + "learning_rate": 1e-06, + "loss": 0.3266, + "mean_token_accuracy": 0.8985384702682495, + "num_tokens": 886387354.0, + "step": 23234 + }, + { + "epoch": 2.9557308230505024, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.961116790771484, + "learning_rate": 1e-06, + "loss": 0.402, + "mean_token_accuracy": 0.8726763725280762, + "num_tokens": 886419742.0, + "step": 23235 + }, + { + "epoch": 2.955858033329093, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.958959579467773, + "learning_rate": 1e-06, + "loss": 0.3936, + "mean_token_accuracy": 0.876144528388977, + "num_tokens": 886455218.0, + "step": 23236 + }, + { + "epoch": 2.9559852436076834, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.108335494995117, + "learning_rate": 1e-06, + "loss": 0.3948, + "mean_token_accuracy": 0.877202033996582, + "num_tokens": 886494454.0, + "step": 23237 + }, + { + "epoch": 2.956112453886274, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.96305274963379, + "learning_rate": 1e-06, + "loss": 0.4469, + "mean_token_accuracy": 0.8535075187683105, + "num_tokens": 886532854.0, + "step": 23238 + }, + { + "epoch": 2.9562396641648645, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.296018600463867, + "learning_rate": 1e-06, + "loss": 0.3985, + "mean_token_accuracy": 0.8740493059158325, + "num_tokens": 886564288.0, + "step": 23239 + }, + { + "epoch": 2.956366874443455, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.13003921508789, + "learning_rate": 1e-06, + "loss": 0.3835, + "mean_token_accuracy": 0.8776447772979736, + "num_tokens": 886597155.0, + "step": 23240 + }, + { + "epoch": 2.9564940847220456, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.0286922454834, + "learning_rate": 1e-06, + "loss": 0.385, + "mean_token_accuracy": 0.8783637881278992, + "num_tokens": 886632015.0, + "step": 23241 + }, + { + "epoch": 2.956621295000636, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.052780151367188, + "learning_rate": 1e-06, + "loss": 0.341, + "mean_token_accuracy": 0.8921074867248535, + "num_tokens": 886665736.0, + "step": 23242 + }, + { + "epoch": 2.9567485052792266, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.041902542114258, + "learning_rate": 1e-06, + "loss": 0.3983, + "mean_token_accuracy": 0.8750971555709839, + "num_tokens": 886702652.0, + "step": 23243 + }, + { + "epoch": 2.956875715557817, + "ewc_loss": 0.041259765625, + "ewc_loss_parallel": 4.124641418457031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.976247787475586, + "learning_rate": 1e-06, + "loss": 0.399, + "mean_token_accuracy": 0.8731194734573364, + "num_tokens": 886743573.0, + "step": 23244 + }, + { + "epoch": 2.9570029258364077, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.995220184326172, + "learning_rate": 1e-06, + "loss": 0.383, + "mean_token_accuracy": 0.880172073841095, + "num_tokens": 886777932.0, + "step": 23245 + }, + { + "epoch": 2.957130136114998, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.979921340942383, + "learning_rate": 1e-06, + "loss": 0.4411, + "mean_token_accuracy": 0.8635320663452148, + "num_tokens": 886813211.0, + "step": 23246 + }, + { + "epoch": 2.9572573463935887, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.084909439086914, + "learning_rate": 1e-06, + "loss": 0.3803, + "mean_token_accuracy": 0.8801352381706238, + "num_tokens": 886849416.0, + "step": 23247 + }, + { + "epoch": 2.9573845566721793, + "ewc_loss": 0.041259765625, + "ewc_loss_parallel": 4.124641418457031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.08363914489746, + "learning_rate": 1e-06, + "loss": 0.4484, + "mean_token_accuracy": 0.8611698150634766, + "num_tokens": 886887983.0, + "step": 23248 + }, + { + "epoch": 2.95751176695077, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.103757858276367, + "learning_rate": 1e-06, + "loss": 0.4485, + "mean_token_accuracy": 0.8591371774673462, + "num_tokens": 886930867.0, + "step": 23249 + }, + { + "epoch": 2.95763897722936, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.894132614135742, + "learning_rate": 1e-06, + "loss": 0.4, + "mean_token_accuracy": 0.8733519315719604, + "num_tokens": 886971862.0, + "step": 23250 + }, + { + "epoch": 2.957766187507951, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.963653564453125, + "learning_rate": 1e-06, + "loss": 0.3721, + "mean_token_accuracy": 0.881229817867279, + "num_tokens": 887005238.0, + "step": 23251 + }, + { + "epoch": 2.957893397786541, + "ewc_loss": 0.041259765625, + "ewc_loss_parallel": 4.124641418457031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.04022789001465, + "learning_rate": 1e-06, + "loss": 0.4078, + "mean_token_accuracy": 0.8685325384140015, + "num_tokens": 887035588.0, + "step": 23252 + }, + { + "epoch": 2.958020608065132, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.954259872436523, + "learning_rate": 1e-06, + "loss": 0.3959, + "mean_token_accuracy": 0.8760528564453125, + "num_tokens": 887079962.0, + "step": 23253 + }, + { + "epoch": 2.958147818343722, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.03021240234375, + "learning_rate": 1e-06, + "loss": 0.3348, + "mean_token_accuracy": 0.8946211338043213, + "num_tokens": 887114572.0, + "step": 23254 + }, + { + "epoch": 2.958275028622313, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.231977462768555, + "learning_rate": 1e-06, + "loss": 0.3821, + "mean_token_accuracy": 0.8798280954360962, + "num_tokens": 887149396.0, + "step": 23255 + }, + { + "epoch": 2.958402238900903, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.165878295898438, + "learning_rate": 1e-06, + "loss": 0.4088, + "mean_token_accuracy": 0.8692973256111145, + "num_tokens": 887191916.0, + "step": 23256 + }, + { + "epoch": 2.9585294491794936, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.923839569091797, + "learning_rate": 1e-06, + "loss": 0.3775, + "mean_token_accuracy": 0.8845667243003845, + "num_tokens": 887228263.0, + "step": 23257 + }, + { + "epoch": 2.958656659458084, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.182037353515625, + "learning_rate": 1e-06, + "loss": 0.3568, + "mean_token_accuracy": 0.8908094167709351, + "num_tokens": 887258688.0, + "step": 23258 + }, + { + "epoch": 2.9587838697366746, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.316612243652344, + "learning_rate": 1e-06, + "loss": 0.4177, + "mean_token_accuracy": 0.8719452619552612, + "num_tokens": 887301565.0, + "step": 23259 + }, + { + "epoch": 2.958911080015265, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.039854049682617, + "learning_rate": 1e-06, + "loss": 0.4069, + "mean_token_accuracy": 0.8714339137077332, + "num_tokens": 887337511.0, + "step": 23260 + }, + { + "epoch": 2.9590382902938557, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.219026565551758, + "learning_rate": 1e-06, + "loss": 0.3715, + "mean_token_accuracy": 0.8787798285484314, + "num_tokens": 887370694.0, + "step": 23261 + }, + { + "epoch": 2.959165500572446, + "ewc_loss": 0.041259765625, + "ewc_loss_parallel": 4.124641418457031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.017431259155273, + "learning_rate": 1e-06, + "loss": 0.4098, + "mean_token_accuracy": 0.8752049803733826, + "num_tokens": 887412131.0, + "step": 23262 + }, + { + "epoch": 2.9592927108510367, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.28788185119629, + "learning_rate": 1e-06, + "loss": 0.4146, + "mean_token_accuracy": 0.8711152076721191, + "num_tokens": 887447179.0, + "step": 23263 + }, + { + "epoch": 2.9594199211296273, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.15459632873535, + "learning_rate": 1e-06, + "loss": 0.4157, + "mean_token_accuracy": 0.8708116412162781, + "num_tokens": 887486079.0, + "step": 23264 + }, + { + "epoch": 2.959547131408218, + "ewc_loss": 0.041259765625, + "ewc_loss_parallel": 4.124641418457031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.11101531982422, + "learning_rate": 1e-06, + "loss": 0.3771, + "mean_token_accuracy": 0.8841303586959839, + "num_tokens": 887519092.0, + "step": 23265 + }, + { + "epoch": 2.9596743416868083, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.193004608154297, + "learning_rate": 1e-06, + "loss": 0.3985, + "mean_token_accuracy": 0.8783671855926514, + "num_tokens": 887552008.0, + "step": 23266 + }, + { + "epoch": 2.959801551965399, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.287010192871094, + "learning_rate": 1e-06, + "loss": 0.4062, + "mean_token_accuracy": 0.8717710971832275, + "num_tokens": 887585827.0, + "step": 23267 + }, + { + "epoch": 2.9599287622439894, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.288522720336914, + "learning_rate": 1e-06, + "loss": 0.401, + "mean_token_accuracy": 0.8736842274665833, + "num_tokens": 887628041.0, + "step": 23268 + }, + { + "epoch": 2.96005597252258, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.161226272583008, + "learning_rate": 1e-06, + "loss": 0.3794, + "mean_token_accuracy": 0.8798893690109253, + "num_tokens": 887667210.0, + "step": 23269 + }, + { + "epoch": 2.9601831828011704, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.105648040771484, + "learning_rate": 1e-06, + "loss": 0.3713, + "mean_token_accuracy": 0.8816484808921814, + "num_tokens": 887705429.0, + "step": 23270 + }, + { + "epoch": 2.960310393079761, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.20722770690918, + "learning_rate": 1e-06, + "loss": 0.4624, + "mean_token_accuracy": 0.8579326868057251, + "num_tokens": 887753188.0, + "step": 23271 + }, + { + "epoch": 2.9604376033583515, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.002944946289062, + "learning_rate": 1e-06, + "loss": 0.4411, + "mean_token_accuracy": 0.8596044778823853, + "num_tokens": 887795907.0, + "step": 23272 + }, + { + "epoch": 2.960564813636942, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.371248245239258, + "learning_rate": 1e-06, + "loss": 0.4065, + "mean_token_accuracy": 0.8745412230491638, + "num_tokens": 887834859.0, + "step": 23273 + }, + { + "epoch": 2.9606920239155325, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.218032836914062, + "learning_rate": 1e-06, + "loss": 0.4075, + "mean_token_accuracy": 0.870409369468689, + "num_tokens": 887879041.0, + "step": 23274 + }, + { + "epoch": 2.9608192341941226, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.93383026123047, + "learning_rate": 1e-06, + "loss": 0.4077, + "mean_token_accuracy": 0.8719537258148193, + "num_tokens": 887913752.0, + "step": 23275 + }, + { + "epoch": 2.9609464444727136, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.147069931030273, + "learning_rate": 1e-06, + "loss": 0.3764, + "mean_token_accuracy": 0.8813152313232422, + "num_tokens": 887950946.0, + "step": 23276 + }, + { + "epoch": 2.9610736547513037, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.825185775756836, + "learning_rate": 1e-06, + "loss": 0.402, + "mean_token_accuracy": 0.8729357719421387, + "num_tokens": 887993155.0, + "step": 23277 + }, + { + "epoch": 2.9612008650298947, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.075252532958984, + "learning_rate": 1e-06, + "loss": 0.4123, + "mean_token_accuracy": 0.874631941318512, + "num_tokens": 888027831.0, + "step": 23278 + }, + { + "epoch": 2.9613280753084847, + "ewc_loss": 0.041259765625, + "ewc_loss_parallel": 4.124641418457031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.12919044494629, + "learning_rate": 1e-06, + "loss": 0.3653, + "mean_token_accuracy": 0.8855807185173035, + "num_tokens": 888066941.0, + "step": 23279 + }, + { + "epoch": 2.9614552855870753, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.024507522583008, + "learning_rate": 1e-06, + "loss": 0.3512, + "mean_token_accuracy": 0.8895821571350098, + "num_tokens": 888099739.0, + "step": 23280 + }, + { + "epoch": 2.961582495865666, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.004947662353516, + "learning_rate": 1e-06, + "loss": 0.3901, + "mean_token_accuracy": 0.8768408298492432, + "num_tokens": 888139675.0, + "step": 23281 + }, + { + "epoch": 2.9617097061442563, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.132490158081055, + "learning_rate": 1e-06, + "loss": 0.4237, + "mean_token_accuracy": 0.8618985414505005, + "num_tokens": 888179374.0, + "step": 23282 + }, + { + "epoch": 2.961836916422847, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.02604103088379, + "learning_rate": 1e-06, + "loss": 0.4301, + "mean_token_accuracy": 0.8619335293769836, + "num_tokens": 888214139.0, + "step": 23283 + }, + { + "epoch": 2.9619641267014374, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.257822036743164, + "learning_rate": 1e-06, + "loss": 0.3995, + "mean_token_accuracy": 0.8735086917877197, + "num_tokens": 888246365.0, + "step": 23284 + }, + { + "epoch": 2.962091336980028, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.096580505371094, + "learning_rate": 1e-06, + "loss": 0.4212, + "mean_token_accuracy": 0.8691736459732056, + "num_tokens": 888290903.0, + "step": 23285 + }, + { + "epoch": 2.9622185472586184, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.230804443359375, + "learning_rate": 1e-06, + "loss": 0.4092, + "mean_token_accuracy": 0.8717714548110962, + "num_tokens": 888331329.0, + "step": 23286 + }, + { + "epoch": 2.962345757537209, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.977447509765625, + "learning_rate": 1e-06, + "loss": 0.4143, + "mean_token_accuracy": 0.8680874705314636, + "num_tokens": 888372411.0, + "step": 23287 + }, + { + "epoch": 2.9624729678157995, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.407825469970703, + "learning_rate": 1e-06, + "loss": 0.3992, + "mean_token_accuracy": 0.8744451403617859, + "num_tokens": 888409778.0, + "step": 23288 + }, + { + "epoch": 2.96260017809439, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.859390258789062, + "learning_rate": 1e-06, + "loss": 0.379, + "mean_token_accuracy": 0.8813998699188232, + "num_tokens": 888453593.0, + "step": 23289 + }, + { + "epoch": 2.9627273883729806, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.519655227661133, + "learning_rate": 1e-06, + "loss": 0.3569, + "mean_token_accuracy": 0.8879767060279846, + "num_tokens": 888488245.0, + "step": 23290 + }, + { + "epoch": 2.962854598651571, + "ewc_loss": 0.04150390625, + "ewc_loss_parallel": 4.1484832763671875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.085187911987305, + "learning_rate": 1e-06, + "loss": 0.3953, + "mean_token_accuracy": 0.8743424415588379, + "num_tokens": 888524306.0, + "step": 23291 + }, + { + "epoch": 2.9629818089301616, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.80359649658203, + "learning_rate": 1e-06, + "loss": 0.3636, + "mean_token_accuracy": 0.8878814578056335, + "num_tokens": 888563791.0, + "step": 23292 + }, + { + "epoch": 2.963109019208752, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.325176239013672, + "learning_rate": 1e-06, + "loss": 0.3852, + "mean_token_accuracy": 0.8798884153366089, + "num_tokens": 888605021.0, + "step": 23293 + }, + { + "epoch": 2.9632362294873427, + "ewc_loss": 0.041259765625, + "ewc_loss_parallel": 4.124641418457031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.077415466308594, + "learning_rate": 1e-06, + "loss": 0.3922, + "mean_token_accuracy": 0.876184344291687, + "num_tokens": 888647382.0, + "step": 23294 + }, + { + "epoch": 2.963363439765933, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.063459396362305, + "learning_rate": 1e-06, + "loss": 0.4076, + "mean_token_accuracy": 0.8754854202270508, + "num_tokens": 888682174.0, + "step": 23295 + }, + { + "epoch": 2.9634906500445237, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.532873153686523, + "learning_rate": 1e-06, + "loss": 0.3799, + "mean_token_accuracy": 0.8770430684089661, + "num_tokens": 888724401.0, + "step": 23296 + }, + { + "epoch": 2.9636178603231143, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.09428596496582, + "learning_rate": 1e-06, + "loss": 0.3793, + "mean_token_accuracy": 0.8817378282546997, + "num_tokens": 888766230.0, + "step": 23297 + }, + { + "epoch": 2.963745070601705, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.267602920532227, + "learning_rate": 1e-06, + "loss": 0.461, + "mean_token_accuracy": 0.8536739945411682, + "num_tokens": 888811865.0, + "step": 23298 + }, + { + "epoch": 2.9638722808802953, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.231969833374023, + "learning_rate": 1e-06, + "loss": 0.3985, + "mean_token_accuracy": 0.8755695819854736, + "num_tokens": 888849549.0, + "step": 23299 + }, + { + "epoch": 2.9639994911588854, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.069486618041992, + "learning_rate": 1e-06, + "loss": 0.3848, + "mean_token_accuracy": 0.8775603175163269, + "num_tokens": 888881305.0, + "step": 23300 + }, + { + "epoch": 2.9641267014374764, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.30596923828125, + "learning_rate": 1e-06, + "loss": 0.3883, + "mean_token_accuracy": 0.8804169297218323, + "num_tokens": 888917862.0, + "step": 23301 + }, + { + "epoch": 2.9642539117160664, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.98834228515625, + "learning_rate": 1e-06, + "loss": 0.3957, + "mean_token_accuracy": 0.8757752180099487, + "num_tokens": 888953883.0, + "step": 23302 + }, + { + "epoch": 2.9643811219946574, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.307146072387695, + "learning_rate": 1e-06, + "loss": 0.3473, + "mean_token_accuracy": 0.8926352262496948, + "num_tokens": 888991052.0, + "step": 23303 + }, + { + "epoch": 2.9645083322732475, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.08319664001465, + "learning_rate": 1e-06, + "loss": 0.3657, + "mean_token_accuracy": 0.8877016305923462, + "num_tokens": 889022523.0, + "step": 23304 + }, + { + "epoch": 2.964635542551838, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.943588256835938, + "learning_rate": 1e-06, + "loss": 0.4202, + "mean_token_accuracy": 0.8664333820343018, + "num_tokens": 889063470.0, + "step": 23305 + }, + { + "epoch": 2.9647627528304286, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.529457092285156, + "learning_rate": 1e-06, + "loss": 0.3704, + "mean_token_accuracy": 0.8840630054473877, + "num_tokens": 889101160.0, + "step": 23306 + }, + { + "epoch": 2.964889963109019, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.967811584472656, + "learning_rate": 1e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.8722949028015137, + "num_tokens": 889139128.0, + "step": 23307 + }, + { + "epoch": 2.9650171733876096, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.095441818237305, + "learning_rate": 1e-06, + "loss": 0.4612, + "mean_token_accuracy": 0.8538174629211426, + "num_tokens": 889175574.0, + "step": 23308 + }, + { + "epoch": 2.9651443836662, + "ewc_loss": 0.041259765625, + "ewc_loss_parallel": 4.124641418457031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.013944625854492, + "learning_rate": 1e-06, + "loss": 0.4109, + "mean_token_accuracy": 0.8728394508361816, + "num_tokens": 889216520.0, + "step": 23309 + }, + { + "epoch": 2.9652715939447907, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.847665786743164, + "learning_rate": 1e-06, + "loss": 0.3662, + "mean_token_accuracy": 0.8871015310287476, + "num_tokens": 889256746.0, + "step": 23310 + }, + { + "epoch": 2.965398804223381, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.13941192626953, + "learning_rate": 1e-06, + "loss": 0.4173, + "mean_token_accuracy": 0.8711934089660645, + "num_tokens": 889298868.0, + "step": 23311 + }, + { + "epoch": 2.9655260145019717, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.847448348999023, + "learning_rate": 1e-06, + "loss": 0.411, + "mean_token_accuracy": 0.8705004453659058, + "num_tokens": 889337758.0, + "step": 23312 + }, + { + "epoch": 2.9656532247805623, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.676851272583008, + "learning_rate": 1e-06, + "loss": 0.3897, + "mean_token_accuracy": 0.8799667954444885, + "num_tokens": 889375308.0, + "step": 23313 + }, + { + "epoch": 2.965780435059153, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.949581146240234, + "learning_rate": 1e-06, + "loss": 0.3688, + "mean_token_accuracy": 0.8839396834373474, + "num_tokens": 889413735.0, + "step": 23314 + }, + { + "epoch": 2.9659076453377433, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.1839599609375, + "learning_rate": 1e-06, + "loss": 0.3918, + "mean_token_accuracy": 0.8778113126754761, + "num_tokens": 889451585.0, + "step": 23315 + }, + { + "epoch": 2.966034855616334, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.615150451660156, + "learning_rate": 1e-06, + "loss": 0.4519, + "mean_token_accuracy": 0.8594306707382202, + "num_tokens": 889487405.0, + "step": 23316 + }, + { + "epoch": 2.9661620658949244, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.87395477294922, + "learning_rate": 1e-06, + "loss": 0.4213, + "mean_token_accuracy": 0.864454984664917, + "num_tokens": 889523051.0, + "step": 23317 + }, + { + "epoch": 2.966289276173515, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.493085861206055, + "learning_rate": 1e-06, + "loss": 0.4434, + "mean_token_accuracy": 0.8570592999458313, + "num_tokens": 889559822.0, + "step": 23318 + }, + { + "epoch": 2.9664164864521054, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.461687088012695, + "learning_rate": 1e-06, + "loss": 0.395, + "mean_token_accuracy": 0.8763852119445801, + "num_tokens": 889591598.0, + "step": 23319 + }, + { + "epoch": 2.966543696730696, + "ewc_loss": 0.040283203125, + "ewc_loss_parallel": 4.029273986816406e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.637598037719727, + "learning_rate": 1e-06, + "loss": 0.3635, + "mean_token_accuracy": 0.8853864073753357, + "num_tokens": 889629787.0, + "step": 23320 + }, + { + "epoch": 2.9666709070092865, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.720542907714844, + "learning_rate": 1e-06, + "loss": 0.3778, + "mean_token_accuracy": 0.8815783262252808, + "num_tokens": 889668730.0, + "step": 23321 + }, + { + "epoch": 2.966798117287877, + "ewc_loss": 0.041259765625, + "ewc_loss_parallel": 4.124641418457031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.08445930480957, + "learning_rate": 1e-06, + "loss": 0.4042, + "mean_token_accuracy": 0.8741458654403687, + "num_tokens": 889705215.0, + "step": 23322 + }, + { + "epoch": 2.966925327566467, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.905193328857422, + "learning_rate": 1e-06, + "loss": 0.3975, + "mean_token_accuracy": 0.8765439987182617, + "num_tokens": 889738528.0, + "step": 23323 + }, + { + "epoch": 2.967052537845058, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.446596145629883, + "learning_rate": 1e-06, + "loss": 0.3946, + "mean_token_accuracy": 0.8787667751312256, + "num_tokens": 889771694.0, + "step": 23324 + }, + { + "epoch": 2.967179748123648, + "ewc_loss": 0.041259765625, + "ewc_loss_parallel": 4.124641418457031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.029216766357422, + "learning_rate": 1e-06, + "loss": 0.366, + "mean_token_accuracy": 0.8882855176925659, + "num_tokens": 889806096.0, + "step": 23325 + }, + { + "epoch": 2.967306958402239, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.214839935302734, + "learning_rate": 1e-06, + "loss": 0.4111, + "mean_token_accuracy": 0.8736384510993958, + "num_tokens": 889848514.0, + "step": 23326 + }, + { + "epoch": 2.967434168680829, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.996395111083984, + "learning_rate": 1e-06, + "loss": 0.3413, + "mean_token_accuracy": 0.8957443237304688, + "num_tokens": 889889689.0, + "step": 23327 + }, + { + "epoch": 2.96756137895942, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.189395904541016, + "learning_rate": 1e-06, + "loss": 0.3886, + "mean_token_accuracy": 0.8782312870025635, + "num_tokens": 889925550.0, + "step": 23328 + }, + { + "epoch": 2.9676885892380103, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.15431022644043, + "learning_rate": 1e-06, + "loss": 0.4303, + "mean_token_accuracy": 0.8637113571166992, + "num_tokens": 889961797.0, + "step": 23329 + }, + { + "epoch": 2.967815799516601, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.765295028686523, + "learning_rate": 1e-06, + "loss": 0.382, + "mean_token_accuracy": 0.8811714053153992, + "num_tokens": 889997935.0, + "step": 23330 + }, + { + "epoch": 2.9679430097951913, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.247116088867188, + "learning_rate": 1e-06, + "loss": 0.3591, + "mean_token_accuracy": 0.8870682716369629, + "num_tokens": 890037900.0, + "step": 23331 + }, + { + "epoch": 2.968070220073782, + "ewc_loss": 0.04150390625, + "ewc_loss_parallel": 4.1484832763671875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.106101989746094, + "learning_rate": 1e-06, + "loss": 0.381, + "mean_token_accuracy": 0.8810151219367981, + "num_tokens": 890074814.0, + "step": 23332 + }, + { + "epoch": 2.9681974303523724, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.39569664001465, + "learning_rate": 1e-06, + "loss": 0.3649, + "mean_token_accuracy": 0.8879638314247131, + "num_tokens": 890107091.0, + "step": 23333 + }, + { + "epoch": 2.968324640630963, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.26347541809082, + "learning_rate": 1e-06, + "loss": 0.3782, + "mean_token_accuracy": 0.8841224908828735, + "num_tokens": 890140558.0, + "step": 23334 + }, + { + "epoch": 2.9684518509095534, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.021299362182617, + "learning_rate": 1e-06, + "loss": 0.3936, + "mean_token_accuracy": 0.8760972023010254, + "num_tokens": 890174563.0, + "step": 23335 + }, + { + "epoch": 2.968579061188144, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.95956039428711, + "learning_rate": 1e-06, + "loss": 0.3981, + "mean_token_accuracy": 0.8730721473693848, + "num_tokens": 890210584.0, + "step": 23336 + }, + { + "epoch": 2.9687062714667345, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.053831100463867, + "learning_rate": 1e-06, + "loss": 0.3598, + "mean_token_accuracy": 0.8876779079437256, + "num_tokens": 890251955.0, + "step": 23337 + }, + { + "epoch": 2.968833481745325, + "ewc_loss": 0.041259765625, + "ewc_loss_parallel": 4.124641418457031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.342090606689453, + "learning_rate": 1e-06, + "loss": 0.4028, + "mean_token_accuracy": 0.8719292879104614, + "num_tokens": 890289095.0, + "step": 23338 + }, + { + "epoch": 2.9689606920239155, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.064584732055664, + "learning_rate": 1e-06, + "loss": 0.3978, + "mean_token_accuracy": 0.8778213858604431, + "num_tokens": 890327864.0, + "step": 23339 + }, + { + "epoch": 2.969087902302506, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.309585571289062, + "learning_rate": 1e-06, + "loss": 0.3933, + "mean_token_accuracy": 0.8757486343383789, + "num_tokens": 890364244.0, + "step": 23340 + }, + { + "epoch": 2.9692151125810966, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.264619827270508, + "learning_rate": 1e-06, + "loss": 0.4476, + "mean_token_accuracy": 0.8619206547737122, + "num_tokens": 890397652.0, + "step": 23341 + }, + { + "epoch": 2.969342322859687, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.03273582458496, + "learning_rate": 1e-06, + "loss": 0.382, + "mean_token_accuracy": 0.8804171085357666, + "num_tokens": 890435598.0, + "step": 23342 + }, + { + "epoch": 2.9694695331382777, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.961153030395508, + "learning_rate": 1e-06, + "loss": 0.3963, + "mean_token_accuracy": 0.8801791667938232, + "num_tokens": 890473445.0, + "step": 23343 + }, + { + "epoch": 2.969596743416868, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.226970672607422, + "learning_rate": 1e-06, + "loss": 0.3801, + "mean_token_accuracy": 0.8800438642501831, + "num_tokens": 890514263.0, + "step": 23344 + }, + { + "epoch": 2.9697239536954587, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.043798446655273, + "learning_rate": 1e-06, + "loss": 0.3718, + "mean_token_accuracy": 0.8812013864517212, + "num_tokens": 890554123.0, + "step": 23345 + }, + { + "epoch": 2.9698511639740492, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.0700626373291, + "learning_rate": 1e-06, + "loss": 0.4078, + "mean_token_accuracy": 0.8754754066467285, + "num_tokens": 890593938.0, + "step": 23346 + }, + { + "epoch": 2.9699783742526398, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.08228302001953, + "learning_rate": 1e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.8652864098548889, + "num_tokens": 890632260.0, + "step": 23347 + }, + { + "epoch": 2.97010558453123, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.06305503845215, + "learning_rate": 1e-06, + "loss": 0.3996, + "mean_token_accuracy": 0.8734382390975952, + "num_tokens": 890666852.0, + "step": 23348 + }, + { + "epoch": 2.970232794809821, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.029705047607422, + "learning_rate": 1e-06, + "loss": 0.4045, + "mean_token_accuracy": 0.871738076210022, + "num_tokens": 890707959.0, + "step": 23349 + }, + { + "epoch": 2.970360005088411, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.127450942993164, + "learning_rate": 1e-06, + "loss": 0.4747, + "mean_token_accuracy": 0.8550558090209961, + "num_tokens": 890748729.0, + "step": 23350 + }, + { + "epoch": 2.970487215367002, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.288917541503906, + "learning_rate": 1e-06, + "loss": 0.3575, + "mean_token_accuracy": 0.8885092735290527, + "num_tokens": 890782116.0, + "step": 23351 + }, + { + "epoch": 2.970614425645592, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.030588150024414, + "learning_rate": 1e-06, + "loss": 0.4348, + "mean_token_accuracy": 0.8633953332901001, + "num_tokens": 890827309.0, + "step": 23352 + }, + { + "epoch": 2.970741635924183, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.210514068603516, + "learning_rate": 1e-06, + "loss": 0.4405, + "mean_token_accuracy": 0.8651285767555237, + "num_tokens": 890863879.0, + "step": 23353 + }, + { + "epoch": 2.970868846202773, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.00274085998535, + "learning_rate": 1e-06, + "loss": 0.388, + "mean_token_accuracy": 0.8781430125236511, + "num_tokens": 890901184.0, + "step": 23354 + }, + { + "epoch": 2.9709960564813636, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.052457809448242, + "learning_rate": 1e-06, + "loss": 0.3645, + "mean_token_accuracy": 0.88523268699646, + "num_tokens": 890937779.0, + "step": 23355 + }, + { + "epoch": 2.971123266759954, + "ewc_loss": 0.041259765625, + "ewc_loss_parallel": 4.124641418457031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.181554794311523, + "learning_rate": 1e-06, + "loss": 0.3532, + "mean_token_accuracy": 0.88902747631073, + "num_tokens": 890978055.0, + "step": 23356 + }, + { + "epoch": 2.9712504770385446, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.076610565185547, + "learning_rate": 1e-06, + "loss": 0.5218, + "mean_token_accuracy": 0.8469250798225403, + "num_tokens": 891010985.0, + "step": 23357 + }, + { + "epoch": 2.971377687317135, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.01865577697754, + "learning_rate": 1e-06, + "loss": 0.3845, + "mean_token_accuracy": 0.8793157339096069, + "num_tokens": 891045904.0, + "step": 23358 + }, + { + "epoch": 2.9715048975957257, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.975034713745117, + "learning_rate": 1e-06, + "loss": 0.3706, + "mean_token_accuracy": 0.8789654970169067, + "num_tokens": 891086240.0, + "step": 23359 + }, + { + "epoch": 2.971632107874316, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.138608932495117, + "learning_rate": 1e-06, + "loss": 0.4052, + "mean_token_accuracy": 0.872566819190979, + "num_tokens": 891126482.0, + "step": 23360 + }, + { + "epoch": 2.9717593181529067, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.133869171142578, + "learning_rate": 1e-06, + "loss": 0.4268, + "mean_token_accuracy": 0.865138828754425, + "num_tokens": 891165982.0, + "step": 23361 + }, + { + "epoch": 2.9718865284314973, + "ewc_loss": 0.041259765625, + "ewc_loss_parallel": 4.124641418457031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.08631134033203, + "learning_rate": 1e-06, + "loss": 0.4344, + "mean_token_accuracy": 0.8623256087303162, + "num_tokens": 891199101.0, + "step": 23362 + }, + { + "epoch": 2.972013738710088, + "ewc_loss": 0.041259765625, + "ewc_loss_parallel": 4.124641418457031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.1185302734375, + "learning_rate": 1e-06, + "loss": 0.4084, + "mean_token_accuracy": 0.8742876052856445, + "num_tokens": 891231511.0, + "step": 23363 + }, + { + "epoch": 2.9721409489886783, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.190933227539062, + "learning_rate": 1e-06, + "loss": 0.4385, + "mean_token_accuracy": 0.8636083602905273, + "num_tokens": 891262369.0, + "step": 23364 + }, + { + "epoch": 2.972268159267269, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.0526180267334, + "learning_rate": 1e-06, + "loss": 0.4364, + "mean_token_accuracy": 0.8599787354469299, + "num_tokens": 891304206.0, + "step": 23365 + }, + { + "epoch": 2.9723953695458594, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.120983123779297, + "learning_rate": 1e-06, + "loss": 0.4352, + "mean_token_accuracy": 0.8634940385818481, + "num_tokens": 891348006.0, + "step": 23366 + }, + { + "epoch": 2.97252257982445, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.25925064086914, + "learning_rate": 1e-06, + "loss": 0.3702, + "mean_token_accuracy": 0.8829151391983032, + "num_tokens": 891386851.0, + "step": 23367 + }, + { + "epoch": 2.9726497901030404, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.38802146911621, + "learning_rate": 1e-06, + "loss": 0.3715, + "mean_token_accuracy": 0.8842368721961975, + "num_tokens": 891423582.0, + "step": 23368 + }, + { + "epoch": 2.972777000381631, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.000442504882812, + "learning_rate": 1e-06, + "loss": 0.3431, + "mean_token_accuracy": 0.8905315399169922, + "num_tokens": 891463773.0, + "step": 23369 + }, + { + "epoch": 2.9729042106602215, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.08329963684082, + "learning_rate": 1e-06, + "loss": 0.3627, + "mean_token_accuracy": 0.8864130973815918, + "num_tokens": 891498117.0, + "step": 23370 + }, + { + "epoch": 2.973031420938812, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.331087112426758, + "learning_rate": 1e-06, + "loss": 0.3716, + "mean_token_accuracy": 0.8828772306442261, + "num_tokens": 891529055.0, + "step": 23371 + }, + { + "epoch": 2.9731586312174025, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.26067352294922, + "learning_rate": 1e-06, + "loss": 0.3681, + "mean_token_accuracy": 0.8843132257461548, + "num_tokens": 891567370.0, + "step": 23372 + }, + { + "epoch": 2.9732858414959926, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.45276641845703, + "learning_rate": 1e-06, + "loss": 0.3872, + "mean_token_accuracy": 0.8781918287277222, + "num_tokens": 891602737.0, + "step": 23373 + }, + { + "epoch": 2.9734130517745836, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.994762420654297, + "learning_rate": 1e-06, + "loss": 0.4348, + "mean_token_accuracy": 0.8680635094642639, + "num_tokens": 891639713.0, + "step": 23374 + }, + { + "epoch": 2.9735402620531737, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.189502716064453, + "learning_rate": 1e-06, + "loss": 0.3662, + "mean_token_accuracy": 0.8874817490577698, + "num_tokens": 891673931.0, + "step": 23375 + }, + { + "epoch": 2.9736674723317646, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.10175895690918, + "learning_rate": 1e-06, + "loss": 0.407, + "mean_token_accuracy": 0.874897837638855, + "num_tokens": 891708015.0, + "step": 23376 + }, + { + "epoch": 2.9737946826103547, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.064252853393555, + "learning_rate": 1e-06, + "loss": 0.4105, + "mean_token_accuracy": 0.8726346492767334, + "num_tokens": 891746023.0, + "step": 23377 + }, + { + "epoch": 2.9739218928889453, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.17165184020996, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.873314619064331, + "num_tokens": 891777767.0, + "step": 23378 + }, + { + "epoch": 2.974049103167536, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.14588165283203, + "learning_rate": 1e-06, + "loss": 0.3829, + "mean_token_accuracy": 0.8783599138259888, + "num_tokens": 891810816.0, + "step": 23379 + }, + { + "epoch": 2.9741763134461263, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.02326202392578, + "learning_rate": 1e-06, + "loss": 0.4088, + "mean_token_accuracy": 0.8683416247367859, + "num_tokens": 891848810.0, + "step": 23380 + }, + { + "epoch": 2.974303523724717, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.207334518432617, + "learning_rate": 1e-06, + "loss": 0.3925, + "mean_token_accuracy": 0.8789821863174438, + "num_tokens": 891892553.0, + "step": 23381 + }, + { + "epoch": 2.9744307340033074, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.179996490478516, + "learning_rate": 1e-06, + "loss": 0.3599, + "mean_token_accuracy": 0.8863897919654846, + "num_tokens": 891927539.0, + "step": 23382 + }, + { + "epoch": 2.974557944281898, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.287673950195312, + "learning_rate": 1e-06, + "loss": 0.3738, + "mean_token_accuracy": 0.8802680969238281, + "num_tokens": 891959577.0, + "step": 23383 + }, + { + "epoch": 2.9746851545604884, + "ewc_loss": 0.041259765625, + "ewc_loss_parallel": 4.124641418457031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.130033493041992, + "learning_rate": 1e-06, + "loss": 0.4808, + "mean_token_accuracy": 0.8491367101669312, + "num_tokens": 892000799.0, + "step": 23384 + }, + { + "epoch": 2.974812364839079, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.158647537231445, + "learning_rate": 1e-06, + "loss": 0.4175, + "mean_token_accuracy": 0.8691596388816833, + "num_tokens": 892042200.0, + "step": 23385 + }, + { + "epoch": 2.9749395751176695, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.212045669555664, + "learning_rate": 1e-06, + "loss": 0.4279, + "mean_token_accuracy": 0.8667290806770325, + "num_tokens": 892084838.0, + "step": 23386 + }, + { + "epoch": 2.97506678539626, + "ewc_loss": 0.041259765625, + "ewc_loss_parallel": 4.124641418457031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.876314163208008, + "learning_rate": 1e-06, + "loss": 0.4454, + "mean_token_accuracy": 0.8595627546310425, + "num_tokens": 892126442.0, + "step": 23387 + }, + { + "epoch": 2.9751939956748505, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.438404083251953, + "learning_rate": 1e-06, + "loss": 0.415, + "mean_token_accuracy": 0.8651500344276428, + "num_tokens": 892165019.0, + "step": 23388 + }, + { + "epoch": 2.975321205953441, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.42363739013672, + "learning_rate": 1e-06, + "loss": 0.3951, + "mean_token_accuracy": 0.8775800466537476, + "num_tokens": 892202291.0, + "step": 23389 + }, + { + "epoch": 2.9754484162320316, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.067020416259766, + "learning_rate": 1e-06, + "loss": 0.4315, + "mean_token_accuracy": 0.8660939931869507, + "num_tokens": 892235159.0, + "step": 23390 + }, + { + "epoch": 2.975575626510622, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.138195037841797, + "learning_rate": 1e-06, + "loss": 0.4398, + "mean_token_accuracy": 0.8639681935310364, + "num_tokens": 892271967.0, + "step": 23391 + }, + { + "epoch": 2.9757028367892127, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.060298919677734, + "learning_rate": 1e-06, + "loss": 0.4484, + "mean_token_accuracy": 0.8615299463272095, + "num_tokens": 892306230.0, + "step": 23392 + }, + { + "epoch": 2.975830047067803, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.015573501586914, + "learning_rate": 1e-06, + "loss": 0.3837, + "mean_token_accuracy": 0.8777880668640137, + "num_tokens": 892342118.0, + "step": 23393 + }, + { + "epoch": 2.9759572573463937, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.2989501953125, + "learning_rate": 1e-06, + "loss": 0.3855, + "mean_token_accuracy": 0.879984438419342, + "num_tokens": 892380764.0, + "step": 23394 + }, + { + "epoch": 2.9760844676249842, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.196762084960938, + "learning_rate": 1e-06, + "loss": 0.4105, + "mean_token_accuracy": 0.87236088514328, + "num_tokens": 892421113.0, + "step": 23395 + }, + { + "epoch": 2.9762116779035748, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.02459716796875, + "learning_rate": 1e-06, + "loss": 0.3796, + "mean_token_accuracy": 0.8819606900215149, + "num_tokens": 892457512.0, + "step": 23396 + }, + { + "epoch": 2.9763388881821653, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.058090209960938, + "learning_rate": 1e-06, + "loss": 0.3757, + "mean_token_accuracy": 0.8797187805175781, + "num_tokens": 892493599.0, + "step": 23397 + }, + { + "epoch": 2.9764660984607554, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.120153427124023, + "learning_rate": 1e-06, + "loss": 0.4167, + "mean_token_accuracy": 0.8659284114837646, + "num_tokens": 892528474.0, + "step": 23398 + }, + { + "epoch": 2.9765933087393464, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.222110748291016, + "learning_rate": 1e-06, + "loss": 0.3735, + "mean_token_accuracy": 0.8817228078842163, + "num_tokens": 892569591.0, + "step": 23399 + }, + { + "epoch": 2.9767205190179364, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.01556396484375, + "learning_rate": 1e-06, + "loss": 0.3786, + "mean_token_accuracy": 0.8794295787811279, + "num_tokens": 892607185.0, + "step": 23400 + }, + { + "epoch": 2.9768477292965274, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.10689353942871, + "learning_rate": 1e-06, + "loss": 0.4004, + "mean_token_accuracy": 0.8757073879241943, + "num_tokens": 892641453.0, + "step": 23401 + }, + { + "epoch": 2.9769749395751175, + "ewc_loss": 0.041259765625, + "ewc_loss_parallel": 4.124641418457031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.03783416748047, + "learning_rate": 1e-06, + "loss": 0.4201, + "mean_token_accuracy": 0.8688247203826904, + "num_tokens": 892676802.0, + "step": 23402 + }, + { + "epoch": 2.977102149853708, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.090900421142578, + "learning_rate": 1e-06, + "loss": 0.3742, + "mean_token_accuracy": 0.8861345052719116, + "num_tokens": 892711694.0, + "step": 23403 + }, + { + "epoch": 2.9772293601322986, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.061979293823242, + "learning_rate": 1e-06, + "loss": 0.4199, + "mean_token_accuracy": 0.8657028675079346, + "num_tokens": 892752662.0, + "step": 23404 + }, + { + "epoch": 2.977356570410889, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.065492630004883, + "learning_rate": 1e-06, + "loss": 0.3956, + "mean_token_accuracy": 0.877220869064331, + "num_tokens": 892790170.0, + "step": 23405 + }, + { + "epoch": 2.9774837806894796, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.10047149658203, + "learning_rate": 1e-06, + "loss": 0.4624, + "mean_token_accuracy": 0.8570214509963989, + "num_tokens": 892831437.0, + "step": 23406 + }, + { + "epoch": 2.97761099096807, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.0379581451416, + "learning_rate": 1e-06, + "loss": 0.3335, + "mean_token_accuracy": 0.8967858552932739, + "num_tokens": 892869022.0, + "step": 23407 + }, + { + "epoch": 2.9777382012466607, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.386821746826172, + "learning_rate": 1e-06, + "loss": 0.3971, + "mean_token_accuracy": 0.8771591186523438, + "num_tokens": 892910738.0, + "step": 23408 + }, + { + "epoch": 2.977865411525251, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.083328247070312, + "learning_rate": 1e-06, + "loss": 0.3817, + "mean_token_accuracy": 0.8837205767631531, + "num_tokens": 892952055.0, + "step": 23409 + }, + { + "epoch": 2.9779926218038417, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.012027740478516, + "learning_rate": 1e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.8674501776695251, + "num_tokens": 892985843.0, + "step": 23410 + }, + { + "epoch": 2.9781198320824323, + "ewc_loss": 0.041259765625, + "ewc_loss_parallel": 4.124641418457031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.218753814697266, + "learning_rate": 1e-06, + "loss": 0.4174, + "mean_token_accuracy": 0.8653848171234131, + "num_tokens": 893031197.0, + "step": 23411 + }, + { + "epoch": 2.978247042361023, + "ewc_loss": 0.04150390625, + "ewc_loss_parallel": 4.1484832763671875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.068899154663086, + "learning_rate": 1e-06, + "loss": 0.3502, + "mean_token_accuracy": 0.8881649374961853, + "num_tokens": 893068113.0, + "step": 23412 + }, + { + "epoch": 2.9783742526396133, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.139387130737305, + "learning_rate": 1e-06, + "loss": 0.3667, + "mean_token_accuracy": 0.8847978115081787, + "num_tokens": 893109148.0, + "step": 23413 + }, + { + "epoch": 2.978501462918204, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.30129051208496, + "learning_rate": 1e-06, + "loss": 0.3818, + "mean_token_accuracy": 0.8800670504570007, + "num_tokens": 893151406.0, + "step": 23414 + }, + { + "epoch": 2.9786286731967944, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.010112762451172, + "learning_rate": 1e-06, + "loss": 0.3828, + "mean_token_accuracy": 0.8793660402297974, + "num_tokens": 893190209.0, + "step": 23415 + }, + { + "epoch": 2.978755883475385, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.09976577758789, + "learning_rate": 1e-06, + "loss": 0.411, + "mean_token_accuracy": 0.874870777130127, + "num_tokens": 893232949.0, + "step": 23416 + }, + { + "epoch": 2.9788830937539754, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.30352783203125, + "learning_rate": 1e-06, + "loss": 0.3967, + "mean_token_accuracy": 0.8755522966384888, + "num_tokens": 893273931.0, + "step": 23417 + }, + { + "epoch": 2.979010304032566, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.106904983520508, + "learning_rate": 1e-06, + "loss": 0.4558, + "mean_token_accuracy": 0.8560829758644104, + "num_tokens": 893317465.0, + "step": 23418 + }, + { + "epoch": 2.9791375143111565, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.09444808959961, + "learning_rate": 1e-06, + "loss": 0.4494, + "mean_token_accuracy": 0.8607231378555298, + "num_tokens": 893354089.0, + "step": 23419 + }, + { + "epoch": 2.979264724589747, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.546159744262695, + "learning_rate": 1e-06, + "loss": 0.4006, + "mean_token_accuracy": 0.8728581666946411, + "num_tokens": 893388051.0, + "step": 23420 + }, + { + "epoch": 2.979391934868337, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.987106323242188, + "learning_rate": 1e-06, + "loss": 0.4197, + "mean_token_accuracy": 0.8700411319732666, + "num_tokens": 893426191.0, + "step": 23421 + }, + { + "epoch": 2.979519145146928, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.050580978393555, + "learning_rate": 1e-06, + "loss": 0.4192, + "mean_token_accuracy": 0.868409276008606, + "num_tokens": 893462134.0, + "step": 23422 + }, + { + "epoch": 2.979646355425518, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.11161231994629, + "learning_rate": 1e-06, + "loss": 0.4292, + "mean_token_accuracy": 0.8702250123023987, + "num_tokens": 893503159.0, + "step": 23423 + }, + { + "epoch": 2.979773565704109, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.143718719482422, + "learning_rate": 1e-06, + "loss": 0.4667, + "mean_token_accuracy": 0.8574248552322388, + "num_tokens": 893538379.0, + "step": 23424 + }, + { + "epoch": 2.979900775982699, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.82086753845215, + "learning_rate": 1e-06, + "loss": 0.4306, + "mean_token_accuracy": 0.8630237579345703, + "num_tokens": 893580431.0, + "step": 23425 + }, + { + "epoch": 2.98002798626129, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.213834762573242, + "learning_rate": 1e-06, + "loss": 0.4318, + "mean_token_accuracy": 0.8678395748138428, + "num_tokens": 893622412.0, + "step": 23426 + }, + { + "epoch": 2.9801551965398803, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.33834457397461, + "learning_rate": 1e-06, + "loss": 0.4011, + "mean_token_accuracy": 0.8785767555236816, + "num_tokens": 893658575.0, + "step": 23427 + }, + { + "epoch": 2.980282406818471, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.186477661132812, + "learning_rate": 1e-06, + "loss": 0.374, + "mean_token_accuracy": 0.8822051286697388, + "num_tokens": 893702812.0, + "step": 23428 + }, + { + "epoch": 2.9804096170970613, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.00576400756836, + "learning_rate": 1e-06, + "loss": 0.4202, + "mean_token_accuracy": 0.8671044111251831, + "num_tokens": 893742785.0, + "step": 23429 + }, + { + "epoch": 2.980536827375652, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.984432220458984, + "learning_rate": 1e-06, + "loss": 0.3843, + "mean_token_accuracy": 0.8797959089279175, + "num_tokens": 893782310.0, + "step": 23430 + }, + { + "epoch": 2.9806640376542424, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.228702545166016, + "learning_rate": 1e-06, + "loss": 0.341, + "mean_token_accuracy": 0.891022801399231, + "num_tokens": 893822794.0, + "step": 23431 + }, + { + "epoch": 2.980791247932833, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.003740310668945, + "learning_rate": 1e-06, + "loss": 0.3952, + "mean_token_accuracy": 0.8798599243164062, + "num_tokens": 893870462.0, + "step": 23432 + }, + { + "epoch": 2.9809184582114234, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.314197540283203, + "learning_rate": 1e-06, + "loss": 0.4071, + "mean_token_accuracy": 0.8751156330108643, + "num_tokens": 893910654.0, + "step": 23433 + }, + { + "epoch": 2.981045668490014, + "ewc_loss": 0.041259765625, + "ewc_loss_parallel": 4.124641418457031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.116527557373047, + "learning_rate": 1e-06, + "loss": 0.4012, + "mean_token_accuracy": 0.8717558979988098, + "num_tokens": 893948819.0, + "step": 23434 + }, + { + "epoch": 2.9811728787686045, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.632415771484375, + "learning_rate": 1e-06, + "loss": 0.3754, + "mean_token_accuracy": 0.8817039728164673, + "num_tokens": 893983325.0, + "step": 23435 + }, + { + "epoch": 2.981300089047195, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.2093563079834, + "learning_rate": 1e-06, + "loss": 0.3978, + "mean_token_accuracy": 0.8741647601127625, + "num_tokens": 894023712.0, + "step": 23436 + }, + { + "epoch": 2.9814272993257855, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.02775764465332, + "learning_rate": 1e-06, + "loss": 0.3679, + "mean_token_accuracy": 0.8833396434783936, + "num_tokens": 894059719.0, + "step": 23437 + }, + { + "epoch": 2.981554509604376, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.04033088684082, + "learning_rate": 1e-06, + "loss": 0.3952, + "mean_token_accuracy": 0.8755778074264526, + "num_tokens": 894100150.0, + "step": 23438 + }, + { + "epoch": 2.9816817198829666, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.038494110107422, + "learning_rate": 1e-06, + "loss": 0.481, + "mean_token_accuracy": 0.8547824621200562, + "num_tokens": 894134336.0, + "step": 23439 + }, + { + "epoch": 2.981808930161557, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.094547271728516, + "learning_rate": 1e-06, + "loss": 0.4419, + "mean_token_accuracy": 0.8605924844741821, + "num_tokens": 894168972.0, + "step": 23440 + }, + { + "epoch": 2.9819361404401477, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.162200927734375, + "learning_rate": 1e-06, + "loss": 0.4053, + "mean_token_accuracy": 0.8710195422172546, + "num_tokens": 894204641.0, + "step": 23441 + }, + { + "epoch": 2.982063350718738, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.386871337890625, + "learning_rate": 1e-06, + "loss": 0.4026, + "mean_token_accuracy": 0.877961277961731, + "num_tokens": 894248403.0, + "step": 23442 + }, + { + "epoch": 2.9821905609973287, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.13934898376465, + "learning_rate": 1e-06, + "loss": 0.3394, + "mean_token_accuracy": 0.8922385573387146, + "num_tokens": 894284035.0, + "step": 23443 + }, + { + "epoch": 2.9823177712759192, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.19611167907715, + "learning_rate": 1e-06, + "loss": 0.379, + "mean_token_accuracy": 0.8792848587036133, + "num_tokens": 894321804.0, + "step": 23444 + }, + { + "epoch": 2.9824449815545098, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.896484375, + "learning_rate": 1e-06, + "loss": 0.3705, + "mean_token_accuracy": 0.8827072381973267, + "num_tokens": 894355648.0, + "step": 23445 + }, + { + "epoch": 2.9825721918331, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.291685104370117, + "learning_rate": 1e-06, + "loss": 0.3906, + "mean_token_accuracy": 0.8761242628097534, + "num_tokens": 894392947.0, + "step": 23446 + }, + { + "epoch": 2.982699402111691, + "ewc_loss": 0.041259765625, + "ewc_loss_parallel": 4.124641418457031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.31947135925293, + "learning_rate": 1e-06, + "loss": 0.4311, + "mean_token_accuracy": 0.8661377429962158, + "num_tokens": 894433437.0, + "step": 23447 + }, + { + "epoch": 2.982826612390281, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.112285614013672, + "learning_rate": 1e-06, + "loss": 0.3872, + "mean_token_accuracy": 0.8799363374710083, + "num_tokens": 894481428.0, + "step": 23448 + }, + { + "epoch": 2.982953822668872, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.243331909179688, + "learning_rate": 1e-06, + "loss": 0.3999, + "mean_token_accuracy": 0.8739832639694214, + "num_tokens": 894518055.0, + "step": 23449 + }, + { + "epoch": 2.983081032947462, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.15645980834961, + "learning_rate": 1e-06, + "loss": 0.4242, + "mean_token_accuracy": 0.8695060014724731, + "num_tokens": 894562693.0, + "step": 23450 + }, + { + "epoch": 2.983208243226053, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.038002014160156, + "learning_rate": 1e-06, + "loss": 0.4031, + "mean_token_accuracy": 0.8751106858253479, + "num_tokens": 894604973.0, + "step": 23451 + }, + { + "epoch": 2.983335453504643, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.166032791137695, + "learning_rate": 1e-06, + "loss": 0.3767, + "mean_token_accuracy": 0.8807896971702576, + "num_tokens": 894645672.0, + "step": 23452 + }, + { + "epoch": 2.9834626637832335, + "ewc_loss": 0.041259765625, + "ewc_loss_parallel": 4.124641418457031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.183435440063477, + "learning_rate": 1e-06, + "loss": 0.4072, + "mean_token_accuracy": 0.8731595277786255, + "num_tokens": 894686702.0, + "step": 23453 + }, + { + "epoch": 2.983589874061824, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.204376220703125, + "learning_rate": 1e-06, + "loss": 0.3753, + "mean_token_accuracy": 0.8827317953109741, + "num_tokens": 894722768.0, + "step": 23454 + }, + { + "epoch": 2.9837170843404146, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.184219360351562, + "learning_rate": 1e-06, + "loss": 0.3681, + "mean_token_accuracy": 0.887474536895752, + "num_tokens": 894759216.0, + "step": 23455 + }, + { + "epoch": 2.983844294619005, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.075119018554688, + "learning_rate": 1e-06, + "loss": 0.3803, + "mean_token_accuracy": 0.8793155550956726, + "num_tokens": 894799691.0, + "step": 23456 + }, + { + "epoch": 2.9839715048975957, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.155981063842773, + "learning_rate": 1e-06, + "loss": 0.4169, + "mean_token_accuracy": 0.8718748092651367, + "num_tokens": 894835869.0, + "step": 23457 + }, + { + "epoch": 2.984098715176186, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.064027786254883, + "learning_rate": 1e-06, + "loss": 0.4578, + "mean_token_accuracy": 0.8534895181655884, + "num_tokens": 894875706.0, + "step": 23458 + }, + { + "epoch": 2.9842259254547767, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.063552856445312, + "learning_rate": 1e-06, + "loss": 0.389, + "mean_token_accuracy": 0.8804537057876587, + "num_tokens": 894920135.0, + "step": 23459 + }, + { + "epoch": 2.9843531357333672, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.17144775390625, + "learning_rate": 1e-06, + "loss": 0.3897, + "mean_token_accuracy": 0.8768205642700195, + "num_tokens": 894960825.0, + "step": 23460 + }, + { + "epoch": 2.9844803460119578, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.970001220703125, + "learning_rate": 1e-06, + "loss": 0.3976, + "mean_token_accuracy": 0.876205563545227, + "num_tokens": 895001935.0, + "step": 23461 + }, + { + "epoch": 2.9846075562905483, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.179784774780273, + "learning_rate": 1e-06, + "loss": 0.3629, + "mean_token_accuracy": 0.8834865093231201, + "num_tokens": 895033559.0, + "step": 23462 + }, + { + "epoch": 2.984734766569139, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.238819122314453, + "learning_rate": 1e-06, + "loss": 0.4032, + "mean_token_accuracy": 0.8745527267456055, + "num_tokens": 895070958.0, + "step": 23463 + }, + { + "epoch": 2.9848619768477294, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.816370010375977, + "learning_rate": 1e-06, + "loss": 0.4036, + "mean_token_accuracy": 0.8736540079116821, + "num_tokens": 895111533.0, + "step": 23464 + }, + { + "epoch": 2.98498918712632, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.0936336517334, + "learning_rate": 1e-06, + "loss": 0.4083, + "mean_token_accuracy": 0.8711962699890137, + "num_tokens": 895149861.0, + "step": 23465 + }, + { + "epoch": 2.9851163974049104, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.2664794921875, + "learning_rate": 1e-06, + "loss": 0.3787, + "mean_token_accuracy": 0.8817489743232727, + "num_tokens": 895192958.0, + "step": 23466 + }, + { + "epoch": 2.985243607683501, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.077150344848633, + "learning_rate": 1e-06, + "loss": 0.3727, + "mean_token_accuracy": 0.8845834732055664, + "num_tokens": 895226710.0, + "step": 23467 + }, + { + "epoch": 2.9853708179620915, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.30526351928711, + "learning_rate": 1e-06, + "loss": 0.3867, + "mean_token_accuracy": 0.8797372579574585, + "num_tokens": 895267695.0, + "step": 23468 + }, + { + "epoch": 2.985498028240682, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.099084854125977, + "learning_rate": 1e-06, + "loss": 0.3557, + "mean_token_accuracy": 0.8909199237823486, + "num_tokens": 895303101.0, + "step": 23469 + }, + { + "epoch": 2.9856252385192725, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.978946685791016, + "learning_rate": 1e-06, + "loss": 0.3774, + "mean_token_accuracy": 0.8809884190559387, + "num_tokens": 895340437.0, + "step": 23470 + }, + { + "epoch": 2.9857524487978626, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.211423873901367, + "learning_rate": 1e-06, + "loss": 0.3603, + "mean_token_accuracy": 0.8887994885444641, + "num_tokens": 895377695.0, + "step": 23471 + }, + { + "epoch": 2.9858796590764536, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.969097137451172, + "learning_rate": 1e-06, + "loss": 0.426, + "mean_token_accuracy": 0.8663821220397949, + "num_tokens": 895418599.0, + "step": 23472 + }, + { + "epoch": 2.9860068693550437, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.21563720703125, + "learning_rate": 1e-06, + "loss": 0.3981, + "mean_token_accuracy": 0.8740020990371704, + "num_tokens": 895454832.0, + "step": 23473 + }, + { + "epoch": 2.9861340796336346, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.903345108032227, + "learning_rate": 1e-06, + "loss": 0.4594, + "mean_token_accuracy": 0.8570293188095093, + "num_tokens": 895490607.0, + "step": 23474 + }, + { + "epoch": 2.9862612899122247, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.03542709350586, + "learning_rate": 1e-06, + "loss": 0.4121, + "mean_token_accuracy": 0.8684169054031372, + "num_tokens": 895531340.0, + "step": 23475 + }, + { + "epoch": 2.9863885001908153, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.175617218017578, + "learning_rate": 1e-06, + "loss": 0.4004, + "mean_token_accuracy": 0.8761931657791138, + "num_tokens": 895568688.0, + "step": 23476 + }, + { + "epoch": 2.986515710469406, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.220787048339844, + "learning_rate": 1e-06, + "loss": 0.4278, + "mean_token_accuracy": 0.8649212121963501, + "num_tokens": 895607316.0, + "step": 23477 + }, + { + "epoch": 2.9866429207479963, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.01279067993164, + "learning_rate": 1e-06, + "loss": 0.3467, + "mean_token_accuracy": 0.8920021653175354, + "num_tokens": 895648694.0, + "step": 23478 + }, + { + "epoch": 2.986770131026587, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.9263973236084, + "learning_rate": 1e-06, + "loss": 0.414, + "mean_token_accuracy": 0.870482861995697, + "num_tokens": 895687996.0, + "step": 23479 + }, + { + "epoch": 2.9868973413051774, + "ewc_loss": 0.041259765625, + "ewc_loss_parallel": 4.124641418457031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.365976333618164, + "learning_rate": 1e-06, + "loss": 0.385, + "mean_token_accuracy": 0.8798668384552002, + "num_tokens": 895726727.0, + "step": 23480 + }, + { + "epoch": 2.987024551583768, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.11083221435547, + "learning_rate": 1e-06, + "loss": 0.3757, + "mean_token_accuracy": 0.8791815042495728, + "num_tokens": 895760688.0, + "step": 23481 + }, + { + "epoch": 2.9871517618623584, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.145517349243164, + "learning_rate": 1e-06, + "loss": 0.4544, + "mean_token_accuracy": 0.8544005751609802, + "num_tokens": 895797303.0, + "step": 23482 + }, + { + "epoch": 2.987278972140949, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.021831512451172, + "learning_rate": 1e-06, + "loss": 0.3795, + "mean_token_accuracy": 0.8809654116630554, + "num_tokens": 895834078.0, + "step": 23483 + }, + { + "epoch": 2.9874061824195395, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.017587661743164, + "learning_rate": 1e-06, + "loss": 0.4346, + "mean_token_accuracy": 0.8654186129570007, + "num_tokens": 895872640.0, + "step": 23484 + }, + { + "epoch": 2.98753339269813, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.7977352142334, + "learning_rate": 1e-06, + "loss": 0.3625, + "mean_token_accuracy": 0.8882148265838623, + "num_tokens": 895907646.0, + "step": 23485 + }, + { + "epoch": 2.9876606029767205, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.15481185913086, + "learning_rate": 1e-06, + "loss": 0.3723, + "mean_token_accuracy": 0.8840743899345398, + "num_tokens": 895944183.0, + "step": 23486 + }, + { + "epoch": 2.987787813255311, + "ewc_loss": 0.041259765625, + "ewc_loss_parallel": 4.124641418457031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.328983306884766, + "learning_rate": 1e-06, + "loss": 0.3845, + "mean_token_accuracy": 0.8797121047973633, + "num_tokens": 895983987.0, + "step": 23487 + }, + { + "epoch": 2.9879150235339016, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.931201934814453, + "learning_rate": 1e-06, + "loss": 0.4284, + "mean_token_accuracy": 0.8634630441665649, + "num_tokens": 896020928.0, + "step": 23488 + }, + { + "epoch": 2.988042233812492, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.912498474121094, + "learning_rate": 1e-06, + "loss": 0.3692, + "mean_token_accuracy": 0.8818024396896362, + "num_tokens": 896057459.0, + "step": 23489 + }, + { + "epoch": 2.9881694440910826, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.139936447143555, + "learning_rate": 1e-06, + "loss": 0.4078, + "mean_token_accuracy": 0.8713889718055725, + "num_tokens": 896095049.0, + "step": 23490 + }, + { + "epoch": 2.988296654369673, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.185279846191406, + "learning_rate": 1e-06, + "loss": 0.3667, + "mean_token_accuracy": 0.8866666555404663, + "num_tokens": 896127433.0, + "step": 23491 + }, + { + "epoch": 2.9884238646482637, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.088573455810547, + "learning_rate": 1e-06, + "loss": 0.3826, + "mean_token_accuracy": 0.8808403015136719, + "num_tokens": 896165662.0, + "step": 23492 + }, + { + "epoch": 2.9885510749268542, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.12830924987793, + "learning_rate": 1e-06, + "loss": 0.4033, + "mean_token_accuracy": 0.8741111159324646, + "num_tokens": 896207753.0, + "step": 23493 + }, + { + "epoch": 2.9886782852054448, + "ewc_loss": 0.041259765625, + "ewc_loss_parallel": 4.124641418457031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.186704635620117, + "learning_rate": 1e-06, + "loss": 0.3995, + "mean_token_accuracy": 0.8746927380561829, + "num_tokens": 896249105.0, + "step": 23494 + }, + { + "epoch": 2.9888054954840353, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.314943313598633, + "learning_rate": 1e-06, + "loss": 0.3937, + "mean_token_accuracy": 0.8756437301635742, + "num_tokens": 896291728.0, + "step": 23495 + }, + { + "epoch": 2.9889327057626254, + "ewc_loss": 0.041259765625, + "ewc_loss_parallel": 4.124641418457031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.97459602355957, + "learning_rate": 1e-06, + "loss": 0.4272, + "mean_token_accuracy": 0.8652734756469727, + "num_tokens": 896331729.0, + "step": 23496 + }, + { + "epoch": 2.9890599160412163, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.12943458557129, + "learning_rate": 1e-06, + "loss": 0.4272, + "mean_token_accuracy": 0.8643227815628052, + "num_tokens": 896371804.0, + "step": 23497 + }, + { + "epoch": 2.9891871263198064, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.936826705932617, + "learning_rate": 1e-06, + "loss": 0.3974, + "mean_token_accuracy": 0.87301105260849, + "num_tokens": 896408398.0, + "step": 23498 + }, + { + "epoch": 2.9893143365983974, + "ewc_loss": 0.041259765625, + "ewc_loss_parallel": 4.124641418457031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.989397048950195, + "learning_rate": 1e-06, + "loss": 0.3647, + "mean_token_accuracy": 0.8904286026954651, + "num_tokens": 896440496.0, + "step": 23499 + }, + { + "epoch": 2.9894415468769875, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.26422691345215, + "learning_rate": 1e-06, + "loss": 0.3987, + "mean_token_accuracy": 0.8759349584579468, + "num_tokens": 896475212.0, + "step": 23500 + }, + { + "epoch": 2.989568757155578, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.022930145263672, + "learning_rate": 1e-06, + "loss": 0.405, + "mean_token_accuracy": 0.8712597489356995, + "num_tokens": 896517122.0, + "step": 23501 + }, + { + "epoch": 2.9896959674341685, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.098249435424805, + "learning_rate": 1e-06, + "loss": 0.3757, + "mean_token_accuracy": 0.8821979761123657, + "num_tokens": 896557259.0, + "step": 23502 + }, + { + "epoch": 2.989823177712759, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.136003494262695, + "learning_rate": 1e-06, + "loss": 0.4254, + "mean_token_accuracy": 0.8661165833473206, + "num_tokens": 896600237.0, + "step": 23503 + }, + { + "epoch": 2.9899503879913496, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.160045623779297, + "learning_rate": 1e-06, + "loss": 0.3628, + "mean_token_accuracy": 0.8870491981506348, + "num_tokens": 896640564.0, + "step": 23504 + }, + { + "epoch": 2.99007759826994, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.10982894897461, + "learning_rate": 1e-06, + "loss": 0.3565, + "mean_token_accuracy": 0.8855298757553101, + "num_tokens": 896674164.0, + "step": 23505 + }, + { + "epoch": 2.9902048085485307, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.94283103942871, + "learning_rate": 1e-06, + "loss": 0.4237, + "mean_token_accuracy": 0.8700968027114868, + "num_tokens": 896716613.0, + "step": 23506 + }, + { + "epoch": 2.990332018827121, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.28480339050293, + "learning_rate": 1e-06, + "loss": 0.4279, + "mean_token_accuracy": 0.8653918504714966, + "num_tokens": 896755586.0, + "step": 23507 + }, + { + "epoch": 2.9904592291057117, + "ewc_loss": 0.041259765625, + "ewc_loss_parallel": 4.124641418457031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.169647216796875, + "learning_rate": 1e-06, + "loss": 0.3984, + "mean_token_accuracy": 0.8747681379318237, + "num_tokens": 896797641.0, + "step": 23508 + }, + { + "epoch": 2.9905864393843022, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.961946487426758, + "learning_rate": 1e-06, + "loss": 0.4083, + "mean_token_accuracy": 0.8714452981948853, + "num_tokens": 896835113.0, + "step": 23509 + }, + { + "epoch": 2.9907136496628928, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.20583152770996, + "learning_rate": 1e-06, + "loss": 0.3902, + "mean_token_accuracy": 0.8754257559776306, + "num_tokens": 896866496.0, + "step": 23510 + }, + { + "epoch": 2.9908408599414833, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.13400650024414, + "learning_rate": 1e-06, + "loss": 0.4069, + "mean_token_accuracy": 0.8740631341934204, + "num_tokens": 896907535.0, + "step": 23511 + }, + { + "epoch": 2.990968070220074, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.11121368408203, + "learning_rate": 1e-06, + "loss": 0.3426, + "mean_token_accuracy": 0.8924017548561096, + "num_tokens": 896948574.0, + "step": 23512 + }, + { + "epoch": 2.9910952804986644, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.310693740844727, + "learning_rate": 1e-06, + "loss": 0.4172, + "mean_token_accuracy": 0.8694360852241516, + "num_tokens": 896992377.0, + "step": 23513 + }, + { + "epoch": 2.991222490777255, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.221670150756836, + "learning_rate": 1e-06, + "loss": 0.4134, + "mean_token_accuracy": 0.8703693747520447, + "num_tokens": 897035411.0, + "step": 23514 + }, + { + "epoch": 2.9913497010558454, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.125755310058594, + "learning_rate": 1e-06, + "loss": 0.399, + "mean_token_accuracy": 0.8766885995864868, + "num_tokens": 897074661.0, + "step": 23515 + }, + { + "epoch": 2.991476911334436, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.244714736938477, + "learning_rate": 1e-06, + "loss": 0.3994, + "mean_token_accuracy": 0.877043604850769, + "num_tokens": 897115550.0, + "step": 23516 + }, + { + "epoch": 2.9916041216130265, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.120323181152344, + "learning_rate": 1e-06, + "loss": 0.4162, + "mean_token_accuracy": 0.8686728477478027, + "num_tokens": 897155091.0, + "step": 23517 + }, + { + "epoch": 2.991731331891617, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.968961715698242, + "learning_rate": 1e-06, + "loss": 0.411, + "mean_token_accuracy": 0.8696047067642212, + "num_tokens": 897196551.0, + "step": 23518 + }, + { + "epoch": 2.991858542170207, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.184608459472656, + "learning_rate": 1e-06, + "loss": 0.3591, + "mean_token_accuracy": 0.8852952718734741, + "num_tokens": 897231885.0, + "step": 23519 + }, + { + "epoch": 2.991985752448798, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.159456253051758, + "learning_rate": 1e-06, + "loss": 0.4007, + "mean_token_accuracy": 0.8751463890075684, + "num_tokens": 897267944.0, + "step": 23520 + }, + { + "epoch": 2.992112962727388, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.09970474243164, + "learning_rate": 1e-06, + "loss": 0.3916, + "mean_token_accuracy": 0.8780490159988403, + "num_tokens": 897307330.0, + "step": 23521 + }, + { + "epoch": 2.992240173005979, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.187185287475586, + "learning_rate": 1e-06, + "loss": 0.3912, + "mean_token_accuracy": 0.8774833679199219, + "num_tokens": 897344028.0, + "step": 23522 + }, + { + "epoch": 2.992367383284569, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.194236755371094, + "learning_rate": 1e-06, + "loss": 0.4221, + "mean_token_accuracy": 0.8668848276138306, + "num_tokens": 897391213.0, + "step": 23523 + }, + { + "epoch": 2.99249459356316, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.87839126586914, + "learning_rate": 1e-06, + "loss": 0.4201, + "mean_token_accuracy": 0.8687548637390137, + "num_tokens": 897428222.0, + "step": 23524 + }, + { + "epoch": 2.9926218038417502, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.02507781982422, + "learning_rate": 1e-06, + "loss": 0.4445, + "mean_token_accuracy": 0.8644761443138123, + "num_tokens": 897457737.0, + "step": 23525 + }, + { + "epoch": 2.992749014120341, + "ewc_loss": 0.041259765625, + "ewc_loss_parallel": 4.124641418457031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.205575942993164, + "learning_rate": 1e-06, + "loss": 0.3738, + "mean_token_accuracy": 0.881874680519104, + "num_tokens": 897498925.0, + "step": 23526 + }, + { + "epoch": 2.9928762243989313, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.99256134033203, + "learning_rate": 1e-06, + "loss": 0.415, + "mean_token_accuracy": 0.8709272146224976, + "num_tokens": 897533157.0, + "step": 23527 + }, + { + "epoch": 2.993003434677522, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.150104522705078, + "learning_rate": 1e-06, + "loss": 0.3903, + "mean_token_accuracy": 0.8780372142791748, + "num_tokens": 897574159.0, + "step": 23528 + }, + { + "epoch": 2.9931306449561124, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.957448959350586, + "learning_rate": 1e-06, + "loss": 0.4147, + "mean_token_accuracy": 0.867016077041626, + "num_tokens": 897606970.0, + "step": 23529 + }, + { + "epoch": 2.993257855234703, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.302486419677734, + "learning_rate": 1e-06, + "loss": 0.4706, + "mean_token_accuracy": 0.8528140783309937, + "num_tokens": 897647725.0, + "step": 23530 + }, + { + "epoch": 2.9933850655132934, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.0980281829834, + "learning_rate": 1e-06, + "loss": 0.3871, + "mean_token_accuracy": 0.8783434629440308, + "num_tokens": 897687109.0, + "step": 23531 + }, + { + "epoch": 2.993512275791884, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.229084014892578, + "learning_rate": 1e-06, + "loss": 0.3693, + "mean_token_accuracy": 0.8860520124435425, + "num_tokens": 897720994.0, + "step": 23532 + }, + { + "epoch": 2.9936394860704745, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.280786514282227, + "learning_rate": 1e-06, + "loss": 0.3478, + "mean_token_accuracy": 0.8908945322036743, + "num_tokens": 897749855.0, + "step": 23533 + }, + { + "epoch": 2.993766696349065, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.983400344848633, + "learning_rate": 1e-06, + "loss": 0.3913, + "mean_token_accuracy": 0.8746180534362793, + "num_tokens": 897792846.0, + "step": 23534 + }, + { + "epoch": 2.9938939066276555, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.096641540527344, + "learning_rate": 1e-06, + "loss": 0.3877, + "mean_token_accuracy": 0.8774241805076599, + "num_tokens": 897836765.0, + "step": 23535 + }, + { + "epoch": 2.994021116906246, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.22862434387207, + "learning_rate": 1e-06, + "loss": 0.3962, + "mean_token_accuracy": 0.8759713172912598, + "num_tokens": 897880918.0, + "step": 23536 + }, + { + "epoch": 2.9941483271848366, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.268028259277344, + "learning_rate": 1e-06, + "loss": 0.4002, + "mean_token_accuracy": 0.8735047578811646, + "num_tokens": 897916676.0, + "step": 23537 + }, + { + "epoch": 2.994275537463427, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.11103630065918, + "learning_rate": 1e-06, + "loss": 0.3795, + "mean_token_accuracy": 0.8810731768608093, + "num_tokens": 897953968.0, + "step": 23538 + }, + { + "epoch": 2.9944027477420176, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.26936912536621, + "learning_rate": 1e-06, + "loss": 0.4124, + "mean_token_accuracy": 0.8753554821014404, + "num_tokens": 897993055.0, + "step": 23539 + }, + { + "epoch": 2.994529958020608, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.066072463989258, + "learning_rate": 1e-06, + "loss": 0.3691, + "mean_token_accuracy": 0.8828888535499573, + "num_tokens": 898030202.0, + "step": 23540 + }, + { + "epoch": 2.9946571682991987, + "ewc_loss": 0.041259765625, + "ewc_loss_parallel": 4.124641418457031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.928014755249023, + "learning_rate": 1e-06, + "loss": 0.4033, + "mean_token_accuracy": 0.8700214624404907, + "num_tokens": 898067234.0, + "step": 23541 + }, + { + "epoch": 2.9947843785777892, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.1999568939209, + "learning_rate": 1e-06, + "loss": 0.3802, + "mean_token_accuracy": 0.8793249726295471, + "num_tokens": 898105423.0, + "step": 23542 + }, + { + "epoch": 2.9949115888563798, + "ewc_loss": 0.041259765625, + "ewc_loss_parallel": 4.124641418457031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.007898330688477, + "learning_rate": 1e-06, + "loss": 0.334, + "mean_token_accuracy": 0.8943590521812439, + "num_tokens": 898144550.0, + "step": 23543 + }, + { + "epoch": 2.99503879913497, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.19425392150879, + "learning_rate": 1e-06, + "loss": 0.3913, + "mean_token_accuracy": 0.8785738945007324, + "num_tokens": 898179277.0, + "step": 23544 + }, + { + "epoch": 2.995166009413561, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.949569702148438, + "learning_rate": 1e-06, + "loss": 0.3921, + "mean_token_accuracy": 0.8732180595397949, + "num_tokens": 898218518.0, + "step": 23545 + }, + { + "epoch": 2.995293219692151, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.180423736572266, + "learning_rate": 1e-06, + "loss": 0.4306, + "mean_token_accuracy": 0.8639445304870605, + "num_tokens": 898259266.0, + "step": 23546 + }, + { + "epoch": 2.995420429970742, + "ewc_loss": 0.041259765625, + "ewc_loss_parallel": 4.124641418457031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.062076568603516, + "learning_rate": 1e-06, + "loss": 0.4283, + "mean_token_accuracy": 0.8743261694908142, + "num_tokens": 898292648.0, + "step": 23547 + }, + { + "epoch": 2.995547640249332, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.16313934326172, + "learning_rate": 1e-06, + "loss": 0.395, + "mean_token_accuracy": 0.8795007467269897, + "num_tokens": 898328413.0, + "step": 23548 + }, + { + "epoch": 2.995674850527923, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.061981201171875, + "learning_rate": 1e-06, + "loss": 0.397, + "mean_token_accuracy": 0.8751084804534912, + "num_tokens": 898376963.0, + "step": 23549 + }, + { + "epoch": 2.995802060806513, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.057525634765625, + "learning_rate": 1e-06, + "loss": 0.3582, + "mean_token_accuracy": 0.8879661560058594, + "num_tokens": 898407256.0, + "step": 23550 + }, + { + "epoch": 2.9959292710851035, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.173791885375977, + "learning_rate": 1e-06, + "loss": 0.3989, + "mean_token_accuracy": 0.8781164288520813, + "num_tokens": 898448800.0, + "step": 23551 + }, + { + "epoch": 2.996056481363694, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.297775268554688, + "learning_rate": 1e-06, + "loss": 0.4121, + "mean_token_accuracy": 0.8724371194839478, + "num_tokens": 898488912.0, + "step": 23552 + }, + { + "epoch": 2.9961836916422846, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.92599105834961, + "learning_rate": 1e-06, + "loss": 0.418, + "mean_token_accuracy": 0.8689090013504028, + "num_tokens": 898530434.0, + "step": 23553 + }, + { + "epoch": 2.996310901920875, + "ewc_loss": 0.04052734375, + "ewc_loss_parallel": 4.0531158447265625e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.146507263183594, + "learning_rate": 1e-06, + "loss": 0.3915, + "mean_token_accuracy": 0.8771164417266846, + "num_tokens": 898565044.0, + "step": 23554 + }, + { + "epoch": 2.9964381121994657, + "ewc_loss": 0.04150390625, + "ewc_loss_parallel": 4.1484832763671875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.28584861755371, + "learning_rate": 1e-06, + "loss": 0.3697, + "mean_token_accuracy": 0.8857239484786987, + "num_tokens": 898601798.0, + "step": 23555 + }, + { + "epoch": 2.996565322478056, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.916845321655273, + "learning_rate": 1e-06, + "loss": 0.4255, + "mean_token_accuracy": 0.8660926818847656, + "num_tokens": 898644789.0, + "step": 23556 + }, + { + "epoch": 2.9966925327566467, + "ewc_loss": 0.041259765625, + "ewc_loss_parallel": 4.124641418457031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.013080596923828, + "learning_rate": 1e-06, + "loss": 0.3991, + "mean_token_accuracy": 0.8742806911468506, + "num_tokens": 898683928.0, + "step": 23557 + }, + { + "epoch": 2.9968197430352372, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.145877838134766, + "learning_rate": 1e-06, + "loss": 0.4098, + "mean_token_accuracy": 0.8710535764694214, + "num_tokens": 898726217.0, + "step": 23558 + }, + { + "epoch": 2.9969469533138278, + "ewc_loss": 0.041259765625, + "ewc_loss_parallel": 4.124641418457031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.032630920410156, + "learning_rate": 1e-06, + "loss": 0.367, + "mean_token_accuracy": 0.8850010633468628, + "num_tokens": 898760575.0, + "step": 23559 + }, + { + "epoch": 2.9970741635924183, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.272640228271484, + "learning_rate": 1e-06, + "loss": 0.4046, + "mean_token_accuracy": 0.8721458911895752, + "num_tokens": 898801232.0, + "step": 23560 + }, + { + "epoch": 2.997201373871009, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.113672256469727, + "learning_rate": 1e-06, + "loss": 0.3727, + "mean_token_accuracy": 0.8818894028663635, + "num_tokens": 898837174.0, + "step": 23561 + }, + { + "epoch": 2.9973285841495994, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.11083221435547, + "learning_rate": 1e-06, + "loss": 0.4321, + "mean_token_accuracy": 0.8665165901184082, + "num_tokens": 898875849.0, + "step": 23562 + }, + { + "epoch": 2.99745579442819, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.137033462524414, + "learning_rate": 1e-06, + "loss": 0.3734, + "mean_token_accuracy": 0.8837844133377075, + "num_tokens": 898909975.0, + "step": 23563 + }, + { + "epoch": 2.9975830047067804, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.063379287719727, + "learning_rate": 1e-06, + "loss": 0.4169, + "mean_token_accuracy": 0.8686257004737854, + "num_tokens": 898948250.0, + "step": 23564 + }, + { + "epoch": 2.997710214985371, + "ewc_loss": 0.041259765625, + "ewc_loss_parallel": 4.124641418457031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.14676856994629, + "learning_rate": 1e-06, + "loss": 0.3612, + "mean_token_accuracy": 0.8915276527404785, + "num_tokens": 898989122.0, + "step": 23565 + }, + { + "epoch": 2.9978374252639615, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.28814697265625, + "learning_rate": 1e-06, + "loss": 0.4085, + "mean_token_accuracy": 0.8704836368560791, + "num_tokens": 899028715.0, + "step": 23566 + }, + { + "epoch": 2.997964635542552, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.111906051635742, + "learning_rate": 1e-06, + "loss": 0.3838, + "mean_token_accuracy": 0.8785238862037659, + "num_tokens": 899064101.0, + "step": 23567 + }, + { + "epoch": 2.9980918458211425, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.082345962524414, + "learning_rate": 1e-06, + "loss": 0.3922, + "mean_token_accuracy": 0.879115104675293, + "num_tokens": 899101759.0, + "step": 23568 + }, + { + "epoch": 2.9982190560997326, + "ewc_loss": 0.041259765625, + "ewc_loss_parallel": 4.124641418457031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.052637100219727, + "learning_rate": 1e-06, + "loss": 0.4154, + "mean_token_accuracy": 0.8694788217544556, + "num_tokens": 899142273.0, + "step": 23569 + }, + { + "epoch": 2.9983462663783236, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.330095291137695, + "learning_rate": 1e-06, + "loss": 0.3926, + "mean_token_accuracy": 0.8770138025283813, + "num_tokens": 899175878.0, + "step": 23570 + }, + { + "epoch": 2.9984734766569137, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.23391342163086, + "learning_rate": 1e-06, + "loss": 0.3567, + "mean_token_accuracy": 0.8862608671188354, + "num_tokens": 899208937.0, + "step": 23571 + }, + { + "epoch": 2.9986006869355046, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.06831932067871, + "learning_rate": 1e-06, + "loss": 0.3981, + "mean_token_accuracy": 0.8750734329223633, + "num_tokens": 899244088.0, + "step": 23572 + }, + { + "epoch": 2.9987278972140947, + "ewc_loss": 0.041259765625, + "ewc_loss_parallel": 4.124641418457031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.106842041015625, + "learning_rate": 1e-06, + "loss": 0.4194, + "mean_token_accuracy": 0.8705544471740723, + "num_tokens": 899280111.0, + "step": 23573 + }, + { + "epoch": 2.9988551074926852, + "ewc_loss": 0.040771484375, + "ewc_loss_parallel": 4.076957702636719e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 26.976333618164062, + "learning_rate": 1e-06, + "loss": 0.4, + "mean_token_accuracy": 0.8747619390487671, + "num_tokens": 899310882.0, + "step": 23574 + }, + { + "epoch": 2.9989823177712758, + "ewc_loss": 0.041259765625, + "ewc_loss_parallel": 4.124641418457031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.048982620239258, + "learning_rate": 1e-06, + "loss": 0.4507, + "mean_token_accuracy": 0.86300128698349, + "num_tokens": 899346820.0, + "step": 23575 + }, + { + "epoch": 2.9991095280498663, + "ewc_loss": 0.041259765625, + "ewc_loss_parallel": 4.124641418457031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.17146110534668, + "learning_rate": 1e-06, + "loss": 0.3564, + "mean_token_accuracy": 0.8886873722076416, + "num_tokens": 899382044.0, + "step": 23576 + }, + { + "epoch": 2.999236738328457, + "ewc_loss": 0.041259765625, + "ewc_loss_parallel": 4.124641418457031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.04380989074707, + "learning_rate": 1e-06, + "loss": 0.3882, + "mean_token_accuracy": 0.8755610585212708, + "num_tokens": 899418389.0, + "step": 23577 + }, + { + "epoch": 2.9993639486070474, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.158151626586914, + "learning_rate": 1e-06, + "loss": 0.4107, + "mean_token_accuracy": 0.8742382526397705, + "num_tokens": 899458486.0, + "step": 23578 + }, + { + "epoch": 2.999491158885638, + "ewc_loss": 0.041259765625, + "ewc_loss_parallel": 4.124641418457031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.095556259155273, + "learning_rate": 1e-06, + "loss": 0.4031, + "mean_token_accuracy": 0.8745719194412231, + "num_tokens": 899498168.0, + "step": 23579 + }, + { + "epoch": 2.9996183691642284, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.191383361816406, + "learning_rate": 1e-06, + "loss": 0.3477, + "mean_token_accuracy": 0.8912855386734009, + "num_tokens": 899538655.0, + "step": 23580 + }, + { + "epoch": 2.999745579442819, + "ewc_loss": 0.041259765625, + "ewc_loss_parallel": 4.124641418457031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.115312576293945, + "learning_rate": 1e-06, + "loss": 0.4294, + "mean_token_accuracy": 0.8649293184280396, + "num_tokens": 899581693.0, + "step": 23581 + }, + { + "epoch": 2.9998727897214095, + "ewc_loss": 0.041015625, + "ewc_loss_parallel": 4.100799560546875e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.231672286987305, + "learning_rate": 1e-06, + "loss": 0.4075, + "mean_token_accuracy": 0.8730556964874268, + "num_tokens": 899623817.0, + "step": 23582 + }, + { + "epoch": 3.0, + "ewc_loss": 0.041259765625, + "ewc_loss_parallel": 4.124641418457031e-05, + "ewc_loss_perp": 0.0, + "grad_norm": 27.13128089904785, + "learning_rate": 1e-06, + "loss": 0.3874, + "mean_token_accuracy": 0.8793513774871826, + "num_tokens": 899664226.0, + "step": 23583 + }, + { + "epoch": 3.0, + "ewc_loss": 0.041259765625, + "ewc_loss_parallel": 4.124641418457031e-05, + "ewc_loss_perp": 0.0, + "step": 23583, + "total_flos": 5.62815163329864e+19, + "train_loss": 0.43322460087045367, + "train_runtime": 46186.2128, + "train_samples_per_second": 8.169, + "train_steps_per_second": 0.511 + } + ], + "logging_steps": 1, + "max_steps": 23583, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 11792, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 5.62815163329864e+19, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/training_args.bin b/training_args.bin new file mode 100644 index 0000000..1cde2d8 --- /dev/null +++ b/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8b64fe8f4a45695a3c0411df9f4487ae3d17efc46e4b36b56a02e45ee0e4aaf2 +size 13393