From 4148b6a13658bd6eda59ac2b5a08042c3879146b Mon Sep 17 00:00:00 2001 From: ModelHub XC Date: Wed, 29 Apr 2026 06:05:44 +0800 Subject: [PATCH] =?UTF-8?q?=E5=88=9D=E5=A7=8B=E5=8C=96=E9=A1=B9=E7=9B=AE?= =?UTF-8?q?=EF=BC=8C=E7=94=B1ModelHub=20XC=E7=A4=BE=E5=8C=BA=E6=8F=90?= =?UTF-8?q?=E4=BE=9B=E6=A8=A1=E5=9E=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Model: Neelectric/Llama-3.2-1B-Instruct_SFT_sciencev00.02 Source: Original Platform --- .gitattributes | 36 + README.md | 60 + all_results.json | 8 + chat_template.jinja | 121 + config.json | 35 + generation_config.json | 8 + model.safetensors | 3 + special_tokens_map.json | 10 + tokenizer.json | 3 + tokenizer_config.json | 2062 ++++++++++++++++ train_results.json | 8 + trainer_state.json | 5173 +++++++++++++++++++++++++++++++++++++++ training_args.bin | 3 + 13 files changed, 7530 insertions(+) create mode 100644 .gitattributes create mode 100644 README.md create mode 100644 all_results.json create mode 100644 chat_template.jinja create mode 100644 config.json create mode 100644 generation_config.json create mode 100644 model.safetensors create mode 100644 special_tokens_map.json create mode 100644 tokenizer.json create mode 100644 tokenizer_config.json create mode 100644 train_results.json create mode 100644 trainer_state.json create mode 100644 training_args.bin diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..52373fe --- /dev/null +++ b/.gitattributes @@ -0,0 +1,36 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text +tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/README.md b/README.md new file mode 100644 index 0000000..5fda5fb --- /dev/null +++ b/README.md @@ -0,0 +1,60 @@ +--- +base_model: meta-llama/Llama-3.2-1B-Instruct +datasets: Neelectric/MoT_science_Llama3_4096toks +library_name: transformers +model_name: Llama-3.2-1B-Instruct_SFT_sciencev00.02 +tags: +- generated_from_trainer +- open-r1 +- sft +- trl +licence: license +--- + +# Model Card for Llama-3.2-1B-Instruct_SFT_sciencev00.02 + +This model is a fine-tuned version of [meta-llama/Llama-3.2-1B-Instruct](https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct) on the [Neelectric/MoT_science_Llama3_4096toks](https://huggingface.co/datasets/Neelectric/MoT_science_Llama3_4096toks) dataset. +It has been trained using [TRL](https://github.com/huggingface/trl). + +## Quick start + +```python +from transformers import pipeline + +question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?" +generator = pipeline("text-generation", model="Neelectric/Llama-3.2-1B-Instruct_SFT_sciencev00.02", device="cuda") +output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0] +print(output["generated_text"]) +``` + +## Training procedure + +[Visualize in Weights & Biases](https://wandb.ai/neelectric/open-r1_science/runs/7pp7cwki) + + + +This model was trained with SFT. + +### Framework versions + +- TRL: 1.0.0.dev0 +- Transformers: 4.57.6 +- Pytorch: 2.9.0 +- Datasets: 4.8.3 +- Tokenizers: 0.22.2 + +## Citations + + + +Cite TRL as: + +```bibtex +@software{vonwerra2020trl, + title = {{TRL: Transformers Reinforcement Learning}}, + author = {von Werra, Leandro and Belkada, Younes and Tunstall, Lewis and Beeching, Edward and Thrush, Tristan and Lambert, Nathan and Huang, Shengyi and Rasul, Kashif and Gallouédec, Quentin}, + license = {Apache-2.0}, + url = {https://github.com/huggingface/trl}, + year = {2020} +} +``` \ No newline at end of file diff --git a/all_results.json b/all_results.json new file mode 100644 index 0000000..aa63ffa --- /dev/null +++ b/all_results.json @@ -0,0 +1,8 @@ +{ + "total_flos": 1.377941890090926e+18, + "train_loss": 1.2564991597543682, + "train_runtime": 1307.2842, + "train_samples": 145693, + "train_samples_per_second": 111.621, + "train_steps_per_second": 0.436 +} \ No newline at end of file diff --git a/chat_template.jinja b/chat_template.jinja new file mode 100644 index 0000000..0ab931a --- /dev/null +++ b/chat_template.jinja @@ -0,0 +1,121 @@ +{{- bos_token }} +{%- if custom_tools is defined %} + {%- set tools = custom_tools %} +{%- endif %} +{%- if not tools_in_user_message is defined %} + {%- set tools_in_user_message = true %} +{%- endif %} +{%- if not date_string is defined %} + {%- set date_string = "26 Jul 2024" %} +{%- endif %} +{%- if not tools is defined %} + {%- set tools = none %} +{%- endif %} + +{#- This block extracts the system message, so we can slot it into the right place. #} +{%- if messages[0]['role'] == 'system' %} + {%- set system_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} +{%- else %} + {%- set system_message = "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: +... + + +... +" %} +{%- endif %} + +{#- System message + builtin tools #} +{{- "<|start_header_id|>system<|end_header_id|>\n\n" }} +{%- if builtin_tools is defined or tools is not none %} + {{- "Environment: ipython\n" }} +{%- endif %} +{%- if builtin_tools is defined %} + {{- "Tools: " + builtin_tools | reject('equalto', 'code_interpreter') | join(", ") + "\n\n"}} +{%- endif %} +{{- "Cutting Knowledge Date: December 2023\n" }} +{{- "Today Date: " + date_string + "\n\n" }} +{%- if tools is not none and not tools_in_user_message %} + {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} +{%- endif %} +{{- system_message }} +{{- "<|eot_id|>" }} + +{#- Custom tools are passed in a user message with some extra guidance #} +{%- if tools_in_user_message and not tools is none %} + {#- Extract the first user message so we can plug it in here #} + {%- if messages | length != 0 %} + {%- set first_user_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} + {%- else %} + {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }} + {%- endif %} + {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}} + {{- "Given the following functions, please respond with a JSON for a function call " }} + {{- "with its proper arguments that best answers the given prompt.\n\n" }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} + {{- first_user_message + "<|eot_id|>"}} +{%- endif %} + +{%- for message in messages %} + {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %} + {%- if message['role'] == 'assistant' %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }} + {% generation %} + {{- message['content'] | trim + '<|eot_id|>' }} + {% endgeneration %} + {%- else %} + {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }} + {%- endif %} + {%- elif 'tool_calls' in message %} + {%- if not message.tool_calls|length == 1 %} + {{- raise_exception("This model only supports single tool-calls at once!") }} + {%- endif %} + {%- set tool_call = message.tool_calls[0].function %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}} + {% generation %} + {%- if builtin_tools is defined and tool_call.name in builtin_tools %} + {{- "<|python_tag|>" + tool_call.name + ".call(" }} + {%- for arg_name, arg_val in tool_call.arguments | items %} + {{- arg_name + '="' + arg_val + '"' }} + {%- if not loop.last %} + {{- ", " }} + {%- endif %} + {%- endfor %} + {{- ")" }} + {%- else %} + {{- '{"name": "' + tool_call.name + '", ' }} + {{- '"parameters": ' }} + {{- tool_call.arguments | tojson }} + {{- "}" }} + {%- endif %} + {%- if builtin_tools is defined %} + {{- "<|eom_id|>" }} + {%- else %} + {{- "<|eot_id|>" }} + {%- endif %} + {% endgeneration %} + {%- elif message.role == "tool" or message.role == "ipython" %} + {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }} + {%- if message.content is mapping or message.content is iterable %} + {{- message.content | tojson }} + {%- else %} + {{- message.content }} + {%- endif %} + {{- "<|eot_id|>" }} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }} +{%- endif %} \ No newline at end of file diff --git a/config.json b/config.json new file mode 100644 index 0000000..9912fbf --- /dev/null +++ b/config.json @@ -0,0 +1,35 @@ +{ + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "dtype": "bfloat16", + "eos_token_id": 128009, + "head_dim": 64, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 8192, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 16, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 32.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": true, + "transformers_version": "4.57.6", + "use_cache": true, + "vocab_size": 128256 +} diff --git a/generation_config.json b/generation_config.json new file mode 100644 index 0000000..1996dc1 --- /dev/null +++ b/generation_config.json @@ -0,0 +1,8 @@ +{ + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": 128009, + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "4.57.6" +} diff --git a/model.safetensors b/model.safetensors new file mode 100644 index 0000000..8ebf750 --- /dev/null +++ b/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ed0417a4daba91c37a8500900717cb983b32ca8bad6f204894f82111ce57b2b5 +size 2996982344 diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000..e8f05fa --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1,10 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": "<|eot_id|>" +} diff --git a/tokenizer.json b/tokenizer.json new file mode 100644 index 0000000..1c1d8d5 --- /dev/null +++ b/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000..8b0c7c1 --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,2062 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/train_results.json b/train_results.json new file mode 100644 index 0000000..aa63ffa --- /dev/null +++ b/train_results.json @@ -0,0 +1,8 @@ +{ + "total_flos": 1.377941890090926e+18, + "train_loss": 1.2564991597543682, + "train_runtime": 1307.2842, + "train_samples": 145693, + "train_samples_per_second": 111.621, + "train_steps_per_second": 0.436 +} \ No newline at end of file diff --git a/trainer_state.json b/trainer_state.json new file mode 100644 index 0000000..16e031e --- /dev/null +++ b/trainer_state.json @@ -0,0 +1,5173 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 570, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0017543859649122807, + "grad_norm": 6.018052101135254, + "learning_rate": 0.0, + "loss": 1.789, + "mean_token_accuracy": 0.5678549408912659, + "num_tokens": 429478.0, + "step": 1 + }, + { + "epoch": 0.0035087719298245615, + "grad_norm": 6.373941898345947, + "learning_rate": 1.7543859649122807e-06, + "loss": 1.7812, + "mean_token_accuracy": 0.5701305270195007, + "num_tokens": 824562.0, + "step": 2 + }, + { + "epoch": 0.005263157894736842, + "grad_norm": 6.300591468811035, + "learning_rate": 3.5087719298245615e-06, + "loss": 1.787, + "mean_token_accuracy": 0.5685350894927979, + "num_tokens": 1228422.0, + "step": 3 + }, + { + "epoch": 0.007017543859649123, + "grad_norm": 4.870020389556885, + "learning_rate": 5.263157894736842e-06, + "loss": 1.7577, + "mean_token_accuracy": 0.570589542388916, + "num_tokens": 1633223.0, + "step": 4 + }, + { + "epoch": 0.008771929824561403, + "grad_norm": 3.564033269882202, + "learning_rate": 7.017543859649123e-06, + "loss": 1.6964, + "mean_token_accuracy": 0.5784072279930115, + "num_tokens": 2060143.0, + "step": 5 + }, + { + "epoch": 0.010526315789473684, + "grad_norm": 2.9137721061706543, + "learning_rate": 8.771929824561403e-06, + "loss": 1.646, + "mean_token_accuracy": 0.5835782289505005, + "num_tokens": 2503690.0, + "step": 6 + }, + { + "epoch": 0.012280701754385965, + "grad_norm": 2.481250047683716, + "learning_rate": 1.0526315789473684e-05, + "loss": 1.5991, + "mean_token_accuracy": 0.5927026271820068, + "num_tokens": 2904426.0, + "step": 7 + }, + { + "epoch": 0.014035087719298246, + "grad_norm": 3.379573345184326, + "learning_rate": 1.2280701754385964e-05, + "loss": 1.5736, + "mean_token_accuracy": 0.5973439812660217, + "num_tokens": 3299360.0, + "step": 8 + }, + { + "epoch": 0.015789473684210527, + "grad_norm": 2.4704713821411133, + "learning_rate": 1.4035087719298246e-05, + "loss": 1.5427, + "mean_token_accuracy": 0.6033717393875122, + "num_tokens": 3709604.0, + "step": 9 + }, + { + "epoch": 0.017543859649122806, + "grad_norm": 1.8616167306900024, + "learning_rate": 1.5789473684210526e-05, + "loss": 1.5179, + "mean_token_accuracy": 0.606142520904541, + "num_tokens": 4122427.0, + "step": 10 + }, + { + "epoch": 0.01929824561403509, + "grad_norm": 1.90486478805542, + "learning_rate": 1.7543859649122806e-05, + "loss": 1.5291, + "mean_token_accuracy": 0.6004294157028198, + "num_tokens": 4549800.0, + "step": 11 + }, + { + "epoch": 0.021052631578947368, + "grad_norm": 1.6314690113067627, + "learning_rate": 1.929824561403509e-05, + "loss": 1.5063, + "mean_token_accuracy": 0.6060437560081482, + "num_tokens": 4972367.0, + "step": 12 + }, + { + "epoch": 0.02280701754385965, + "grad_norm": 1.4108632802963257, + "learning_rate": 2.105263157894737e-05, + "loss": 1.4803, + "mean_token_accuracy": 0.6093297004699707, + "num_tokens": 5381784.0, + "step": 13 + }, + { + "epoch": 0.02456140350877193, + "grad_norm": 1.3794684410095215, + "learning_rate": 2.280701754385965e-05, + "loss": 1.4411, + "mean_token_accuracy": 0.6182389259338379, + "num_tokens": 5783941.0, + "step": 14 + }, + { + "epoch": 0.02631578947368421, + "grad_norm": 1.2287817001342773, + "learning_rate": 2.456140350877193e-05, + "loss": 1.423, + "mean_token_accuracy": 0.621440052986145, + "num_tokens": 6173946.0, + "step": 15 + }, + { + "epoch": 0.028070175438596492, + "grad_norm": 1.2201406955718994, + "learning_rate": 2.6315789473684212e-05, + "loss": 1.4455, + "mean_token_accuracy": 0.6170344352722168, + "num_tokens": 6588292.0, + "step": 16 + }, + { + "epoch": 0.02982456140350877, + "grad_norm": 1.1274272203445435, + "learning_rate": 2.8070175438596492e-05, + "loss": 1.4051, + "mean_token_accuracy": 0.6251938343048096, + "num_tokens": 7005165.0, + "step": 17 + }, + { + "epoch": 0.031578947368421054, + "grad_norm": 1.040313482284546, + "learning_rate": 2.9824561403508772e-05, + "loss": 1.4229, + "mean_token_accuracy": 0.6200644373893738, + "num_tokens": 7422596.0, + "step": 18 + }, + { + "epoch": 0.03333333333333333, + "grad_norm": 1.0897908210754395, + "learning_rate": 3.157894736842105e-05, + "loss": 1.3782, + "mean_token_accuracy": 0.6302369236946106, + "num_tokens": 7812857.0, + "step": 19 + }, + { + "epoch": 0.03508771929824561, + "grad_norm": 1.08786940574646, + "learning_rate": 3.3333333333333335e-05, + "loss": 1.4219, + "mean_token_accuracy": 0.6207268834114075, + "num_tokens": 8244992.0, + "step": 20 + }, + { + "epoch": 0.03684210526315789, + "grad_norm": 0.991358757019043, + "learning_rate": 3.508771929824561e-05, + "loss": 1.3912, + "mean_token_accuracy": 0.6278927326202393, + "num_tokens": 8664134.0, + "step": 21 + }, + { + "epoch": 0.03859649122807018, + "grad_norm": 1.1284328699111938, + "learning_rate": 3.6842105263157895e-05, + "loss": 1.4004, + "mean_token_accuracy": 0.6226587295532227, + "num_tokens": 9082923.0, + "step": 22 + }, + { + "epoch": 0.04035087719298246, + "grad_norm": 1.1181979179382324, + "learning_rate": 3.859649122807018e-05, + "loss": 1.3971, + "mean_token_accuracy": 0.6253641247749329, + "num_tokens": 9497934.0, + "step": 23 + }, + { + "epoch": 0.042105263157894736, + "grad_norm": 1.2639045715332031, + "learning_rate": 4.0350877192982455e-05, + "loss": 1.4259, + "mean_token_accuracy": 0.6176168322563171, + "num_tokens": 9923716.0, + "step": 24 + }, + { + "epoch": 0.043859649122807015, + "grad_norm": 1.0910519361495972, + "learning_rate": 4.210526315789474e-05, + "loss": 1.3733, + "mean_token_accuracy": 0.6294311285018921, + "num_tokens": 10353257.0, + "step": 25 + }, + { + "epoch": 0.0456140350877193, + "grad_norm": 1.0363545417785645, + "learning_rate": 4.3859649122807014e-05, + "loss": 1.3809, + "mean_token_accuracy": 0.6275283098220825, + "num_tokens": 10767965.0, + "step": 26 + }, + { + "epoch": 0.04736842105263158, + "grad_norm": 1.048935055732727, + "learning_rate": 4.56140350877193e-05, + "loss": 1.3815, + "mean_token_accuracy": 0.6274988651275635, + "num_tokens": 11188411.0, + "step": 27 + }, + { + "epoch": 0.04912280701754386, + "grad_norm": 1.3705120086669922, + "learning_rate": 4.736842105263158e-05, + "loss": 1.357, + "mean_token_accuracy": 0.6328938007354736, + "num_tokens": 11597031.0, + "step": 28 + }, + { + "epoch": 0.05087719298245614, + "grad_norm": 1.2502895593643188, + "learning_rate": 4.912280701754386e-05, + "loss": 1.3848, + "mean_token_accuracy": 0.626395583152771, + "num_tokens": 11996650.0, + "step": 29 + }, + { + "epoch": 0.05263157894736842, + "grad_norm": 0.9786079525947571, + "learning_rate": 5.087719298245615e-05, + "loss": 1.3669, + "mean_token_accuracy": 0.6301195621490479, + "num_tokens": 12414351.0, + "step": 30 + }, + { + "epoch": 0.054385964912280704, + "grad_norm": 1.322411298751831, + "learning_rate": 5.2631578947368424e-05, + "loss": 1.3703, + "mean_token_accuracy": 0.6289023756980896, + "num_tokens": 12853897.0, + "step": 31 + }, + { + "epoch": 0.056140350877192984, + "grad_norm": 1.1092149019241333, + "learning_rate": 5.438596491228071e-05, + "loss": 1.343, + "mean_token_accuracy": 0.6342843770980835, + "num_tokens": 13246819.0, + "step": 32 + }, + { + "epoch": 0.05789473684210526, + "grad_norm": 1.39752995967865, + "learning_rate": 5.6140350877192984e-05, + "loss": 1.3795, + "mean_token_accuracy": 0.6259216666221619, + "num_tokens": 13668134.0, + "step": 33 + }, + { + "epoch": 0.05964912280701754, + "grad_norm": 1.2338861227035522, + "learning_rate": 5.789473684210527e-05, + "loss": 1.3461, + "mean_token_accuracy": 0.6343478560447693, + "num_tokens": 14077677.0, + "step": 34 + }, + { + "epoch": 0.06140350877192982, + "grad_norm": 1.1885336637496948, + "learning_rate": 5.9649122807017544e-05, + "loss": 1.3637, + "mean_token_accuracy": 0.6306077241897583, + "num_tokens": 14502863.0, + "step": 35 + }, + { + "epoch": 0.06315789473684211, + "grad_norm": 1.471887469291687, + "learning_rate": 6.140350877192983e-05, + "loss": 1.3589, + "mean_token_accuracy": 0.6298720836639404, + "num_tokens": 14940331.0, + "step": 36 + }, + { + "epoch": 0.06491228070175438, + "grad_norm": 1.0302767753601074, + "learning_rate": 6.31578947368421e-05, + "loss": 1.3397, + "mean_token_accuracy": 0.6345314979553223, + "num_tokens": 15343369.0, + "step": 37 + }, + { + "epoch": 0.06666666666666667, + "grad_norm": 1.42778480052948, + "learning_rate": 6.49122807017544e-05, + "loss": 1.3546, + "mean_token_accuracy": 0.6308321356773376, + "num_tokens": 15752951.0, + "step": 38 + }, + { + "epoch": 0.06842105263157895, + "grad_norm": 1.52997624874115, + "learning_rate": 6.666666666666667e-05, + "loss": 1.3207, + "mean_token_accuracy": 0.6400465965270996, + "num_tokens": 16147599.0, + "step": 39 + }, + { + "epoch": 0.07017543859649122, + "grad_norm": 1.1962817907333374, + "learning_rate": 6.842105263157895e-05, + "loss": 1.3581, + "mean_token_accuracy": 0.6308744549751282, + "num_tokens": 16557268.0, + "step": 40 + }, + { + "epoch": 0.07192982456140351, + "grad_norm": 1.1255979537963867, + "learning_rate": 7.017543859649122e-05, + "loss": 1.3227, + "mean_token_accuracy": 0.639037013053894, + "num_tokens": 16950170.0, + "step": 41 + }, + { + "epoch": 0.07368421052631578, + "grad_norm": 1.3424605131149292, + "learning_rate": 7.192982456140351e-05, + "loss": 1.3326, + "mean_token_accuracy": 0.6364561319351196, + "num_tokens": 17369691.0, + "step": 42 + }, + { + "epoch": 0.07543859649122807, + "grad_norm": 1.4676284790039062, + "learning_rate": 7.368421052631579e-05, + "loss": 1.3922, + "mean_token_accuracy": 0.6229093074798584, + "num_tokens": 17777844.0, + "step": 43 + }, + { + "epoch": 0.07719298245614035, + "grad_norm": 1.339996099472046, + "learning_rate": 7.543859649122808e-05, + "loss": 1.3405, + "mean_token_accuracy": 0.6354778409004211, + "num_tokens": 18191802.0, + "step": 44 + }, + { + "epoch": 0.07894736842105263, + "grad_norm": 1.7620866298675537, + "learning_rate": 7.719298245614036e-05, + "loss": 1.3225, + "mean_token_accuracy": 0.6384661197662354, + "num_tokens": 18608551.0, + "step": 45 + }, + { + "epoch": 0.08070175438596491, + "grad_norm": 1.4890868663787842, + "learning_rate": 7.894736842105263e-05, + "loss": 1.3829, + "mean_token_accuracy": 0.6231105327606201, + "num_tokens": 19020275.0, + "step": 46 + }, + { + "epoch": 0.0824561403508772, + "grad_norm": 1.3470134735107422, + "learning_rate": 8.070175438596491e-05, + "loss": 1.3068, + "mean_token_accuracy": 0.6428367495536804, + "num_tokens": 19416792.0, + "step": 47 + }, + { + "epoch": 0.08421052631578947, + "grad_norm": 1.2967629432678223, + "learning_rate": 8.24561403508772e-05, + "loss": 1.3352, + "mean_token_accuracy": 0.6349055171012878, + "num_tokens": 19842380.0, + "step": 48 + }, + { + "epoch": 0.08596491228070176, + "grad_norm": 1.5379173755645752, + "learning_rate": 8.421052631578948e-05, + "loss": 1.3475, + "mean_token_accuracy": 0.6317988634109497, + "num_tokens": 20257340.0, + "step": 49 + }, + { + "epoch": 0.08771929824561403, + "grad_norm": 1.2810230255126953, + "learning_rate": 8.596491228070177e-05, + "loss": 1.3337, + "mean_token_accuracy": 0.6345160007476807, + "num_tokens": 20693853.0, + "step": 50 + }, + { + "epoch": 0.08947368421052632, + "grad_norm": 1.5687311887741089, + "learning_rate": 8.771929824561403e-05, + "loss": 1.3213, + "mean_token_accuracy": 0.637017011642456, + "num_tokens": 21145069.0, + "step": 51 + }, + { + "epoch": 0.0912280701754386, + "grad_norm": 1.3021150827407837, + "learning_rate": 8.947368421052632e-05, + "loss": 1.3366, + "mean_token_accuracy": 0.6351794600486755, + "num_tokens": 21548907.0, + "step": 52 + }, + { + "epoch": 0.09298245614035087, + "grad_norm": 1.6907377243041992, + "learning_rate": 9.12280701754386e-05, + "loss": 1.3326, + "mean_token_accuracy": 0.6350468397140503, + "num_tokens": 21947011.0, + "step": 53 + }, + { + "epoch": 0.09473684210526316, + "grad_norm": 1.4103087186813354, + "learning_rate": 9.298245614035089e-05, + "loss": 1.3593, + "mean_token_accuracy": 0.6285296082496643, + "num_tokens": 22384995.0, + "step": 54 + }, + { + "epoch": 0.09649122807017543, + "grad_norm": 1.3662679195404053, + "learning_rate": 9.473684210526316e-05, + "loss": 1.2921, + "mean_token_accuracy": 0.644428014755249, + "num_tokens": 22774976.0, + "step": 55 + }, + { + "epoch": 0.09824561403508772, + "grad_norm": 1.4143177270889282, + "learning_rate": 9.649122807017544e-05, + "loss": 1.3537, + "mean_token_accuracy": 0.6295624375343323, + "num_tokens": 23192796.0, + "step": 56 + }, + { + "epoch": 0.1, + "grad_norm": 1.085375189781189, + "learning_rate": 9.824561403508771e-05, + "loss": 1.3102, + "mean_token_accuracy": 0.6399141550064087, + "num_tokens": 23610316.0, + "step": 57 + }, + { + "epoch": 0.10175438596491228, + "grad_norm": 1.3309866189956665, + "learning_rate": 0.0001, + "loss": 1.3685, + "mean_token_accuracy": 0.6267704367637634, + "num_tokens": 24043550.0, + "step": 58 + }, + { + "epoch": 0.10350877192982456, + "grad_norm": 1.4298138618469238, + "learning_rate": 0.0001, + "loss": 1.3263, + "mean_token_accuracy": 0.6358023285865784, + "num_tokens": 24482051.0, + "step": 59 + }, + { + "epoch": 0.10526315789473684, + "grad_norm": 1.3495875597000122, + "learning_rate": 0.0001, + "loss": 1.3231, + "mean_token_accuracy": 0.6365145444869995, + "num_tokens": 24889567.0, + "step": 60 + }, + { + "epoch": 0.10701754385964912, + "grad_norm": 1.3433363437652588, + "learning_rate": 0.0001, + "loss": 1.3265, + "mean_token_accuracy": 0.6364420652389526, + "num_tokens": 25296636.0, + "step": 61 + }, + { + "epoch": 0.10877192982456141, + "grad_norm": 1.4023200273513794, + "learning_rate": 0.0001, + "loss": 1.2956, + "mean_token_accuracy": 0.6436514854431152, + "num_tokens": 25694614.0, + "step": 62 + }, + { + "epoch": 0.11052631578947368, + "grad_norm": 1.4170814752578735, + "learning_rate": 0.0001, + "loss": 1.308, + "mean_token_accuracy": 0.6407404541969299, + "num_tokens": 26107689.0, + "step": 63 + }, + { + "epoch": 0.11228070175438597, + "grad_norm": 1.198994755744934, + "learning_rate": 0.0001, + "loss": 1.3057, + "mean_token_accuracy": 0.6392735838890076, + "num_tokens": 26522427.0, + "step": 64 + }, + { + "epoch": 0.11403508771929824, + "grad_norm": 1.422518014907837, + "learning_rate": 0.0001, + "loss": 1.3237, + "mean_token_accuracy": 0.6369431018829346, + "num_tokens": 26934123.0, + "step": 65 + }, + { + "epoch": 0.11578947368421053, + "grad_norm": 1.3225864171981812, + "learning_rate": 0.0001, + "loss": 1.3213, + "mean_token_accuracy": 0.637446403503418, + "num_tokens": 27358915.0, + "step": 66 + }, + { + "epoch": 0.11754385964912281, + "grad_norm": 1.1103287935256958, + "learning_rate": 0.0001, + "loss": 1.3107, + "mean_token_accuracy": 0.6421770453453064, + "num_tokens": 27775556.0, + "step": 67 + }, + { + "epoch": 0.11929824561403508, + "grad_norm": 1.1607317924499512, + "learning_rate": 0.0001, + "loss": 1.3219, + "mean_token_accuracy": 0.6369372010231018, + "num_tokens": 28174439.0, + "step": 68 + }, + { + "epoch": 0.12105263157894737, + "grad_norm": 1.121587872505188, + "learning_rate": 0.0001, + "loss": 1.2874, + "mean_token_accuracy": 0.6452154517173767, + "num_tokens": 28591718.0, + "step": 69 + }, + { + "epoch": 0.12280701754385964, + "grad_norm": 1.347907304763794, + "learning_rate": 0.0001, + "loss": 1.3031, + "mean_token_accuracy": 0.6407017707824707, + "num_tokens": 29010174.0, + "step": 70 + }, + { + "epoch": 0.12456140350877193, + "grad_norm": 0.9920047521591187, + "learning_rate": 0.0001, + "loss": 1.3081, + "mean_token_accuracy": 0.6407448053359985, + "num_tokens": 29430243.0, + "step": 71 + }, + { + "epoch": 0.12631578947368421, + "grad_norm": 1.4440033435821533, + "learning_rate": 0.0001, + "loss": 1.317, + "mean_token_accuracy": 0.6371080279350281, + "num_tokens": 29852311.0, + "step": 72 + }, + { + "epoch": 0.1280701754385965, + "grad_norm": 1.172947645187378, + "learning_rate": 0.0001, + "loss": 1.2955, + "mean_token_accuracy": 0.6423896551132202, + "num_tokens": 30267287.0, + "step": 73 + }, + { + "epoch": 0.12982456140350876, + "grad_norm": 1.2112936973571777, + "learning_rate": 0.0001, + "loss": 1.3155, + "mean_token_accuracy": 0.6372196674346924, + "num_tokens": 30715583.0, + "step": 74 + }, + { + "epoch": 0.13157894736842105, + "grad_norm": 1.1959091424942017, + "learning_rate": 0.0001, + "loss": 1.326, + "mean_token_accuracy": 0.6362699866294861, + "num_tokens": 31143599.0, + "step": 75 + }, + { + "epoch": 0.13333333333333333, + "grad_norm": 1.3436111211776733, + "learning_rate": 0.0001, + "loss": 1.3162, + "mean_token_accuracy": 0.6371738910675049, + "num_tokens": 31573952.0, + "step": 76 + }, + { + "epoch": 0.13508771929824562, + "grad_norm": 1.101008653640747, + "learning_rate": 0.0001, + "loss": 1.334, + "mean_token_accuracy": 0.6339811086654663, + "num_tokens": 31985309.0, + "step": 77 + }, + { + "epoch": 0.1368421052631579, + "grad_norm": 1.2296723127365112, + "learning_rate": 0.0001, + "loss": 1.3346, + "mean_token_accuracy": 0.6317209005355835, + "num_tokens": 32422743.0, + "step": 78 + }, + { + "epoch": 0.13859649122807016, + "grad_norm": 1.0157369375228882, + "learning_rate": 0.0001, + "loss": 1.3197, + "mean_token_accuracy": 0.6366320848464966, + "num_tokens": 32830001.0, + "step": 79 + }, + { + "epoch": 0.14035087719298245, + "grad_norm": 1.1848087310791016, + "learning_rate": 0.0001, + "loss": 1.2867, + "mean_token_accuracy": 0.6441957950592041, + "num_tokens": 33242795.0, + "step": 80 + }, + { + "epoch": 0.14210526315789473, + "grad_norm": 1.035370945930481, + "learning_rate": 0.0001, + "loss": 1.291, + "mean_token_accuracy": 0.6427246332168579, + "num_tokens": 33631163.0, + "step": 81 + }, + { + "epoch": 0.14385964912280702, + "grad_norm": 1.2173899412155151, + "learning_rate": 0.0001, + "loss": 1.331, + "mean_token_accuracy": 0.6329866647720337, + "num_tokens": 34030802.0, + "step": 82 + }, + { + "epoch": 0.1456140350877193, + "grad_norm": 1.2178702354431152, + "learning_rate": 0.0001, + "loss": 1.3159, + "mean_token_accuracy": 0.6359639167785645, + "num_tokens": 34443782.0, + "step": 83 + }, + { + "epoch": 0.14736842105263157, + "grad_norm": 1.045278787612915, + "learning_rate": 0.0001, + "loss": 1.3248, + "mean_token_accuracy": 0.6354624629020691, + "num_tokens": 34854553.0, + "step": 84 + }, + { + "epoch": 0.14912280701754385, + "grad_norm": 1.0509958267211914, + "learning_rate": 0.0001, + "loss": 1.3038, + "mean_token_accuracy": 0.6399705410003662, + "num_tokens": 35265577.0, + "step": 85 + }, + { + "epoch": 0.15087719298245614, + "grad_norm": 1.1449450254440308, + "learning_rate": 0.0001, + "loss": 1.3004, + "mean_token_accuracy": 0.6418460011482239, + "num_tokens": 35705670.0, + "step": 86 + }, + { + "epoch": 0.15263157894736842, + "grad_norm": 1.254193902015686, + "learning_rate": 0.0001, + "loss": 1.3088, + "mean_token_accuracy": 0.6380743980407715, + "num_tokens": 36117458.0, + "step": 87 + }, + { + "epoch": 0.1543859649122807, + "grad_norm": 1.107653021812439, + "learning_rate": 0.0001, + "loss": 1.3009, + "mean_token_accuracy": 0.6418921947479248, + "num_tokens": 36528322.0, + "step": 88 + }, + { + "epoch": 0.156140350877193, + "grad_norm": 0.9854401350021362, + "learning_rate": 0.0001, + "loss": 1.3022, + "mean_token_accuracy": 0.6411072015762329, + "num_tokens": 36937089.0, + "step": 89 + }, + { + "epoch": 0.15789473684210525, + "grad_norm": 0.9852709174156189, + "learning_rate": 0.0001, + "loss": 1.2818, + "mean_token_accuracy": 0.6445783972740173, + "num_tokens": 37347449.0, + "step": 90 + }, + { + "epoch": 0.15964912280701754, + "grad_norm": 1.0607930421829224, + "learning_rate": 0.0001, + "loss": 1.3042, + "mean_token_accuracy": 0.6411298513412476, + "num_tokens": 37768132.0, + "step": 91 + }, + { + "epoch": 0.16140350877192983, + "grad_norm": 0.8618792295455933, + "learning_rate": 0.0001, + "loss": 1.2899, + "mean_token_accuracy": 0.6437462568283081, + "num_tokens": 38173420.0, + "step": 92 + }, + { + "epoch": 0.1631578947368421, + "grad_norm": 0.9967447519302368, + "learning_rate": 0.0001, + "loss": 1.2854, + "mean_token_accuracy": 0.6431381106376648, + "num_tokens": 38564492.0, + "step": 93 + }, + { + "epoch": 0.1649122807017544, + "grad_norm": 0.984609842300415, + "learning_rate": 0.0001, + "loss": 1.3059, + "mean_token_accuracy": 0.6391075849533081, + "num_tokens": 38973529.0, + "step": 94 + }, + { + "epoch": 0.16666666666666666, + "grad_norm": 1.2071311473846436, + "learning_rate": 0.0001, + "loss": 1.3399, + "mean_token_accuracy": 0.6329282522201538, + "num_tokens": 39394470.0, + "step": 95 + }, + { + "epoch": 0.16842105263157894, + "grad_norm": 0.976823627948761, + "learning_rate": 0.0001, + "loss": 1.3189, + "mean_token_accuracy": 0.6357463598251343, + "num_tokens": 39817247.0, + "step": 96 + }, + { + "epoch": 0.17017543859649123, + "grad_norm": 1.1396266222000122, + "learning_rate": 0.0001, + "loss": 1.3212, + "mean_token_accuracy": 0.6357501745223999, + "num_tokens": 40219578.0, + "step": 97 + }, + { + "epoch": 0.17192982456140352, + "grad_norm": 1.375174880027771, + "learning_rate": 0.0001, + "loss": 1.3187, + "mean_token_accuracy": 0.6361098289489746, + "num_tokens": 40631135.0, + "step": 98 + }, + { + "epoch": 0.1736842105263158, + "grad_norm": 1.1790404319763184, + "learning_rate": 0.0001, + "loss": 1.2962, + "mean_token_accuracy": 0.6430338621139526, + "num_tokens": 41060591.0, + "step": 99 + }, + { + "epoch": 0.17543859649122806, + "grad_norm": 1.1208826303482056, + "learning_rate": 0.0001, + "loss": 1.3243, + "mean_token_accuracy": 0.6350250244140625, + "num_tokens": 41491310.0, + "step": 100 + }, + { + "epoch": 0.17719298245614035, + "grad_norm": 0.9812876582145691, + "learning_rate": 0.0001, + "loss": 1.2989, + "mean_token_accuracy": 0.6403151154518127, + "num_tokens": 41928251.0, + "step": 101 + }, + { + "epoch": 0.17894736842105263, + "grad_norm": 1.118895173072815, + "learning_rate": 0.0001, + "loss": 1.2881, + "mean_token_accuracy": 0.643653392791748, + "num_tokens": 42328746.0, + "step": 102 + }, + { + "epoch": 0.18070175438596492, + "grad_norm": 1.0872011184692383, + "learning_rate": 0.0001, + "loss": 1.3338, + "mean_token_accuracy": 0.6348128318786621, + "num_tokens": 42770651.0, + "step": 103 + }, + { + "epoch": 0.1824561403508772, + "grad_norm": 1.0117576122283936, + "learning_rate": 0.0001, + "loss": 1.305, + "mean_token_accuracy": 0.6394751071929932, + "num_tokens": 43173759.0, + "step": 104 + }, + { + "epoch": 0.18421052631578946, + "grad_norm": 0.9142250418663025, + "learning_rate": 0.0001, + "loss": 1.2908, + "mean_token_accuracy": 0.642844557762146, + "num_tokens": 43604295.0, + "step": 105 + }, + { + "epoch": 0.18596491228070175, + "grad_norm": 1.1038587093353271, + "learning_rate": 0.0001, + "loss": 1.2963, + "mean_token_accuracy": 0.6419985294342041, + "num_tokens": 44008348.0, + "step": 106 + }, + { + "epoch": 0.18771929824561404, + "grad_norm": 0.928559422492981, + "learning_rate": 0.0001, + "loss": 1.3107, + "mean_token_accuracy": 0.6373360753059387, + "num_tokens": 44444613.0, + "step": 107 + }, + { + "epoch": 0.18947368421052632, + "grad_norm": 1.0053200721740723, + "learning_rate": 0.0001, + "loss": 1.2887, + "mean_token_accuracy": 0.6448897123336792, + "num_tokens": 44875312.0, + "step": 108 + }, + { + "epoch": 0.1912280701754386, + "grad_norm": 0.9399821758270264, + "learning_rate": 0.0001, + "loss": 1.2996, + "mean_token_accuracy": 0.6389566659927368, + "num_tokens": 45273670.0, + "step": 109 + }, + { + "epoch": 0.19298245614035087, + "grad_norm": 1.2514432668685913, + "learning_rate": 0.0001, + "loss": 1.2788, + "mean_token_accuracy": 0.6447431445121765, + "num_tokens": 45696861.0, + "step": 110 + }, + { + "epoch": 0.19473684210526315, + "grad_norm": 0.9928343892097473, + "learning_rate": 0.0001, + "loss": 1.303, + "mean_token_accuracy": 0.639816403388977, + "num_tokens": 46115387.0, + "step": 111 + }, + { + "epoch": 0.19649122807017544, + "grad_norm": 1.0918611288070679, + "learning_rate": 0.0001, + "loss": 1.2904, + "mean_token_accuracy": 0.6424538493156433, + "num_tokens": 46521933.0, + "step": 112 + }, + { + "epoch": 0.19824561403508772, + "grad_norm": 1.1192419528961182, + "learning_rate": 0.0001, + "loss": 1.3263, + "mean_token_accuracy": 0.634386420249939, + "num_tokens": 46941357.0, + "step": 113 + }, + { + "epoch": 0.2, + "grad_norm": 0.9753395318984985, + "learning_rate": 0.0001, + "loss": 1.2792, + "mean_token_accuracy": 0.6461848020553589, + "num_tokens": 47343683.0, + "step": 114 + }, + { + "epoch": 0.20175438596491227, + "grad_norm": 0.8872193694114685, + "learning_rate": 0.0001, + "loss": 1.2928, + "mean_token_accuracy": 0.6433566808700562, + "num_tokens": 47787665.0, + "step": 115 + }, + { + "epoch": 0.20350877192982456, + "grad_norm": 0.9394273161888123, + "learning_rate": 0.0001, + "loss": 1.2839, + "mean_token_accuracy": 0.643855094909668, + "num_tokens": 48190313.0, + "step": 116 + }, + { + "epoch": 0.20526315789473684, + "grad_norm": 1.136915922164917, + "learning_rate": 0.0001, + "loss": 1.2904, + "mean_token_accuracy": 0.6421671509742737, + "num_tokens": 48630247.0, + "step": 117 + }, + { + "epoch": 0.20701754385964913, + "grad_norm": 0.9522098898887634, + "learning_rate": 0.0001, + "loss": 1.2942, + "mean_token_accuracy": 0.6419311761856079, + "num_tokens": 49009657.0, + "step": 118 + }, + { + "epoch": 0.20877192982456141, + "grad_norm": 1.1538357734680176, + "learning_rate": 0.0001, + "loss": 1.2708, + "mean_token_accuracy": 0.6458480358123779, + "num_tokens": 49398873.0, + "step": 119 + }, + { + "epoch": 0.21052631578947367, + "grad_norm": 0.9239334464073181, + "learning_rate": 0.0001, + "loss": 1.2768, + "mean_token_accuracy": 0.6459267139434814, + "num_tokens": 49804381.0, + "step": 120 + }, + { + "epoch": 0.21228070175438596, + "grad_norm": 0.9793084859848022, + "learning_rate": 0.0001, + "loss": 1.2712, + "mean_token_accuracy": 0.6456162929534912, + "num_tokens": 50213766.0, + "step": 121 + }, + { + "epoch": 0.21403508771929824, + "grad_norm": 1.1136904954910278, + "learning_rate": 0.0001, + "loss": 1.2877, + "mean_token_accuracy": 0.6435809135437012, + "num_tokens": 50625155.0, + "step": 122 + }, + { + "epoch": 0.21578947368421053, + "grad_norm": 0.8962170481681824, + "learning_rate": 0.0001, + "loss": 1.2929, + "mean_token_accuracy": 0.6421340703964233, + "num_tokens": 51028769.0, + "step": 123 + }, + { + "epoch": 0.21754385964912282, + "grad_norm": 1.0955440998077393, + "learning_rate": 0.0001, + "loss": 1.2801, + "mean_token_accuracy": 0.6436057090759277, + "num_tokens": 51442144.0, + "step": 124 + }, + { + "epoch": 0.21929824561403508, + "grad_norm": 0.9009307622909546, + "learning_rate": 0.0001, + "loss": 1.2709, + "mean_token_accuracy": 0.6477597951889038, + "num_tokens": 51847840.0, + "step": 125 + }, + { + "epoch": 0.22105263157894736, + "grad_norm": 1.0885659456253052, + "learning_rate": 0.0001, + "loss": 1.2771, + "mean_token_accuracy": 0.6435371041297913, + "num_tokens": 52284344.0, + "step": 126 + }, + { + "epoch": 0.22280701754385965, + "grad_norm": 0.92705899477005, + "learning_rate": 0.0001, + "loss": 1.3212, + "mean_token_accuracy": 0.6340687870979309, + "num_tokens": 52685505.0, + "step": 127 + }, + { + "epoch": 0.22456140350877193, + "grad_norm": 0.9139009118080139, + "learning_rate": 0.0001, + "loss": 1.2932, + "mean_token_accuracy": 0.6407190561294556, + "num_tokens": 53110059.0, + "step": 128 + }, + { + "epoch": 0.22631578947368422, + "grad_norm": 0.8279791474342346, + "learning_rate": 0.0001, + "loss": 1.2659, + "mean_token_accuracy": 0.6474618911743164, + "num_tokens": 53502719.0, + "step": 129 + }, + { + "epoch": 0.22807017543859648, + "grad_norm": 0.9933703541755676, + "learning_rate": 0.0001, + "loss": 1.2681, + "mean_token_accuracy": 0.6483220458030701, + "num_tokens": 53917317.0, + "step": 130 + }, + { + "epoch": 0.22982456140350876, + "grad_norm": 0.887478768825531, + "learning_rate": 0.0001, + "loss": 1.3002, + "mean_token_accuracy": 0.6406154632568359, + "num_tokens": 54338305.0, + "step": 131 + }, + { + "epoch": 0.23157894736842105, + "grad_norm": 0.8612638711929321, + "learning_rate": 0.0001, + "loss": 1.2774, + "mean_token_accuracy": 0.6442515850067139, + "num_tokens": 54754376.0, + "step": 132 + }, + { + "epoch": 0.23333333333333334, + "grad_norm": 0.850595235824585, + "learning_rate": 0.0001, + "loss": 1.3222, + "mean_token_accuracy": 0.6351276636123657, + "num_tokens": 55177668.0, + "step": 133 + }, + { + "epoch": 0.23508771929824562, + "grad_norm": 1.1265441179275513, + "learning_rate": 0.0001, + "loss": 1.2869, + "mean_token_accuracy": 0.6440436840057373, + "num_tokens": 55588413.0, + "step": 134 + }, + { + "epoch": 0.23684210526315788, + "grad_norm": 0.8181601762771606, + "learning_rate": 0.0001, + "loss": 1.2689, + "mean_token_accuracy": 0.6486095190048218, + "num_tokens": 56002661.0, + "step": 135 + }, + { + "epoch": 0.23859649122807017, + "grad_norm": 0.9597206115722656, + "learning_rate": 0.0001, + "loss": 1.2685, + "mean_token_accuracy": 0.6476383209228516, + "num_tokens": 56399978.0, + "step": 136 + }, + { + "epoch": 0.24035087719298245, + "grad_norm": 0.9021192193031311, + "learning_rate": 0.0001, + "loss": 1.277, + "mean_token_accuracy": 0.6456701159477234, + "num_tokens": 56805057.0, + "step": 137 + }, + { + "epoch": 0.24210526315789474, + "grad_norm": 0.9269475936889648, + "learning_rate": 0.0001, + "loss": 1.2937, + "mean_token_accuracy": 0.6424517631530762, + "num_tokens": 57221548.0, + "step": 138 + }, + { + "epoch": 0.24385964912280703, + "grad_norm": 0.9395855069160461, + "learning_rate": 0.0001, + "loss": 1.2905, + "mean_token_accuracy": 0.6421390175819397, + "num_tokens": 57619205.0, + "step": 139 + }, + { + "epoch": 0.24561403508771928, + "grad_norm": 1.0334845781326294, + "learning_rate": 0.0001, + "loss": 1.2727, + "mean_token_accuracy": 0.6469994783401489, + "num_tokens": 58029104.0, + "step": 140 + }, + { + "epoch": 0.24736842105263157, + "grad_norm": 1.080823302268982, + "learning_rate": 0.0001, + "loss": 1.3086, + "mean_token_accuracy": 0.6387298107147217, + "num_tokens": 58444372.0, + "step": 141 + }, + { + "epoch": 0.24912280701754386, + "grad_norm": 0.8953016400337219, + "learning_rate": 0.0001, + "loss": 1.3045, + "mean_token_accuracy": 0.6378800272941589, + "num_tokens": 58859539.0, + "step": 142 + }, + { + "epoch": 0.25087719298245614, + "grad_norm": 0.8567958474159241, + "learning_rate": 0.0001, + "loss": 1.3121, + "mean_token_accuracy": 0.6372794508934021, + "num_tokens": 59268101.0, + "step": 143 + }, + { + "epoch": 0.25263157894736843, + "grad_norm": 1.158692479133606, + "learning_rate": 0.0001, + "loss": 1.2691, + "mean_token_accuracy": 0.6473510265350342, + "num_tokens": 59708311.0, + "step": 144 + }, + { + "epoch": 0.2543859649122807, + "grad_norm": 0.9232509732246399, + "learning_rate": 0.0001, + "loss": 1.2682, + "mean_token_accuracy": 0.648154616355896, + "num_tokens": 60135806.0, + "step": 145 + }, + { + "epoch": 0.256140350877193, + "grad_norm": 0.9411163330078125, + "learning_rate": 0.0001, + "loss": 1.3155, + "mean_token_accuracy": 0.6371098756790161, + "num_tokens": 60546162.0, + "step": 146 + }, + { + "epoch": 0.2578947368421053, + "grad_norm": 1.013136863708496, + "learning_rate": 0.0001, + "loss": 1.2655, + "mean_token_accuracy": 0.646664023399353, + "num_tokens": 60977521.0, + "step": 147 + }, + { + "epoch": 0.2596491228070175, + "grad_norm": 1.1551271677017212, + "learning_rate": 0.0001, + "loss": 1.2798, + "mean_token_accuracy": 0.6451727747917175, + "num_tokens": 61372998.0, + "step": 148 + }, + { + "epoch": 0.2614035087719298, + "grad_norm": 0.8795229196548462, + "learning_rate": 0.0001, + "loss": 1.2982, + "mean_token_accuracy": 0.6401211023330688, + "num_tokens": 61781320.0, + "step": 149 + }, + { + "epoch": 0.2631578947368421, + "grad_norm": 0.965307891368866, + "learning_rate": 0.0001, + "loss": 1.2788, + "mean_token_accuracy": 0.6439850330352783, + "num_tokens": 62199535.0, + "step": 150 + }, + { + "epoch": 0.2649122807017544, + "grad_norm": 0.9804089665412903, + "learning_rate": 0.0001, + "loss": 1.2765, + "mean_token_accuracy": 0.6456645727157593, + "num_tokens": 62632976.0, + "step": 151 + }, + { + "epoch": 0.26666666666666666, + "grad_norm": 0.9098561406135559, + "learning_rate": 0.0001, + "loss": 1.2564, + "mean_token_accuracy": 0.6501985192298889, + "num_tokens": 63043041.0, + "step": 152 + }, + { + "epoch": 0.26842105263157895, + "grad_norm": 0.7934507727622986, + "learning_rate": 0.0001, + "loss": 1.2638, + "mean_token_accuracy": 0.6469926834106445, + "num_tokens": 63453330.0, + "step": 153 + }, + { + "epoch": 0.27017543859649124, + "grad_norm": 1.0823460817337036, + "learning_rate": 0.0001, + "loss": 1.2983, + "mean_token_accuracy": 0.6407555341720581, + "num_tokens": 63864814.0, + "step": 154 + }, + { + "epoch": 0.2719298245614035, + "grad_norm": 0.7126585841178894, + "learning_rate": 0.0001, + "loss": 1.2528, + "mean_token_accuracy": 0.6504640579223633, + "num_tokens": 64280965.0, + "step": 155 + }, + { + "epoch": 0.2736842105263158, + "grad_norm": 0.9585691690444946, + "learning_rate": 0.0001, + "loss": 1.2686, + "mean_token_accuracy": 0.6472254395484924, + "num_tokens": 64672526.0, + "step": 156 + }, + { + "epoch": 0.2754385964912281, + "grad_norm": 0.752656102180481, + "learning_rate": 0.0001, + "loss": 1.2474, + "mean_token_accuracy": 0.6523414850234985, + "num_tokens": 65077313.0, + "step": 157 + }, + { + "epoch": 0.2771929824561403, + "grad_norm": 0.9288751482963562, + "learning_rate": 0.0001, + "loss": 1.2624, + "mean_token_accuracy": 0.646138072013855, + "num_tokens": 65487692.0, + "step": 158 + }, + { + "epoch": 0.2789473684210526, + "grad_norm": 0.8809065222740173, + "learning_rate": 0.0001, + "loss": 1.2607, + "mean_token_accuracy": 0.6489483118057251, + "num_tokens": 65905410.0, + "step": 159 + }, + { + "epoch": 0.2807017543859649, + "grad_norm": 0.9240980744361877, + "learning_rate": 0.0001, + "loss": 1.2958, + "mean_token_accuracy": 0.6388487219810486, + "num_tokens": 66310475.0, + "step": 160 + }, + { + "epoch": 0.2824561403508772, + "grad_norm": 0.8388931751251221, + "learning_rate": 0.0001, + "loss": 1.2838, + "mean_token_accuracy": 0.6431634426116943, + "num_tokens": 66727367.0, + "step": 161 + }, + { + "epoch": 0.28421052631578947, + "grad_norm": 0.8820334076881409, + "learning_rate": 0.0001, + "loss": 1.303, + "mean_token_accuracy": 0.63872230052948, + "num_tokens": 67154750.0, + "step": 162 + }, + { + "epoch": 0.28596491228070176, + "grad_norm": 0.8385342359542847, + "learning_rate": 0.0001, + "loss": 1.2632, + "mean_token_accuracy": 0.6464277505874634, + "num_tokens": 67571403.0, + "step": 163 + }, + { + "epoch": 0.28771929824561404, + "grad_norm": 0.8737322092056274, + "learning_rate": 0.0001, + "loss": 1.248, + "mean_token_accuracy": 0.6486573219299316, + "num_tokens": 67947644.0, + "step": 164 + }, + { + "epoch": 0.2894736842105263, + "grad_norm": 0.8021559119224548, + "learning_rate": 0.0001, + "loss": 1.2716, + "mean_token_accuracy": 0.644127607345581, + "num_tokens": 68362596.0, + "step": 165 + }, + { + "epoch": 0.2912280701754386, + "grad_norm": 0.7599727511405945, + "learning_rate": 0.0001, + "loss": 1.2511, + "mean_token_accuracy": 0.6510897874832153, + "num_tokens": 68768455.0, + "step": 166 + }, + { + "epoch": 0.2929824561403509, + "grad_norm": 0.8421241044998169, + "learning_rate": 0.0001, + "loss": 1.2839, + "mean_token_accuracy": 0.6423199772834778, + "num_tokens": 69182971.0, + "step": 167 + }, + { + "epoch": 0.29473684210526313, + "grad_norm": 0.8853815793991089, + "learning_rate": 0.0001, + "loss": 1.2698, + "mean_token_accuracy": 0.6462802886962891, + "num_tokens": 69633484.0, + "step": 168 + }, + { + "epoch": 0.2964912280701754, + "grad_norm": 1.0347827672958374, + "learning_rate": 0.0001, + "loss": 1.2935, + "mean_token_accuracy": 0.6394780874252319, + "num_tokens": 70069352.0, + "step": 169 + }, + { + "epoch": 0.2982456140350877, + "grad_norm": 0.8993912935256958, + "learning_rate": 0.0001, + "loss": 1.2689, + "mean_token_accuracy": 0.6471203565597534, + "num_tokens": 70478616.0, + "step": 170 + }, + { + "epoch": 0.3, + "grad_norm": 0.8886722922325134, + "learning_rate": 0.0001, + "loss": 1.2943, + "mean_token_accuracy": 0.6398030519485474, + "num_tokens": 70905952.0, + "step": 171 + }, + { + "epoch": 0.3017543859649123, + "grad_norm": 1.1024540662765503, + "learning_rate": 0.0001, + "loss": 1.2799, + "mean_token_accuracy": 0.6444812417030334, + "num_tokens": 71317263.0, + "step": 172 + }, + { + "epoch": 0.30350877192982456, + "grad_norm": 0.8578200340270996, + "learning_rate": 0.0001, + "loss": 1.2851, + "mean_token_accuracy": 0.6433255672454834, + "num_tokens": 71764261.0, + "step": 173 + }, + { + "epoch": 0.30526315789473685, + "grad_norm": 0.9540239572525024, + "learning_rate": 0.0001, + "loss": 1.2777, + "mean_token_accuracy": 0.644009530544281, + "num_tokens": 72179518.0, + "step": 174 + }, + { + "epoch": 0.30701754385964913, + "grad_norm": 0.9197105169296265, + "learning_rate": 0.0001, + "loss": 1.2664, + "mean_token_accuracy": 0.6461474895477295, + "num_tokens": 72594932.0, + "step": 175 + }, + { + "epoch": 0.3087719298245614, + "grad_norm": 0.7414684891700745, + "learning_rate": 0.0001, + "loss": 1.2688, + "mean_token_accuracy": 0.6475616097450256, + "num_tokens": 73005473.0, + "step": 176 + }, + { + "epoch": 0.3105263157894737, + "grad_norm": 0.9558865427970886, + "learning_rate": 0.0001, + "loss": 1.2673, + "mean_token_accuracy": 0.6460444927215576, + "num_tokens": 73397415.0, + "step": 177 + }, + { + "epoch": 0.312280701754386, + "grad_norm": 0.9229446053504944, + "learning_rate": 0.0001, + "loss": 1.2738, + "mean_token_accuracy": 0.6450560092926025, + "num_tokens": 73793964.0, + "step": 178 + }, + { + "epoch": 0.3140350877192982, + "grad_norm": 0.7821291089057922, + "learning_rate": 0.0001, + "loss": 1.2489, + "mean_token_accuracy": 0.6500634551048279, + "num_tokens": 74195270.0, + "step": 179 + }, + { + "epoch": 0.3157894736842105, + "grad_norm": 0.7419561743736267, + "learning_rate": 0.0001, + "loss": 1.2612, + "mean_token_accuracy": 0.6482713222503662, + "num_tokens": 74603802.0, + "step": 180 + }, + { + "epoch": 0.3175438596491228, + "grad_norm": 0.7956511378288269, + "learning_rate": 0.0001, + "loss": 1.2644, + "mean_token_accuracy": 0.6461498141288757, + "num_tokens": 75026083.0, + "step": 181 + }, + { + "epoch": 0.3192982456140351, + "grad_norm": 0.7759901285171509, + "learning_rate": 0.0001, + "loss": 1.2647, + "mean_token_accuracy": 0.6459044218063354, + "num_tokens": 75426052.0, + "step": 182 + }, + { + "epoch": 0.32105263157894737, + "grad_norm": 0.8206394910812378, + "learning_rate": 0.0001, + "loss": 1.2661, + "mean_token_accuracy": 0.646410346031189, + "num_tokens": 75830225.0, + "step": 183 + }, + { + "epoch": 0.32280701754385965, + "grad_norm": 0.9032196402549744, + "learning_rate": 0.0001, + "loss": 1.2677, + "mean_token_accuracy": 0.64692223072052, + "num_tokens": 76241940.0, + "step": 184 + }, + { + "epoch": 0.32456140350877194, + "grad_norm": 0.7018728256225586, + "learning_rate": 0.0001, + "loss": 1.2505, + "mean_token_accuracy": 0.6487954258918762, + "num_tokens": 76663142.0, + "step": 185 + }, + { + "epoch": 0.3263157894736842, + "grad_norm": 0.9026210904121399, + "learning_rate": 0.0001, + "loss": 1.2483, + "mean_token_accuracy": 0.652391254901886, + "num_tokens": 77102129.0, + "step": 186 + }, + { + "epoch": 0.3280701754385965, + "grad_norm": 0.8878228068351746, + "learning_rate": 0.0001, + "loss": 1.2736, + "mean_token_accuracy": 0.6432400941848755, + "num_tokens": 77508487.0, + "step": 187 + }, + { + "epoch": 0.3298245614035088, + "grad_norm": 0.9250103235244751, + "learning_rate": 0.0001, + "loss": 1.2426, + "mean_token_accuracy": 0.651581883430481, + "num_tokens": 77908351.0, + "step": 188 + }, + { + "epoch": 0.33157894736842103, + "grad_norm": 0.6793785095214844, + "learning_rate": 0.0001, + "loss": 1.2453, + "mean_token_accuracy": 0.6514815092086792, + "num_tokens": 78320845.0, + "step": 189 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 0.7402032017707825, + "learning_rate": 0.0001, + "loss": 1.2523, + "mean_token_accuracy": 0.6495726704597473, + "num_tokens": 78732383.0, + "step": 190 + }, + { + "epoch": 0.3350877192982456, + "grad_norm": 0.9974339604377747, + "learning_rate": 0.0001, + "loss": 1.2545, + "mean_token_accuracy": 0.6485158801078796, + "num_tokens": 79123256.0, + "step": 191 + }, + { + "epoch": 0.3368421052631579, + "grad_norm": 0.9054931998252869, + "learning_rate": 0.0001, + "loss": 1.2538, + "mean_token_accuracy": 0.6507511734962463, + "num_tokens": 79525484.0, + "step": 192 + }, + { + "epoch": 0.3385964912280702, + "grad_norm": 0.7434863448143005, + "learning_rate": 0.0001, + "loss": 1.2615, + "mean_token_accuracy": 0.6481617093086243, + "num_tokens": 79955140.0, + "step": 193 + }, + { + "epoch": 0.34035087719298246, + "grad_norm": 0.7779750823974609, + "learning_rate": 0.0001, + "loss": 1.2739, + "mean_token_accuracy": 0.6448099613189697, + "num_tokens": 80387845.0, + "step": 194 + }, + { + "epoch": 0.34210526315789475, + "grad_norm": 0.8742808103561401, + "learning_rate": 0.0001, + "loss": 1.2674, + "mean_token_accuracy": 0.6466174125671387, + "num_tokens": 80810849.0, + "step": 195 + }, + { + "epoch": 0.34385964912280703, + "grad_norm": 0.810045063495636, + "learning_rate": 0.0001, + "loss": 1.2722, + "mean_token_accuracy": 0.6458349227905273, + "num_tokens": 81226626.0, + "step": 196 + }, + { + "epoch": 0.3456140350877193, + "grad_norm": 0.7127732634544373, + "learning_rate": 0.0001, + "loss": 1.2912, + "mean_token_accuracy": 0.6405543684959412, + "num_tokens": 81648045.0, + "step": 197 + }, + { + "epoch": 0.3473684210526316, + "grad_norm": 0.8309784531593323, + "learning_rate": 0.0001, + "loss": 1.254, + "mean_token_accuracy": 0.6493059396743774, + "num_tokens": 82055621.0, + "step": 198 + }, + { + "epoch": 0.34912280701754383, + "grad_norm": 0.8503166437149048, + "learning_rate": 0.0001, + "loss": 1.2772, + "mean_token_accuracy": 0.6441330909729004, + "num_tokens": 82454820.0, + "step": 199 + }, + { + "epoch": 0.3508771929824561, + "grad_norm": 0.8834285736083984, + "learning_rate": 0.0001, + "loss": 1.2701, + "mean_token_accuracy": 0.6456678509712219, + "num_tokens": 82881912.0, + "step": 200 + }, + { + "epoch": 0.3526315789473684, + "grad_norm": 0.7746639847755432, + "learning_rate": 0.0001, + "loss": 1.2731, + "mean_token_accuracy": 0.6454135775566101, + "num_tokens": 83294708.0, + "step": 201 + }, + { + "epoch": 0.3543859649122807, + "grad_norm": 0.8626236915588379, + "learning_rate": 0.0001, + "loss": 1.2677, + "mean_token_accuracy": 0.6472684144973755, + "num_tokens": 83692153.0, + "step": 202 + }, + { + "epoch": 0.356140350877193, + "grad_norm": 0.8129353523254395, + "learning_rate": 0.0001, + "loss": 1.2504, + "mean_token_accuracy": 0.649857223033905, + "num_tokens": 84106215.0, + "step": 203 + }, + { + "epoch": 0.35789473684210527, + "grad_norm": 0.9501094818115234, + "learning_rate": 0.0001, + "loss": 1.2788, + "mean_token_accuracy": 0.6440906524658203, + "num_tokens": 84533326.0, + "step": 204 + }, + { + "epoch": 0.35964912280701755, + "grad_norm": 0.7424087524414062, + "learning_rate": 0.0001, + "loss": 1.2663, + "mean_token_accuracy": 0.6469358205795288, + "num_tokens": 84958198.0, + "step": 205 + }, + { + "epoch": 0.36140350877192984, + "grad_norm": 0.7956259846687317, + "learning_rate": 0.0001, + "loss": 1.2691, + "mean_token_accuracy": 0.6459769010543823, + "num_tokens": 85375870.0, + "step": 206 + }, + { + "epoch": 0.3631578947368421, + "grad_norm": 0.7288737893104553, + "learning_rate": 0.0001, + "loss": 1.2707, + "mean_token_accuracy": 0.6465530395507812, + "num_tokens": 85800348.0, + "step": 207 + }, + { + "epoch": 0.3649122807017544, + "grad_norm": 0.7138190865516663, + "learning_rate": 0.0001, + "loss": 1.2415, + "mean_token_accuracy": 0.6529696583747864, + "num_tokens": 86195245.0, + "step": 208 + }, + { + "epoch": 0.36666666666666664, + "grad_norm": 0.9041345119476318, + "learning_rate": 0.0001, + "loss": 1.2673, + "mean_token_accuracy": 0.6487336754798889, + "num_tokens": 86599515.0, + "step": 209 + }, + { + "epoch": 0.3684210526315789, + "grad_norm": 0.7553381323814392, + "learning_rate": 0.0001, + "loss": 1.298, + "mean_token_accuracy": 0.6391161680221558, + "num_tokens": 87039537.0, + "step": 210 + }, + { + "epoch": 0.3701754385964912, + "grad_norm": 0.7526540160179138, + "learning_rate": 0.0001, + "loss": 1.2465, + "mean_token_accuracy": 0.6537913084030151, + "num_tokens": 87426029.0, + "step": 211 + }, + { + "epoch": 0.3719298245614035, + "grad_norm": 0.9352124333381653, + "learning_rate": 0.0001, + "loss": 1.2655, + "mean_token_accuracy": 0.6462576389312744, + "num_tokens": 87817987.0, + "step": 212 + }, + { + "epoch": 0.3736842105263158, + "grad_norm": 0.8342838883399963, + "learning_rate": 0.0001, + "loss": 1.2356, + "mean_token_accuracy": 0.653677225112915, + "num_tokens": 88218779.0, + "step": 213 + }, + { + "epoch": 0.37543859649122807, + "grad_norm": 0.7606971263885498, + "learning_rate": 0.0001, + "loss": 1.2423, + "mean_token_accuracy": 0.6509567499160767, + "num_tokens": 88610670.0, + "step": 214 + }, + { + "epoch": 0.37719298245614036, + "grad_norm": 0.9147993326187134, + "learning_rate": 0.0001, + "loss": 1.2777, + "mean_token_accuracy": 0.6447807550430298, + "num_tokens": 89031094.0, + "step": 215 + }, + { + "epoch": 0.37894736842105264, + "grad_norm": 0.8798630833625793, + "learning_rate": 0.0001, + "loss": 1.282, + "mean_token_accuracy": 0.6422438025474548, + "num_tokens": 89465235.0, + "step": 216 + }, + { + "epoch": 0.38070175438596493, + "grad_norm": 0.7571805119514465, + "learning_rate": 0.0001, + "loss": 1.2503, + "mean_token_accuracy": 0.6498503684997559, + "num_tokens": 89867145.0, + "step": 217 + }, + { + "epoch": 0.3824561403508772, + "grad_norm": 0.9793193936347961, + "learning_rate": 0.0001, + "loss": 1.248, + "mean_token_accuracy": 0.6518094539642334, + "num_tokens": 90262494.0, + "step": 218 + }, + { + "epoch": 0.38421052631578945, + "grad_norm": 0.871235728263855, + "learning_rate": 0.0001, + "loss": 1.2707, + "mean_token_accuracy": 0.6453557014465332, + "num_tokens": 90671131.0, + "step": 219 + }, + { + "epoch": 0.38596491228070173, + "grad_norm": 0.7807226181030273, + "learning_rate": 0.0001, + "loss": 1.2607, + "mean_token_accuracy": 0.6465795040130615, + "num_tokens": 91092593.0, + "step": 220 + }, + { + "epoch": 0.387719298245614, + "grad_norm": 0.9600160121917725, + "learning_rate": 0.0001, + "loss": 1.2791, + "mean_token_accuracy": 0.6422662734985352, + "num_tokens": 91501248.0, + "step": 221 + }, + { + "epoch": 0.3894736842105263, + "grad_norm": 0.8549517393112183, + "learning_rate": 0.0001, + "loss": 1.243, + "mean_token_accuracy": 0.6511504650115967, + "num_tokens": 91913007.0, + "step": 222 + }, + { + "epoch": 0.3912280701754386, + "grad_norm": 0.7951960563659668, + "learning_rate": 0.0001, + "loss": 1.2377, + "mean_token_accuracy": 0.6538807153701782, + "num_tokens": 92321188.0, + "step": 223 + }, + { + "epoch": 0.3929824561403509, + "grad_norm": 0.8606045842170715, + "learning_rate": 0.0001, + "loss": 1.2678, + "mean_token_accuracy": 0.645931601524353, + "num_tokens": 92726934.0, + "step": 224 + }, + { + "epoch": 0.39473684210526316, + "grad_norm": 0.7008436322212219, + "learning_rate": 0.0001, + "loss": 1.2384, + "mean_token_accuracy": 0.6525442600250244, + "num_tokens": 93128966.0, + "step": 225 + }, + { + "epoch": 0.39649122807017545, + "grad_norm": 0.7526488304138184, + "learning_rate": 0.0001, + "loss": 1.2738, + "mean_token_accuracy": 0.6442047357559204, + "num_tokens": 93567917.0, + "step": 226 + }, + { + "epoch": 0.39824561403508774, + "grad_norm": 0.8679794669151306, + "learning_rate": 0.0001, + "loss": 1.2548, + "mean_token_accuracy": 0.6482324600219727, + "num_tokens": 93979268.0, + "step": 227 + }, + { + "epoch": 0.4, + "grad_norm": 0.8233749270439148, + "learning_rate": 0.0001, + "loss": 1.2215, + "mean_token_accuracy": 0.6573722958564758, + "num_tokens": 94367195.0, + "step": 228 + }, + { + "epoch": 0.4017543859649123, + "grad_norm": 0.7261408567428589, + "learning_rate": 0.0001, + "loss": 1.2415, + "mean_token_accuracy": 0.6515704989433289, + "num_tokens": 94759639.0, + "step": 229 + }, + { + "epoch": 0.40350877192982454, + "grad_norm": 0.7959755659103394, + "learning_rate": 0.0001, + "loss": 1.2722, + "mean_token_accuracy": 0.6438157558441162, + "num_tokens": 95191668.0, + "step": 230 + }, + { + "epoch": 0.4052631578947368, + "grad_norm": 0.8794543147087097, + "learning_rate": 0.0001, + "loss": 1.2477, + "mean_token_accuracy": 0.6511826515197754, + "num_tokens": 95614874.0, + "step": 231 + }, + { + "epoch": 0.4070175438596491, + "grad_norm": 0.7663288116455078, + "learning_rate": 0.0001, + "loss": 1.2175, + "mean_token_accuracy": 0.656987190246582, + "num_tokens": 95990695.0, + "step": 232 + }, + { + "epoch": 0.4087719298245614, + "grad_norm": 0.7509688138961792, + "learning_rate": 0.0001, + "loss": 1.2395, + "mean_token_accuracy": 0.6518152952194214, + "num_tokens": 96377947.0, + "step": 233 + }, + { + "epoch": 0.4105263157894737, + "grad_norm": 0.9182112812995911, + "learning_rate": 0.0001, + "loss": 1.2567, + "mean_token_accuracy": 0.6489981412887573, + "num_tokens": 96787894.0, + "step": 234 + }, + { + "epoch": 0.41228070175438597, + "grad_norm": 0.8123442530632019, + "learning_rate": 0.0001, + "loss": 1.2541, + "mean_token_accuracy": 0.6488667726516724, + "num_tokens": 97203762.0, + "step": 235 + }, + { + "epoch": 0.41403508771929826, + "grad_norm": 0.8581697344779968, + "learning_rate": 0.0001, + "loss": 1.2595, + "mean_token_accuracy": 0.6481375694274902, + "num_tokens": 97598494.0, + "step": 236 + }, + { + "epoch": 0.41578947368421054, + "grad_norm": 0.8051207065582275, + "learning_rate": 0.0001, + "loss": 1.2763, + "mean_token_accuracy": 0.644673228263855, + "num_tokens": 98011465.0, + "step": 237 + }, + { + "epoch": 0.41754385964912283, + "grad_norm": 0.7852127552032471, + "learning_rate": 0.0001, + "loss": 1.2765, + "mean_token_accuracy": 0.6438398361206055, + "num_tokens": 98447067.0, + "step": 238 + }, + { + "epoch": 0.4192982456140351, + "grad_norm": 0.7962046265602112, + "learning_rate": 0.0001, + "loss": 1.2429, + "mean_token_accuracy": 0.6508150100708008, + "num_tokens": 98854782.0, + "step": 239 + }, + { + "epoch": 0.42105263157894735, + "grad_norm": 0.8521065711975098, + "learning_rate": 0.0001, + "loss": 1.2499, + "mean_token_accuracy": 0.64998859167099, + "num_tokens": 99276423.0, + "step": 240 + }, + { + "epoch": 0.42280701754385963, + "grad_norm": 0.8006791472434998, + "learning_rate": 0.0001, + "loss": 1.2523, + "mean_token_accuracy": 0.65036940574646, + "num_tokens": 99705527.0, + "step": 241 + }, + { + "epoch": 0.4245614035087719, + "grad_norm": 0.6923927664756775, + "learning_rate": 0.0001, + "loss": 1.2698, + "mean_token_accuracy": 0.645989179611206, + "num_tokens": 100144851.0, + "step": 242 + }, + { + "epoch": 0.4263157894736842, + "grad_norm": 0.8310588002204895, + "learning_rate": 0.0001, + "loss": 1.2131, + "mean_token_accuracy": 0.6578903198242188, + "num_tokens": 100546756.0, + "step": 243 + }, + { + "epoch": 0.4280701754385965, + "grad_norm": 0.7767439484596252, + "learning_rate": 0.0001, + "loss": 1.2374, + "mean_token_accuracy": 0.6534087061882019, + "num_tokens": 100961732.0, + "step": 244 + }, + { + "epoch": 0.4298245614035088, + "grad_norm": 0.7211782932281494, + "learning_rate": 0.0001, + "loss": 1.2627, + "mean_token_accuracy": 0.6475375294685364, + "num_tokens": 101396866.0, + "step": 245 + }, + { + "epoch": 0.43157894736842106, + "grad_norm": 0.754098117351532, + "learning_rate": 0.0001, + "loss": 1.2535, + "mean_token_accuracy": 0.6487017869949341, + "num_tokens": 101823395.0, + "step": 246 + }, + { + "epoch": 0.43333333333333335, + "grad_norm": 0.887698233127594, + "learning_rate": 0.0001, + "loss": 1.2372, + "mean_token_accuracy": 0.6514610052108765, + "num_tokens": 102218896.0, + "step": 247 + }, + { + "epoch": 0.43508771929824563, + "grad_norm": 0.6688896417617798, + "learning_rate": 0.0001, + "loss": 1.2135, + "mean_token_accuracy": 0.6570154428482056, + "num_tokens": 102635943.0, + "step": 248 + }, + { + "epoch": 0.4368421052631579, + "grad_norm": 0.6720183491706848, + "learning_rate": 0.0001, + "loss": 1.2314, + "mean_token_accuracy": 0.653835654258728, + "num_tokens": 103060426.0, + "step": 249 + }, + { + "epoch": 0.43859649122807015, + "grad_norm": 0.6985954642295837, + "learning_rate": 0.0001, + "loss": 1.2302, + "mean_token_accuracy": 0.6543055176734924, + "num_tokens": 103480891.0, + "step": 250 + }, + { + "epoch": 0.44035087719298244, + "grad_norm": 0.7861040234565735, + "learning_rate": 0.0001, + "loss": 1.2259, + "mean_token_accuracy": 0.6543919444084167, + "num_tokens": 103896368.0, + "step": 251 + }, + { + "epoch": 0.4421052631578947, + "grad_norm": 0.7467155456542969, + "learning_rate": 0.0001, + "loss": 1.2425, + "mean_token_accuracy": 0.6521680355072021, + "num_tokens": 104318424.0, + "step": 252 + }, + { + "epoch": 0.443859649122807, + "grad_norm": 0.689565896987915, + "learning_rate": 0.0001, + "loss": 1.2384, + "mean_token_accuracy": 0.6529023051261902, + "num_tokens": 104743917.0, + "step": 253 + }, + { + "epoch": 0.4456140350877193, + "grad_norm": 0.8311668634414673, + "learning_rate": 0.0001, + "loss": 1.2286, + "mean_token_accuracy": 0.6543080806732178, + "num_tokens": 105146836.0, + "step": 254 + }, + { + "epoch": 0.4473684210526316, + "grad_norm": 0.8047776818275452, + "learning_rate": 0.0001, + "loss": 1.2555, + "mean_token_accuracy": 0.6480646133422852, + "num_tokens": 105552302.0, + "step": 255 + }, + { + "epoch": 0.44912280701754387, + "grad_norm": 0.6903892159461975, + "learning_rate": 0.0001, + "loss": 1.2878, + "mean_token_accuracy": 0.6401119232177734, + "num_tokens": 105963414.0, + "step": 256 + }, + { + "epoch": 0.45087719298245615, + "grad_norm": 0.7000618577003479, + "learning_rate": 0.0001, + "loss": 1.2557, + "mean_token_accuracy": 0.6474447846412659, + "num_tokens": 106368736.0, + "step": 257 + }, + { + "epoch": 0.45263157894736844, + "grad_norm": 0.7351795434951782, + "learning_rate": 0.0001, + "loss": 1.2237, + "mean_token_accuracy": 0.6556580662727356, + "num_tokens": 106769771.0, + "step": 258 + }, + { + "epoch": 0.4543859649122807, + "grad_norm": 0.7257981300354004, + "learning_rate": 0.0001, + "loss": 1.2416, + "mean_token_accuracy": 0.6521273255348206, + "num_tokens": 107170029.0, + "step": 259 + }, + { + "epoch": 0.45614035087719296, + "grad_norm": 0.8522328734397888, + "learning_rate": 0.0001, + "loss": 1.2301, + "mean_token_accuracy": 0.6529244184494019, + "num_tokens": 107576140.0, + "step": 260 + }, + { + "epoch": 0.45789473684210524, + "grad_norm": 0.6672490835189819, + "learning_rate": 0.0001, + "loss": 1.2414, + "mean_token_accuracy": 0.6505205631256104, + "num_tokens": 108009067.0, + "step": 261 + }, + { + "epoch": 0.45964912280701753, + "grad_norm": 0.8998327255249023, + "learning_rate": 0.0001, + "loss": 1.2365, + "mean_token_accuracy": 0.6523317694664001, + "num_tokens": 108434633.0, + "step": 262 + }, + { + "epoch": 0.4614035087719298, + "grad_norm": 0.7883278727531433, + "learning_rate": 0.0001, + "loss": 1.2511, + "mean_token_accuracy": 0.648801326751709, + "num_tokens": 108878862.0, + "step": 263 + }, + { + "epoch": 0.4631578947368421, + "grad_norm": 0.9719793796539307, + "learning_rate": 0.0001, + "loss": 1.2326, + "mean_token_accuracy": 0.6538045406341553, + "num_tokens": 109287222.0, + "step": 264 + }, + { + "epoch": 0.4649122807017544, + "grad_norm": 0.6874752044677734, + "learning_rate": 0.0001, + "loss": 1.2319, + "mean_token_accuracy": 0.6524173021316528, + "num_tokens": 109693157.0, + "step": 265 + }, + { + "epoch": 0.4666666666666667, + "grad_norm": 0.8174811601638794, + "learning_rate": 0.0001, + "loss": 1.2235, + "mean_token_accuracy": 0.6545587182044983, + "num_tokens": 110099662.0, + "step": 266 + }, + { + "epoch": 0.46842105263157896, + "grad_norm": 0.7676987051963806, + "learning_rate": 0.0001, + "loss": 1.252, + "mean_token_accuracy": 0.6494687795639038, + "num_tokens": 110511759.0, + "step": 267 + }, + { + "epoch": 0.47017543859649125, + "grad_norm": 0.7034929394721985, + "learning_rate": 0.0001, + "loss": 1.2328, + "mean_token_accuracy": 0.653315544128418, + "num_tokens": 110927626.0, + "step": 268 + }, + { + "epoch": 0.47192982456140353, + "grad_norm": 0.6947440505027771, + "learning_rate": 0.0001, + "loss": 1.2451, + "mean_token_accuracy": 0.6503375172615051, + "num_tokens": 111371879.0, + "step": 269 + }, + { + "epoch": 0.47368421052631576, + "grad_norm": 0.7659525871276855, + "learning_rate": 0.0001, + "loss": 1.251, + "mean_token_accuracy": 0.6494304537773132, + "num_tokens": 111784069.0, + "step": 270 + }, + { + "epoch": 0.47543859649122805, + "grad_norm": 0.7740342617034912, + "learning_rate": 0.0001, + "loss": 1.2253, + "mean_token_accuracy": 0.6542062759399414, + "num_tokens": 112186870.0, + "step": 271 + }, + { + "epoch": 0.47719298245614034, + "grad_norm": 0.65045565366745, + "learning_rate": 0.0001, + "loss": 1.2305, + "mean_token_accuracy": 0.6532946228981018, + "num_tokens": 112616585.0, + "step": 272 + }, + { + "epoch": 0.4789473684210526, + "grad_norm": 0.7001651525497437, + "learning_rate": 0.0001, + "loss": 1.2247, + "mean_token_accuracy": 0.654727041721344, + "num_tokens": 113009798.0, + "step": 273 + }, + { + "epoch": 0.4807017543859649, + "grad_norm": 0.6165850162506104, + "learning_rate": 0.0001, + "loss": 1.2174, + "mean_token_accuracy": 0.6552860736846924, + "num_tokens": 113410244.0, + "step": 274 + }, + { + "epoch": 0.4824561403508772, + "grad_norm": 0.7424379587173462, + "learning_rate": 0.0001, + "loss": 1.2379, + "mean_token_accuracy": 0.653006911277771, + "num_tokens": 113805215.0, + "step": 275 + }, + { + "epoch": 0.4842105263157895, + "grad_norm": 0.7236623167991638, + "learning_rate": 0.0001, + "loss": 1.2748, + "mean_token_accuracy": 0.6438848972320557, + "num_tokens": 114224914.0, + "step": 276 + }, + { + "epoch": 0.48596491228070177, + "grad_norm": 0.6665499210357666, + "learning_rate": 0.0001, + "loss": 1.205, + "mean_token_accuracy": 0.6590371131896973, + "num_tokens": 114606381.0, + "step": 277 + }, + { + "epoch": 0.48771929824561405, + "grad_norm": 0.6881427764892578, + "learning_rate": 0.0001, + "loss": 1.2456, + "mean_token_accuracy": 0.6502711772918701, + "num_tokens": 115021603.0, + "step": 278 + }, + { + "epoch": 0.48947368421052634, + "grad_norm": 0.8498430848121643, + "learning_rate": 0.0001, + "loss": 1.2443, + "mean_token_accuracy": 0.6494314670562744, + "num_tokens": 115427417.0, + "step": 279 + }, + { + "epoch": 0.49122807017543857, + "grad_norm": 0.724355936050415, + "learning_rate": 0.0001, + "loss": 1.2574, + "mean_token_accuracy": 0.6479068398475647, + "num_tokens": 115868301.0, + "step": 280 + }, + { + "epoch": 0.49298245614035086, + "grad_norm": 0.6625252366065979, + "learning_rate": 0.0001, + "loss": 1.2128, + "mean_token_accuracy": 0.6565937995910645, + "num_tokens": 116278418.0, + "step": 281 + }, + { + "epoch": 0.49473684210526314, + "grad_norm": 0.8329636454582214, + "learning_rate": 0.0001, + "loss": 1.2419, + "mean_token_accuracy": 0.650545060634613, + "num_tokens": 116681770.0, + "step": 282 + }, + { + "epoch": 0.4964912280701754, + "grad_norm": 0.8298386335372925, + "learning_rate": 0.0001, + "loss": 1.2356, + "mean_token_accuracy": 0.6532111167907715, + "num_tokens": 117054940.0, + "step": 283 + }, + { + "epoch": 0.4982456140350877, + "grad_norm": 0.7011889219284058, + "learning_rate": 0.0001, + "loss": 1.2418, + "mean_token_accuracy": 0.6518392562866211, + "num_tokens": 117477299.0, + "step": 284 + }, + { + "epoch": 0.5, + "grad_norm": 0.710082471370697, + "learning_rate": 0.0001, + "loss": 1.2479, + "mean_token_accuracy": 0.6494768857955933, + "num_tokens": 117885331.0, + "step": 285 + }, + { + "epoch": 0.5017543859649123, + "grad_norm": 0.8371219038963318, + "learning_rate": 0.0001, + "loss": 1.2577, + "mean_token_accuracy": 0.6471771001815796, + "num_tokens": 118307149.0, + "step": 286 + }, + { + "epoch": 0.5035087719298246, + "grad_norm": 0.8411158919334412, + "learning_rate": 0.0001, + "loss": 1.219, + "mean_token_accuracy": 0.6554292440414429, + "num_tokens": 118697262.0, + "step": 287 + }, + { + "epoch": 0.5052631578947369, + "grad_norm": 0.7115722298622131, + "learning_rate": 0.0001, + "loss": 1.2501, + "mean_token_accuracy": 0.6482685804367065, + "num_tokens": 119124053.0, + "step": 288 + }, + { + "epoch": 0.5070175438596491, + "grad_norm": 0.6575236916542053, + "learning_rate": 0.0001, + "loss": 1.2555, + "mean_token_accuracy": 0.6482589840888977, + "num_tokens": 119569185.0, + "step": 289 + }, + { + "epoch": 0.5087719298245614, + "grad_norm": 0.8516756892204285, + "learning_rate": 0.0001, + "loss": 1.2332, + "mean_token_accuracy": 0.6534713506698608, + "num_tokens": 119987270.0, + "step": 290 + }, + { + "epoch": 0.5105263157894737, + "grad_norm": 0.7346055507659912, + "learning_rate": 0.0001, + "loss": 1.2347, + "mean_token_accuracy": 0.6515175700187683, + "num_tokens": 120377177.0, + "step": 291 + }, + { + "epoch": 0.512280701754386, + "grad_norm": 0.6637006402015686, + "learning_rate": 0.0001, + "loss": 1.2241, + "mean_token_accuracy": 0.6554847359657288, + "num_tokens": 120794750.0, + "step": 292 + }, + { + "epoch": 0.5140350877192983, + "grad_norm": 0.8050562143325806, + "learning_rate": 0.0001, + "loss": 1.2462, + "mean_token_accuracy": 0.6506670713424683, + "num_tokens": 121223883.0, + "step": 293 + }, + { + "epoch": 0.5157894736842106, + "grad_norm": 0.7059856057167053, + "learning_rate": 0.0001, + "loss": 1.2287, + "mean_token_accuracy": 0.6533610820770264, + "num_tokens": 121644108.0, + "step": 294 + }, + { + "epoch": 0.5175438596491229, + "grad_norm": 0.6939064860343933, + "learning_rate": 0.0001, + "loss": 1.2357, + "mean_token_accuracy": 0.6511529684066772, + "num_tokens": 122051612.0, + "step": 295 + }, + { + "epoch": 0.519298245614035, + "grad_norm": 0.8220492601394653, + "learning_rate": 0.0001, + "loss": 1.2257, + "mean_token_accuracy": 0.6545118093490601, + "num_tokens": 122430847.0, + "step": 296 + }, + { + "epoch": 0.5210526315789473, + "grad_norm": 0.7044985294342041, + "learning_rate": 0.0001, + "loss": 1.2309, + "mean_token_accuracy": 0.654273271560669, + "num_tokens": 122849234.0, + "step": 297 + }, + { + "epoch": 0.5228070175438596, + "grad_norm": 0.8146756291389465, + "learning_rate": 0.0001, + "loss": 1.2489, + "mean_token_accuracy": 0.6501311659812927, + "num_tokens": 123258290.0, + "step": 298 + }, + { + "epoch": 0.5245614035087719, + "grad_norm": 0.766899824142456, + "learning_rate": 0.0001, + "loss": 1.2436, + "mean_token_accuracy": 0.6506932973861694, + "num_tokens": 123663735.0, + "step": 299 + }, + { + "epoch": 0.5263157894736842, + "grad_norm": 0.7193543910980225, + "learning_rate": 0.0001, + "loss": 1.2451, + "mean_token_accuracy": 0.6491553783416748, + "num_tokens": 124058727.0, + "step": 300 + }, + { + "epoch": 0.5280701754385965, + "grad_norm": 0.6504607200622559, + "learning_rate": 0.0001, + "loss": 1.2455, + "mean_token_accuracy": 0.6509132385253906, + "num_tokens": 124484021.0, + "step": 301 + }, + { + "epoch": 0.5298245614035088, + "grad_norm": 0.7661638259887695, + "learning_rate": 0.0001, + "loss": 1.2158, + "mean_token_accuracy": 0.6565036177635193, + "num_tokens": 124902607.0, + "step": 302 + }, + { + "epoch": 0.531578947368421, + "grad_norm": 0.73735511302948, + "learning_rate": 0.0001, + "loss": 1.2408, + "mean_token_accuracy": 0.6503796577453613, + "num_tokens": 125338175.0, + "step": 303 + }, + { + "epoch": 0.5333333333333333, + "grad_norm": 0.9022007584571838, + "learning_rate": 0.0001, + "loss": 1.2332, + "mean_token_accuracy": 0.6524848937988281, + "num_tokens": 125757431.0, + "step": 304 + }, + { + "epoch": 0.5350877192982456, + "grad_norm": 0.6961904764175415, + "learning_rate": 0.0001, + "loss": 1.2217, + "mean_token_accuracy": 0.6560453176498413, + "num_tokens": 126155529.0, + "step": 305 + }, + { + "epoch": 0.5368421052631579, + "grad_norm": 0.6821785569190979, + "learning_rate": 0.0001, + "loss": 1.223, + "mean_token_accuracy": 0.6551496982574463, + "num_tokens": 126572012.0, + "step": 306 + }, + { + "epoch": 0.5385964912280702, + "grad_norm": 0.8659482002258301, + "learning_rate": 0.0001, + "loss": 1.2393, + "mean_token_accuracy": 0.6508756875991821, + "num_tokens": 126990573.0, + "step": 307 + }, + { + "epoch": 0.5403508771929825, + "grad_norm": 0.6646002531051636, + "learning_rate": 0.0001, + "loss": 1.2173, + "mean_token_accuracy": 0.6561950445175171, + "num_tokens": 127418976.0, + "step": 308 + }, + { + "epoch": 0.5421052631578948, + "grad_norm": 0.6923218369483948, + "learning_rate": 0.0001, + "loss": 1.1911, + "mean_token_accuracy": 0.6624599695205688, + "num_tokens": 127827614.0, + "step": 309 + }, + { + "epoch": 0.543859649122807, + "grad_norm": 0.6864442825317383, + "learning_rate": 0.0001, + "loss": 1.2267, + "mean_token_accuracy": 0.6538540720939636, + "num_tokens": 128258313.0, + "step": 310 + }, + { + "epoch": 0.5456140350877193, + "grad_norm": 0.7230309247970581, + "learning_rate": 0.0001, + "loss": 1.2495, + "mean_token_accuracy": 0.6486865282058716, + "num_tokens": 128702682.0, + "step": 311 + }, + { + "epoch": 0.5473684210526316, + "grad_norm": 0.6914284825325012, + "learning_rate": 0.0001, + "loss": 1.2211, + "mean_token_accuracy": 0.6558956503868103, + "num_tokens": 129094449.0, + "step": 312 + }, + { + "epoch": 0.5491228070175439, + "grad_norm": 0.6948025822639465, + "learning_rate": 0.0001, + "loss": 1.2232, + "mean_token_accuracy": 0.6554611325263977, + "num_tokens": 129492795.0, + "step": 313 + }, + { + "epoch": 0.5508771929824562, + "grad_norm": 0.6883065104484558, + "learning_rate": 0.0001, + "loss": 1.225, + "mean_token_accuracy": 0.6549999713897705, + "num_tokens": 129923670.0, + "step": 314 + }, + { + "epoch": 0.5526315789473685, + "grad_norm": 0.7065843939781189, + "learning_rate": 0.0001, + "loss": 1.211, + "mean_token_accuracy": 0.6596200466156006, + "num_tokens": 130333012.0, + "step": 315 + }, + { + "epoch": 0.5543859649122806, + "grad_norm": 0.8073469996452332, + "learning_rate": 0.0001, + "loss": 1.2073, + "mean_token_accuracy": 0.6588984727859497, + "num_tokens": 130749305.0, + "step": 316 + }, + { + "epoch": 0.5561403508771929, + "grad_norm": 0.8134505748748779, + "learning_rate": 0.0001, + "loss": 1.2544, + "mean_token_accuracy": 0.6476566791534424, + "num_tokens": 131169421.0, + "step": 317 + }, + { + "epoch": 0.5578947368421052, + "grad_norm": 0.6765173077583313, + "learning_rate": 0.0001, + "loss": 1.2078, + "mean_token_accuracy": 0.6600826978683472, + "num_tokens": 131574788.0, + "step": 318 + }, + { + "epoch": 0.5596491228070175, + "grad_norm": 0.7156663537025452, + "learning_rate": 0.0001, + "loss": 1.2267, + "mean_token_accuracy": 0.6545342206954956, + "num_tokens": 131984459.0, + "step": 319 + }, + { + "epoch": 0.5614035087719298, + "grad_norm": 0.8103324174880981, + "learning_rate": 0.0001, + "loss": 1.2612, + "mean_token_accuracy": 0.6481289267539978, + "num_tokens": 132415165.0, + "step": 320 + }, + { + "epoch": 0.5631578947368421, + "grad_norm": 0.742142915725708, + "learning_rate": 0.0001, + "loss": 1.2039, + "mean_token_accuracy": 0.6596871018409729, + "num_tokens": 132838824.0, + "step": 321 + }, + { + "epoch": 0.5649122807017544, + "grad_norm": 0.7613045573234558, + "learning_rate": 0.0001, + "loss": 1.2095, + "mean_token_accuracy": 0.6572511792182922, + "num_tokens": 133247120.0, + "step": 322 + }, + { + "epoch": 0.5666666666666667, + "grad_norm": 0.7817480564117432, + "learning_rate": 0.0001, + "loss": 1.2232, + "mean_token_accuracy": 0.6545703411102295, + "num_tokens": 133672706.0, + "step": 323 + }, + { + "epoch": 0.5684210526315789, + "grad_norm": 0.6124296188354492, + "learning_rate": 0.0001, + "loss": 1.2342, + "mean_token_accuracy": 0.6552791595458984, + "num_tokens": 134079103.0, + "step": 324 + }, + { + "epoch": 0.5701754385964912, + "grad_norm": 0.6886869668960571, + "learning_rate": 0.0001, + "loss": 1.2231, + "mean_token_accuracy": 0.6547764539718628, + "num_tokens": 134494826.0, + "step": 325 + }, + { + "epoch": 0.5719298245614035, + "grad_norm": 0.6630454659461975, + "learning_rate": 0.0001, + "loss": 1.2345, + "mean_token_accuracy": 0.6520872116088867, + "num_tokens": 134919504.0, + "step": 326 + }, + { + "epoch": 0.5736842105263158, + "grad_norm": 0.8173869252204895, + "learning_rate": 0.0001, + "loss": 1.2276, + "mean_token_accuracy": 0.6533281207084656, + "num_tokens": 135331962.0, + "step": 327 + }, + { + "epoch": 0.5754385964912281, + "grad_norm": 0.6743276715278625, + "learning_rate": 0.0001, + "loss": 1.2281, + "mean_token_accuracy": 0.6535520553588867, + "num_tokens": 135745710.0, + "step": 328 + }, + { + "epoch": 0.5771929824561404, + "grad_norm": 0.6731691360473633, + "learning_rate": 0.0001, + "loss": 1.2346, + "mean_token_accuracy": 0.6528003215789795, + "num_tokens": 136174418.0, + "step": 329 + }, + { + "epoch": 0.5789473684210527, + "grad_norm": 0.6211588382720947, + "learning_rate": 0.0001, + "loss": 1.2329, + "mean_token_accuracy": 0.653133749961853, + "num_tokens": 136597949.0, + "step": 330 + }, + { + "epoch": 0.5807017543859649, + "grad_norm": 0.8585658073425293, + "learning_rate": 0.0001, + "loss": 1.2506, + "mean_token_accuracy": 0.648890495300293, + "num_tokens": 137047696.0, + "step": 331 + }, + { + "epoch": 0.5824561403508772, + "grad_norm": 0.8006256222724915, + "learning_rate": 0.0001, + "loss": 1.2119, + "mean_token_accuracy": 0.658997118473053, + "num_tokens": 137428701.0, + "step": 332 + }, + { + "epoch": 0.5842105263157895, + "grad_norm": 0.692973792552948, + "learning_rate": 0.0001, + "loss": 1.2167, + "mean_token_accuracy": 0.6570533514022827, + "num_tokens": 137823083.0, + "step": 333 + }, + { + "epoch": 0.5859649122807018, + "grad_norm": 0.7685320973396301, + "learning_rate": 0.0001, + "loss": 1.271, + "mean_token_accuracy": 0.6423413157463074, + "num_tokens": 138250591.0, + "step": 334 + }, + { + "epoch": 0.5877192982456141, + "grad_norm": 0.7700155377388, + "learning_rate": 0.0001, + "loss": 1.2252, + "mean_token_accuracy": 0.6543079614639282, + "num_tokens": 138660562.0, + "step": 335 + }, + { + "epoch": 0.5894736842105263, + "grad_norm": 0.7410191893577576, + "learning_rate": 0.0001, + "loss": 1.2156, + "mean_token_accuracy": 0.6561688184738159, + "num_tokens": 139077597.0, + "step": 336 + }, + { + "epoch": 0.5912280701754385, + "grad_norm": 0.7632637619972229, + "learning_rate": 0.0001, + "loss": 1.235, + "mean_token_accuracy": 0.6516165733337402, + "num_tokens": 139482698.0, + "step": 337 + }, + { + "epoch": 0.5929824561403508, + "grad_norm": 0.690731942653656, + "learning_rate": 0.0001, + "loss": 1.2408, + "mean_token_accuracy": 0.6502476930618286, + "num_tokens": 139906177.0, + "step": 338 + }, + { + "epoch": 0.5947368421052631, + "grad_norm": 0.6513046026229858, + "learning_rate": 0.0001, + "loss": 1.2297, + "mean_token_accuracy": 0.6529406905174255, + "num_tokens": 140319741.0, + "step": 339 + }, + { + "epoch": 0.5964912280701754, + "grad_norm": 0.6879235506057739, + "learning_rate": 0.0001, + "loss": 1.2234, + "mean_token_accuracy": 0.6540219783782959, + "num_tokens": 140731809.0, + "step": 340 + }, + { + "epoch": 0.5982456140350877, + "grad_norm": 0.7240639925003052, + "learning_rate": 0.0001, + "loss": 1.2043, + "mean_token_accuracy": 0.6592921018600464, + "num_tokens": 141126659.0, + "step": 341 + }, + { + "epoch": 0.6, + "grad_norm": 0.6559076905250549, + "learning_rate": 0.0001, + "loss": 1.2353, + "mean_token_accuracy": 0.6514889001846313, + "num_tokens": 141551619.0, + "step": 342 + }, + { + "epoch": 0.6017543859649123, + "grad_norm": 0.7054679989814758, + "learning_rate": 0.0001, + "loss": 1.2225, + "mean_token_accuracy": 0.6558979749679565, + "num_tokens": 141970744.0, + "step": 343 + }, + { + "epoch": 0.6035087719298246, + "grad_norm": 0.6867666244506836, + "learning_rate": 0.0001, + "loss": 1.2162, + "mean_token_accuracy": 0.6566264629364014, + "num_tokens": 142395401.0, + "step": 344 + }, + { + "epoch": 0.6052631578947368, + "grad_norm": 0.6507348418235779, + "learning_rate": 0.0001, + "loss": 1.2244, + "mean_token_accuracy": 0.6545271277427673, + "num_tokens": 142827910.0, + "step": 345 + }, + { + "epoch": 0.6070175438596491, + "grad_norm": 0.7520820498466492, + "learning_rate": 0.0001, + "loss": 1.2315, + "mean_token_accuracy": 0.6522303819656372, + "num_tokens": 143256693.0, + "step": 346 + }, + { + "epoch": 0.6087719298245614, + "grad_norm": 0.7250421047210693, + "learning_rate": 0.0001, + "loss": 1.2126, + "mean_token_accuracy": 0.6569870710372925, + "num_tokens": 143670113.0, + "step": 347 + }, + { + "epoch": 0.6105263157894737, + "grad_norm": 0.707240104675293, + "learning_rate": 0.0001, + "loss": 1.2337, + "mean_token_accuracy": 0.6521450281143188, + "num_tokens": 144085012.0, + "step": 348 + }, + { + "epoch": 0.612280701754386, + "grad_norm": 0.6530799269676208, + "learning_rate": 0.0001, + "loss": 1.2366, + "mean_token_accuracy": 0.650078296661377, + "num_tokens": 144511304.0, + "step": 349 + }, + { + "epoch": 0.6140350877192983, + "grad_norm": 0.7164869904518127, + "learning_rate": 0.0001, + "loss": 1.1957, + "mean_token_accuracy": 0.6614063382148743, + "num_tokens": 144903389.0, + "step": 350 + }, + { + "epoch": 0.6157894736842106, + "grad_norm": 0.6941936612129211, + "learning_rate": 0.0001, + "loss": 1.2244, + "mean_token_accuracy": 0.6538809537887573, + "num_tokens": 145317796.0, + "step": 351 + }, + { + "epoch": 0.6175438596491228, + "grad_norm": 0.5569853186607361, + "learning_rate": 0.0001, + "loss": 1.2303, + "mean_token_accuracy": 0.6540141701698303, + "num_tokens": 145728692.0, + "step": 352 + }, + { + "epoch": 0.6192982456140351, + "grad_norm": 0.6453179121017456, + "learning_rate": 0.0001, + "loss": 1.2219, + "mean_token_accuracy": 0.6558411121368408, + "num_tokens": 146149428.0, + "step": 353 + }, + { + "epoch": 0.6210526315789474, + "grad_norm": 0.7571195363998413, + "learning_rate": 0.0001, + "loss": 1.2371, + "mean_token_accuracy": 0.6512084007263184, + "num_tokens": 146567803.0, + "step": 354 + }, + { + "epoch": 0.6228070175438597, + "grad_norm": 0.7026142477989197, + "learning_rate": 0.0001, + "loss": 1.2577, + "mean_token_accuracy": 0.6465089917182922, + "num_tokens": 146996287.0, + "step": 355 + }, + { + "epoch": 0.624561403508772, + "grad_norm": 0.7396862506866455, + "learning_rate": 0.0001, + "loss": 1.207, + "mean_token_accuracy": 0.657769501209259, + "num_tokens": 147406032.0, + "step": 356 + }, + { + "epoch": 0.6263157894736842, + "grad_norm": 0.7301826477050781, + "learning_rate": 0.0001, + "loss": 1.2549, + "mean_token_accuracy": 0.648101270198822, + "num_tokens": 147818711.0, + "step": 357 + }, + { + "epoch": 0.6280701754385964, + "grad_norm": 0.6443963646888733, + "learning_rate": 0.0001, + "loss": 1.2247, + "mean_token_accuracy": 0.6540451645851135, + "num_tokens": 148224782.0, + "step": 358 + }, + { + "epoch": 0.6298245614035087, + "grad_norm": 0.5962257981300354, + "learning_rate": 0.0001, + "loss": 1.2095, + "mean_token_accuracy": 0.6578296422958374, + "num_tokens": 148638041.0, + "step": 359 + }, + { + "epoch": 0.631578947368421, + "grad_norm": 0.553277850151062, + "learning_rate": 0.0001, + "loss": 1.1954, + "mean_token_accuracy": 0.6618661880493164, + "num_tokens": 149034209.0, + "step": 360 + }, + { + "epoch": 0.6333333333333333, + "grad_norm": 0.8141903281211853, + "learning_rate": 0.0001, + "loss": 1.2181, + "mean_token_accuracy": 0.6552863121032715, + "num_tokens": 149462197.0, + "step": 361 + }, + { + "epoch": 0.6350877192982456, + "grad_norm": 0.6312337517738342, + "learning_rate": 0.0001, + "loss": 1.2237, + "mean_token_accuracy": 0.6552015542984009, + "num_tokens": 149874309.0, + "step": 362 + }, + { + "epoch": 0.6368421052631579, + "grad_norm": 0.6863110661506653, + "learning_rate": 0.0001, + "loss": 1.2144, + "mean_token_accuracy": 0.6574706435203552, + "num_tokens": 150278354.0, + "step": 363 + }, + { + "epoch": 0.6385964912280702, + "grad_norm": 0.7062144875526428, + "learning_rate": 0.0001, + "loss": 1.2309, + "mean_token_accuracy": 0.6519078016281128, + "num_tokens": 150696393.0, + "step": 364 + }, + { + "epoch": 0.6403508771929824, + "grad_norm": 0.6141137480735779, + "learning_rate": 0.0001, + "loss": 1.2175, + "mean_token_accuracy": 0.6562414169311523, + "num_tokens": 151097966.0, + "step": 365 + }, + { + "epoch": 0.6421052631578947, + "grad_norm": 0.6939074993133545, + "learning_rate": 0.0001, + "loss": 1.2115, + "mean_token_accuracy": 0.6587799787521362, + "num_tokens": 151487261.0, + "step": 366 + }, + { + "epoch": 0.643859649122807, + "grad_norm": 0.6834867596626282, + "learning_rate": 0.0001, + "loss": 1.2492, + "mean_token_accuracy": 0.6492800116539001, + "num_tokens": 151915716.0, + "step": 367 + }, + { + "epoch": 0.6456140350877193, + "grad_norm": 0.6845062971115112, + "learning_rate": 0.0001, + "loss": 1.2045, + "mean_token_accuracy": 0.6578772664070129, + "num_tokens": 152333716.0, + "step": 368 + }, + { + "epoch": 0.6473684210526316, + "grad_norm": 0.6263954639434814, + "learning_rate": 0.0001, + "loss": 1.2285, + "mean_token_accuracy": 0.6524069905281067, + "num_tokens": 152758004.0, + "step": 369 + }, + { + "epoch": 0.6491228070175439, + "grad_norm": 0.7604780793190002, + "learning_rate": 0.0001, + "loss": 1.2321, + "mean_token_accuracy": 0.6509186029434204, + "num_tokens": 153175726.0, + "step": 370 + }, + { + "epoch": 0.6508771929824562, + "grad_norm": 0.6607220768928528, + "learning_rate": 0.0001, + "loss": 1.193, + "mean_token_accuracy": 0.6612952947616577, + "num_tokens": 153573526.0, + "step": 371 + }, + { + "epoch": 0.6526315789473685, + "grad_norm": 0.7317623496055603, + "learning_rate": 0.0001, + "loss": 1.2233, + "mean_token_accuracy": 0.6555420756340027, + "num_tokens": 154001303.0, + "step": 372 + }, + { + "epoch": 0.6543859649122807, + "grad_norm": 0.5643908381462097, + "learning_rate": 0.0001, + "loss": 1.1888, + "mean_token_accuracy": 0.6617960929870605, + "num_tokens": 154409874.0, + "step": 373 + }, + { + "epoch": 0.656140350877193, + "grad_norm": 0.631582498550415, + "learning_rate": 0.0001, + "loss": 1.2192, + "mean_token_accuracy": 0.6558030843734741, + "num_tokens": 154826554.0, + "step": 374 + }, + { + "epoch": 0.6578947368421053, + "grad_norm": 0.745689332485199, + "learning_rate": 0.0001, + "loss": 1.2146, + "mean_token_accuracy": 0.6560918688774109, + "num_tokens": 155230585.0, + "step": 375 + }, + { + "epoch": 0.6596491228070176, + "grad_norm": 0.651474118232727, + "learning_rate": 0.0001, + "loss": 1.2055, + "mean_token_accuracy": 0.6597638726234436, + "num_tokens": 155636974.0, + "step": 376 + }, + { + "epoch": 0.6614035087719298, + "grad_norm": 0.7227398753166199, + "learning_rate": 0.0001, + "loss": 1.2211, + "mean_token_accuracy": 0.6551980972290039, + "num_tokens": 156063068.0, + "step": 377 + }, + { + "epoch": 0.6631578947368421, + "grad_norm": 0.6124153137207031, + "learning_rate": 0.0001, + "loss": 1.2435, + "mean_token_accuracy": 0.6511872410774231, + "num_tokens": 156510643.0, + "step": 378 + }, + { + "epoch": 0.6649122807017543, + "grad_norm": 0.7193928360939026, + "learning_rate": 0.0001, + "loss": 1.2242, + "mean_token_accuracy": 0.6536823511123657, + "num_tokens": 156947423.0, + "step": 379 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 0.7923741936683655, + "learning_rate": 0.0001, + "loss": 1.2149, + "mean_token_accuracy": 0.6561374664306641, + "num_tokens": 157370426.0, + "step": 380 + }, + { + "epoch": 0.6684210526315789, + "grad_norm": 0.7290387153625488, + "learning_rate": 0.0001, + "loss": 1.2132, + "mean_token_accuracy": 0.6568667888641357, + "num_tokens": 157782963.0, + "step": 381 + }, + { + "epoch": 0.6701754385964912, + "grad_norm": 0.6192464232444763, + "learning_rate": 0.0001, + "loss": 1.2309, + "mean_token_accuracy": 0.6526235342025757, + "num_tokens": 158180387.0, + "step": 382 + }, + { + "epoch": 0.6719298245614035, + "grad_norm": 0.7137374877929688, + "learning_rate": 0.0001, + "loss": 1.2328, + "mean_token_accuracy": 0.6518644094467163, + "num_tokens": 158582401.0, + "step": 383 + }, + { + "epoch": 0.6736842105263158, + "grad_norm": 0.7550848126411438, + "learning_rate": 0.0001, + "loss": 1.2166, + "mean_token_accuracy": 0.6573696136474609, + "num_tokens": 158991919.0, + "step": 384 + }, + { + "epoch": 0.6754385964912281, + "grad_norm": 0.6890254020690918, + "learning_rate": 0.0001, + "loss": 1.2107, + "mean_token_accuracy": 0.6569054126739502, + "num_tokens": 159393180.0, + "step": 385 + }, + { + "epoch": 0.6771929824561403, + "grad_norm": 0.7258317470550537, + "learning_rate": 0.0001, + "loss": 1.2151, + "mean_token_accuracy": 0.6549711227416992, + "num_tokens": 159800687.0, + "step": 386 + }, + { + "epoch": 0.6789473684210526, + "grad_norm": 0.7973881363868713, + "learning_rate": 0.0001, + "loss": 1.1913, + "mean_token_accuracy": 0.6616858839988708, + "num_tokens": 160206677.0, + "step": 387 + }, + { + "epoch": 0.6807017543859649, + "grad_norm": 0.6781461238861084, + "learning_rate": 0.0001, + "loss": 1.2296, + "mean_token_accuracy": 0.6531736850738525, + "num_tokens": 160623973.0, + "step": 388 + }, + { + "epoch": 0.6824561403508772, + "grad_norm": 0.8034713268280029, + "learning_rate": 0.0001, + "loss": 1.2306, + "mean_token_accuracy": 0.6528879404067993, + "num_tokens": 161033575.0, + "step": 389 + }, + { + "epoch": 0.6842105263157895, + "grad_norm": 0.7085846066474915, + "learning_rate": 0.0001, + "loss": 1.1892, + "mean_token_accuracy": 0.6614021062850952, + "num_tokens": 161420334.0, + "step": 390 + }, + { + "epoch": 0.6859649122807018, + "grad_norm": 0.712842583656311, + "learning_rate": 0.0001, + "loss": 1.2096, + "mean_token_accuracy": 0.6570684909820557, + "num_tokens": 161823214.0, + "step": 391 + }, + { + "epoch": 0.6877192982456141, + "grad_norm": 0.6031337380409241, + "learning_rate": 0.0001, + "loss": 1.2131, + "mean_token_accuracy": 0.6579930782318115, + "num_tokens": 162253222.0, + "step": 392 + }, + { + "epoch": 0.6894736842105263, + "grad_norm": 0.6571363806724548, + "learning_rate": 0.0001, + "loss": 1.2151, + "mean_token_accuracy": 0.6550261974334717, + "num_tokens": 162673396.0, + "step": 393 + }, + { + "epoch": 0.6912280701754386, + "grad_norm": 0.590053915977478, + "learning_rate": 0.0001, + "loss": 1.1913, + "mean_token_accuracy": 0.6606940031051636, + "num_tokens": 163095701.0, + "step": 394 + }, + { + "epoch": 0.6929824561403509, + "grad_norm": 0.660569429397583, + "learning_rate": 0.0001, + "loss": 1.2168, + "mean_token_accuracy": 0.6552713513374329, + "num_tokens": 163503487.0, + "step": 395 + }, + { + "epoch": 0.6947368421052632, + "grad_norm": 0.5482744574546814, + "learning_rate": 0.0001, + "loss": 1.1966, + "mean_token_accuracy": 0.6622109413146973, + "num_tokens": 163908638.0, + "step": 396 + }, + { + "epoch": 0.6964912280701754, + "grad_norm": 0.6649277210235596, + "learning_rate": 0.0001, + "loss": 1.2082, + "mean_token_accuracy": 0.6560900211334229, + "num_tokens": 164321664.0, + "step": 397 + }, + { + "epoch": 0.6982456140350877, + "grad_norm": 0.6546705365180969, + "learning_rate": 0.0001, + "loss": 1.208, + "mean_token_accuracy": 0.6577179431915283, + "num_tokens": 164739198.0, + "step": 398 + }, + { + "epoch": 0.7, + "grad_norm": 0.6374883651733398, + "learning_rate": 0.0001, + "loss": 1.1893, + "mean_token_accuracy": 0.660727322101593, + "num_tokens": 165130707.0, + "step": 399 + }, + { + "epoch": 0.7017543859649122, + "grad_norm": 0.6626867055892944, + "learning_rate": 0.0001, + "loss": 1.2056, + "mean_token_accuracy": 0.6570228934288025, + "num_tokens": 165544679.0, + "step": 400 + }, + { + "epoch": 0.7035087719298245, + "grad_norm": 0.648720920085907, + "learning_rate": 0.0001, + "loss": 1.1889, + "mean_token_accuracy": 0.6612677574157715, + "num_tokens": 165963969.0, + "step": 401 + }, + { + "epoch": 0.7052631578947368, + "grad_norm": 0.6660583019256592, + "learning_rate": 0.0001, + "loss": 1.201, + "mean_token_accuracy": 0.6595137119293213, + "num_tokens": 166342479.0, + "step": 402 + }, + { + "epoch": 0.7070175438596491, + "grad_norm": 0.6676925420761108, + "learning_rate": 0.0001, + "loss": 1.2178, + "mean_token_accuracy": 0.655318558216095, + "num_tokens": 166746524.0, + "step": 403 + }, + { + "epoch": 0.7087719298245614, + "grad_norm": 0.6398362517356873, + "learning_rate": 0.0001, + "loss": 1.2217, + "mean_token_accuracy": 0.6538881063461304, + "num_tokens": 167166144.0, + "step": 404 + }, + { + "epoch": 0.7105263157894737, + "grad_norm": 0.6486631035804749, + "learning_rate": 0.0001, + "loss": 1.2107, + "mean_token_accuracy": 0.6568524837493896, + "num_tokens": 167576053.0, + "step": 405 + }, + { + "epoch": 0.712280701754386, + "grad_norm": 0.6971449851989746, + "learning_rate": 0.0001, + "loss": 1.2072, + "mean_token_accuracy": 0.6588019132614136, + "num_tokens": 168005716.0, + "step": 406 + }, + { + "epoch": 0.7140350877192982, + "grad_norm": 0.5594667792320251, + "learning_rate": 0.0001, + "loss": 1.1815, + "mean_token_accuracy": 0.6640152931213379, + "num_tokens": 168425061.0, + "step": 407 + }, + { + "epoch": 0.7157894736842105, + "grad_norm": 0.6978932619094849, + "learning_rate": 0.0001, + "loss": 1.2123, + "mean_token_accuracy": 0.6560491323471069, + "num_tokens": 168832540.0, + "step": 408 + }, + { + "epoch": 0.7175438596491228, + "grad_norm": 0.577872097492218, + "learning_rate": 0.0001, + "loss": 1.1961, + "mean_token_accuracy": 0.6605911254882812, + "num_tokens": 169243355.0, + "step": 409 + }, + { + "epoch": 0.7192982456140351, + "grad_norm": 0.6972746849060059, + "learning_rate": 0.0001, + "loss": 1.2263, + "mean_token_accuracy": 0.6549021005630493, + "num_tokens": 169678758.0, + "step": 410 + }, + { + "epoch": 0.7210526315789474, + "grad_norm": 0.6528338193893433, + "learning_rate": 0.0001, + "loss": 1.2193, + "mean_token_accuracy": 0.6543501615524292, + "num_tokens": 170107843.0, + "step": 411 + }, + { + "epoch": 0.7228070175438597, + "grad_norm": 0.6352643370628357, + "learning_rate": 0.0001, + "loss": 1.21, + "mean_token_accuracy": 0.6568456292152405, + "num_tokens": 170512414.0, + "step": 412 + }, + { + "epoch": 0.724561403508772, + "grad_norm": 0.6633725762367249, + "learning_rate": 0.0001, + "loss": 1.2273, + "mean_token_accuracy": 0.6531171798706055, + "num_tokens": 170927891.0, + "step": 413 + }, + { + "epoch": 0.7263157894736842, + "grad_norm": 0.7003793716430664, + "learning_rate": 0.0001, + "loss": 1.2471, + "mean_token_accuracy": 0.6485984921455383, + "num_tokens": 171347602.0, + "step": 414 + }, + { + "epoch": 0.7280701754385965, + "grad_norm": 0.6166436076164246, + "learning_rate": 0.0001, + "loss": 1.1822, + "mean_token_accuracy": 0.664222240447998, + "num_tokens": 171764325.0, + "step": 415 + }, + { + "epoch": 0.7298245614035088, + "grad_norm": 0.6370410323143005, + "learning_rate": 0.0001, + "loss": 1.2288, + "mean_token_accuracy": 0.6530359387397766, + "num_tokens": 172161316.0, + "step": 416 + }, + { + "epoch": 0.7315789473684211, + "grad_norm": 0.5680028200149536, + "learning_rate": 0.0001, + "loss": 1.188, + "mean_token_accuracy": 0.663171112537384, + "num_tokens": 172557979.0, + "step": 417 + }, + { + "epoch": 0.7333333333333333, + "grad_norm": 0.6317917704582214, + "learning_rate": 0.0001, + "loss": 1.2088, + "mean_token_accuracy": 0.6587448120117188, + "num_tokens": 172977293.0, + "step": 418 + }, + { + "epoch": 0.7350877192982456, + "grad_norm": 0.6629990935325623, + "learning_rate": 0.0001, + "loss": 1.206, + "mean_token_accuracy": 0.657719612121582, + "num_tokens": 173386905.0, + "step": 419 + }, + { + "epoch": 0.7368421052631579, + "grad_norm": 0.7318717241287231, + "learning_rate": 0.0001, + "loss": 1.1874, + "mean_token_accuracy": 0.662236750125885, + "num_tokens": 173790039.0, + "step": 420 + }, + { + "epoch": 0.7385964912280701, + "grad_norm": 0.5909295678138733, + "learning_rate": 0.0001, + "loss": 1.1857, + "mean_token_accuracy": 0.66287761926651, + "num_tokens": 174200214.0, + "step": 421 + }, + { + "epoch": 0.7403508771929824, + "grad_norm": 0.7244629859924316, + "learning_rate": 0.0001, + "loss": 1.198, + "mean_token_accuracy": 0.6586729288101196, + "num_tokens": 174594155.0, + "step": 422 + }, + { + "epoch": 0.7421052631578947, + "grad_norm": 0.7065144777297974, + "learning_rate": 0.0001, + "loss": 1.189, + "mean_token_accuracy": 0.6611475348472595, + "num_tokens": 175025672.0, + "step": 423 + }, + { + "epoch": 0.743859649122807, + "grad_norm": 0.6348630785942078, + "learning_rate": 0.0001, + "loss": 1.2285, + "mean_token_accuracy": 0.6518391370773315, + "num_tokens": 175452636.0, + "step": 424 + }, + { + "epoch": 0.7456140350877193, + "grad_norm": 0.6401616930961609, + "learning_rate": 0.0001, + "loss": 1.2204, + "mean_token_accuracy": 0.655087947845459, + "num_tokens": 175879906.0, + "step": 425 + }, + { + "epoch": 0.7473684210526316, + "grad_norm": 0.6971575617790222, + "learning_rate": 0.0001, + "loss": 1.2111, + "mean_token_accuracy": 0.6562093496322632, + "num_tokens": 176292162.0, + "step": 426 + }, + { + "epoch": 0.7491228070175439, + "grad_norm": 0.6440587043762207, + "learning_rate": 0.0001, + "loss": 1.2012, + "mean_token_accuracy": 0.6593471765518188, + "num_tokens": 176720725.0, + "step": 427 + }, + { + "epoch": 0.7508771929824561, + "grad_norm": 0.597520649433136, + "learning_rate": 0.0001, + "loss": 1.2284, + "mean_token_accuracy": 0.6531672477722168, + "num_tokens": 177161243.0, + "step": 428 + }, + { + "epoch": 0.7526315789473684, + "grad_norm": 0.8046004772186279, + "learning_rate": 0.0001, + "loss": 1.1928, + "mean_token_accuracy": 0.6594202518463135, + "num_tokens": 177562007.0, + "step": 429 + }, + { + "epoch": 0.7543859649122807, + "grad_norm": 0.6298813223838806, + "learning_rate": 0.0001, + "loss": 1.219, + "mean_token_accuracy": 0.6546623706817627, + "num_tokens": 177983323.0, + "step": 430 + }, + { + "epoch": 0.756140350877193, + "grad_norm": 0.5731974840164185, + "learning_rate": 0.0001, + "loss": 1.2153, + "mean_token_accuracy": 0.6574859619140625, + "num_tokens": 178409419.0, + "step": 431 + }, + { + "epoch": 0.7578947368421053, + "grad_norm": 0.7396548390388489, + "learning_rate": 0.0001, + "loss": 1.2113, + "mean_token_accuracy": 0.6574329137802124, + "num_tokens": 178832025.0, + "step": 432 + }, + { + "epoch": 0.7596491228070176, + "grad_norm": 0.6398889422416687, + "learning_rate": 0.0001, + "loss": 1.2159, + "mean_token_accuracy": 0.6554309129714966, + "num_tokens": 179246477.0, + "step": 433 + }, + { + "epoch": 0.7614035087719299, + "grad_norm": 0.6085229516029358, + "learning_rate": 0.0001, + "loss": 1.2216, + "mean_token_accuracy": 0.6549739837646484, + "num_tokens": 179666041.0, + "step": 434 + }, + { + "epoch": 0.7631578947368421, + "grad_norm": 0.7816640734672546, + "learning_rate": 0.0001, + "loss": 1.2119, + "mean_token_accuracy": 0.6565245389938354, + "num_tokens": 180092225.0, + "step": 435 + }, + { + "epoch": 0.7649122807017544, + "grad_norm": 0.8083506226539612, + "learning_rate": 0.0001, + "loss": 1.1961, + "mean_token_accuracy": 0.6606351733207703, + "num_tokens": 180498123.0, + "step": 436 + }, + { + "epoch": 0.7666666666666667, + "grad_norm": 0.6019986271858215, + "learning_rate": 0.0001, + "loss": 1.1972, + "mean_token_accuracy": 0.6611742377281189, + "num_tokens": 180904735.0, + "step": 437 + }, + { + "epoch": 0.7684210526315789, + "grad_norm": 0.6621778011322021, + "learning_rate": 0.0001, + "loss": 1.1987, + "mean_token_accuracy": 0.6592767238616943, + "num_tokens": 181297827.0, + "step": 438 + }, + { + "epoch": 0.7701754385964912, + "grad_norm": 0.5817862749099731, + "learning_rate": 0.0001, + "loss": 1.2096, + "mean_token_accuracy": 0.6584010124206543, + "num_tokens": 181722096.0, + "step": 439 + }, + { + "epoch": 0.7719298245614035, + "grad_norm": 0.6433981657028198, + "learning_rate": 0.0001, + "loss": 1.1872, + "mean_token_accuracy": 0.6626437902450562, + "num_tokens": 182124515.0, + "step": 440 + }, + { + "epoch": 0.7736842105263158, + "grad_norm": 0.6573434472084045, + "learning_rate": 0.0001, + "loss": 1.2159, + "mean_token_accuracy": 0.6551169157028198, + "num_tokens": 182542265.0, + "step": 441 + }, + { + "epoch": 0.775438596491228, + "grad_norm": 0.684744656085968, + "learning_rate": 0.0001, + "loss": 1.2095, + "mean_token_accuracy": 0.6574633121490479, + "num_tokens": 182942368.0, + "step": 442 + }, + { + "epoch": 0.7771929824561403, + "grad_norm": 0.5961515307426453, + "learning_rate": 0.0001, + "loss": 1.2012, + "mean_token_accuracy": 0.6580845713615417, + "num_tokens": 183350652.0, + "step": 443 + }, + { + "epoch": 0.7789473684210526, + "grad_norm": 0.7268422842025757, + "learning_rate": 0.0001, + "loss": 1.2082, + "mean_token_accuracy": 0.657654345035553, + "num_tokens": 183786290.0, + "step": 444 + }, + { + "epoch": 0.7807017543859649, + "grad_norm": 0.7548661231994629, + "learning_rate": 0.0001, + "loss": 1.203, + "mean_token_accuracy": 0.6581261157989502, + "num_tokens": 184191985.0, + "step": 445 + }, + { + "epoch": 0.7824561403508772, + "grad_norm": 0.589838981628418, + "learning_rate": 0.0001, + "loss": 1.2253, + "mean_token_accuracy": 0.652956485748291, + "num_tokens": 184617087.0, + "step": 446 + }, + { + "epoch": 0.7842105263157895, + "grad_norm": 0.7901304960250854, + "learning_rate": 0.0001, + "loss": 1.2023, + "mean_token_accuracy": 0.6594702005386353, + "num_tokens": 185046113.0, + "step": 447 + }, + { + "epoch": 0.7859649122807018, + "grad_norm": 0.681577205657959, + "learning_rate": 0.0001, + "loss": 1.1765, + "mean_token_accuracy": 0.6648210287094116, + "num_tokens": 185440210.0, + "step": 448 + }, + { + "epoch": 0.787719298245614, + "grad_norm": 0.619105339050293, + "learning_rate": 0.0001, + "loss": 1.2151, + "mean_token_accuracy": 0.6544394493103027, + "num_tokens": 185866240.0, + "step": 449 + }, + { + "epoch": 0.7894736842105263, + "grad_norm": 0.6568613648414612, + "learning_rate": 0.0001, + "loss": 1.1808, + "mean_token_accuracy": 0.6645166277885437, + "num_tokens": 186262559.0, + "step": 450 + }, + { + "epoch": 0.7912280701754386, + "grad_norm": 0.6452411413192749, + "learning_rate": 0.0001, + "loss": 1.2289, + "mean_token_accuracy": 0.6531310677528381, + "num_tokens": 186677017.0, + "step": 451 + }, + { + "epoch": 0.7929824561403509, + "grad_norm": 0.6799737215042114, + "learning_rate": 0.0001, + "loss": 1.2207, + "mean_token_accuracy": 0.6556583046913147, + "num_tokens": 187108135.0, + "step": 452 + }, + { + "epoch": 0.7947368421052632, + "grad_norm": 0.5680040717124939, + "learning_rate": 0.0001, + "loss": 1.1886, + "mean_token_accuracy": 0.6613532900810242, + "num_tokens": 187533913.0, + "step": 453 + }, + { + "epoch": 0.7964912280701755, + "grad_norm": 0.6380943655967712, + "learning_rate": 0.0001, + "loss": 1.2136, + "mean_token_accuracy": 0.6577192544937134, + "num_tokens": 187943777.0, + "step": 454 + }, + { + "epoch": 0.7982456140350878, + "grad_norm": 0.5565281510353088, + "learning_rate": 0.0001, + "loss": 1.1941, + "mean_token_accuracy": 0.6604122519493103, + "num_tokens": 188365013.0, + "step": 455 + }, + { + "epoch": 0.8, + "grad_norm": 0.6176914572715759, + "learning_rate": 0.0001, + "loss": 1.1957, + "mean_token_accuracy": 0.6601607799530029, + "num_tokens": 188805546.0, + "step": 456 + }, + { + "epoch": 0.8017543859649123, + "grad_norm": 0.6163376569747925, + "learning_rate": 0.0001, + "loss": 1.1788, + "mean_token_accuracy": 0.6634429097175598, + "num_tokens": 189204261.0, + "step": 457 + }, + { + "epoch": 0.8035087719298246, + "grad_norm": 0.6874009966850281, + "learning_rate": 0.0001, + "loss": 1.2061, + "mean_token_accuracy": 0.6587145924568176, + "num_tokens": 189609457.0, + "step": 458 + }, + { + "epoch": 0.8052631578947368, + "grad_norm": 0.6584733724594116, + "learning_rate": 0.0001, + "loss": 1.2077, + "mean_token_accuracy": 0.6568840742111206, + "num_tokens": 190024904.0, + "step": 459 + }, + { + "epoch": 0.8070175438596491, + "grad_norm": 0.554511547088623, + "learning_rate": 0.0001, + "loss": 1.1883, + "mean_token_accuracy": 0.661857008934021, + "num_tokens": 190435843.0, + "step": 460 + }, + { + "epoch": 0.8087719298245614, + "grad_norm": 0.6625659465789795, + "learning_rate": 0.0001, + "loss": 1.209, + "mean_token_accuracy": 0.6570659875869751, + "num_tokens": 190844879.0, + "step": 461 + }, + { + "epoch": 0.8105263157894737, + "grad_norm": 0.6230789422988892, + "learning_rate": 0.0001, + "loss": 1.1932, + "mean_token_accuracy": 0.6605242490768433, + "num_tokens": 191240668.0, + "step": 462 + }, + { + "epoch": 0.8122807017543859, + "grad_norm": 0.5848865509033203, + "learning_rate": 0.0001, + "loss": 1.2055, + "mean_token_accuracy": 0.6577123999595642, + "num_tokens": 191649912.0, + "step": 463 + }, + { + "epoch": 0.8140350877192982, + "grad_norm": 0.7131868600845337, + "learning_rate": 0.0001, + "loss": 1.1945, + "mean_token_accuracy": 0.6616647839546204, + "num_tokens": 192065093.0, + "step": 464 + }, + { + "epoch": 0.8157894736842105, + "grad_norm": 0.620922863483429, + "learning_rate": 0.0001, + "loss": 1.2134, + "mean_token_accuracy": 0.657670259475708, + "num_tokens": 192458897.0, + "step": 465 + }, + { + "epoch": 0.8175438596491228, + "grad_norm": 0.6825653910636902, + "learning_rate": 0.0001, + "loss": 1.2083, + "mean_token_accuracy": 0.6574592590332031, + "num_tokens": 192871837.0, + "step": 466 + }, + { + "epoch": 0.8192982456140351, + "grad_norm": 0.649117648601532, + "learning_rate": 0.0001, + "loss": 1.1904, + "mean_token_accuracy": 0.661907970905304, + "num_tokens": 193297231.0, + "step": 467 + }, + { + "epoch": 0.8210526315789474, + "grad_norm": 0.5843600630760193, + "learning_rate": 0.0001, + "loss": 1.1613, + "mean_token_accuracy": 0.6669655442237854, + "num_tokens": 193684403.0, + "step": 468 + }, + { + "epoch": 0.8228070175438597, + "grad_norm": 0.6877574324607849, + "learning_rate": 0.0001, + "loss": 1.2131, + "mean_token_accuracy": 0.6550207138061523, + "num_tokens": 194113357.0, + "step": 469 + }, + { + "epoch": 0.8245614035087719, + "grad_norm": 0.6516855955123901, + "learning_rate": 0.0001, + "loss": 1.1979, + "mean_token_accuracy": 0.65838223695755, + "num_tokens": 194526469.0, + "step": 470 + }, + { + "epoch": 0.8263157894736842, + "grad_norm": 0.6000040769577026, + "learning_rate": 0.0001, + "loss": 1.2303, + "mean_token_accuracy": 0.6509213447570801, + "num_tokens": 194953909.0, + "step": 471 + }, + { + "epoch": 0.8280701754385965, + "grad_norm": 0.6414221525192261, + "learning_rate": 0.0001, + "loss": 1.2276, + "mean_token_accuracy": 0.6521142721176147, + "num_tokens": 195388769.0, + "step": 472 + }, + { + "epoch": 0.8298245614035088, + "grad_norm": 0.614547848701477, + "learning_rate": 0.0001, + "loss": 1.2174, + "mean_token_accuracy": 0.6551234126091003, + "num_tokens": 195818916.0, + "step": 473 + }, + { + "epoch": 0.8315789473684211, + "grad_norm": 0.6391692161560059, + "learning_rate": 0.0001, + "loss": 1.1963, + "mean_token_accuracy": 0.659719705581665, + "num_tokens": 196233034.0, + "step": 474 + }, + { + "epoch": 0.8333333333333334, + "grad_norm": 0.6614966988563538, + "learning_rate": 0.0001, + "loss": 1.2407, + "mean_token_accuracy": 0.6485875844955444, + "num_tokens": 196660034.0, + "step": 475 + }, + { + "epoch": 0.8350877192982457, + "grad_norm": 0.5896729826927185, + "learning_rate": 0.0001, + "loss": 1.2188, + "mean_token_accuracy": 0.6551612615585327, + "num_tokens": 197080673.0, + "step": 476 + }, + { + "epoch": 0.8368421052631579, + "grad_norm": 0.6428948044776917, + "learning_rate": 0.0001, + "loss": 1.1962, + "mean_token_accuracy": 0.6604630947113037, + "num_tokens": 197493003.0, + "step": 477 + }, + { + "epoch": 0.8385964912280702, + "grad_norm": 0.6853853464126587, + "learning_rate": 0.0001, + "loss": 1.2116, + "mean_token_accuracy": 0.6553216576576233, + "num_tokens": 197923635.0, + "step": 478 + }, + { + "epoch": 0.8403508771929824, + "grad_norm": 0.6877092719078064, + "learning_rate": 0.0001, + "loss": 1.2017, + "mean_token_accuracy": 0.657435417175293, + "num_tokens": 198339854.0, + "step": 479 + }, + { + "epoch": 0.8421052631578947, + "grad_norm": 0.5886791348457336, + "learning_rate": 0.0001, + "loss": 1.2414, + "mean_token_accuracy": 0.6491390466690063, + "num_tokens": 198768483.0, + "step": 480 + }, + { + "epoch": 0.843859649122807, + "grad_norm": 0.8585889935493469, + "learning_rate": 0.0001, + "loss": 1.2168, + "mean_token_accuracy": 0.6541799902915955, + "num_tokens": 199190842.0, + "step": 481 + }, + { + "epoch": 0.8456140350877193, + "grad_norm": 0.6527767181396484, + "learning_rate": 0.0001, + "loss": 1.2236, + "mean_token_accuracy": 0.6539558172225952, + "num_tokens": 199620649.0, + "step": 482 + }, + { + "epoch": 0.8473684210526315, + "grad_norm": 0.6834746599197388, + "learning_rate": 0.0001, + "loss": 1.2015, + "mean_token_accuracy": 0.6586301326751709, + "num_tokens": 200023406.0, + "step": 483 + }, + { + "epoch": 0.8491228070175438, + "grad_norm": 0.6827247142791748, + "learning_rate": 0.0001, + "loss": 1.178, + "mean_token_accuracy": 0.6644470691680908, + "num_tokens": 200430239.0, + "step": 484 + }, + { + "epoch": 0.8508771929824561, + "grad_norm": 0.6491426229476929, + "learning_rate": 0.0001, + "loss": 1.1927, + "mean_token_accuracy": 0.6621779799461365, + "num_tokens": 200833864.0, + "step": 485 + }, + { + "epoch": 0.8526315789473684, + "grad_norm": 0.6229031682014465, + "learning_rate": 0.0001, + "loss": 1.1923, + "mean_token_accuracy": 0.6596782207489014, + "num_tokens": 201241869.0, + "step": 486 + }, + { + "epoch": 0.8543859649122807, + "grad_norm": 0.5779981017112732, + "learning_rate": 0.0001, + "loss": 1.1857, + "mean_token_accuracy": 0.6614409685134888, + "num_tokens": 201658653.0, + "step": 487 + }, + { + "epoch": 0.856140350877193, + "grad_norm": 0.6096077561378479, + "learning_rate": 0.0001, + "loss": 1.2096, + "mean_token_accuracy": 0.6574957370758057, + "num_tokens": 202086503.0, + "step": 488 + }, + { + "epoch": 0.8578947368421053, + "grad_norm": 0.7495996952056885, + "learning_rate": 0.0001, + "loss": 1.2005, + "mean_token_accuracy": 0.6597346067428589, + "num_tokens": 202509306.0, + "step": 489 + }, + { + "epoch": 0.8596491228070176, + "grad_norm": 0.6209189295768738, + "learning_rate": 0.0001, + "loss": 1.1753, + "mean_token_accuracy": 0.6628221273422241, + "num_tokens": 202909087.0, + "step": 490 + }, + { + "epoch": 0.8614035087719298, + "grad_norm": 0.563208281993866, + "learning_rate": 0.0001, + "loss": 1.2005, + "mean_token_accuracy": 0.6574559211730957, + "num_tokens": 203337615.0, + "step": 491 + }, + { + "epoch": 0.8631578947368421, + "grad_norm": 0.6872074604034424, + "learning_rate": 0.0001, + "loss": 1.1982, + "mean_token_accuracy": 0.6597882509231567, + "num_tokens": 203754527.0, + "step": 492 + }, + { + "epoch": 0.8649122807017544, + "grad_norm": 0.6505935192108154, + "learning_rate": 0.0001, + "loss": 1.1734, + "mean_token_accuracy": 0.666144609451294, + "num_tokens": 204166768.0, + "step": 493 + }, + { + "epoch": 0.8666666666666667, + "grad_norm": 0.7290279269218445, + "learning_rate": 0.0001, + "loss": 1.1923, + "mean_token_accuracy": 0.6601245403289795, + "num_tokens": 204554076.0, + "step": 494 + }, + { + "epoch": 0.868421052631579, + "grad_norm": 0.6451328992843628, + "learning_rate": 0.0001, + "loss": 1.2272, + "mean_token_accuracy": 0.653006911277771, + "num_tokens": 204962578.0, + "step": 495 + }, + { + "epoch": 0.8701754385964913, + "grad_norm": 0.7413347363471985, + "learning_rate": 0.0001, + "loss": 1.153, + "mean_token_accuracy": 0.6694173216819763, + "num_tokens": 205358271.0, + "step": 496 + }, + { + "epoch": 0.8719298245614036, + "grad_norm": 0.6787010431289673, + "learning_rate": 0.0001, + "loss": 1.2085, + "mean_token_accuracy": 0.6561261415481567, + "num_tokens": 205786890.0, + "step": 497 + }, + { + "epoch": 0.8736842105263158, + "grad_norm": 0.6698117256164551, + "learning_rate": 0.0001, + "loss": 1.2019, + "mean_token_accuracy": 0.6592704057693481, + "num_tokens": 206193572.0, + "step": 498 + }, + { + "epoch": 0.875438596491228, + "grad_norm": 0.6170295476913452, + "learning_rate": 0.0001, + "loss": 1.1723, + "mean_token_accuracy": 0.6650887727737427, + "num_tokens": 206607450.0, + "step": 499 + }, + { + "epoch": 0.8771929824561403, + "grad_norm": 0.5921252965927124, + "learning_rate": 0.0001, + "loss": 1.1823, + "mean_token_accuracy": 0.6633787155151367, + "num_tokens": 207005126.0, + "step": 500 + }, + { + "epoch": 0.8789473684210526, + "grad_norm": 0.69658362865448, + "learning_rate": 0.0001, + "loss": 1.1793, + "mean_token_accuracy": 0.6646496057510376, + "num_tokens": 207394794.0, + "step": 501 + }, + { + "epoch": 0.8807017543859649, + "grad_norm": 0.6810624599456787, + "learning_rate": 0.0001, + "loss": 1.1979, + "mean_token_accuracy": 0.6584210395812988, + "num_tokens": 207783539.0, + "step": 502 + }, + { + "epoch": 0.8824561403508772, + "grad_norm": 0.6264888644218445, + "learning_rate": 0.0001, + "loss": 1.2045, + "mean_token_accuracy": 0.6583248376846313, + "num_tokens": 208196198.0, + "step": 503 + }, + { + "epoch": 0.8842105263157894, + "grad_norm": 0.6070482730865479, + "learning_rate": 0.0001, + "loss": 1.1995, + "mean_token_accuracy": 0.6601771712303162, + "num_tokens": 208602598.0, + "step": 504 + }, + { + "epoch": 0.8859649122807017, + "grad_norm": 0.6856517791748047, + "learning_rate": 0.0001, + "loss": 1.1909, + "mean_token_accuracy": 0.6614357233047485, + "num_tokens": 209039139.0, + "step": 505 + }, + { + "epoch": 0.887719298245614, + "grad_norm": 0.5697737336158752, + "learning_rate": 0.0001, + "loss": 1.1967, + "mean_token_accuracy": 0.6589823961257935, + "num_tokens": 209437950.0, + "step": 506 + }, + { + "epoch": 0.8894736842105263, + "grad_norm": 0.7310987114906311, + "learning_rate": 0.0001, + "loss": 1.205, + "mean_token_accuracy": 0.6580671668052673, + "num_tokens": 209862559.0, + "step": 507 + }, + { + "epoch": 0.8912280701754386, + "grad_norm": 0.6229117512702942, + "learning_rate": 0.0001, + "loss": 1.222, + "mean_token_accuracy": 0.6536027789115906, + "num_tokens": 210284769.0, + "step": 508 + }, + { + "epoch": 0.8929824561403509, + "grad_norm": 0.5739285349845886, + "learning_rate": 0.0001, + "loss": 1.2059, + "mean_token_accuracy": 0.6585639119148254, + "num_tokens": 210708859.0, + "step": 509 + }, + { + "epoch": 0.8947368421052632, + "grad_norm": 0.6239802837371826, + "learning_rate": 0.0001, + "loss": 1.1986, + "mean_token_accuracy": 0.6589633822441101, + "num_tokens": 211130168.0, + "step": 510 + }, + { + "epoch": 0.8964912280701754, + "grad_norm": 0.6617391705513, + "learning_rate": 0.0001, + "loss": 1.2027, + "mean_token_accuracy": 0.6577974557876587, + "num_tokens": 211529753.0, + "step": 511 + }, + { + "epoch": 0.8982456140350877, + "grad_norm": 0.638733983039856, + "learning_rate": 0.0001, + "loss": 1.2142, + "mean_token_accuracy": 0.6540495157241821, + "num_tokens": 211963430.0, + "step": 512 + }, + { + "epoch": 0.9, + "grad_norm": 0.6008735299110413, + "learning_rate": 0.0001, + "loss": 1.2136, + "mean_token_accuracy": 0.6559524536132812, + "num_tokens": 212387404.0, + "step": 513 + }, + { + "epoch": 0.9017543859649123, + "grad_norm": 0.6343475580215454, + "learning_rate": 0.0001, + "loss": 1.1718, + "mean_token_accuracy": 0.6651860475540161, + "num_tokens": 212802763.0, + "step": 514 + }, + { + "epoch": 0.9035087719298246, + "grad_norm": 0.637675940990448, + "learning_rate": 0.0001, + "loss": 1.1694, + "mean_token_accuracy": 0.6661834716796875, + "num_tokens": 213219406.0, + "step": 515 + }, + { + "epoch": 0.9052631578947369, + "grad_norm": 0.5518184900283813, + "learning_rate": 0.0001, + "loss": 1.1954, + "mean_token_accuracy": 0.6603313684463501, + "num_tokens": 213623710.0, + "step": 516 + }, + { + "epoch": 0.9070175438596492, + "grad_norm": 0.6756175756454468, + "learning_rate": 0.0001, + "loss": 1.1701, + "mean_token_accuracy": 0.6667043566703796, + "num_tokens": 214053806.0, + "step": 517 + }, + { + "epoch": 0.9087719298245615, + "grad_norm": 0.5964516401290894, + "learning_rate": 0.0001, + "loss": 1.2007, + "mean_token_accuracy": 0.6573567390441895, + "num_tokens": 214457394.0, + "step": 518 + }, + { + "epoch": 0.9105263157894737, + "grad_norm": 0.745707094669342, + "learning_rate": 0.0001, + "loss": 1.1697, + "mean_token_accuracy": 0.6656190156936646, + "num_tokens": 214841472.0, + "step": 519 + }, + { + "epoch": 0.9122807017543859, + "grad_norm": 0.5971705317497253, + "learning_rate": 0.0001, + "loss": 1.2061, + "mean_token_accuracy": 0.656207799911499, + "num_tokens": 215261046.0, + "step": 520 + }, + { + "epoch": 0.9140350877192982, + "grad_norm": 0.7177700400352478, + "learning_rate": 0.0001, + "loss": 1.1753, + "mean_token_accuracy": 0.6650264263153076, + "num_tokens": 215664423.0, + "step": 521 + }, + { + "epoch": 0.9157894736842105, + "grad_norm": 0.5945612788200378, + "learning_rate": 0.0001, + "loss": 1.1813, + "mean_token_accuracy": 0.66515052318573, + "num_tokens": 216072733.0, + "step": 522 + }, + { + "epoch": 0.9175438596491228, + "grad_norm": 0.7161288857460022, + "learning_rate": 0.0001, + "loss": 1.1964, + "mean_token_accuracy": 0.6598995923995972, + "num_tokens": 216490157.0, + "step": 523 + }, + { + "epoch": 0.9192982456140351, + "grad_norm": 0.6490321159362793, + "learning_rate": 0.0001, + "loss": 1.2151, + "mean_token_accuracy": 0.6550993919372559, + "num_tokens": 216933714.0, + "step": 524 + }, + { + "epoch": 0.9210526315789473, + "grad_norm": 0.6328585743904114, + "learning_rate": 0.0001, + "loss": 1.2008, + "mean_token_accuracy": 0.658663272857666, + "num_tokens": 217360826.0, + "step": 525 + }, + { + "epoch": 0.9228070175438596, + "grad_norm": 0.6045000553131104, + "learning_rate": 0.0001, + "loss": 1.1837, + "mean_token_accuracy": 0.6624784469604492, + "num_tokens": 217767964.0, + "step": 526 + }, + { + "epoch": 0.9245614035087719, + "grad_norm": 0.5896552205085754, + "learning_rate": 0.0001, + "loss": 1.1785, + "mean_token_accuracy": 0.663489043712616, + "num_tokens": 218182502.0, + "step": 527 + }, + { + "epoch": 0.9263157894736842, + "grad_norm": 0.6433465480804443, + "learning_rate": 0.0001, + "loss": 1.1866, + "mean_token_accuracy": 0.662973940372467, + "num_tokens": 218599164.0, + "step": 528 + }, + { + "epoch": 0.9280701754385965, + "grad_norm": 0.6225712895393372, + "learning_rate": 0.0001, + "loss": 1.1732, + "mean_token_accuracy": 0.6657634973526001, + "num_tokens": 219038853.0, + "step": 529 + }, + { + "epoch": 0.9298245614035088, + "grad_norm": 0.6584674715995789, + "learning_rate": 0.0001, + "loss": 1.1931, + "mean_token_accuracy": 0.6607257127761841, + "num_tokens": 219455763.0, + "step": 530 + }, + { + "epoch": 0.9315789473684211, + "grad_norm": 0.5859020352363586, + "learning_rate": 0.0001, + "loss": 1.2043, + "mean_token_accuracy": 0.657474935054779, + "num_tokens": 219857096.0, + "step": 531 + }, + { + "epoch": 0.9333333333333333, + "grad_norm": 0.6879558563232422, + "learning_rate": 0.0001, + "loss": 1.1761, + "mean_token_accuracy": 0.6635361909866333, + "num_tokens": 220269043.0, + "step": 532 + }, + { + "epoch": 0.9350877192982456, + "grad_norm": 0.6866979002952576, + "learning_rate": 0.0001, + "loss": 1.2073, + "mean_token_accuracy": 0.6566738486289978, + "num_tokens": 220650778.0, + "step": 533 + }, + { + "epoch": 0.9368421052631579, + "grad_norm": 0.6336923241615295, + "learning_rate": 0.0001, + "loss": 1.2039, + "mean_token_accuracy": 0.6579070091247559, + "num_tokens": 221055825.0, + "step": 534 + }, + { + "epoch": 0.9385964912280702, + "grad_norm": 0.6081579327583313, + "learning_rate": 0.0001, + "loss": 1.2108, + "mean_token_accuracy": 0.6550259590148926, + "num_tokens": 221459868.0, + "step": 535 + }, + { + "epoch": 0.9403508771929825, + "grad_norm": 0.6312009692192078, + "learning_rate": 0.0001, + "loss": 1.1702, + "mean_token_accuracy": 0.6641944646835327, + "num_tokens": 221860324.0, + "step": 536 + }, + { + "epoch": 0.9421052631578948, + "grad_norm": 0.5887439250946045, + "learning_rate": 0.0001, + "loss": 1.1778, + "mean_token_accuracy": 0.6638119220733643, + "num_tokens": 222291251.0, + "step": 537 + }, + { + "epoch": 0.9438596491228071, + "grad_norm": 0.543400764465332, + "learning_rate": 0.0001, + "loss": 1.1805, + "mean_token_accuracy": 0.6627988815307617, + "num_tokens": 222699559.0, + "step": 538 + }, + { + "epoch": 0.9456140350877194, + "grad_norm": 0.5787383913993835, + "learning_rate": 0.0001, + "loss": 1.1914, + "mean_token_accuracy": 0.6620433330535889, + "num_tokens": 223156357.0, + "step": 539 + }, + { + "epoch": 0.9473684210526315, + "grad_norm": 0.6597963571548462, + "learning_rate": 0.0001, + "loss": 1.1871, + "mean_token_accuracy": 0.6626171469688416, + "num_tokens": 223562411.0, + "step": 540 + }, + { + "epoch": 0.9491228070175438, + "grad_norm": 0.5731210112571716, + "learning_rate": 0.0001, + "loss": 1.2078, + "mean_token_accuracy": 0.6560046076774597, + "num_tokens": 223988252.0, + "step": 541 + }, + { + "epoch": 0.9508771929824561, + "grad_norm": 0.7036701440811157, + "learning_rate": 0.0001, + "loss": 1.1917, + "mean_token_accuracy": 0.6613459587097168, + "num_tokens": 224401217.0, + "step": 542 + }, + { + "epoch": 0.9526315789473684, + "grad_norm": 0.5783252120018005, + "learning_rate": 0.0001, + "loss": 1.1757, + "mean_token_accuracy": 0.6623378396034241, + "num_tokens": 224807482.0, + "step": 543 + }, + { + "epoch": 0.9543859649122807, + "grad_norm": 0.7617517113685608, + "learning_rate": 0.0001, + "loss": 1.1937, + "mean_token_accuracy": 0.659681499004364, + "num_tokens": 225220680.0, + "step": 544 + }, + { + "epoch": 0.956140350877193, + "grad_norm": 0.6007680296897888, + "learning_rate": 0.0001, + "loss": 1.2401, + "mean_token_accuracy": 0.6486543416976929, + "num_tokens": 225640539.0, + "step": 545 + }, + { + "epoch": 0.9578947368421052, + "grad_norm": 0.7272628545761108, + "learning_rate": 0.0001, + "loss": 1.1822, + "mean_token_accuracy": 0.6626489162445068, + "num_tokens": 226035924.0, + "step": 546 + }, + { + "epoch": 0.9596491228070175, + "grad_norm": 0.700038492679596, + "learning_rate": 0.0001, + "loss": 1.1658, + "mean_token_accuracy": 0.666049599647522, + "num_tokens": 226425232.0, + "step": 547 + }, + { + "epoch": 0.9614035087719298, + "grad_norm": 0.6490049958229065, + "learning_rate": 0.0001, + "loss": 1.1691, + "mean_token_accuracy": 0.6649153828620911, + "num_tokens": 226802383.0, + "step": 548 + }, + { + "epoch": 0.9631578947368421, + "grad_norm": 0.7154028415679932, + "learning_rate": 0.0001, + "loss": 1.1986, + "mean_token_accuracy": 0.6584769487380981, + "num_tokens": 227227610.0, + "step": 549 + }, + { + "epoch": 0.9649122807017544, + "grad_norm": 0.6601865887641907, + "learning_rate": 0.0001, + "loss": 1.2034, + "mean_token_accuracy": 0.6581115126609802, + "num_tokens": 227657882.0, + "step": 550 + }, + { + "epoch": 0.9666666666666667, + "grad_norm": 0.6211066842079163, + "learning_rate": 0.0001, + "loss": 1.1749, + "mean_token_accuracy": 0.6637754440307617, + "num_tokens": 228081460.0, + "step": 551 + }, + { + "epoch": 0.968421052631579, + "grad_norm": 0.6879007816314697, + "learning_rate": 0.0001, + "loss": 1.2181, + "mean_token_accuracy": 0.6532254219055176, + "num_tokens": 228507388.0, + "step": 552 + }, + { + "epoch": 0.9701754385964912, + "grad_norm": 0.6297675371170044, + "learning_rate": 0.0001, + "loss": 1.2126, + "mean_token_accuracy": 0.654970645904541, + "num_tokens": 228935033.0, + "step": 553 + }, + { + "epoch": 0.9719298245614035, + "grad_norm": 0.5917762517929077, + "learning_rate": 0.0001, + "loss": 1.1517, + "mean_token_accuracy": 0.6693041920661926, + "num_tokens": 229325690.0, + "step": 554 + }, + { + "epoch": 0.9736842105263158, + "grad_norm": 0.6466293334960938, + "learning_rate": 0.0001, + "loss": 1.1573, + "mean_token_accuracy": 0.6677490472793579, + "num_tokens": 229726549.0, + "step": 555 + }, + { + "epoch": 0.9754385964912281, + "grad_norm": 0.6341378688812256, + "learning_rate": 0.0001, + "loss": 1.185, + "mean_token_accuracy": 0.6614177227020264, + "num_tokens": 230122274.0, + "step": 556 + }, + { + "epoch": 0.9771929824561404, + "grad_norm": 0.604850172996521, + "learning_rate": 0.0001, + "loss": 1.1959, + "mean_token_accuracy": 0.6580736637115479, + "num_tokens": 230526104.0, + "step": 557 + }, + { + "epoch": 0.9789473684210527, + "grad_norm": 0.7436766624450684, + "learning_rate": 0.0001, + "loss": 1.1642, + "mean_token_accuracy": 0.6674508452415466, + "num_tokens": 230941199.0, + "step": 558 + }, + { + "epoch": 0.980701754385965, + "grad_norm": 0.6362001895904541, + "learning_rate": 0.0001, + "loss": 1.1751, + "mean_token_accuracy": 0.6646236181259155, + "num_tokens": 231367872.0, + "step": 559 + }, + { + "epoch": 0.9824561403508771, + "grad_norm": 0.6686745882034302, + "learning_rate": 0.0001, + "loss": 1.2065, + "mean_token_accuracy": 0.6581393480300903, + "num_tokens": 231804511.0, + "step": 560 + }, + { + "epoch": 0.9842105263157894, + "grad_norm": 0.7186607718467712, + "learning_rate": 0.0001, + "loss": 1.1692, + "mean_token_accuracy": 0.6657729148864746, + "num_tokens": 232201554.0, + "step": 561 + }, + { + "epoch": 0.9859649122807017, + "grad_norm": 0.5875235795974731, + "learning_rate": 0.0001, + "loss": 1.185, + "mean_token_accuracy": 0.6623810529708862, + "num_tokens": 232632247.0, + "step": 562 + }, + { + "epoch": 0.987719298245614, + "grad_norm": 0.6285355687141418, + "learning_rate": 0.0001, + "loss": 1.1932, + "mean_token_accuracy": 0.6584882140159607, + "num_tokens": 233040299.0, + "step": 563 + }, + { + "epoch": 0.9894736842105263, + "grad_norm": 0.6787013411521912, + "learning_rate": 0.0001, + "loss": 1.2017, + "mean_token_accuracy": 0.6589356660842896, + "num_tokens": 233476898.0, + "step": 564 + }, + { + "epoch": 0.9912280701754386, + "grad_norm": 0.5261335372924805, + "learning_rate": 0.0001, + "loss": 1.1674, + "mean_token_accuracy": 0.6651347279548645, + "num_tokens": 233898203.0, + "step": 565 + }, + { + "epoch": 0.9929824561403509, + "grad_norm": 0.6217278242111206, + "learning_rate": 0.0001, + "loss": 1.2212, + "mean_token_accuracy": 0.653939962387085, + "num_tokens": 234316491.0, + "step": 566 + }, + { + "epoch": 0.9947368421052631, + "grad_norm": 0.6469559073448181, + "learning_rate": 0.0001, + "loss": 1.1848, + "mean_token_accuracy": 0.6615195870399475, + "num_tokens": 234725455.0, + "step": 567 + }, + { + "epoch": 0.9964912280701754, + "grad_norm": 0.6558631062507629, + "learning_rate": 0.0001, + "loss": 1.173, + "mean_token_accuracy": 0.6650323867797852, + "num_tokens": 235152094.0, + "step": 568 + }, + { + "epoch": 0.9982456140350877, + "grad_norm": 0.6159579157829285, + "learning_rate": 0.0001, + "loss": 1.1885, + "mean_token_accuracy": 0.6604526042938232, + "num_tokens": 235558911.0, + "step": 569 + }, + { + "epoch": 1.0, + "grad_norm": 0.6799984574317932, + "learning_rate": 0.0001, + "loss": 1.1975, + "mean_token_accuracy": 0.6584136486053467, + "num_tokens": 235994347.0, + "step": 570 + }, + { + "epoch": 1.0, + "step": 570, + "total_flos": 1.377941890090926e+18, + "train_loss": 1.2564991597543682, + "train_runtime": 1307.2842, + "train_samples_per_second": 111.621, + "train_steps_per_second": 0.436 + } + ], + "logging_steps": 1, + "max_steps": 570, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 285, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.377941890090926e+18, + "train_batch_size": 64, + "trial_name": null, + "trial_params": null +} diff --git a/training_args.bin b/training_args.bin new file mode 100644 index 0000000..589b18a --- /dev/null +++ b/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f643d1663b3cc33737006e06f0346b1d97fbdeed19118d07249ee4abd5aae4d6 +size 13329