commit 6f263bcc328ccfa621288682bb2c61479be0d476 Author: ModelHub XC Date: Mon Jun 1 02:21:15 2026 +0800 初始化项目,由ModelHub XC社区提供模型 Model: Neelectric/Llama-3.2-1B-Instruct_SFT_sciencev00.04 Source: Original Platform diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..52373fe --- /dev/null +++ b/.gitattributes @@ -0,0 +1,36 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text +tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/README.md b/README.md new file mode 100644 index 0000000..3f3cc3d --- /dev/null +++ b/README.md @@ -0,0 +1,60 @@ +--- +base_model: meta-llama/Llama-3.2-1B-Instruct +datasets: Neelectric/MoT_science_Llama3_2048toks +library_name: transformers +model_name: Llama-3.2-1B-Instruct_SFT_sciencev00.04 +tags: +- generated_from_trainer +- trl +- sft +- open-r1 +licence: license +--- + +# Model Card for Llama-3.2-1B-Instruct_SFT_sciencev00.04 + +This model is a fine-tuned version of [meta-llama/Llama-3.2-1B-Instruct](https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct) on the [Neelectric/MoT_science_Llama3_2048toks](https://huggingface.co/datasets/Neelectric/MoT_science_Llama3_2048toks) dataset. +It has been trained using [TRL](https://github.com/huggingface/trl). + +## Quick start + +```python +from transformers import pipeline + +question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?" +generator = pipeline("text-generation", model="Neelectric/Llama-3.2-1B-Instruct_SFT_sciencev00.04", device="cuda") +output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0] +print(output["generated_text"]) +``` + +## Training procedure + +[Visualize in Weights & Biases](https://wandb.ai/neelectric/open-r1_science/runs/25xtib34) + + + +This model was trained with SFT. + +### Framework versions + +- TRL: 1.0.0.dev0 +- Transformers: 4.57.6 +- Pytorch: 2.9.0 +- Datasets: 4.8.3 +- Tokenizers: 0.22.2 + +## Citations + + + +Cite TRL as: + +```bibtex +@software{vonwerra2020trl, + title = {{TRL: Transformers Reinforcement Learning}}, + author = {von Werra, Leandro and Belkada, Younes and Tunstall, Lewis and Beeching, Edward and Thrush, Tristan and Lambert, Nathan and Huang, Shengyi and Rasul, Kashif and Gallouédec, Quentin}, + license = {Apache-2.0}, + url = {https://github.com/huggingface/trl}, + year = {2020} +} +``` \ No newline at end of file diff --git a/all_results.json b/all_results.json new file mode 100644 index 0000000..72386af --- /dev/null +++ b/all_results.json @@ -0,0 +1,8 @@ +{ + "total_flos": 2.1853937174671524e+18, + "train_loss": 1.1615400253780304, + "train_runtime": 1840.1375, + "train_samples": 107517, + "train_samples_per_second": 175.286, + "train_steps_per_second": 0.342 +} \ No newline at end of file diff --git a/chat_template.jinja b/chat_template.jinja new file mode 100644 index 0000000..0ab931a --- /dev/null +++ b/chat_template.jinja @@ -0,0 +1,121 @@ +{{- bos_token }} +{%- if custom_tools is defined %} + {%- set tools = custom_tools %} +{%- endif %} +{%- if not tools_in_user_message is defined %} + {%- set tools_in_user_message = true %} +{%- endif %} +{%- if not date_string is defined %} + {%- set date_string = "26 Jul 2024" %} +{%- endif %} +{%- if not tools is defined %} + {%- set tools = none %} +{%- endif %} + +{#- This block extracts the system message, so we can slot it into the right place. #} +{%- if messages[0]['role'] == 'system' %} + {%- set system_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} +{%- else %} + {%- set system_message = "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: +... + + +... +" %} +{%- endif %} + +{#- System message + builtin tools #} +{{- "<|start_header_id|>system<|end_header_id|>\n\n" }} +{%- if builtin_tools is defined or tools is not none %} + {{- "Environment: ipython\n" }} +{%- endif %} +{%- if builtin_tools is defined %} + {{- "Tools: " + builtin_tools | reject('equalto', 'code_interpreter') | join(", ") + "\n\n"}} +{%- endif %} +{{- "Cutting Knowledge Date: December 2023\n" }} +{{- "Today Date: " + date_string + "\n\n" }} +{%- if tools is not none and not tools_in_user_message %} + {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} +{%- endif %} +{{- system_message }} +{{- "<|eot_id|>" }} + +{#- Custom tools are passed in a user message with some extra guidance #} +{%- if tools_in_user_message and not tools is none %} + {#- Extract the first user message so we can plug it in here #} + {%- if messages | length != 0 %} + {%- set first_user_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} + {%- else %} + {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }} + {%- endif %} + {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}} + {{- "Given the following functions, please respond with a JSON for a function call " }} + {{- "with its proper arguments that best answers the given prompt.\n\n" }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} + {{- first_user_message + "<|eot_id|>"}} +{%- endif %} + +{%- for message in messages %} + {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %} + {%- if message['role'] == 'assistant' %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }} + {% generation %} + {{- message['content'] | trim + '<|eot_id|>' }} + {% endgeneration %} + {%- else %} + {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }} + {%- endif %} + {%- elif 'tool_calls' in message %} + {%- if not message.tool_calls|length == 1 %} + {{- raise_exception("This model only supports single tool-calls at once!") }} + {%- endif %} + {%- set tool_call = message.tool_calls[0].function %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}} + {% generation %} + {%- if builtin_tools is defined and tool_call.name in builtin_tools %} + {{- "<|python_tag|>" + tool_call.name + ".call(" }} + {%- for arg_name, arg_val in tool_call.arguments | items %} + {{- arg_name + '="' + arg_val + '"' }} + {%- if not loop.last %} + {{- ", " }} + {%- endif %} + {%- endfor %} + {{- ")" }} + {%- else %} + {{- '{"name": "' + tool_call.name + '", ' }} + {{- '"parameters": ' }} + {{- tool_call.arguments | tojson }} + {{- "}" }} + {%- endif %} + {%- if builtin_tools is defined %} + {{- "<|eom_id|>" }} + {%- else %} + {{- "<|eot_id|>" }} + {%- endif %} + {% endgeneration %} + {%- elif message.role == "tool" or message.role == "ipython" %} + {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }} + {%- if message.content is mapping or message.content is iterable %} + {{- message.content | tojson }} + {%- else %} + {{- message.content }} + {%- endif %} + {{- "<|eot_id|>" }} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }} +{%- endif %} \ No newline at end of file diff --git a/config.json b/config.json new file mode 100644 index 0000000..9912fbf --- /dev/null +++ b/config.json @@ -0,0 +1,35 @@ +{ + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "dtype": "bfloat16", + "eos_token_id": 128009, + "head_dim": 64, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 8192, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 16, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 32.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": true, + "transformers_version": "4.57.6", + "use_cache": true, + "vocab_size": 128256 +} diff --git a/generation_config.json b/generation_config.json new file mode 100644 index 0000000..1996dc1 --- /dev/null +++ b/generation_config.json @@ -0,0 +1,8 @@ +{ + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": 128009, + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "4.57.6" +} diff --git a/model.safetensors b/model.safetensors new file mode 100644 index 0000000..07b8c43 --- /dev/null +++ b/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a7a6faf606af54469fcd7e9aebb7dc4b8a3c2b1039c9bad05d4238bc5b7033e +size 2996982344 diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000..e8f05fa --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1,10 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": "<|eot_id|>" +} diff --git a/tokenizer.json b/tokenizer.json new file mode 100644 index 0000000..1c1d8d5 --- /dev/null +++ b/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000..8b0c7c1 --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,2062 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/train_results.json b/train_results.json new file mode 100644 index 0000000..72386af --- /dev/null +++ b/train_results.json @@ -0,0 +1,8 @@ +{ + "total_flos": 2.1853937174671524e+18, + "train_loss": 1.1615400253780304, + "train_runtime": 1840.1375, + "train_samples": 107517, + "train_samples_per_second": 175.286, + "train_steps_per_second": 0.342 +} \ No newline at end of file diff --git a/trainer_state.json b/trainer_state.json new file mode 100644 index 0000000..44b7814 --- /dev/null +++ b/trainer_state.json @@ -0,0 +1,5713 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 500, + "global_step": 630, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.004761904761904762, + "grad_norm": 8.288029670715332, + "learning_rate": 0.0, + "loss": 1.7656, + "mean_token_accuracy": 0.5768666863441467, + "num_tokens": 582781.0, + "step": 1 + }, + { + "epoch": 0.009523809523809525, + "grad_norm": 8.260492324829102, + "learning_rate": 1.5873015873015874e-07, + "loss": 1.7728, + "mean_token_accuracy": 0.5752322673797607, + "num_tokens": 1163696.0, + "step": 2 + }, + { + "epoch": 0.014285714285714285, + "grad_norm": 8.188252449035645, + "learning_rate": 3.174603174603175e-07, + "loss": 1.776, + "mean_token_accuracy": 0.5746057033538818, + "num_tokens": 1762000.0, + "step": 3 + }, + { + "epoch": 0.01904761904761905, + "grad_norm": 8.122298240661621, + "learning_rate": 4.7619047619047623e-07, + "loss": 1.7765, + "mean_token_accuracy": 0.5741599798202515, + "num_tokens": 2363228.0, + "step": 4 + }, + { + "epoch": 0.023809523809523808, + "grad_norm": 7.91809606552124, + "learning_rate": 6.34920634920635e-07, + "loss": 1.7924, + "mean_token_accuracy": 0.5723700523376465, + "num_tokens": 2968748.0, + "step": 5 + }, + { + "epoch": 0.02857142857142857, + "grad_norm": 7.924537181854248, + "learning_rate": 7.936507936507937e-07, + "loss": 1.7649, + "mean_token_accuracy": 0.5754636526107788, + "num_tokens": 3564062.0, + "step": 6 + }, + { + "epoch": 0.03333333333333333, + "grad_norm": 7.629780292510986, + "learning_rate": 9.523809523809525e-07, + "loss": 1.7769, + "mean_token_accuracy": 0.5719509124755859, + "num_tokens": 4140352.0, + "step": 7 + }, + { + "epoch": 0.0380952380952381, + "grad_norm": 7.133674621582031, + "learning_rate": 1.111111111111111e-06, + "loss": 1.7748, + "mean_token_accuracy": 0.5719484090805054, + "num_tokens": 4748067.0, + "step": 8 + }, + { + "epoch": 0.04285714285714286, + "grad_norm": 6.150221347808838, + "learning_rate": 1.26984126984127e-06, + "loss": 1.7288, + "mean_token_accuracy": 0.5776432156562805, + "num_tokens": 5333791.0, + "step": 9 + }, + { + "epoch": 0.047619047619047616, + "grad_norm": 6.026834964752197, + "learning_rate": 1.4285714285714286e-06, + "loss": 1.7401, + "mean_token_accuracy": 0.5752577781677246, + "num_tokens": 5923564.0, + "step": 10 + }, + { + "epoch": 0.05238095238095238, + "grad_norm": 5.608363151550293, + "learning_rate": 1.5873015873015873e-06, + "loss": 1.7097, + "mean_token_accuracy": 0.5797863602638245, + "num_tokens": 6528559.0, + "step": 11 + }, + { + "epoch": 0.05714285714285714, + "grad_norm": 4.234569072723389, + "learning_rate": 1.746031746031746e-06, + "loss": 1.6598, + "mean_token_accuracy": 0.5850973725318909, + "num_tokens": 7118765.0, + "step": 12 + }, + { + "epoch": 0.06190476190476191, + "grad_norm": 4.145053386688232, + "learning_rate": 1.904761904761905e-06, + "loss": 1.6597, + "mean_token_accuracy": 0.5842898488044739, + "num_tokens": 7709226.0, + "step": 13 + }, + { + "epoch": 0.06666666666666667, + "grad_norm": 3.9073646068573, + "learning_rate": 2.0634920634920634e-06, + "loss": 1.6303, + "mean_token_accuracy": 0.5906457901000977, + "num_tokens": 8298984.0, + "step": 14 + }, + { + "epoch": 0.07142857142857142, + "grad_norm": 3.8127150535583496, + "learning_rate": 2.222222222222222e-06, + "loss": 1.6281, + "mean_token_accuracy": 0.5896565914154053, + "num_tokens": 8875624.0, + "step": 15 + }, + { + "epoch": 0.0761904761904762, + "grad_norm": 3.0899341106414795, + "learning_rate": 2.380952380952381e-06, + "loss": 1.5687, + "mean_token_accuracy": 0.5990549325942993, + "num_tokens": 9448671.0, + "step": 16 + }, + { + "epoch": 0.08095238095238096, + "grad_norm": 2.755232334136963, + "learning_rate": 2.53968253968254e-06, + "loss": 1.5548, + "mean_token_accuracy": 0.6021129488945007, + "num_tokens": 10049546.0, + "step": 17 + }, + { + "epoch": 0.08571428571428572, + "grad_norm": 2.589613914489746, + "learning_rate": 2.6984126984126986e-06, + "loss": 1.5609, + "mean_token_accuracy": 0.5993459820747375, + "num_tokens": 10644905.0, + "step": 18 + }, + { + "epoch": 0.09047619047619047, + "grad_norm": 2.2161478996276855, + "learning_rate": 2.8571428571428573e-06, + "loss": 1.5541, + "mean_token_accuracy": 0.6018418073654175, + "num_tokens": 11239583.0, + "step": 19 + }, + { + "epoch": 0.09523809523809523, + "grad_norm": 1.9722470045089722, + "learning_rate": 3.015873015873016e-06, + "loss": 1.5295, + "mean_token_accuracy": 0.6070071458816528, + "num_tokens": 11827320.0, + "step": 20 + }, + { + "epoch": 0.1, + "grad_norm": 1.8827704191207886, + "learning_rate": 3.1746031746031746e-06, + "loss": 1.4814, + "mean_token_accuracy": 0.6151003837585449, + "num_tokens": 12425511.0, + "step": 21 + }, + { + "epoch": 0.10476190476190476, + "grad_norm": 2.351033926010132, + "learning_rate": 3.3333333333333333e-06, + "loss": 1.4865, + "mean_token_accuracy": 0.6138286590576172, + "num_tokens": 13015708.0, + "step": 22 + }, + { + "epoch": 0.10952380952380952, + "grad_norm": 2.134150981903076, + "learning_rate": 3.492063492063492e-06, + "loss": 1.469, + "mean_token_accuracy": 0.6165286302566528, + "num_tokens": 13608875.0, + "step": 23 + }, + { + "epoch": 0.11428571428571428, + "grad_norm": 1.9380258321762085, + "learning_rate": 3.6507936507936507e-06, + "loss": 1.476, + "mean_token_accuracy": 0.6141604781150818, + "num_tokens": 14204297.0, + "step": 24 + }, + { + "epoch": 0.11904761904761904, + "grad_norm": 1.656062364578247, + "learning_rate": 3.80952380952381e-06, + "loss": 1.461, + "mean_token_accuracy": 0.6166412830352783, + "num_tokens": 14782206.0, + "step": 25 + }, + { + "epoch": 0.12380952380952381, + "grad_norm": 1.3905470371246338, + "learning_rate": 3.968253968253968e-06, + "loss": 1.4382, + "mean_token_accuracy": 0.6210923194885254, + "num_tokens": 15377829.0, + "step": 26 + }, + { + "epoch": 0.12857142857142856, + "grad_norm": 1.1439160108566284, + "learning_rate": 4.126984126984127e-06, + "loss": 1.4318, + "mean_token_accuracy": 0.6224101781845093, + "num_tokens": 15975819.0, + "step": 27 + }, + { + "epoch": 0.13333333333333333, + "grad_norm": 1.0443707704544067, + "learning_rate": 4.2857142857142855e-06, + "loss": 1.4182, + "mean_token_accuracy": 0.6251046061515808, + "num_tokens": 16577839.0, + "step": 28 + }, + { + "epoch": 0.1380952380952381, + "grad_norm": 1.0729820728302002, + "learning_rate": 4.444444444444444e-06, + "loss": 1.4116, + "mean_token_accuracy": 0.6257858276367188, + "num_tokens": 17164831.0, + "step": 29 + }, + { + "epoch": 0.14285714285714285, + "grad_norm": 1.1262085437774658, + "learning_rate": 4.603174603174604e-06, + "loss": 1.3974, + "mean_token_accuracy": 0.6290417909622192, + "num_tokens": 17770476.0, + "step": 30 + }, + { + "epoch": 0.14761904761904762, + "grad_norm": 1.1004436016082764, + "learning_rate": 4.761904761904762e-06, + "loss": 1.383, + "mean_token_accuracy": 0.6305603981018066, + "num_tokens": 18360862.0, + "step": 31 + }, + { + "epoch": 0.1523809523809524, + "grad_norm": 0.9822593927383423, + "learning_rate": 4.920634920634921e-06, + "loss": 1.3981, + "mean_token_accuracy": 0.6271172761917114, + "num_tokens": 18944338.0, + "step": 32 + }, + { + "epoch": 0.15714285714285714, + "grad_norm": 0.8572197556495667, + "learning_rate": 5.07936507936508e-06, + "loss": 1.3721, + "mean_token_accuracy": 0.6327400207519531, + "num_tokens": 19540189.0, + "step": 33 + }, + { + "epoch": 0.1619047619047619, + "grad_norm": 0.9113824963569641, + "learning_rate": 5.2380952380952384e-06, + "loss": 1.3689, + "mean_token_accuracy": 0.6341559290885925, + "num_tokens": 20138131.0, + "step": 34 + }, + { + "epoch": 0.16666666666666666, + "grad_norm": 0.8736249208450317, + "learning_rate": 5.396825396825397e-06, + "loss": 1.3855, + "mean_token_accuracy": 0.6294394731521606, + "num_tokens": 20735187.0, + "step": 35 + }, + { + "epoch": 0.17142857142857143, + "grad_norm": 0.8438997268676758, + "learning_rate": 5.555555555555557e-06, + "loss": 1.3614, + "mean_token_accuracy": 0.6335337162017822, + "num_tokens": 21316383.0, + "step": 36 + }, + { + "epoch": 0.1761904761904762, + "grad_norm": 0.7541394233703613, + "learning_rate": 5.7142857142857145e-06, + "loss": 1.3378, + "mean_token_accuracy": 0.6401833295822144, + "num_tokens": 21910626.0, + "step": 37 + }, + { + "epoch": 0.18095238095238095, + "grad_norm": 0.697533130645752, + "learning_rate": 5.873015873015874e-06, + "loss": 1.3591, + "mean_token_accuracy": 0.6341187357902527, + "num_tokens": 22503955.0, + "step": 38 + }, + { + "epoch": 0.18571428571428572, + "grad_norm": 0.677990734577179, + "learning_rate": 6.031746031746032e-06, + "loss": 1.3543, + "mean_token_accuracy": 0.6353764533996582, + "num_tokens": 23093310.0, + "step": 39 + }, + { + "epoch": 0.19047619047619047, + "grad_norm": 0.677953839302063, + "learning_rate": 6.1904761904761914e-06, + "loss": 1.3249, + "mean_token_accuracy": 0.641827404499054, + "num_tokens": 23681028.0, + "step": 40 + }, + { + "epoch": 0.19523809523809524, + "grad_norm": 0.6177698969841003, + "learning_rate": 6.349206349206349e-06, + "loss": 1.3271, + "mean_token_accuracy": 0.6412782669067383, + "num_tokens": 24275532.0, + "step": 41 + }, + { + "epoch": 0.2, + "grad_norm": 0.6382781267166138, + "learning_rate": 6.507936507936509e-06, + "loss": 1.3309, + "mean_token_accuracy": 0.6407559514045715, + "num_tokens": 24868054.0, + "step": 42 + }, + { + "epoch": 0.20476190476190476, + "grad_norm": 0.5981337428092957, + "learning_rate": 6.666666666666667e-06, + "loss": 1.3323, + "mean_token_accuracy": 0.6397281885147095, + "num_tokens": 25459842.0, + "step": 43 + }, + { + "epoch": 0.20952380952380953, + "grad_norm": 0.5885143876075745, + "learning_rate": 6.825396825396826e-06, + "loss": 1.339, + "mean_token_accuracy": 0.6373006105422974, + "num_tokens": 26051340.0, + "step": 44 + }, + { + "epoch": 0.21428571428571427, + "grad_norm": 0.5942175984382629, + "learning_rate": 6.984126984126984e-06, + "loss": 1.3188, + "mean_token_accuracy": 0.6426886320114136, + "num_tokens": 26635240.0, + "step": 45 + }, + { + "epoch": 0.21904761904761905, + "grad_norm": 0.6174569129943848, + "learning_rate": 7.1428571428571436e-06, + "loss": 1.3198, + "mean_token_accuracy": 0.6419786214828491, + "num_tokens": 27228570.0, + "step": 46 + }, + { + "epoch": 0.22380952380952382, + "grad_norm": 0.6012991070747375, + "learning_rate": 7.301587301587301e-06, + "loss": 1.3139, + "mean_token_accuracy": 0.6440544128417969, + "num_tokens": 27825958.0, + "step": 47 + }, + { + "epoch": 0.22857142857142856, + "grad_norm": 0.6103922128677368, + "learning_rate": 7.460317460317461e-06, + "loss": 1.3076, + "mean_token_accuracy": 0.6433683037757874, + "num_tokens": 28418470.0, + "step": 48 + }, + { + "epoch": 0.23333333333333334, + "grad_norm": 0.6127147674560547, + "learning_rate": 7.61904761904762e-06, + "loss": 1.3044, + "mean_token_accuracy": 0.6449373364448547, + "num_tokens": 29013060.0, + "step": 49 + }, + { + "epoch": 0.23809523809523808, + "grad_norm": 0.5933082103729248, + "learning_rate": 7.77777777777778e-06, + "loss": 1.3131, + "mean_token_accuracy": 0.6417987942695618, + "num_tokens": 29624709.0, + "step": 50 + }, + { + "epoch": 0.24285714285714285, + "grad_norm": 0.6003814339637756, + "learning_rate": 7.936507936507936e-06, + "loss": 1.3056, + "mean_token_accuracy": 0.6438874006271362, + "num_tokens": 30227928.0, + "step": 51 + }, + { + "epoch": 0.24761904761904763, + "grad_norm": 0.5546218156814575, + "learning_rate": 8.095238095238097e-06, + "loss": 1.3073, + "mean_token_accuracy": 0.6426275968551636, + "num_tokens": 30823383.0, + "step": 52 + }, + { + "epoch": 0.2523809523809524, + "grad_norm": 0.5813356637954712, + "learning_rate": 8.253968253968254e-06, + "loss": 1.2887, + "mean_token_accuracy": 0.6480042338371277, + "num_tokens": 31418593.0, + "step": 53 + }, + { + "epoch": 0.2571428571428571, + "grad_norm": 0.6125403046607971, + "learning_rate": 8.412698412698414e-06, + "loss": 1.2812, + "mean_token_accuracy": 0.6492801904678345, + "num_tokens": 32008377.0, + "step": 54 + }, + { + "epoch": 0.2619047619047619, + "grad_norm": 0.6021028757095337, + "learning_rate": 8.571428571428571e-06, + "loss": 1.2881, + "mean_token_accuracy": 0.6466339230537415, + "num_tokens": 32600302.0, + "step": 55 + }, + { + "epoch": 0.26666666666666666, + "grad_norm": 0.5916977524757385, + "learning_rate": 8.730158730158731e-06, + "loss": 1.2896, + "mean_token_accuracy": 0.6466712951660156, + "num_tokens": 33201147.0, + "step": 56 + }, + { + "epoch": 0.2714285714285714, + "grad_norm": 0.5573871731758118, + "learning_rate": 8.888888888888888e-06, + "loss": 1.269, + "mean_token_accuracy": 0.6514161229133606, + "num_tokens": 33790565.0, + "step": 57 + }, + { + "epoch": 0.2761904761904762, + "grad_norm": 0.6427719593048096, + "learning_rate": 9.047619047619049e-06, + "loss": 1.2747, + "mean_token_accuracy": 0.6507048606872559, + "num_tokens": 34387187.0, + "step": 58 + }, + { + "epoch": 0.28095238095238095, + "grad_norm": 0.5992103219032288, + "learning_rate": 9.206349206349207e-06, + "loss": 1.2832, + "mean_token_accuracy": 0.6487317085266113, + "num_tokens": 35000480.0, + "step": 59 + }, + { + "epoch": 0.2857142857142857, + "grad_norm": 0.6176905632019043, + "learning_rate": 9.365079365079366e-06, + "loss": 1.266, + "mean_token_accuracy": 0.6526767611503601, + "num_tokens": 35588577.0, + "step": 60 + }, + { + "epoch": 0.2904761904761905, + "grad_norm": 0.6162196397781372, + "learning_rate": 9.523809523809525e-06, + "loss": 1.2696, + "mean_token_accuracy": 0.6507794857025146, + "num_tokens": 36179186.0, + "step": 61 + }, + { + "epoch": 0.29523809523809524, + "grad_norm": 0.5662937760353088, + "learning_rate": 9.682539682539683e-06, + "loss": 1.2769, + "mean_token_accuracy": 0.6498540639877319, + "num_tokens": 36787338.0, + "step": 62 + }, + { + "epoch": 0.3, + "grad_norm": 0.6263328790664673, + "learning_rate": 9.841269841269842e-06, + "loss": 1.2659, + "mean_token_accuracy": 0.6512309908866882, + "num_tokens": 37376232.0, + "step": 63 + }, + { + "epoch": 0.3047619047619048, + "grad_norm": 0.5712647438049316, + "learning_rate": 1e-05, + "loss": 1.2575, + "mean_token_accuracy": 0.65373295545578, + "num_tokens": 37965066.0, + "step": 64 + }, + { + "epoch": 0.30952380952380953, + "grad_norm": 0.6364603042602539, + "learning_rate": 1e-05, + "loss": 1.2707, + "mean_token_accuracy": 0.6504393219947815, + "num_tokens": 38556474.0, + "step": 65 + }, + { + "epoch": 0.3142857142857143, + "grad_norm": 0.5501719117164612, + "learning_rate": 1e-05, + "loss": 1.2817, + "mean_token_accuracy": 0.6485756039619446, + "num_tokens": 39153957.0, + "step": 66 + }, + { + "epoch": 0.319047619047619, + "grad_norm": 0.6252837777137756, + "learning_rate": 1e-05, + "loss": 1.269, + "mean_token_accuracy": 0.6509230136871338, + "num_tokens": 39743079.0, + "step": 67 + }, + { + "epoch": 0.3238095238095238, + "grad_norm": 0.635744035243988, + "learning_rate": 1e-05, + "loss": 1.2538, + "mean_token_accuracy": 0.6549092531204224, + "num_tokens": 40341422.0, + "step": 68 + }, + { + "epoch": 0.32857142857142857, + "grad_norm": 0.602989137172699, + "learning_rate": 1e-05, + "loss": 1.2522, + "mean_token_accuracy": 0.6547552347183228, + "num_tokens": 40930579.0, + "step": 69 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 0.6224581003189087, + "learning_rate": 1e-05, + "loss": 1.2475, + "mean_token_accuracy": 0.6561790704727173, + "num_tokens": 41521392.0, + "step": 70 + }, + { + "epoch": 0.3380952380952381, + "grad_norm": 0.6388071179389954, + "learning_rate": 1e-05, + "loss": 1.2652, + "mean_token_accuracy": 0.6521209478378296, + "num_tokens": 42126117.0, + "step": 71 + }, + { + "epoch": 0.34285714285714286, + "grad_norm": 0.6036304235458374, + "learning_rate": 1e-05, + "loss": 1.2435, + "mean_token_accuracy": 0.6566687822341919, + "num_tokens": 42717085.0, + "step": 72 + }, + { + "epoch": 0.3476190476190476, + "grad_norm": 0.6735650300979614, + "learning_rate": 1e-05, + "loss": 1.2474, + "mean_token_accuracy": 0.6550711989402771, + "num_tokens": 43300932.0, + "step": 73 + }, + { + "epoch": 0.3523809523809524, + "grad_norm": 0.6821399927139282, + "learning_rate": 1e-05, + "loss": 1.2612, + "mean_token_accuracy": 0.6513347625732422, + "num_tokens": 43885512.0, + "step": 74 + }, + { + "epoch": 0.35714285714285715, + "grad_norm": 0.5906922221183777, + "learning_rate": 1e-05, + "loss": 1.2462, + "mean_token_accuracy": 0.6552602052688599, + "num_tokens": 44482626.0, + "step": 75 + }, + { + "epoch": 0.3619047619047619, + "grad_norm": 0.6703640222549438, + "learning_rate": 1e-05, + "loss": 1.2555, + "mean_token_accuracy": 0.6526749134063721, + "num_tokens": 45073331.0, + "step": 76 + }, + { + "epoch": 0.36666666666666664, + "grad_norm": 0.6432617902755737, + "learning_rate": 1e-05, + "loss": 1.2536, + "mean_token_accuracy": 0.654289186000824, + "num_tokens": 45683001.0, + "step": 77 + }, + { + "epoch": 0.37142857142857144, + "grad_norm": 0.5765655040740967, + "learning_rate": 1e-05, + "loss": 1.2571, + "mean_token_accuracy": 0.6539218425750732, + "num_tokens": 46280871.0, + "step": 78 + }, + { + "epoch": 0.3761904761904762, + "grad_norm": 0.6340111494064331, + "learning_rate": 1e-05, + "loss": 1.2372, + "mean_token_accuracy": 0.6561391353607178, + "num_tokens": 46860927.0, + "step": 79 + }, + { + "epoch": 0.38095238095238093, + "grad_norm": 0.6405033469200134, + "learning_rate": 1e-05, + "loss": 1.2526, + "mean_token_accuracy": 0.6536115407943726, + "num_tokens": 47450747.0, + "step": 80 + }, + { + "epoch": 0.38571428571428573, + "grad_norm": 0.5792959332466125, + "learning_rate": 1e-05, + "loss": 1.25, + "mean_token_accuracy": 0.6553176641464233, + "num_tokens": 48053355.0, + "step": 81 + }, + { + "epoch": 0.3904761904761905, + "grad_norm": 0.686775267124176, + "learning_rate": 1e-05, + "loss": 1.2208, + "mean_token_accuracy": 0.659858226776123, + "num_tokens": 48654406.0, + "step": 82 + }, + { + "epoch": 0.3952380952380952, + "grad_norm": 0.6492419838905334, + "learning_rate": 1e-05, + "loss": 1.2283, + "mean_token_accuracy": 0.6583410501480103, + "num_tokens": 49253902.0, + "step": 83 + }, + { + "epoch": 0.4, + "grad_norm": 0.5871007442474365, + "learning_rate": 1e-05, + "loss": 1.2452, + "mean_token_accuracy": 0.6552358269691467, + "num_tokens": 49851728.0, + "step": 84 + }, + { + "epoch": 0.40476190476190477, + "grad_norm": 0.5860946774482727, + "learning_rate": 1e-05, + "loss": 1.2512, + "mean_token_accuracy": 0.6536369919776917, + "num_tokens": 50456288.0, + "step": 85 + }, + { + "epoch": 0.4095238095238095, + "grad_norm": 0.6220575571060181, + "learning_rate": 1e-05, + "loss": 1.2576, + "mean_token_accuracy": 0.6526967883110046, + "num_tokens": 51058176.0, + "step": 86 + }, + { + "epoch": 0.4142857142857143, + "grad_norm": 0.6111760139465332, + "learning_rate": 1e-05, + "loss": 1.2426, + "mean_token_accuracy": 0.6556516885757446, + "num_tokens": 51665178.0, + "step": 87 + }, + { + "epoch": 0.41904761904761906, + "grad_norm": 0.7028889060020447, + "learning_rate": 1e-05, + "loss": 1.2275, + "mean_token_accuracy": 0.658629298210144, + "num_tokens": 52237427.0, + "step": 88 + }, + { + "epoch": 0.4238095238095238, + "grad_norm": 0.6114148497581482, + "learning_rate": 1e-05, + "loss": 1.2493, + "mean_token_accuracy": 0.6530453562736511, + "num_tokens": 52850605.0, + "step": 89 + }, + { + "epoch": 0.42857142857142855, + "grad_norm": 0.6214424967765808, + "learning_rate": 1e-05, + "loss": 1.2107, + "mean_token_accuracy": 0.6619127988815308, + "num_tokens": 53435907.0, + "step": 90 + }, + { + "epoch": 0.43333333333333335, + "grad_norm": 0.6224313378334045, + "learning_rate": 1e-05, + "loss": 1.2479, + "mean_token_accuracy": 0.6531662344932556, + "num_tokens": 54032690.0, + "step": 91 + }, + { + "epoch": 0.4380952380952381, + "grad_norm": 0.5745725035667419, + "learning_rate": 1e-05, + "loss": 1.2339, + "mean_token_accuracy": 0.6577485799789429, + "num_tokens": 54631908.0, + "step": 92 + }, + { + "epoch": 0.44285714285714284, + "grad_norm": 0.6754887104034424, + "learning_rate": 1e-05, + "loss": 1.2274, + "mean_token_accuracy": 0.6584199666976929, + "num_tokens": 55218598.0, + "step": 93 + }, + { + "epoch": 0.44761904761904764, + "grad_norm": 0.6922246813774109, + "learning_rate": 1e-05, + "loss": 1.2513, + "mean_token_accuracy": 0.6527312397956848, + "num_tokens": 55814642.0, + "step": 94 + }, + { + "epoch": 0.4523809523809524, + "grad_norm": 0.5802931189537048, + "learning_rate": 1e-05, + "loss": 1.2231, + "mean_token_accuracy": 0.6605392694473267, + "num_tokens": 56410000.0, + "step": 95 + }, + { + "epoch": 0.45714285714285713, + "grad_norm": 0.7186371088027954, + "learning_rate": 1e-05, + "loss": 1.2325, + "mean_token_accuracy": 0.6574358940124512, + "num_tokens": 57001902.0, + "step": 96 + }, + { + "epoch": 0.46190476190476193, + "grad_norm": 0.5912067294120789, + "learning_rate": 1e-05, + "loss": 1.2413, + "mean_token_accuracy": 0.6551775336265564, + "num_tokens": 57612227.0, + "step": 97 + }, + { + "epoch": 0.4666666666666667, + "grad_norm": 0.7110946774482727, + "learning_rate": 1e-05, + "loss": 1.2272, + "mean_token_accuracy": 0.6573148965835571, + "num_tokens": 58198983.0, + "step": 98 + }, + { + "epoch": 0.4714285714285714, + "grad_norm": 0.703130841255188, + "learning_rate": 1e-05, + "loss": 1.2488, + "mean_token_accuracy": 0.6536985039710999, + "num_tokens": 58805739.0, + "step": 99 + }, + { + "epoch": 0.47619047619047616, + "grad_norm": 0.6474947333335876, + "learning_rate": 1e-05, + "loss": 1.216, + "mean_token_accuracy": 0.6609683036804199, + "num_tokens": 59386596.0, + "step": 100 + }, + { + "epoch": 0.48095238095238096, + "grad_norm": 0.7493091225624084, + "learning_rate": 1e-05, + "loss": 1.2239, + "mean_token_accuracy": 0.6587037444114685, + "num_tokens": 59976677.0, + "step": 101 + }, + { + "epoch": 0.4857142857142857, + "grad_norm": 0.6101422905921936, + "learning_rate": 1e-05, + "loss": 1.2366, + "mean_token_accuracy": 0.6560448408126831, + "num_tokens": 60581023.0, + "step": 102 + }, + { + "epoch": 0.49047619047619045, + "grad_norm": 0.7304781079292297, + "learning_rate": 1e-05, + "loss": 1.2269, + "mean_token_accuracy": 0.6589258909225464, + "num_tokens": 61177587.0, + "step": 103 + }, + { + "epoch": 0.49523809523809526, + "grad_norm": 0.618215024471283, + "learning_rate": 1e-05, + "loss": 1.2207, + "mean_token_accuracy": 0.6586862802505493, + "num_tokens": 61759739.0, + "step": 104 + }, + { + "epoch": 0.5, + "grad_norm": 0.6789980530738831, + "learning_rate": 1e-05, + "loss": 1.2283, + "mean_token_accuracy": 0.6580797433853149, + "num_tokens": 62343623.0, + "step": 105 + }, + { + "epoch": 0.5047619047619047, + "grad_norm": 0.6834375858306885, + "learning_rate": 1e-05, + "loss": 1.2226, + "mean_token_accuracy": 0.6588083505630493, + "num_tokens": 62936609.0, + "step": 106 + }, + { + "epoch": 0.5095238095238095, + "grad_norm": 0.6128349304199219, + "learning_rate": 1e-05, + "loss": 1.219, + "mean_token_accuracy": 0.6602170467376709, + "num_tokens": 63540035.0, + "step": 107 + }, + { + "epoch": 0.5142857142857142, + "grad_norm": 0.6424954533576965, + "learning_rate": 1e-05, + "loss": 1.2252, + "mean_token_accuracy": 0.6583743691444397, + "num_tokens": 64137406.0, + "step": 108 + }, + { + "epoch": 0.5190476190476191, + "grad_norm": 0.566566526889801, + "learning_rate": 1e-05, + "loss": 1.2104, + "mean_token_accuracy": 0.6621809005737305, + "num_tokens": 64747343.0, + "step": 109 + }, + { + "epoch": 0.5238095238095238, + "grad_norm": 0.5913292169570923, + "learning_rate": 1e-05, + "loss": 1.21, + "mean_token_accuracy": 0.6611165404319763, + "num_tokens": 65340433.0, + "step": 110 + }, + { + "epoch": 0.5285714285714286, + "grad_norm": 0.5560601353645325, + "learning_rate": 1e-05, + "loss": 1.2029, + "mean_token_accuracy": 0.6629985570907593, + "num_tokens": 65928375.0, + "step": 111 + }, + { + "epoch": 0.5333333333333333, + "grad_norm": 0.5711589455604553, + "learning_rate": 1e-05, + "loss": 1.2285, + "mean_token_accuracy": 0.6574028134346008, + "num_tokens": 66527455.0, + "step": 112 + }, + { + "epoch": 0.5380952380952381, + "grad_norm": 0.5675383806228638, + "learning_rate": 1e-05, + "loss": 1.2001, + "mean_token_accuracy": 0.6645528674125671, + "num_tokens": 67120147.0, + "step": 113 + }, + { + "epoch": 0.5428571428571428, + "grad_norm": 0.5860258340835571, + "learning_rate": 1e-05, + "loss": 1.2182, + "mean_token_accuracy": 0.6599565744400024, + "num_tokens": 67726850.0, + "step": 114 + }, + { + "epoch": 0.5476190476190477, + "grad_norm": 0.5209094285964966, + "learning_rate": 1e-05, + "loss": 1.2126, + "mean_token_accuracy": 0.6609143018722534, + "num_tokens": 68316713.0, + "step": 115 + }, + { + "epoch": 0.5523809523809524, + "grad_norm": 0.6333171725273132, + "learning_rate": 1e-05, + "loss": 1.2156, + "mean_token_accuracy": 0.6600525379180908, + "num_tokens": 68892365.0, + "step": 116 + }, + { + "epoch": 0.5571428571428572, + "grad_norm": 0.5704973340034485, + "learning_rate": 1e-05, + "loss": 1.2211, + "mean_token_accuracy": 0.6591875553131104, + "num_tokens": 69505524.0, + "step": 117 + }, + { + "epoch": 0.5619047619047619, + "grad_norm": 0.7181419134140015, + "learning_rate": 1e-05, + "loss": 1.2036, + "mean_token_accuracy": 0.6623135805130005, + "num_tokens": 70095302.0, + "step": 118 + }, + { + "epoch": 0.5666666666666667, + "grad_norm": 0.5681948661804199, + "learning_rate": 1e-05, + "loss": 1.216, + "mean_token_accuracy": 0.6598063707351685, + "num_tokens": 70694971.0, + "step": 119 + }, + { + "epoch": 0.5714285714285714, + "grad_norm": 0.7001712918281555, + "learning_rate": 1e-05, + "loss": 1.2146, + "mean_token_accuracy": 0.6608985662460327, + "num_tokens": 71279415.0, + "step": 120 + }, + { + "epoch": 0.5761904761904761, + "grad_norm": 0.6377084255218506, + "learning_rate": 1e-05, + "loss": 1.209, + "mean_token_accuracy": 0.6621115207672119, + "num_tokens": 71869014.0, + "step": 121 + }, + { + "epoch": 0.580952380952381, + "grad_norm": 0.6364737153053284, + "learning_rate": 1e-05, + "loss": 1.2171, + "mean_token_accuracy": 0.6591671705245972, + "num_tokens": 72472715.0, + "step": 122 + }, + { + "epoch": 0.5857142857142857, + "grad_norm": 0.6466585397720337, + "learning_rate": 1e-05, + "loss": 1.2089, + "mean_token_accuracy": 0.661442756652832, + "num_tokens": 73055740.0, + "step": 123 + }, + { + "epoch": 0.5904761904761905, + "grad_norm": 0.5920109152793884, + "learning_rate": 1e-05, + "loss": 1.1924, + "mean_token_accuracy": 0.6659133434295654, + "num_tokens": 73639151.0, + "step": 124 + }, + { + "epoch": 0.5952380952380952, + "grad_norm": 0.6872738599777222, + "learning_rate": 1e-05, + "loss": 1.2113, + "mean_token_accuracy": 0.6628360152244568, + "num_tokens": 74216756.0, + "step": 125 + }, + { + "epoch": 0.6, + "grad_norm": 0.5881339907646179, + "learning_rate": 1e-05, + "loss": 1.2062, + "mean_token_accuracy": 0.662140965461731, + "num_tokens": 74813953.0, + "step": 126 + }, + { + "epoch": 0.6047619047619047, + "grad_norm": 0.6483287215232849, + "learning_rate": 1e-05, + "loss": 1.2065, + "mean_token_accuracy": 0.6624675989151001, + "num_tokens": 75410691.0, + "step": 127 + }, + { + "epoch": 0.6095238095238096, + "grad_norm": 0.5890834331512451, + "learning_rate": 1e-05, + "loss": 1.2235, + "mean_token_accuracy": 0.6575560569763184, + "num_tokens": 75996496.0, + "step": 128 + }, + { + "epoch": 0.6142857142857143, + "grad_norm": 0.6782101988792419, + "learning_rate": 1e-05, + "loss": 1.199, + "mean_token_accuracy": 0.6648662090301514, + "num_tokens": 76585198.0, + "step": 129 + }, + { + "epoch": 0.6190476190476191, + "grad_norm": 0.6252265572547913, + "learning_rate": 1e-05, + "loss": 1.1872, + "mean_token_accuracy": 0.6665824055671692, + "num_tokens": 77191596.0, + "step": 130 + }, + { + "epoch": 0.6238095238095238, + "grad_norm": 0.6833210587501526, + "learning_rate": 1e-05, + "loss": 1.2048, + "mean_token_accuracy": 0.6622829437255859, + "num_tokens": 77796998.0, + "step": 131 + }, + { + "epoch": 0.6285714285714286, + "grad_norm": 0.6870852708816528, + "learning_rate": 1e-05, + "loss": 1.2171, + "mean_token_accuracy": 0.6590390801429749, + "num_tokens": 78395104.0, + "step": 132 + }, + { + "epoch": 0.6333333333333333, + "grad_norm": 0.7417638897895813, + "learning_rate": 1e-05, + "loss": 1.2036, + "mean_token_accuracy": 0.66297847032547, + "num_tokens": 78988563.0, + "step": 133 + }, + { + "epoch": 0.638095238095238, + "grad_norm": 0.569595456123352, + "learning_rate": 1e-05, + "loss": 1.2234, + "mean_token_accuracy": 0.6573336124420166, + "num_tokens": 79599633.0, + "step": 134 + }, + { + "epoch": 0.6428571428571429, + "grad_norm": 0.8054560422897339, + "learning_rate": 1e-05, + "loss": 1.2149, + "mean_token_accuracy": 0.6601018905639648, + "num_tokens": 80196954.0, + "step": 135 + }, + { + "epoch": 0.6476190476190476, + "grad_norm": 0.6360299587249756, + "learning_rate": 1e-05, + "loss": 1.2141, + "mean_token_accuracy": 0.6599046587944031, + "num_tokens": 80790959.0, + "step": 136 + }, + { + "epoch": 0.6523809523809524, + "grad_norm": 0.7952516078948975, + "learning_rate": 1e-05, + "loss": 1.2004, + "mean_token_accuracy": 0.6641189455986023, + "num_tokens": 81363350.0, + "step": 137 + }, + { + "epoch": 0.6571428571428571, + "grad_norm": 0.7050403356552124, + "learning_rate": 1e-05, + "loss": 1.2017, + "mean_token_accuracy": 0.6631975173950195, + "num_tokens": 81960356.0, + "step": 138 + }, + { + "epoch": 0.6619047619047619, + "grad_norm": 0.809806227684021, + "learning_rate": 1e-05, + "loss": 1.2119, + "mean_token_accuracy": 0.6605896353721619, + "num_tokens": 82573967.0, + "step": 139 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 0.7040579915046692, + "learning_rate": 1e-05, + "loss": 1.1997, + "mean_token_accuracy": 0.6634917259216309, + "num_tokens": 83170751.0, + "step": 140 + }, + { + "epoch": 0.6714285714285714, + "grad_norm": 0.7381901144981384, + "learning_rate": 1e-05, + "loss": 1.1815, + "mean_token_accuracy": 0.6675858497619629, + "num_tokens": 83744022.0, + "step": 141 + }, + { + "epoch": 0.6761904761904762, + "grad_norm": 0.6610327959060669, + "learning_rate": 1e-05, + "loss": 1.2172, + "mean_token_accuracy": 0.659174382686615, + "num_tokens": 84336667.0, + "step": 142 + }, + { + "epoch": 0.680952380952381, + "grad_norm": 0.8185865879058838, + "learning_rate": 1e-05, + "loss": 1.199, + "mean_token_accuracy": 0.6634737253189087, + "num_tokens": 84927599.0, + "step": 143 + }, + { + "epoch": 0.6857142857142857, + "grad_norm": 0.6603442430496216, + "learning_rate": 1e-05, + "loss": 1.1976, + "mean_token_accuracy": 0.6643534898757935, + "num_tokens": 85516121.0, + "step": 144 + }, + { + "epoch": 0.6904761904761905, + "grad_norm": 0.7519460320472717, + "learning_rate": 1e-05, + "loss": 1.1926, + "mean_token_accuracy": 0.664984941482544, + "num_tokens": 86106161.0, + "step": 145 + }, + { + "epoch": 0.6952380952380952, + "grad_norm": 0.7080089449882507, + "learning_rate": 1e-05, + "loss": 1.2086, + "mean_token_accuracy": 0.6621935963630676, + "num_tokens": 86723880.0, + "step": 146 + }, + { + "epoch": 0.7, + "grad_norm": 0.7303557395935059, + "learning_rate": 1e-05, + "loss": 1.2033, + "mean_token_accuracy": 0.6634014248847961, + "num_tokens": 87327542.0, + "step": 147 + }, + { + "epoch": 0.7047619047619048, + "grad_norm": 0.6376964449882507, + "learning_rate": 1e-05, + "loss": 1.1977, + "mean_token_accuracy": 0.6633247137069702, + "num_tokens": 87912557.0, + "step": 148 + }, + { + "epoch": 0.7095238095238096, + "grad_norm": 0.6810888051986694, + "learning_rate": 1e-05, + "loss": 1.2087, + "mean_token_accuracy": 0.6617689728736877, + "num_tokens": 88514499.0, + "step": 149 + }, + { + "epoch": 0.7142857142857143, + "grad_norm": 0.6272366046905518, + "learning_rate": 1e-05, + "loss": 1.1879, + "mean_token_accuracy": 0.6662660837173462, + "num_tokens": 89090412.0, + "step": 150 + }, + { + "epoch": 0.719047619047619, + "grad_norm": 0.6499550938606262, + "learning_rate": 1e-05, + "loss": 1.1978, + "mean_token_accuracy": 0.6638685464859009, + "num_tokens": 89689944.0, + "step": 151 + }, + { + "epoch": 0.7238095238095238, + "grad_norm": 0.6450507640838623, + "learning_rate": 1e-05, + "loss": 1.2088, + "mean_token_accuracy": 0.6614329218864441, + "num_tokens": 90281605.0, + "step": 152 + }, + { + "epoch": 0.7285714285714285, + "grad_norm": 0.6113287806510925, + "learning_rate": 1e-05, + "loss": 1.2095, + "mean_token_accuracy": 0.6616454124450684, + "num_tokens": 90877169.0, + "step": 153 + }, + { + "epoch": 0.7333333333333333, + "grad_norm": 0.6421619653701782, + "learning_rate": 1e-05, + "loss": 1.2141, + "mean_token_accuracy": 0.6598343253135681, + "num_tokens": 91473587.0, + "step": 154 + }, + { + "epoch": 0.7380952380952381, + "grad_norm": 0.5994828939437866, + "learning_rate": 1e-05, + "loss": 1.2069, + "mean_token_accuracy": 0.6615887880325317, + "num_tokens": 92066142.0, + "step": 155 + }, + { + "epoch": 0.7428571428571429, + "grad_norm": 0.5635871887207031, + "learning_rate": 1e-05, + "loss": 1.1885, + "mean_token_accuracy": 0.6657248735427856, + "num_tokens": 92671294.0, + "step": 156 + }, + { + "epoch": 0.7476190476190476, + "grad_norm": 0.5961142778396606, + "learning_rate": 1e-05, + "loss": 1.1915, + "mean_token_accuracy": 0.6649054884910583, + "num_tokens": 93267004.0, + "step": 157 + }, + { + "epoch": 0.7523809523809524, + "grad_norm": 0.5518187284469604, + "learning_rate": 1e-05, + "loss": 1.2093, + "mean_token_accuracy": 0.6612235307693481, + "num_tokens": 93865099.0, + "step": 158 + }, + { + "epoch": 0.7571428571428571, + "grad_norm": 0.6183374524116516, + "learning_rate": 1e-05, + "loss": 1.1825, + "mean_token_accuracy": 0.6676396131515503, + "num_tokens": 94449283.0, + "step": 159 + }, + { + "epoch": 0.7619047619047619, + "grad_norm": 0.5925056338310242, + "learning_rate": 1e-05, + "loss": 1.1927, + "mean_token_accuracy": 0.6643291711807251, + "num_tokens": 95037160.0, + "step": 160 + }, + { + "epoch": 0.7666666666666667, + "grad_norm": 0.6148018836975098, + "learning_rate": 1e-05, + "loss": 1.1761, + "mean_token_accuracy": 0.6689929962158203, + "num_tokens": 95620329.0, + "step": 161 + }, + { + "epoch": 0.7714285714285715, + "grad_norm": 0.6416387557983398, + "learning_rate": 1e-05, + "loss": 1.1978, + "mean_token_accuracy": 0.6625751256942749, + "num_tokens": 96202979.0, + "step": 162 + }, + { + "epoch": 0.7761904761904762, + "grad_norm": 0.5393695831298828, + "learning_rate": 1e-05, + "loss": 1.1918, + "mean_token_accuracy": 0.665260910987854, + "num_tokens": 96794135.0, + "step": 163 + }, + { + "epoch": 0.780952380952381, + "grad_norm": 0.6334103941917419, + "learning_rate": 1e-05, + "loss": 1.1821, + "mean_token_accuracy": 0.6664952635765076, + "num_tokens": 97380180.0, + "step": 164 + }, + { + "epoch": 0.7857142857142857, + "grad_norm": 0.6443802118301392, + "learning_rate": 1e-05, + "loss": 1.2005, + "mean_token_accuracy": 0.663545548915863, + "num_tokens": 97979583.0, + "step": 165 + }, + { + "epoch": 0.7904761904761904, + "grad_norm": 0.6070786714553833, + "learning_rate": 1e-05, + "loss": 1.1818, + "mean_token_accuracy": 0.6681106686592102, + "num_tokens": 98573453.0, + "step": 166 + }, + { + "epoch": 0.7952380952380952, + "grad_norm": 0.5983892679214478, + "learning_rate": 1e-05, + "loss": 1.189, + "mean_token_accuracy": 0.6651272177696228, + "num_tokens": 99162518.0, + "step": 167 + }, + { + "epoch": 0.8, + "grad_norm": 0.5511825084686279, + "learning_rate": 1e-05, + "loss": 1.1859, + "mean_token_accuracy": 0.6656243801116943, + "num_tokens": 99755688.0, + "step": 168 + }, + { + "epoch": 0.8047619047619048, + "grad_norm": 0.5612326264381409, + "learning_rate": 1e-05, + "loss": 1.1923, + "mean_token_accuracy": 0.6645892858505249, + "num_tokens": 100367122.0, + "step": 169 + }, + { + "epoch": 0.8095238095238095, + "grad_norm": 0.6149346232414246, + "learning_rate": 1e-05, + "loss": 1.1866, + "mean_token_accuracy": 0.665177583694458, + "num_tokens": 100966663.0, + "step": 170 + }, + { + "epoch": 0.8142857142857143, + "grad_norm": 0.5557584166526794, + "learning_rate": 1e-05, + "loss": 1.1993, + "mean_token_accuracy": 0.6638921499252319, + "num_tokens": 101561561.0, + "step": 171 + }, + { + "epoch": 0.819047619047619, + "grad_norm": 0.6174666285514832, + "learning_rate": 1e-05, + "loss": 1.2058, + "mean_token_accuracy": 0.6619209051132202, + "num_tokens": 102150367.0, + "step": 172 + }, + { + "epoch": 0.8238095238095238, + "grad_norm": 0.6149846911430359, + "learning_rate": 1e-05, + "loss": 1.1956, + "mean_token_accuracy": 0.6646385788917542, + "num_tokens": 102744438.0, + "step": 173 + }, + { + "epoch": 0.8285714285714286, + "grad_norm": 0.6205980777740479, + "learning_rate": 1e-05, + "loss": 1.1944, + "mean_token_accuracy": 0.6641254425048828, + "num_tokens": 103336159.0, + "step": 174 + }, + { + "epoch": 0.8333333333333334, + "grad_norm": 0.6782044172286987, + "learning_rate": 1e-05, + "loss": 1.1993, + "mean_token_accuracy": 0.6630405187606812, + "num_tokens": 103933457.0, + "step": 175 + }, + { + "epoch": 0.8380952380952381, + "grad_norm": 0.6339226961135864, + "learning_rate": 1e-05, + "loss": 1.1854, + "mean_token_accuracy": 0.6652607917785645, + "num_tokens": 104528020.0, + "step": 176 + }, + { + "epoch": 0.8428571428571429, + "grad_norm": 0.604350209236145, + "learning_rate": 1e-05, + "loss": 1.2142, + "mean_token_accuracy": 0.6597182750701904, + "num_tokens": 105126562.0, + "step": 177 + }, + { + "epoch": 0.8476190476190476, + "grad_norm": 0.5730092525482178, + "learning_rate": 1e-05, + "loss": 1.1796, + "mean_token_accuracy": 0.6674203872680664, + "num_tokens": 105730229.0, + "step": 178 + }, + { + "epoch": 0.8523809523809524, + "grad_norm": 0.6724650263786316, + "learning_rate": 1e-05, + "loss": 1.201, + "mean_token_accuracy": 0.6622498035430908, + "num_tokens": 106338239.0, + "step": 179 + }, + { + "epoch": 0.8571428571428571, + "grad_norm": 0.5882953405380249, + "learning_rate": 1e-05, + "loss": 1.1982, + "mean_token_accuracy": 0.6630674600601196, + "num_tokens": 106929782.0, + "step": 180 + }, + { + "epoch": 0.861904761904762, + "grad_norm": 0.6305244565010071, + "learning_rate": 1e-05, + "loss": 1.1932, + "mean_token_accuracy": 0.6646133661270142, + "num_tokens": 107516950.0, + "step": 181 + }, + { + "epoch": 0.8666666666666667, + "grad_norm": 0.6297836899757385, + "learning_rate": 1e-05, + "loss": 1.1825, + "mean_token_accuracy": 0.6660134792327881, + "num_tokens": 108104046.0, + "step": 182 + }, + { + "epoch": 0.8714285714285714, + "grad_norm": 0.5446469783782959, + "learning_rate": 1e-05, + "loss": 1.1992, + "mean_token_accuracy": 0.6630533933639526, + "num_tokens": 108711068.0, + "step": 183 + }, + { + "epoch": 0.8761904761904762, + "grad_norm": 0.5844411253929138, + "learning_rate": 1e-05, + "loss": 1.1687, + "mean_token_accuracy": 0.669592022895813, + "num_tokens": 109294847.0, + "step": 184 + }, + { + "epoch": 0.8809523809523809, + "grad_norm": 0.6065420508384705, + "learning_rate": 1e-05, + "loss": 1.1886, + "mean_token_accuracy": 0.664987325668335, + "num_tokens": 109903424.0, + "step": 185 + }, + { + "epoch": 0.8857142857142857, + "grad_norm": 0.6002596616744995, + "learning_rate": 1e-05, + "loss": 1.1894, + "mean_token_accuracy": 0.66484135389328, + "num_tokens": 110515082.0, + "step": 186 + }, + { + "epoch": 0.8904761904761904, + "grad_norm": 0.5755858421325684, + "learning_rate": 1e-05, + "loss": 1.1887, + "mean_token_accuracy": 0.6651521325111389, + "num_tokens": 111105456.0, + "step": 187 + }, + { + "epoch": 0.8952380952380953, + "grad_norm": 0.6171888709068298, + "learning_rate": 1e-05, + "loss": 1.1893, + "mean_token_accuracy": 0.6657494306564331, + "num_tokens": 111699029.0, + "step": 188 + }, + { + "epoch": 0.9, + "grad_norm": 0.579205334186554, + "learning_rate": 1e-05, + "loss": 1.1659, + "mean_token_accuracy": 0.6696426272392273, + "num_tokens": 112280321.0, + "step": 189 + }, + { + "epoch": 0.9047619047619048, + "grad_norm": 0.6712483167648315, + "learning_rate": 1e-05, + "loss": 1.1677, + "mean_token_accuracy": 0.6694087982177734, + "num_tokens": 112860009.0, + "step": 190 + }, + { + "epoch": 0.9095238095238095, + "grad_norm": 0.6215792894363403, + "learning_rate": 1e-05, + "loss": 1.1872, + "mean_token_accuracy": 0.6649343967437744, + "num_tokens": 113457303.0, + "step": 191 + }, + { + "epoch": 0.9142857142857143, + "grad_norm": 0.5627334117889404, + "learning_rate": 1e-05, + "loss": 1.181, + "mean_token_accuracy": 0.6672377586364746, + "num_tokens": 114054977.0, + "step": 192 + }, + { + "epoch": 0.919047619047619, + "grad_norm": 0.5678215622901917, + "learning_rate": 1e-05, + "loss": 1.1778, + "mean_token_accuracy": 0.6673398613929749, + "num_tokens": 114641555.0, + "step": 193 + }, + { + "epoch": 0.9238095238095239, + "grad_norm": 0.5933332443237305, + "learning_rate": 1e-05, + "loss": 1.1939, + "mean_token_accuracy": 0.6647536754608154, + "num_tokens": 115241437.0, + "step": 194 + }, + { + "epoch": 0.9285714285714286, + "grad_norm": 0.5732199549674988, + "learning_rate": 1e-05, + "loss": 1.1714, + "mean_token_accuracy": 0.6686159372329712, + "num_tokens": 115845775.0, + "step": 195 + }, + { + "epoch": 0.9333333333333333, + "grad_norm": 0.6514256596565247, + "learning_rate": 1e-05, + "loss": 1.1782, + "mean_token_accuracy": 0.6679466962814331, + "num_tokens": 116452623.0, + "step": 196 + }, + { + "epoch": 0.9380952380952381, + "grad_norm": 0.5765755772590637, + "learning_rate": 1e-05, + "loss": 1.1861, + "mean_token_accuracy": 0.6654437780380249, + "num_tokens": 117045570.0, + "step": 197 + }, + { + "epoch": 0.9428571428571428, + "grad_norm": 0.7004836797714233, + "learning_rate": 1e-05, + "loss": 1.1638, + "mean_token_accuracy": 0.6707776784896851, + "num_tokens": 117654535.0, + "step": 198 + }, + { + "epoch": 0.9476190476190476, + "grad_norm": 0.5966997146606445, + "learning_rate": 1e-05, + "loss": 1.1772, + "mean_token_accuracy": 0.6684892177581787, + "num_tokens": 118247244.0, + "step": 199 + }, + { + "epoch": 0.9523809523809523, + "grad_norm": 0.6460300087928772, + "learning_rate": 1e-05, + "loss": 1.1713, + "mean_token_accuracy": 0.6694802045822144, + "num_tokens": 118843074.0, + "step": 200 + }, + { + "epoch": 0.9571428571428572, + "grad_norm": 0.599161684513092, + "learning_rate": 1e-05, + "loss": 1.1712, + "mean_token_accuracy": 0.6690815687179565, + "num_tokens": 119445023.0, + "step": 201 + }, + { + "epoch": 0.9619047619047619, + "grad_norm": 0.6229502558708191, + "learning_rate": 1e-05, + "loss": 1.1864, + "mean_token_accuracy": 0.6660387516021729, + "num_tokens": 120045748.0, + "step": 202 + }, + { + "epoch": 0.9666666666666667, + "grad_norm": 0.6429843306541443, + "learning_rate": 1e-05, + "loss": 1.1785, + "mean_token_accuracy": 0.6669691205024719, + "num_tokens": 120635079.0, + "step": 203 + }, + { + "epoch": 0.9714285714285714, + "grad_norm": 0.6153910756111145, + "learning_rate": 1e-05, + "loss": 1.1791, + "mean_token_accuracy": 0.66630619764328, + "num_tokens": 121220486.0, + "step": 204 + }, + { + "epoch": 0.9761904761904762, + "grad_norm": 0.6496953368186951, + "learning_rate": 1e-05, + "loss": 1.1804, + "mean_token_accuracy": 0.6666555404663086, + "num_tokens": 121800676.0, + "step": 205 + }, + { + "epoch": 0.9809523809523809, + "grad_norm": 0.6011868119239807, + "learning_rate": 1e-05, + "loss": 1.1842, + "mean_token_accuracy": 0.6658217906951904, + "num_tokens": 122409399.0, + "step": 206 + }, + { + "epoch": 0.9857142857142858, + "grad_norm": 0.857315182685852, + "learning_rate": 1e-05, + "loss": 1.1652, + "mean_token_accuracy": 0.6701173186302185, + "num_tokens": 123003502.0, + "step": 207 + }, + { + "epoch": 0.9904761904761905, + "grad_norm": 0.6711968183517456, + "learning_rate": 1e-05, + "loss": 1.1821, + "mean_token_accuracy": 0.6669960021972656, + "num_tokens": 123595838.0, + "step": 208 + }, + { + "epoch": 0.9952380952380953, + "grad_norm": 0.8044399619102478, + "learning_rate": 1e-05, + "loss": 1.1797, + "mean_token_accuracy": 0.6671728491783142, + "num_tokens": 124166476.0, + "step": 209 + }, + { + "epoch": 1.0, + "grad_norm": 0.724872887134552, + "learning_rate": 1e-05, + "loss": 1.1689, + "mean_token_accuracy": 0.66896653175354, + "num_tokens": 124761423.0, + "step": 210 + }, + { + "epoch": 1.0047619047619047, + "grad_norm": 0.7732614278793335, + "learning_rate": 1e-05, + "loss": 1.176, + "mean_token_accuracy": 0.6668572425842285, + "num_tokens": 125364371.0, + "step": 211 + }, + { + "epoch": 1.0095238095238095, + "grad_norm": 0.6983124017715454, + "learning_rate": 1e-05, + "loss": 1.1342, + "mean_token_accuracy": 0.6760746240615845, + "num_tokens": 125954118.0, + "step": 212 + }, + { + "epoch": 1.0142857142857142, + "grad_norm": 0.6097580790519714, + "learning_rate": 1e-05, + "loss": 1.1398, + "mean_token_accuracy": 0.6745401620864868, + "num_tokens": 126544991.0, + "step": 213 + }, + { + "epoch": 1.019047619047619, + "grad_norm": 0.6844852566719055, + "learning_rate": 1e-05, + "loss": 1.1425, + "mean_token_accuracy": 0.6751389503479004, + "num_tokens": 127151772.0, + "step": 214 + }, + { + "epoch": 1.0238095238095237, + "grad_norm": 0.7108845114707947, + "learning_rate": 1e-05, + "loss": 1.1472, + "mean_token_accuracy": 0.6734536290168762, + "num_tokens": 127762517.0, + "step": 215 + }, + { + "epoch": 1.0285714285714285, + "grad_norm": 0.7051171660423279, + "learning_rate": 1e-05, + "loss": 1.1516, + "mean_token_accuracy": 0.672438383102417, + "num_tokens": 128358892.0, + "step": 216 + }, + { + "epoch": 1.0333333333333334, + "grad_norm": 0.742440938949585, + "learning_rate": 1e-05, + "loss": 1.1486, + "mean_token_accuracy": 0.6727321743965149, + "num_tokens": 128930309.0, + "step": 217 + }, + { + "epoch": 1.0380952380952382, + "grad_norm": 0.6921288371086121, + "learning_rate": 1e-05, + "loss": 1.1336, + "mean_token_accuracy": 0.6769453883171082, + "num_tokens": 129537678.0, + "step": 218 + }, + { + "epoch": 1.042857142857143, + "grad_norm": 0.6531715989112854, + "learning_rate": 1e-05, + "loss": 1.1486, + "mean_token_accuracy": 0.6732891201972961, + "num_tokens": 130113717.0, + "step": 219 + }, + { + "epoch": 1.0476190476190477, + "grad_norm": 0.8497748970985413, + "learning_rate": 1e-05, + "loss": 1.1554, + "mean_token_accuracy": 0.6714987754821777, + "num_tokens": 130724521.0, + "step": 220 + }, + { + "epoch": 1.0523809523809524, + "grad_norm": 0.6819850206375122, + "learning_rate": 1e-05, + "loss": 1.1407, + "mean_token_accuracy": 0.6752928495407104, + "num_tokens": 131298037.0, + "step": 221 + }, + { + "epoch": 1.0571428571428572, + "grad_norm": 0.785930335521698, + "learning_rate": 1e-05, + "loss": 1.1486, + "mean_token_accuracy": 0.6729685068130493, + "num_tokens": 131909779.0, + "step": 222 + }, + { + "epoch": 1.061904761904762, + "grad_norm": 0.6023511290550232, + "learning_rate": 1e-05, + "loss": 1.1458, + "mean_token_accuracy": 0.6734186410903931, + "num_tokens": 132506621.0, + "step": 223 + }, + { + "epoch": 1.0666666666666667, + "grad_norm": 0.8720818758010864, + "learning_rate": 1e-05, + "loss": 1.1498, + "mean_token_accuracy": 0.6726520657539368, + "num_tokens": 133124443.0, + "step": 224 + }, + { + "epoch": 1.0714285714285714, + "grad_norm": 0.6429004073143005, + "learning_rate": 1e-05, + "loss": 1.1608, + "mean_token_accuracy": 0.6698133945465088, + "num_tokens": 133719672.0, + "step": 225 + }, + { + "epoch": 1.0761904761904761, + "grad_norm": 0.7744424343109131, + "learning_rate": 1e-05, + "loss": 1.1357, + "mean_token_accuracy": 0.6747680306434631, + "num_tokens": 134309852.0, + "step": 226 + }, + { + "epoch": 1.0809523809523809, + "grad_norm": 0.7106124758720398, + "learning_rate": 1e-05, + "loss": 1.1472, + "mean_token_accuracy": 0.6723679900169373, + "num_tokens": 134890952.0, + "step": 227 + }, + { + "epoch": 1.0857142857142856, + "grad_norm": 0.8420917987823486, + "learning_rate": 1e-05, + "loss": 1.1144, + "mean_token_accuracy": 0.6813350915908813, + "num_tokens": 135479588.0, + "step": 228 + }, + { + "epoch": 1.0904761904761904, + "grad_norm": 0.7307847738265991, + "learning_rate": 1e-05, + "loss": 1.14, + "mean_token_accuracy": 0.6748834848403931, + "num_tokens": 136065836.0, + "step": 229 + }, + { + "epoch": 1.0952380952380953, + "grad_norm": 0.6740959882736206, + "learning_rate": 1e-05, + "loss": 1.1377, + "mean_token_accuracy": 0.6761696934700012, + "num_tokens": 136668062.0, + "step": 230 + }, + { + "epoch": 1.1, + "grad_norm": 0.6920994520187378, + "learning_rate": 1e-05, + "loss": 1.1398, + "mean_token_accuracy": 0.6743276715278625, + "num_tokens": 137256533.0, + "step": 231 + }, + { + "epoch": 1.1047619047619048, + "grad_norm": 0.6870349645614624, + "learning_rate": 1e-05, + "loss": 1.1459, + "mean_token_accuracy": 0.6732701063156128, + "num_tokens": 137848246.0, + "step": 232 + }, + { + "epoch": 1.1095238095238096, + "grad_norm": 0.6535449028015137, + "learning_rate": 1e-05, + "loss": 1.1494, + "mean_token_accuracy": 0.6729423999786377, + "num_tokens": 138450870.0, + "step": 233 + }, + { + "epoch": 1.1142857142857143, + "grad_norm": 0.6108024716377258, + "learning_rate": 1e-05, + "loss": 1.1498, + "mean_token_accuracy": 0.671773374080658, + "num_tokens": 139048178.0, + "step": 234 + }, + { + "epoch": 1.119047619047619, + "grad_norm": 0.618743360042572, + "learning_rate": 1e-05, + "loss": 1.1394, + "mean_token_accuracy": 0.6749163866043091, + "num_tokens": 139647536.0, + "step": 235 + }, + { + "epoch": 1.1238095238095238, + "grad_norm": 0.5873496532440186, + "learning_rate": 1e-05, + "loss": 1.1428, + "mean_token_accuracy": 0.6742033958435059, + "num_tokens": 140237569.0, + "step": 236 + }, + { + "epoch": 1.1285714285714286, + "grad_norm": 0.6749809980392456, + "learning_rate": 1e-05, + "loss": 1.1462, + "mean_token_accuracy": 0.67291659116745, + "num_tokens": 140808948.0, + "step": 237 + }, + { + "epoch": 1.1333333333333333, + "grad_norm": 0.5988799333572388, + "learning_rate": 1e-05, + "loss": 1.1417, + "mean_token_accuracy": 0.6734879016876221, + "num_tokens": 141387906.0, + "step": 238 + }, + { + "epoch": 1.138095238095238, + "grad_norm": 0.7041788697242737, + "learning_rate": 1e-05, + "loss": 1.1421, + "mean_token_accuracy": 0.6749635934829712, + "num_tokens": 141991024.0, + "step": 239 + }, + { + "epoch": 1.1428571428571428, + "grad_norm": 0.677106499671936, + "learning_rate": 1e-05, + "loss": 1.122, + "mean_token_accuracy": 0.6789741516113281, + "num_tokens": 142585170.0, + "step": 240 + }, + { + "epoch": 1.1476190476190475, + "grad_norm": 0.6422439217567444, + "learning_rate": 1e-05, + "loss": 1.1509, + "mean_token_accuracy": 0.6719658374786377, + "num_tokens": 143178473.0, + "step": 241 + }, + { + "epoch": 1.1523809523809523, + "grad_norm": 0.6920860409736633, + "learning_rate": 1e-05, + "loss": 1.1511, + "mean_token_accuracy": 0.6708908677101135, + "num_tokens": 143782184.0, + "step": 242 + }, + { + "epoch": 1.157142857142857, + "grad_norm": 0.5582302212715149, + "learning_rate": 1e-05, + "loss": 1.1331, + "mean_token_accuracy": 0.6759682297706604, + "num_tokens": 144383051.0, + "step": 243 + }, + { + "epoch": 1.161904761904762, + "grad_norm": 0.6627556085586548, + "learning_rate": 1e-05, + "loss": 1.1432, + "mean_token_accuracy": 0.6744831204414368, + "num_tokens": 144977872.0, + "step": 244 + }, + { + "epoch": 1.1666666666666667, + "grad_norm": 0.5956741571426392, + "learning_rate": 1e-05, + "loss": 1.1456, + "mean_token_accuracy": 0.6733117699623108, + "num_tokens": 145573077.0, + "step": 245 + }, + { + "epoch": 1.1714285714285715, + "grad_norm": 0.7862910628318787, + "learning_rate": 1e-05, + "loss": 1.1464, + "mean_token_accuracy": 0.6739993691444397, + "num_tokens": 146165107.0, + "step": 246 + }, + { + "epoch": 1.1761904761904762, + "grad_norm": 0.6099702715873718, + "learning_rate": 1e-05, + "loss": 1.1393, + "mean_token_accuracy": 0.6740779876708984, + "num_tokens": 146763356.0, + "step": 247 + }, + { + "epoch": 1.180952380952381, + "grad_norm": 0.7584065198898315, + "learning_rate": 1e-05, + "loss": 1.136, + "mean_token_accuracy": 0.6759775280952454, + "num_tokens": 147358035.0, + "step": 248 + }, + { + "epoch": 1.1857142857142857, + "grad_norm": 0.6754823327064514, + "learning_rate": 1e-05, + "loss": 1.1523, + "mean_token_accuracy": 0.6710008978843689, + "num_tokens": 147955530.0, + "step": 249 + }, + { + "epoch": 1.1904761904761905, + "grad_norm": 0.6045711636543274, + "learning_rate": 1e-05, + "loss": 1.1468, + "mean_token_accuracy": 0.6728271245956421, + "num_tokens": 148547950.0, + "step": 250 + }, + { + "epoch": 1.1952380952380952, + "grad_norm": 0.6770275235176086, + "learning_rate": 1e-05, + "loss": 1.1309, + "mean_token_accuracy": 0.6762286424636841, + "num_tokens": 149127280.0, + "step": 251 + }, + { + "epoch": 1.2, + "grad_norm": 0.5667791366577148, + "learning_rate": 1e-05, + "loss": 1.1389, + "mean_token_accuracy": 0.6750789284706116, + "num_tokens": 149735096.0, + "step": 252 + }, + { + "epoch": 1.2047619047619047, + "grad_norm": 0.6122450232505798, + "learning_rate": 1e-05, + "loss": 1.1423, + "mean_token_accuracy": 0.6746940612792969, + "num_tokens": 150338864.0, + "step": 253 + }, + { + "epoch": 1.2095238095238094, + "grad_norm": 0.6596109867095947, + "learning_rate": 1e-05, + "loss": 1.1234, + "mean_token_accuracy": 0.6786649227142334, + "num_tokens": 150940365.0, + "step": 254 + }, + { + "epoch": 1.2142857142857142, + "grad_norm": 0.6414262652397156, + "learning_rate": 1e-05, + "loss": 1.1454, + "mean_token_accuracy": 0.6734991073608398, + "num_tokens": 151517902.0, + "step": 255 + }, + { + "epoch": 1.2190476190476192, + "grad_norm": 0.7465854287147522, + "learning_rate": 1e-05, + "loss": 1.1225, + "mean_token_accuracy": 0.6790366172790527, + "num_tokens": 152093932.0, + "step": 256 + }, + { + "epoch": 1.223809523809524, + "grad_norm": 0.6045883297920227, + "learning_rate": 1e-05, + "loss": 1.1281, + "mean_token_accuracy": 0.6779497861862183, + "num_tokens": 152690003.0, + "step": 257 + }, + { + "epoch": 1.2285714285714286, + "grad_norm": 0.7717053890228271, + "learning_rate": 1e-05, + "loss": 1.1305, + "mean_token_accuracy": 0.6769629716873169, + "num_tokens": 153278014.0, + "step": 258 + }, + { + "epoch": 1.2333333333333334, + "grad_norm": 0.6217109560966492, + "learning_rate": 1e-05, + "loss": 1.1377, + "mean_token_accuracy": 0.6756360530853271, + "num_tokens": 153871781.0, + "step": 259 + }, + { + "epoch": 1.2380952380952381, + "grad_norm": 0.7101379632949829, + "learning_rate": 1e-05, + "loss": 1.1396, + "mean_token_accuracy": 0.6745343208312988, + "num_tokens": 154466124.0, + "step": 260 + }, + { + "epoch": 1.2428571428571429, + "grad_norm": 0.6611591577529907, + "learning_rate": 1e-05, + "loss": 1.1342, + "mean_token_accuracy": 0.675082802772522, + "num_tokens": 155073053.0, + "step": 261 + }, + { + "epoch": 1.2476190476190476, + "grad_norm": 0.7041805386543274, + "learning_rate": 1e-05, + "loss": 1.1612, + "mean_token_accuracy": 0.6703898906707764, + "num_tokens": 155680694.0, + "step": 262 + }, + { + "epoch": 1.2523809523809524, + "grad_norm": 0.6518973708152771, + "learning_rate": 1e-05, + "loss": 1.1492, + "mean_token_accuracy": 0.6719495058059692, + "num_tokens": 156279612.0, + "step": 263 + }, + { + "epoch": 1.2571428571428571, + "grad_norm": 0.6293846368789673, + "learning_rate": 1e-05, + "loss": 1.1381, + "mean_token_accuracy": 0.6761749982833862, + "num_tokens": 156898086.0, + "step": 264 + }, + { + "epoch": 1.2619047619047619, + "grad_norm": 0.5713494420051575, + "learning_rate": 1e-05, + "loss": 1.1527, + "mean_token_accuracy": 0.6716663837432861, + "num_tokens": 157502996.0, + "step": 265 + }, + { + "epoch": 1.2666666666666666, + "grad_norm": 0.6561734676361084, + "learning_rate": 1e-05, + "loss": 1.1544, + "mean_token_accuracy": 0.6708611845970154, + "num_tokens": 158107778.0, + "step": 266 + }, + { + "epoch": 1.2714285714285714, + "grad_norm": 0.5799586772918701, + "learning_rate": 1e-05, + "loss": 1.1177, + "mean_token_accuracy": 0.6797953844070435, + "num_tokens": 158713147.0, + "step": 267 + }, + { + "epoch": 1.276190476190476, + "grad_norm": 0.5941030979156494, + "learning_rate": 1e-05, + "loss": 1.1255, + "mean_token_accuracy": 0.6776763200759888, + "num_tokens": 159292006.0, + "step": 268 + }, + { + "epoch": 1.2809523809523808, + "grad_norm": 0.6683588624000549, + "learning_rate": 1e-05, + "loss": 1.1234, + "mean_token_accuracy": 0.6778484582901001, + "num_tokens": 159889197.0, + "step": 269 + }, + { + "epoch": 1.2857142857142856, + "grad_norm": 0.6561569571495056, + "learning_rate": 1e-05, + "loss": 1.1378, + "mean_token_accuracy": 0.6750425696372986, + "num_tokens": 160485304.0, + "step": 270 + }, + { + "epoch": 1.2904761904761906, + "grad_norm": 0.5719537138938904, + "learning_rate": 1e-05, + "loss": 1.1404, + "mean_token_accuracy": 0.6747204065322876, + "num_tokens": 161092433.0, + "step": 271 + }, + { + "epoch": 1.2952380952380953, + "grad_norm": 0.6006868481636047, + "learning_rate": 1e-05, + "loss": 1.1396, + "mean_token_accuracy": 0.6749382019042969, + "num_tokens": 161683555.0, + "step": 272 + }, + { + "epoch": 1.3, + "grad_norm": 0.6102608442306519, + "learning_rate": 1e-05, + "loss": 1.1293, + "mean_token_accuracy": 0.6775893568992615, + "num_tokens": 162278973.0, + "step": 273 + }, + { + "epoch": 1.3047619047619048, + "grad_norm": 0.6217197179794312, + "learning_rate": 1e-05, + "loss": 1.1366, + "mean_token_accuracy": 0.6764044165611267, + "num_tokens": 162885270.0, + "step": 274 + }, + { + "epoch": 1.3095238095238095, + "grad_norm": 0.6187546253204346, + "learning_rate": 1e-05, + "loss": 1.1315, + "mean_token_accuracy": 0.6765252351760864, + "num_tokens": 163476311.0, + "step": 275 + }, + { + "epoch": 1.3142857142857143, + "grad_norm": 0.5942601561546326, + "learning_rate": 1e-05, + "loss": 1.1455, + "mean_token_accuracy": 0.6730477213859558, + "num_tokens": 164071314.0, + "step": 276 + }, + { + "epoch": 1.319047619047619, + "grad_norm": 0.5942831635475159, + "learning_rate": 1e-05, + "loss": 1.1321, + "mean_token_accuracy": 0.6764451861381531, + "num_tokens": 164663415.0, + "step": 277 + }, + { + "epoch": 1.3238095238095238, + "grad_norm": 0.6232311129570007, + "learning_rate": 1e-05, + "loss": 1.1269, + "mean_token_accuracy": 0.6775949597358704, + "num_tokens": 165256997.0, + "step": 278 + }, + { + "epoch": 1.3285714285714285, + "grad_norm": 0.6126914024353027, + "learning_rate": 1e-05, + "loss": 1.1317, + "mean_token_accuracy": 0.676669716835022, + "num_tokens": 165847922.0, + "step": 279 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 0.6624312400817871, + "learning_rate": 1e-05, + "loss": 1.1337, + "mean_token_accuracy": 0.6758729815483093, + "num_tokens": 166444541.0, + "step": 280 + }, + { + "epoch": 1.3380952380952382, + "grad_norm": 0.6634590029716492, + "learning_rate": 1e-05, + "loss": 1.1246, + "mean_token_accuracy": 0.6781991124153137, + "num_tokens": 167028591.0, + "step": 281 + }, + { + "epoch": 1.342857142857143, + "grad_norm": 0.7142046093940735, + "learning_rate": 1e-05, + "loss": 1.1473, + "mean_token_accuracy": 0.6724534034729004, + "num_tokens": 167627132.0, + "step": 282 + }, + { + "epoch": 1.3476190476190477, + "grad_norm": 0.5835825800895691, + "learning_rate": 1e-05, + "loss": 1.119, + "mean_token_accuracy": 0.6801720857620239, + "num_tokens": 168226854.0, + "step": 283 + }, + { + "epoch": 1.3523809523809525, + "grad_norm": 0.7441895008087158, + "learning_rate": 1e-05, + "loss": 1.1508, + "mean_token_accuracy": 0.6721788048744202, + "num_tokens": 168833732.0, + "step": 284 + }, + { + "epoch": 1.3571428571428572, + "grad_norm": 0.613866925239563, + "learning_rate": 1e-05, + "loss": 1.1263, + "mean_token_accuracy": 0.6785060167312622, + "num_tokens": 169426970.0, + "step": 285 + }, + { + "epoch": 1.361904761904762, + "grad_norm": 0.7395045161247253, + "learning_rate": 1e-05, + "loss": 1.1504, + "mean_token_accuracy": 0.6733224391937256, + "num_tokens": 170025787.0, + "step": 286 + }, + { + "epoch": 1.3666666666666667, + "grad_norm": 0.7011858224868774, + "learning_rate": 1e-05, + "loss": 1.1457, + "mean_token_accuracy": 0.6723621487617493, + "num_tokens": 170621857.0, + "step": 287 + }, + { + "epoch": 1.3714285714285714, + "grad_norm": 0.6301146149635315, + "learning_rate": 1e-05, + "loss": 1.1428, + "mean_token_accuracy": 0.6736270189285278, + "num_tokens": 171220708.0, + "step": 288 + }, + { + "epoch": 1.3761904761904762, + "grad_norm": 0.6546505093574524, + "learning_rate": 1e-05, + "loss": 1.1527, + "mean_token_accuracy": 0.6708388924598694, + "num_tokens": 171812508.0, + "step": 289 + }, + { + "epoch": 1.380952380952381, + "grad_norm": 0.665846049785614, + "learning_rate": 1e-05, + "loss": 1.1239, + "mean_token_accuracy": 0.6771635413169861, + "num_tokens": 172401090.0, + "step": 290 + }, + { + "epoch": 1.3857142857142857, + "grad_norm": 0.6951489448547363, + "learning_rate": 1e-05, + "loss": 1.1303, + "mean_token_accuracy": 0.6767557263374329, + "num_tokens": 172989201.0, + "step": 291 + }, + { + "epoch": 1.3904761904761904, + "grad_norm": 0.6228903532028198, + "learning_rate": 1e-05, + "loss": 1.1316, + "mean_token_accuracy": 0.6754661798477173, + "num_tokens": 173563807.0, + "step": 292 + }, + { + "epoch": 1.3952380952380952, + "grad_norm": 0.7011890411376953, + "learning_rate": 1e-05, + "loss": 1.1303, + "mean_token_accuracy": 0.6768910884857178, + "num_tokens": 174159574.0, + "step": 293 + }, + { + "epoch": 1.4, + "grad_norm": 0.6298404932022095, + "learning_rate": 1e-05, + "loss": 1.1487, + "mean_token_accuracy": 0.672224223613739, + "num_tokens": 174744244.0, + "step": 294 + }, + { + "epoch": 1.4047619047619047, + "grad_norm": 0.6158511638641357, + "learning_rate": 1e-05, + "loss": 1.1315, + "mean_token_accuracy": 0.6756511926651001, + "num_tokens": 175341946.0, + "step": 295 + }, + { + "epoch": 1.4095238095238094, + "grad_norm": 0.6887179613113403, + "learning_rate": 1e-05, + "loss": 1.1019, + "mean_token_accuracy": 0.6828951239585876, + "num_tokens": 175904117.0, + "step": 296 + }, + { + "epoch": 1.4142857142857144, + "grad_norm": 0.64696204662323, + "learning_rate": 1e-05, + "loss": 1.1307, + "mean_token_accuracy": 0.6764581799507141, + "num_tokens": 176493621.0, + "step": 297 + }, + { + "epoch": 1.4190476190476191, + "grad_norm": 0.5804628133773804, + "learning_rate": 1e-05, + "loss": 1.1316, + "mean_token_accuracy": 0.6758260726928711, + "num_tokens": 177082157.0, + "step": 298 + }, + { + "epoch": 1.4238095238095239, + "grad_norm": 0.6294459104537964, + "learning_rate": 1e-05, + "loss": 1.1325, + "mean_token_accuracy": 0.6751164197921753, + "num_tokens": 177668681.0, + "step": 299 + }, + { + "epoch": 1.4285714285714286, + "grad_norm": 0.617782711982727, + "learning_rate": 1e-05, + "loss": 1.1352, + "mean_token_accuracy": 0.6748452186584473, + "num_tokens": 178256283.0, + "step": 300 + }, + { + "epoch": 1.4333333333333333, + "grad_norm": 0.6512781977653503, + "learning_rate": 1e-05, + "loss": 1.1468, + "mean_token_accuracy": 0.6721617579460144, + "num_tokens": 178850673.0, + "step": 301 + }, + { + "epoch": 1.438095238095238, + "grad_norm": 0.5774661898612976, + "learning_rate": 1e-05, + "loss": 1.1246, + "mean_token_accuracy": 0.6787533760070801, + "num_tokens": 179457871.0, + "step": 302 + }, + { + "epoch": 1.4428571428571428, + "grad_norm": 0.5992771983146667, + "learning_rate": 1e-05, + "loss": 1.1548, + "mean_token_accuracy": 0.6706414818763733, + "num_tokens": 180064071.0, + "step": 303 + }, + { + "epoch": 1.4476190476190476, + "grad_norm": 0.5943005681037903, + "learning_rate": 1e-05, + "loss": 1.1106, + "mean_token_accuracy": 0.6806790828704834, + "num_tokens": 180650796.0, + "step": 304 + }, + { + "epoch": 1.4523809523809523, + "grad_norm": 0.6455477476119995, + "learning_rate": 1e-05, + "loss": 1.1409, + "mean_token_accuracy": 0.6753484606742859, + "num_tokens": 181246825.0, + "step": 305 + }, + { + "epoch": 1.457142857142857, + "grad_norm": 0.5515779852867126, + "learning_rate": 1e-05, + "loss": 1.1429, + "mean_token_accuracy": 0.6737354397773743, + "num_tokens": 181855567.0, + "step": 306 + }, + { + "epoch": 1.461904761904762, + "grad_norm": 0.6088519096374512, + "learning_rate": 1e-05, + "loss": 1.1095, + "mean_token_accuracy": 0.680343508720398, + "num_tokens": 182433911.0, + "step": 307 + }, + { + "epoch": 1.4666666666666668, + "grad_norm": 0.6310312747955322, + "learning_rate": 1e-05, + "loss": 1.1307, + "mean_token_accuracy": 0.676478385925293, + "num_tokens": 183023144.0, + "step": 308 + }, + { + "epoch": 1.4714285714285715, + "grad_norm": 0.6333861947059631, + "learning_rate": 1e-05, + "loss": 1.1225, + "mean_token_accuracy": 0.6778949499130249, + "num_tokens": 183626514.0, + "step": 309 + }, + { + "epoch": 1.4761904761904763, + "grad_norm": 0.6410499811172485, + "learning_rate": 1e-05, + "loss": 1.1284, + "mean_token_accuracy": 0.6767443418502808, + "num_tokens": 184221439.0, + "step": 310 + }, + { + "epoch": 1.480952380952381, + "grad_norm": 0.6700615882873535, + "learning_rate": 1e-05, + "loss": 1.134, + "mean_token_accuracy": 0.6758592128753662, + "num_tokens": 184819506.0, + "step": 311 + }, + { + "epoch": 1.4857142857142858, + "grad_norm": 0.5785894989967346, + "learning_rate": 1e-05, + "loss": 1.1338, + "mean_token_accuracy": 0.6757279634475708, + "num_tokens": 185419019.0, + "step": 312 + }, + { + "epoch": 1.4904761904761905, + "grad_norm": 0.6253511309623718, + "learning_rate": 1e-05, + "loss": 1.1212, + "mean_token_accuracy": 0.6801990270614624, + "num_tokens": 186010772.0, + "step": 313 + }, + { + "epoch": 1.4952380952380953, + "grad_norm": 0.6034374237060547, + "learning_rate": 1e-05, + "loss": 1.1178, + "mean_token_accuracy": 0.6792829036712646, + "num_tokens": 186589243.0, + "step": 314 + }, + { + "epoch": 1.5, + "grad_norm": 0.6875804662704468, + "learning_rate": 1e-05, + "loss": 1.1165, + "mean_token_accuracy": 0.6799081563949585, + "num_tokens": 187182368.0, + "step": 315 + }, + { + "epoch": 1.5047619047619047, + "grad_norm": 0.5927019119262695, + "learning_rate": 1e-05, + "loss": 1.1179, + "mean_token_accuracy": 0.6792271733283997, + "num_tokens": 187763428.0, + "step": 316 + }, + { + "epoch": 1.5095238095238095, + "grad_norm": 0.5725839734077454, + "learning_rate": 1e-05, + "loss": 1.1129, + "mean_token_accuracy": 0.6808658838272095, + "num_tokens": 188359395.0, + "step": 317 + }, + { + "epoch": 1.5142857142857142, + "grad_norm": 0.6134579181671143, + "learning_rate": 1e-05, + "loss": 1.1329, + "mean_token_accuracy": 0.6752611398696899, + "num_tokens": 188952450.0, + "step": 318 + }, + { + "epoch": 1.519047619047619, + "grad_norm": 0.5980193018913269, + "learning_rate": 1e-05, + "loss": 1.1282, + "mean_token_accuracy": 0.6765316128730774, + "num_tokens": 189535853.0, + "step": 319 + }, + { + "epoch": 1.5238095238095237, + "grad_norm": 0.6418870091438293, + "learning_rate": 1e-05, + "loss": 1.1113, + "mean_token_accuracy": 0.6808905601501465, + "num_tokens": 190127386.0, + "step": 320 + }, + { + "epoch": 1.5285714285714285, + "grad_norm": 0.5932308435440063, + "learning_rate": 1e-05, + "loss": 1.1282, + "mean_token_accuracy": 0.6762252449989319, + "num_tokens": 190718877.0, + "step": 321 + }, + { + "epoch": 1.5333333333333332, + "grad_norm": 0.6508740782737732, + "learning_rate": 1e-05, + "loss": 1.1504, + "mean_token_accuracy": 0.6717185974121094, + "num_tokens": 191320553.0, + "step": 322 + }, + { + "epoch": 1.538095238095238, + "grad_norm": 0.6029355525970459, + "learning_rate": 1e-05, + "loss": 1.1219, + "mean_token_accuracy": 0.6790941953659058, + "num_tokens": 191911786.0, + "step": 323 + }, + { + "epoch": 1.5428571428571427, + "grad_norm": 0.5820804834365845, + "learning_rate": 1e-05, + "loss": 1.1483, + "mean_token_accuracy": 0.6729787588119507, + "num_tokens": 192517254.0, + "step": 324 + }, + { + "epoch": 1.5476190476190477, + "grad_norm": 0.6086446642875671, + "learning_rate": 1e-05, + "loss": 1.1438, + "mean_token_accuracy": 0.6730492115020752, + "num_tokens": 193113713.0, + "step": 325 + }, + { + "epoch": 1.5523809523809524, + "grad_norm": 0.6287596821784973, + "learning_rate": 1e-05, + "loss": 1.1255, + "mean_token_accuracy": 0.6779239177703857, + "num_tokens": 193718335.0, + "step": 326 + }, + { + "epoch": 1.5571428571428572, + "grad_norm": 0.6495358347892761, + "learning_rate": 1e-05, + "loss": 1.1267, + "mean_token_accuracy": 0.6764586567878723, + "num_tokens": 194303328.0, + "step": 327 + }, + { + "epoch": 1.561904761904762, + "grad_norm": 0.6034678816795349, + "learning_rate": 1e-05, + "loss": 1.1204, + "mean_token_accuracy": 0.6789346933364868, + "num_tokens": 194886509.0, + "step": 328 + }, + { + "epoch": 1.5666666666666667, + "grad_norm": 0.6537843346595764, + "learning_rate": 1e-05, + "loss": 1.1269, + "mean_token_accuracy": 0.678215742111206, + "num_tokens": 195456896.0, + "step": 329 + }, + { + "epoch": 1.5714285714285714, + "grad_norm": 0.5981965661048889, + "learning_rate": 1e-05, + "loss": 1.1237, + "mean_token_accuracy": 0.6771047115325928, + "num_tokens": 196053871.0, + "step": 330 + }, + { + "epoch": 1.5761904761904761, + "grad_norm": 0.7181389331817627, + "learning_rate": 1e-05, + "loss": 1.1236, + "mean_token_accuracy": 0.6774399280548096, + "num_tokens": 196654732.0, + "step": 331 + }, + { + "epoch": 1.580952380952381, + "grad_norm": 0.6066569089889526, + "learning_rate": 1e-05, + "loss": 1.1124, + "mean_token_accuracy": 0.6811067461967468, + "num_tokens": 197242864.0, + "step": 332 + }, + { + "epoch": 1.5857142857142859, + "grad_norm": 0.7779151797294617, + "learning_rate": 1e-05, + "loss": 1.1153, + "mean_token_accuracy": 0.6798511743545532, + "num_tokens": 197840214.0, + "step": 333 + }, + { + "epoch": 1.5904761904761906, + "grad_norm": 0.5971040725708008, + "learning_rate": 1e-05, + "loss": 1.1177, + "mean_token_accuracy": 0.6795299649238586, + "num_tokens": 198440572.0, + "step": 334 + }, + { + "epoch": 1.5952380952380953, + "grad_norm": 0.6526306867599487, + "learning_rate": 1e-05, + "loss": 1.1134, + "mean_token_accuracy": 0.6805366277694702, + "num_tokens": 199039184.0, + "step": 335 + }, + { + "epoch": 1.6, + "grad_norm": 0.622909426689148, + "learning_rate": 1e-05, + "loss": 1.1139, + "mean_token_accuracy": 0.6792494058609009, + "num_tokens": 199626548.0, + "step": 336 + }, + { + "epoch": 1.6047619047619048, + "grad_norm": 0.6684408187866211, + "learning_rate": 1e-05, + "loss": 1.128, + "mean_token_accuracy": 0.6774076819419861, + "num_tokens": 200222258.0, + "step": 337 + }, + { + "epoch": 1.6095238095238096, + "grad_norm": 0.5934977531433105, + "learning_rate": 1e-05, + "loss": 1.1203, + "mean_token_accuracy": 0.6792654991149902, + "num_tokens": 200819172.0, + "step": 338 + }, + { + "epoch": 1.6142857142857143, + "grad_norm": 0.6164219975471497, + "learning_rate": 1e-05, + "loss": 1.1314, + "mean_token_accuracy": 0.6759560704231262, + "num_tokens": 201413549.0, + "step": 339 + }, + { + "epoch": 1.619047619047619, + "grad_norm": 0.6061872839927673, + "learning_rate": 1e-05, + "loss": 1.1162, + "mean_token_accuracy": 0.6795899868011475, + "num_tokens": 202014069.0, + "step": 340 + }, + { + "epoch": 1.6238095238095238, + "grad_norm": 0.6192796230316162, + "learning_rate": 1e-05, + "loss": 1.1476, + "mean_token_accuracy": 0.6721718311309814, + "num_tokens": 202600379.0, + "step": 341 + }, + { + "epoch": 1.6285714285714286, + "grad_norm": 0.6233608722686768, + "learning_rate": 1e-05, + "loss": 1.1226, + "mean_token_accuracy": 0.6779032945632935, + "num_tokens": 203203233.0, + "step": 342 + }, + { + "epoch": 1.6333333333333333, + "grad_norm": 0.5831724405288696, + "learning_rate": 1e-05, + "loss": 1.1159, + "mean_token_accuracy": 0.6793074607849121, + "num_tokens": 203802554.0, + "step": 343 + }, + { + "epoch": 1.638095238095238, + "grad_norm": 0.6623408794403076, + "learning_rate": 1e-05, + "loss": 1.1296, + "mean_token_accuracy": 0.677479088306427, + "num_tokens": 204395560.0, + "step": 344 + }, + { + "epoch": 1.6428571428571428, + "grad_norm": 0.5827105045318604, + "learning_rate": 1e-05, + "loss": 1.113, + "mean_token_accuracy": 0.6808111071586609, + "num_tokens": 205001404.0, + "step": 345 + }, + { + "epoch": 1.6476190476190475, + "grad_norm": 0.5602775812149048, + "learning_rate": 1e-05, + "loss": 1.1066, + "mean_token_accuracy": 0.6823267936706543, + "num_tokens": 205599855.0, + "step": 346 + }, + { + "epoch": 1.6523809523809523, + "grad_norm": 0.6435489654541016, + "learning_rate": 1e-05, + "loss": 1.1124, + "mean_token_accuracy": 0.6803141832351685, + "num_tokens": 206163338.0, + "step": 347 + }, + { + "epoch": 1.657142857142857, + "grad_norm": 0.5933458209037781, + "learning_rate": 1e-05, + "loss": 1.137, + "mean_token_accuracy": 0.6748683452606201, + "num_tokens": 206741397.0, + "step": 348 + }, + { + "epoch": 1.6619047619047618, + "grad_norm": 0.5775367021560669, + "learning_rate": 1e-05, + "loss": 1.1298, + "mean_token_accuracy": 0.6758445501327515, + "num_tokens": 207323297.0, + "step": 349 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 0.5773342251777649, + "learning_rate": 1e-05, + "loss": 1.1294, + "mean_token_accuracy": 0.6764418482780457, + "num_tokens": 207908589.0, + "step": 350 + }, + { + "epoch": 1.6714285714285713, + "grad_norm": 0.6353156566619873, + "learning_rate": 1e-05, + "loss": 1.1403, + "mean_token_accuracy": 0.6732203960418701, + "num_tokens": 208500281.0, + "step": 351 + }, + { + "epoch": 1.6761904761904762, + "grad_norm": 0.5841516852378845, + "learning_rate": 1e-05, + "loss": 1.1201, + "mean_token_accuracy": 0.6789692640304565, + "num_tokens": 209097427.0, + "step": 352 + }, + { + "epoch": 1.680952380952381, + "grad_norm": 0.5935720205307007, + "learning_rate": 1e-05, + "loss": 1.1217, + "mean_token_accuracy": 0.6778074502944946, + "num_tokens": 209704225.0, + "step": 353 + }, + { + "epoch": 1.6857142857142857, + "grad_norm": 0.6088152527809143, + "learning_rate": 1e-05, + "loss": 1.1177, + "mean_token_accuracy": 0.6796123385429382, + "num_tokens": 210313267.0, + "step": 354 + }, + { + "epoch": 1.6904761904761905, + "grad_norm": 0.5818439722061157, + "learning_rate": 1e-05, + "loss": 1.1273, + "mean_token_accuracy": 0.6770058870315552, + "num_tokens": 210918588.0, + "step": 355 + }, + { + "epoch": 1.6952380952380952, + "grad_norm": 0.6217803955078125, + "learning_rate": 1e-05, + "loss": 1.1268, + "mean_token_accuracy": 0.6772897243499756, + "num_tokens": 211508221.0, + "step": 356 + }, + { + "epoch": 1.7, + "grad_norm": 0.5793229937553406, + "learning_rate": 1e-05, + "loss": 1.1359, + "mean_token_accuracy": 0.6748672127723694, + "num_tokens": 212108728.0, + "step": 357 + }, + { + "epoch": 1.704761904761905, + "grad_norm": 0.5839233994483948, + "learning_rate": 1e-05, + "loss": 1.1226, + "mean_token_accuracy": 0.6776269674301147, + "num_tokens": 212705437.0, + "step": 358 + }, + { + "epoch": 1.7095238095238097, + "grad_norm": 0.6158073544502258, + "learning_rate": 1e-05, + "loss": 1.1324, + "mean_token_accuracy": 0.6745504140853882, + "num_tokens": 213300176.0, + "step": 359 + }, + { + "epoch": 1.7142857142857144, + "grad_norm": 0.6093515753746033, + "learning_rate": 1e-05, + "loss": 1.132, + "mean_token_accuracy": 0.6751940250396729, + "num_tokens": 213890731.0, + "step": 360 + }, + { + "epoch": 1.7190476190476192, + "grad_norm": 0.629436194896698, + "learning_rate": 1e-05, + "loss": 1.1147, + "mean_token_accuracy": 0.6785677075386047, + "num_tokens": 214471137.0, + "step": 361 + }, + { + "epoch": 1.723809523809524, + "grad_norm": 0.6373199820518494, + "learning_rate": 1e-05, + "loss": 1.1169, + "mean_token_accuracy": 0.6792606115341187, + "num_tokens": 215062165.0, + "step": 362 + }, + { + "epoch": 1.7285714285714286, + "grad_norm": 0.5850217938423157, + "learning_rate": 1e-05, + "loss": 1.1264, + "mean_token_accuracy": 0.6766684055328369, + "num_tokens": 215662977.0, + "step": 363 + }, + { + "epoch": 1.7333333333333334, + "grad_norm": 0.676506757736206, + "learning_rate": 1e-05, + "loss": 1.1328, + "mean_token_accuracy": 0.6750730276107788, + "num_tokens": 216253942.0, + "step": 364 + }, + { + "epoch": 1.7380952380952381, + "grad_norm": 0.5996358394622803, + "learning_rate": 1e-05, + "loss": 1.1234, + "mean_token_accuracy": 0.6771166920661926, + "num_tokens": 216847247.0, + "step": 365 + }, + { + "epoch": 1.7428571428571429, + "grad_norm": 0.604375422000885, + "learning_rate": 1e-05, + "loss": 1.1111, + "mean_token_accuracy": 0.6807925701141357, + "num_tokens": 217427979.0, + "step": 366 + }, + { + "epoch": 1.7476190476190476, + "grad_norm": 0.6484256386756897, + "learning_rate": 1e-05, + "loss": 1.1149, + "mean_token_accuracy": 0.67896568775177, + "num_tokens": 218020622.0, + "step": 367 + }, + { + "epoch": 1.7523809523809524, + "grad_norm": 0.5445154905319214, + "learning_rate": 1e-05, + "loss": 1.1238, + "mean_token_accuracy": 0.677640438079834, + "num_tokens": 218613768.0, + "step": 368 + }, + { + "epoch": 1.7571428571428571, + "grad_norm": 0.5835940837860107, + "learning_rate": 1e-05, + "loss": 1.1352, + "mean_token_accuracy": 0.6746830940246582, + "num_tokens": 219217863.0, + "step": 369 + }, + { + "epoch": 1.7619047619047619, + "grad_norm": 0.6108807325363159, + "learning_rate": 1e-05, + "loss": 1.1245, + "mean_token_accuracy": 0.6771240234375, + "num_tokens": 219826128.0, + "step": 370 + }, + { + "epoch": 1.7666666666666666, + "grad_norm": 0.5301618576049805, + "learning_rate": 1e-05, + "loss": 1.1193, + "mean_token_accuracy": 0.6791725158691406, + "num_tokens": 220424737.0, + "step": 371 + }, + { + "epoch": 1.7714285714285714, + "grad_norm": 0.567722737789154, + "learning_rate": 1e-05, + "loss": 1.1241, + "mean_token_accuracy": 0.6771541833877563, + "num_tokens": 221010073.0, + "step": 372 + }, + { + "epoch": 1.776190476190476, + "grad_norm": 0.6946297883987427, + "learning_rate": 1e-05, + "loss": 1.1205, + "mean_token_accuracy": 0.678805410861969, + "num_tokens": 221614799.0, + "step": 373 + }, + { + "epoch": 1.7809523809523808, + "grad_norm": 0.5566631555557251, + "learning_rate": 1e-05, + "loss": 1.1185, + "mean_token_accuracy": 0.6782611012458801, + "num_tokens": 222215943.0, + "step": 374 + }, + { + "epoch": 1.7857142857142856, + "grad_norm": 0.5999249219894409, + "learning_rate": 1e-05, + "loss": 1.114, + "mean_token_accuracy": 0.6802798509597778, + "num_tokens": 222803822.0, + "step": 375 + }, + { + "epoch": 1.7904761904761903, + "grad_norm": 0.5825783014297485, + "learning_rate": 1e-05, + "loss": 1.1314, + "mean_token_accuracy": 0.6754652261734009, + "num_tokens": 223409541.0, + "step": 376 + }, + { + "epoch": 1.795238095238095, + "grad_norm": 0.5893160700798035, + "learning_rate": 1e-05, + "loss": 1.1183, + "mean_token_accuracy": 0.6782077550888062, + "num_tokens": 223996446.0, + "step": 377 + }, + { + "epoch": 1.8, + "grad_norm": 0.5960800051689148, + "learning_rate": 1e-05, + "loss": 1.1203, + "mean_token_accuracy": 0.678328275680542, + "num_tokens": 224599074.0, + "step": 378 + }, + { + "epoch": 1.8047619047619048, + "grad_norm": 0.5972325205802917, + "learning_rate": 1e-05, + "loss": 1.1122, + "mean_token_accuracy": 0.6802579760551453, + "num_tokens": 225184557.0, + "step": 379 + }, + { + "epoch": 1.8095238095238095, + "grad_norm": 0.597683310508728, + "learning_rate": 1e-05, + "loss": 1.1185, + "mean_token_accuracy": 0.6798061728477478, + "num_tokens": 225774197.0, + "step": 380 + }, + { + "epoch": 1.8142857142857143, + "grad_norm": 0.575453519821167, + "learning_rate": 1e-05, + "loss": 1.1072, + "mean_token_accuracy": 0.6810543537139893, + "num_tokens": 226359063.0, + "step": 381 + }, + { + "epoch": 1.819047619047619, + "grad_norm": 0.5560538172721863, + "learning_rate": 1e-05, + "loss": 1.1237, + "mean_token_accuracy": 0.6774187088012695, + "num_tokens": 226962202.0, + "step": 382 + }, + { + "epoch": 1.8238095238095238, + "grad_norm": 0.6427722573280334, + "learning_rate": 1e-05, + "loss": 1.1218, + "mean_token_accuracy": 0.677949070930481, + "num_tokens": 227541002.0, + "step": 383 + }, + { + "epoch": 1.8285714285714287, + "grad_norm": 0.6143935322761536, + "learning_rate": 1e-05, + "loss": 1.1221, + "mean_token_accuracy": 0.6778963804244995, + "num_tokens": 228134124.0, + "step": 384 + }, + { + "epoch": 1.8333333333333335, + "grad_norm": 0.6365751624107361, + "learning_rate": 1e-05, + "loss": 1.112, + "mean_token_accuracy": 0.6797761917114258, + "num_tokens": 228729717.0, + "step": 385 + }, + { + "epoch": 1.8380952380952382, + "grad_norm": 0.719041109085083, + "learning_rate": 1e-05, + "loss": 1.1178, + "mean_token_accuracy": 0.6780564785003662, + "num_tokens": 229318931.0, + "step": 386 + }, + { + "epoch": 1.842857142857143, + "grad_norm": 0.6031278967857361, + "learning_rate": 1e-05, + "loss": 1.1246, + "mean_token_accuracy": 0.6776800155639648, + "num_tokens": 229923675.0, + "step": 387 + }, + { + "epoch": 1.8476190476190477, + "grad_norm": 0.6627750396728516, + "learning_rate": 1e-05, + "loss": 1.1149, + "mean_token_accuracy": 0.6797564029693604, + "num_tokens": 230514254.0, + "step": 388 + }, + { + "epoch": 1.8523809523809525, + "grad_norm": 0.576654314994812, + "learning_rate": 1e-05, + "loss": 1.1228, + "mean_token_accuracy": 0.6780418157577515, + "num_tokens": 231113801.0, + "step": 389 + }, + { + "epoch": 1.8571428571428572, + "grad_norm": 0.6316273212432861, + "learning_rate": 1e-05, + "loss": 1.1119, + "mean_token_accuracy": 0.6792047023773193, + "num_tokens": 231709098.0, + "step": 390 + }, + { + "epoch": 1.861904761904762, + "grad_norm": 0.5546997785568237, + "learning_rate": 1e-05, + "loss": 1.1247, + "mean_token_accuracy": 0.6769775748252869, + "num_tokens": 232311276.0, + "step": 391 + }, + { + "epoch": 1.8666666666666667, + "grad_norm": 0.617088794708252, + "learning_rate": 1e-05, + "loss": 1.113, + "mean_token_accuracy": 0.6795423030853271, + "num_tokens": 232904607.0, + "step": 392 + }, + { + "epoch": 1.8714285714285714, + "grad_norm": 0.611702561378479, + "learning_rate": 1e-05, + "loss": 1.1057, + "mean_token_accuracy": 0.6821488738059998, + "num_tokens": 233493254.0, + "step": 393 + }, + { + "epoch": 1.8761904761904762, + "grad_norm": 0.6276193261146545, + "learning_rate": 1e-05, + "loss": 1.1154, + "mean_token_accuracy": 0.6800172328948975, + "num_tokens": 234085431.0, + "step": 394 + }, + { + "epoch": 1.880952380952381, + "grad_norm": 0.6570289731025696, + "learning_rate": 1e-05, + "loss": 1.1245, + "mean_token_accuracy": 0.676892876625061, + "num_tokens": 234685396.0, + "step": 395 + }, + { + "epoch": 1.8857142857142857, + "grad_norm": 0.6350821256637573, + "learning_rate": 1e-05, + "loss": 1.1253, + "mean_token_accuracy": 0.6770513653755188, + "num_tokens": 235280049.0, + "step": 396 + }, + { + "epoch": 1.8904761904761904, + "grad_norm": 0.6419028639793396, + "learning_rate": 1e-05, + "loss": 1.1258, + "mean_token_accuracy": 0.6772979497909546, + "num_tokens": 235867276.0, + "step": 397 + }, + { + "epoch": 1.8952380952380952, + "grad_norm": 0.6098426580429077, + "learning_rate": 1e-05, + "loss": 1.1126, + "mean_token_accuracy": 0.6804271936416626, + "num_tokens": 236448647.0, + "step": 398 + }, + { + "epoch": 1.9, + "grad_norm": 0.5854616165161133, + "learning_rate": 1e-05, + "loss": 1.1256, + "mean_token_accuracy": 0.6755622625350952, + "num_tokens": 237054180.0, + "step": 399 + }, + { + "epoch": 1.9047619047619047, + "grad_norm": 0.6416271328926086, + "learning_rate": 1e-05, + "loss": 1.1394, + "mean_token_accuracy": 0.6737023591995239, + "num_tokens": 237658218.0, + "step": 400 + }, + { + "epoch": 1.9095238095238094, + "grad_norm": 0.5833379626274109, + "learning_rate": 1e-05, + "loss": 1.1223, + "mean_token_accuracy": 0.6782907247543335, + "num_tokens": 238248393.0, + "step": 401 + }, + { + "epoch": 1.9142857142857141, + "grad_norm": 0.6798136830329895, + "learning_rate": 1e-05, + "loss": 1.1096, + "mean_token_accuracy": 0.6816190481185913, + "num_tokens": 238838548.0, + "step": 402 + }, + { + "epoch": 1.919047619047619, + "grad_norm": 0.5994821786880493, + "learning_rate": 1e-05, + "loss": 1.1154, + "mean_token_accuracy": 0.6799057722091675, + "num_tokens": 239442502.0, + "step": 403 + }, + { + "epoch": 1.9238095238095239, + "grad_norm": 0.6224843263626099, + "learning_rate": 1e-05, + "loss": 1.1273, + "mean_token_accuracy": 0.6760965585708618, + "num_tokens": 240029019.0, + "step": 404 + }, + { + "epoch": 1.9285714285714286, + "grad_norm": 0.6100861430168152, + "learning_rate": 1e-05, + "loss": 1.1134, + "mean_token_accuracy": 0.6803538799285889, + "num_tokens": 240623504.0, + "step": 405 + }, + { + "epoch": 1.9333333333333333, + "grad_norm": 0.6026962399482727, + "learning_rate": 1e-05, + "loss": 1.1022, + "mean_token_accuracy": 0.6817559599876404, + "num_tokens": 241217102.0, + "step": 406 + }, + { + "epoch": 1.938095238095238, + "grad_norm": 0.6529442667961121, + "learning_rate": 1e-05, + "loss": 1.1141, + "mean_token_accuracy": 0.6804797649383545, + "num_tokens": 241812222.0, + "step": 407 + }, + { + "epoch": 1.9428571428571428, + "grad_norm": 0.6519430875778198, + "learning_rate": 1e-05, + "loss": 1.1085, + "mean_token_accuracy": 0.680460512638092, + "num_tokens": 242388669.0, + "step": 408 + }, + { + "epoch": 1.9476190476190476, + "grad_norm": 0.7020300626754761, + "learning_rate": 1e-05, + "loss": 1.1111, + "mean_token_accuracy": 0.6802721619606018, + "num_tokens": 242965686.0, + "step": 409 + }, + { + "epoch": 1.9523809523809523, + "grad_norm": 0.6024628281593323, + "learning_rate": 1e-05, + "loss": 1.1183, + "mean_token_accuracy": 0.6792590022087097, + "num_tokens": 243553853.0, + "step": 410 + }, + { + "epoch": 1.9571428571428573, + "grad_norm": 0.7494162321090698, + "learning_rate": 1e-05, + "loss": 1.1135, + "mean_token_accuracy": 0.6788877844810486, + "num_tokens": 244139754.0, + "step": 411 + }, + { + "epoch": 1.961904761904762, + "grad_norm": 0.6602755188941956, + "learning_rate": 1e-05, + "loss": 1.1176, + "mean_token_accuracy": 0.6777335405349731, + "num_tokens": 244736423.0, + "step": 412 + }, + { + "epoch": 1.9666666666666668, + "grad_norm": 0.7016980051994324, + "learning_rate": 1e-05, + "loss": 1.1186, + "mean_token_accuracy": 0.6798810362815857, + "num_tokens": 245337682.0, + "step": 413 + }, + { + "epoch": 1.9714285714285715, + "grad_norm": 0.6483145356178284, + "learning_rate": 1e-05, + "loss": 1.1172, + "mean_token_accuracy": 0.67914879322052, + "num_tokens": 245952105.0, + "step": 414 + }, + { + "epoch": 1.9761904761904763, + "grad_norm": 0.678092896938324, + "learning_rate": 1e-05, + "loss": 1.1039, + "mean_token_accuracy": 0.6819472312927246, + "num_tokens": 246540999.0, + "step": 415 + }, + { + "epoch": 1.980952380952381, + "grad_norm": 0.7507527470588684, + "learning_rate": 1e-05, + "loss": 1.1103, + "mean_token_accuracy": 0.6814316511154175, + "num_tokens": 247142303.0, + "step": 416 + }, + { + "epoch": 1.9857142857142858, + "grad_norm": 0.625765323638916, + "learning_rate": 1e-05, + "loss": 1.108, + "mean_token_accuracy": 0.6812607049942017, + "num_tokens": 247732988.0, + "step": 417 + }, + { + "epoch": 1.9904761904761905, + "grad_norm": 0.6421918869018555, + "learning_rate": 1e-05, + "loss": 1.1022, + "mean_token_accuracy": 0.681933581829071, + "num_tokens": 248334744.0, + "step": 418 + }, + { + "epoch": 1.9952380952380953, + "grad_norm": 0.6160528659820557, + "learning_rate": 1e-05, + "loss": 1.1133, + "mean_token_accuracy": 0.6797480583190918, + "num_tokens": 248930347.0, + "step": 419 + }, + { + "epoch": 2.0, + "grad_norm": 0.703513503074646, + "learning_rate": 1e-05, + "loss": 1.129, + "mean_token_accuracy": 0.676598846912384, + "num_tokens": 249522093.0, + "step": 420 + }, + { + "epoch": 2.0047619047619047, + "grad_norm": 0.7784668207168579, + "learning_rate": 1e-05, + "loss": 1.0772, + "mean_token_accuracy": 0.687883734703064, + "num_tokens": 250112163.0, + "step": 421 + }, + { + "epoch": 2.0095238095238095, + "grad_norm": 0.7685954570770264, + "learning_rate": 1e-05, + "loss": 1.065, + "mean_token_accuracy": 0.6909651160240173, + "num_tokens": 250690466.0, + "step": 422 + }, + { + "epoch": 2.0142857142857142, + "grad_norm": 0.5822970867156982, + "learning_rate": 1e-05, + "loss": 1.0678, + "mean_token_accuracy": 0.68952476978302, + "num_tokens": 251279220.0, + "step": 423 + }, + { + "epoch": 2.019047619047619, + "grad_norm": 0.8003807663917542, + "learning_rate": 1e-05, + "loss": 1.071, + "mean_token_accuracy": 0.6891335248947144, + "num_tokens": 251871717.0, + "step": 424 + }, + { + "epoch": 2.0238095238095237, + "grad_norm": 0.6656951904296875, + "learning_rate": 1e-05, + "loss": 1.0907, + "mean_token_accuracy": 0.6843781471252441, + "num_tokens": 252474129.0, + "step": 425 + }, + { + "epoch": 2.0285714285714285, + "grad_norm": 0.662339448928833, + "learning_rate": 1e-05, + "loss": 1.0518, + "mean_token_accuracy": 0.693079948425293, + "num_tokens": 253069065.0, + "step": 426 + }, + { + "epoch": 2.033333333333333, + "grad_norm": 0.6397184729576111, + "learning_rate": 1e-05, + "loss": 1.079, + "mean_token_accuracy": 0.6863641738891602, + "num_tokens": 253654392.0, + "step": 427 + }, + { + "epoch": 2.038095238095238, + "grad_norm": 0.6415942907333374, + "learning_rate": 1e-05, + "loss": 1.0688, + "mean_token_accuracy": 0.6888935565948486, + "num_tokens": 254245168.0, + "step": 428 + }, + { + "epoch": 2.0428571428571427, + "grad_norm": 0.6560488939285278, + "learning_rate": 1e-05, + "loss": 1.0824, + "mean_token_accuracy": 0.685175895690918, + "num_tokens": 254841132.0, + "step": 429 + }, + { + "epoch": 2.0476190476190474, + "grad_norm": 0.5839130878448486, + "learning_rate": 1e-05, + "loss": 1.0676, + "mean_token_accuracy": 0.6896021366119385, + "num_tokens": 255433793.0, + "step": 430 + }, + { + "epoch": 2.052380952380952, + "grad_norm": 0.7360151410102844, + "learning_rate": 1e-05, + "loss": 1.0658, + "mean_token_accuracy": 0.6903011798858643, + "num_tokens": 256015225.0, + "step": 431 + }, + { + "epoch": 2.057142857142857, + "grad_norm": 0.633699893951416, + "learning_rate": 1e-05, + "loss": 1.0638, + "mean_token_accuracy": 0.6905348300933838, + "num_tokens": 256617333.0, + "step": 432 + }, + { + "epoch": 2.0619047619047617, + "grad_norm": 0.6784190535545349, + "learning_rate": 1e-05, + "loss": 1.0913, + "mean_token_accuracy": 0.6846885681152344, + "num_tokens": 257214120.0, + "step": 433 + }, + { + "epoch": 2.066666666666667, + "grad_norm": 0.6749794483184814, + "learning_rate": 1e-05, + "loss": 1.0707, + "mean_token_accuracy": 0.6884802579879761, + "num_tokens": 257807506.0, + "step": 434 + }, + { + "epoch": 2.0714285714285716, + "grad_norm": 0.6474000811576843, + "learning_rate": 1e-05, + "loss": 1.0873, + "mean_token_accuracy": 0.6844640970230103, + "num_tokens": 258414225.0, + "step": 435 + }, + { + "epoch": 2.0761904761904764, + "grad_norm": 0.6300811171531677, + "learning_rate": 1e-05, + "loss": 1.0684, + "mean_token_accuracy": 0.688113808631897, + "num_tokens": 259009305.0, + "step": 436 + }, + { + "epoch": 2.080952380952381, + "grad_norm": 0.6160655617713928, + "learning_rate": 1e-05, + "loss": 1.0626, + "mean_token_accuracy": 0.690530002117157, + "num_tokens": 259600283.0, + "step": 437 + }, + { + "epoch": 2.085714285714286, + "grad_norm": 0.5936851501464844, + "learning_rate": 1e-05, + "loss": 1.0876, + "mean_token_accuracy": 0.6846874952316284, + "num_tokens": 260203907.0, + "step": 438 + }, + { + "epoch": 2.0904761904761906, + "grad_norm": 0.6563723683357239, + "learning_rate": 1e-05, + "loss": 1.0823, + "mean_token_accuracy": 0.6850008964538574, + "num_tokens": 260795356.0, + "step": 439 + }, + { + "epoch": 2.0952380952380953, + "grad_norm": 0.6244327425956726, + "learning_rate": 1e-05, + "loss": 1.0595, + "mean_token_accuracy": 0.6918923854827881, + "num_tokens": 261376414.0, + "step": 440 + }, + { + "epoch": 2.1, + "grad_norm": 0.6768208146095276, + "learning_rate": 1e-05, + "loss": 1.0854, + "mean_token_accuracy": 0.6857748031616211, + "num_tokens": 261965206.0, + "step": 441 + }, + { + "epoch": 2.104761904761905, + "grad_norm": 0.6261032819747925, + "learning_rate": 1e-05, + "loss": 1.0792, + "mean_token_accuracy": 0.6868791580200195, + "num_tokens": 262561371.0, + "step": 442 + }, + { + "epoch": 2.1095238095238096, + "grad_norm": 0.6388991475105286, + "learning_rate": 1e-05, + "loss": 1.068, + "mean_token_accuracy": 0.689771294593811, + "num_tokens": 263159935.0, + "step": 443 + }, + { + "epoch": 2.1142857142857143, + "grad_norm": 0.6453383564949036, + "learning_rate": 1e-05, + "loss": 1.0803, + "mean_token_accuracy": 0.6851140260696411, + "num_tokens": 263754299.0, + "step": 444 + }, + { + "epoch": 2.119047619047619, + "grad_norm": 0.6248214244842529, + "learning_rate": 1e-05, + "loss": 1.0792, + "mean_token_accuracy": 0.686503529548645, + "num_tokens": 264354630.0, + "step": 445 + }, + { + "epoch": 2.123809523809524, + "grad_norm": 0.6909031271934509, + "learning_rate": 1e-05, + "loss": 1.0995, + "mean_token_accuracy": 0.681476891040802, + "num_tokens": 264964353.0, + "step": 446 + }, + { + "epoch": 2.1285714285714286, + "grad_norm": 0.6381927132606506, + "learning_rate": 1e-05, + "loss": 1.0816, + "mean_token_accuracy": 0.6861193180084229, + "num_tokens": 265561411.0, + "step": 447 + }, + { + "epoch": 2.1333333333333333, + "grad_norm": 0.669456958770752, + "learning_rate": 1e-05, + "loss": 1.073, + "mean_token_accuracy": 0.6876237392425537, + "num_tokens": 266155790.0, + "step": 448 + }, + { + "epoch": 2.138095238095238, + "grad_norm": 0.6266065239906311, + "learning_rate": 1e-05, + "loss": 1.0788, + "mean_token_accuracy": 0.6870714426040649, + "num_tokens": 266757269.0, + "step": 449 + }, + { + "epoch": 2.142857142857143, + "grad_norm": 0.6428273916244507, + "learning_rate": 1e-05, + "loss": 1.0942, + "mean_token_accuracy": 0.6831685304641724, + "num_tokens": 267369143.0, + "step": 450 + }, + { + "epoch": 2.1476190476190475, + "grad_norm": 0.6169470548629761, + "learning_rate": 1e-05, + "loss": 1.0619, + "mean_token_accuracy": 0.6913155317306519, + "num_tokens": 267965437.0, + "step": 451 + }, + { + "epoch": 2.1523809523809523, + "grad_norm": 0.6351789832115173, + "learning_rate": 1e-05, + "loss": 1.0713, + "mean_token_accuracy": 0.6888561248779297, + "num_tokens": 268571463.0, + "step": 452 + }, + { + "epoch": 2.157142857142857, + "grad_norm": 0.6532635688781738, + "learning_rate": 1e-05, + "loss": 1.0698, + "mean_token_accuracy": 0.6889727115631104, + "num_tokens": 269157041.0, + "step": 453 + }, + { + "epoch": 2.1619047619047618, + "grad_norm": 0.5989878177642822, + "learning_rate": 1e-05, + "loss": 1.0682, + "mean_token_accuracy": 0.6890783309936523, + "num_tokens": 269758195.0, + "step": 454 + }, + { + "epoch": 2.1666666666666665, + "grad_norm": 0.6337672472000122, + "learning_rate": 1e-05, + "loss": 1.0969, + "mean_token_accuracy": 0.6822439432144165, + "num_tokens": 270349613.0, + "step": 455 + }, + { + "epoch": 2.1714285714285713, + "grad_norm": 0.5972429513931274, + "learning_rate": 1e-05, + "loss": 1.0831, + "mean_token_accuracy": 0.6840848326683044, + "num_tokens": 270947174.0, + "step": 456 + }, + { + "epoch": 2.176190476190476, + "grad_norm": 0.6298529505729675, + "learning_rate": 1e-05, + "loss": 1.0921, + "mean_token_accuracy": 0.6836713552474976, + "num_tokens": 271549889.0, + "step": 457 + }, + { + "epoch": 2.1809523809523808, + "grad_norm": 0.574796199798584, + "learning_rate": 1e-05, + "loss": 1.064, + "mean_token_accuracy": 0.6906387805938721, + "num_tokens": 272139782.0, + "step": 458 + }, + { + "epoch": 2.185714285714286, + "grad_norm": 0.6812316179275513, + "learning_rate": 1e-05, + "loss": 1.0762, + "mean_token_accuracy": 0.687111496925354, + "num_tokens": 272746279.0, + "step": 459 + }, + { + "epoch": 2.1904761904761907, + "grad_norm": 0.5981315970420837, + "learning_rate": 1e-05, + "loss": 1.0626, + "mean_token_accuracy": 0.6907453536987305, + "num_tokens": 273348449.0, + "step": 460 + }, + { + "epoch": 2.1952380952380954, + "grad_norm": 0.6438897252082825, + "learning_rate": 1e-05, + "loss": 1.0853, + "mean_token_accuracy": 0.6858918070793152, + "num_tokens": 273949928.0, + "step": 461 + }, + { + "epoch": 2.2, + "grad_norm": 0.6236709952354431, + "learning_rate": 1e-05, + "loss": 1.0733, + "mean_token_accuracy": 0.6877059936523438, + "num_tokens": 274548070.0, + "step": 462 + }, + { + "epoch": 2.204761904761905, + "grad_norm": 0.6749060153961182, + "learning_rate": 1e-05, + "loss": 1.0758, + "mean_token_accuracy": 0.6867290735244751, + "num_tokens": 275135656.0, + "step": 463 + }, + { + "epoch": 2.2095238095238097, + "grad_norm": 0.6628844738006592, + "learning_rate": 1e-05, + "loss": 1.0765, + "mean_token_accuracy": 0.6874538660049438, + "num_tokens": 275740663.0, + "step": 464 + }, + { + "epoch": 2.2142857142857144, + "grad_norm": 0.5728548169136047, + "learning_rate": 1e-05, + "loss": 1.0754, + "mean_token_accuracy": 0.6882718205451965, + "num_tokens": 276346207.0, + "step": 465 + }, + { + "epoch": 2.219047619047619, + "grad_norm": 0.6232889294624329, + "learning_rate": 1e-05, + "loss": 1.0752, + "mean_token_accuracy": 0.6872685551643372, + "num_tokens": 276940208.0, + "step": 466 + }, + { + "epoch": 2.223809523809524, + "grad_norm": 0.6447910070419312, + "learning_rate": 1e-05, + "loss": 1.091, + "mean_token_accuracy": 0.6836293339729309, + "num_tokens": 277539762.0, + "step": 467 + }, + { + "epoch": 2.2285714285714286, + "grad_norm": 0.6113771796226501, + "learning_rate": 1e-05, + "loss": 1.0865, + "mean_token_accuracy": 0.684094250202179, + "num_tokens": 278136526.0, + "step": 468 + }, + { + "epoch": 2.2333333333333334, + "grad_norm": 0.6344524025917053, + "learning_rate": 1e-05, + "loss": 1.0772, + "mean_token_accuracy": 0.6870338320732117, + "num_tokens": 278723575.0, + "step": 469 + }, + { + "epoch": 2.238095238095238, + "grad_norm": 0.6180852055549622, + "learning_rate": 1e-05, + "loss": 1.0544, + "mean_token_accuracy": 0.6927859783172607, + "num_tokens": 279313692.0, + "step": 470 + }, + { + "epoch": 2.242857142857143, + "grad_norm": 0.6375457644462585, + "learning_rate": 1e-05, + "loss": 1.0869, + "mean_token_accuracy": 0.6847492456436157, + "num_tokens": 279911596.0, + "step": 471 + }, + { + "epoch": 2.2476190476190476, + "grad_norm": 0.6032583117485046, + "learning_rate": 1e-05, + "loss": 1.0701, + "mean_token_accuracy": 0.6893506050109863, + "num_tokens": 280516626.0, + "step": 472 + }, + { + "epoch": 2.2523809523809524, + "grad_norm": 0.6571868062019348, + "learning_rate": 1e-05, + "loss": 1.0723, + "mean_token_accuracy": 0.6889193654060364, + "num_tokens": 281109826.0, + "step": 473 + }, + { + "epoch": 2.257142857142857, + "grad_norm": 0.5816087126731873, + "learning_rate": 1e-05, + "loss": 1.0783, + "mean_token_accuracy": 0.6873452663421631, + "num_tokens": 281705908.0, + "step": 474 + }, + { + "epoch": 2.261904761904762, + "grad_norm": 0.6110855340957642, + "learning_rate": 1e-05, + "loss": 1.0733, + "mean_token_accuracy": 0.6875293850898743, + "num_tokens": 282295646.0, + "step": 475 + }, + { + "epoch": 2.2666666666666666, + "grad_norm": 0.5722987055778503, + "learning_rate": 1e-05, + "loss": 1.064, + "mean_token_accuracy": 0.6898794174194336, + "num_tokens": 282882984.0, + "step": 476 + }, + { + "epoch": 2.2714285714285714, + "grad_norm": 0.5756980776786804, + "learning_rate": 1e-05, + "loss": 1.0705, + "mean_token_accuracy": 0.6888871192932129, + "num_tokens": 283470314.0, + "step": 477 + }, + { + "epoch": 2.276190476190476, + "grad_norm": 0.6090242862701416, + "learning_rate": 1e-05, + "loss": 1.0729, + "mean_token_accuracy": 0.6876958012580872, + "num_tokens": 284064822.0, + "step": 478 + }, + { + "epoch": 2.280952380952381, + "grad_norm": 0.551956295967102, + "learning_rate": 1e-05, + "loss": 1.0666, + "mean_token_accuracy": 0.6899924278259277, + "num_tokens": 284659143.0, + "step": 479 + }, + { + "epoch": 2.2857142857142856, + "grad_norm": 0.617386519908905, + "learning_rate": 1e-05, + "loss": 1.0789, + "mean_token_accuracy": 0.6873286366462708, + "num_tokens": 285260603.0, + "step": 480 + }, + { + "epoch": 2.2904761904761903, + "grad_norm": 0.5895305871963501, + "learning_rate": 1e-05, + "loss": 1.0668, + "mean_token_accuracy": 0.6887931823730469, + "num_tokens": 285858617.0, + "step": 481 + }, + { + "epoch": 2.295238095238095, + "grad_norm": 0.575018584728241, + "learning_rate": 1e-05, + "loss": 1.0733, + "mean_token_accuracy": 0.6886229515075684, + "num_tokens": 286462909.0, + "step": 482 + }, + { + "epoch": 2.3, + "grad_norm": 0.680483341217041, + "learning_rate": 1e-05, + "loss": 1.0686, + "mean_token_accuracy": 0.6894232034683228, + "num_tokens": 287057508.0, + "step": 483 + }, + { + "epoch": 2.3047619047619046, + "grad_norm": 0.6086472868919373, + "learning_rate": 1e-05, + "loss": 1.0784, + "mean_token_accuracy": 0.6863738298416138, + "num_tokens": 287647864.0, + "step": 484 + }, + { + "epoch": 2.3095238095238093, + "grad_norm": 0.6269891858100891, + "learning_rate": 1e-05, + "loss": 1.0803, + "mean_token_accuracy": 0.6864203810691833, + "num_tokens": 288244654.0, + "step": 485 + }, + { + "epoch": 2.314285714285714, + "grad_norm": 0.6842952370643616, + "learning_rate": 1e-05, + "loss": 1.0897, + "mean_token_accuracy": 0.684012770652771, + "num_tokens": 288833805.0, + "step": 486 + }, + { + "epoch": 2.319047619047619, + "grad_norm": 0.5772620439529419, + "learning_rate": 1e-05, + "loss": 1.0728, + "mean_token_accuracy": 0.6879225969314575, + "num_tokens": 289430249.0, + "step": 487 + }, + { + "epoch": 2.323809523809524, + "grad_norm": 0.6799498796463013, + "learning_rate": 1e-05, + "loss": 1.0737, + "mean_token_accuracy": 0.6892322897911072, + "num_tokens": 290017640.0, + "step": 488 + }, + { + "epoch": 2.3285714285714287, + "grad_norm": 0.63170325756073, + "learning_rate": 1e-05, + "loss": 1.0694, + "mean_token_accuracy": 0.6884621381759644, + "num_tokens": 290598414.0, + "step": 489 + }, + { + "epoch": 2.3333333333333335, + "grad_norm": 0.6786331534385681, + "learning_rate": 1e-05, + "loss": 1.061, + "mean_token_accuracy": 0.6906882524490356, + "num_tokens": 291181812.0, + "step": 490 + }, + { + "epoch": 2.3380952380952382, + "grad_norm": 0.6489508748054504, + "learning_rate": 1e-05, + "loss": 1.0747, + "mean_token_accuracy": 0.6877006888389587, + "num_tokens": 291764317.0, + "step": 491 + }, + { + "epoch": 2.342857142857143, + "grad_norm": 0.6271830797195435, + "learning_rate": 1e-05, + "loss": 1.0809, + "mean_token_accuracy": 0.6854414343833923, + "num_tokens": 292350504.0, + "step": 492 + }, + { + "epoch": 2.3476190476190477, + "grad_norm": 0.6458184123039246, + "learning_rate": 1e-05, + "loss": 1.0777, + "mean_token_accuracy": 0.6864031553268433, + "num_tokens": 292951776.0, + "step": 493 + }, + { + "epoch": 2.3523809523809525, + "grad_norm": 0.6648980379104614, + "learning_rate": 1e-05, + "loss": 1.0695, + "mean_token_accuracy": 0.6887059211730957, + "num_tokens": 293532488.0, + "step": 494 + }, + { + "epoch": 2.357142857142857, + "grad_norm": 0.6425085067749023, + "learning_rate": 1e-05, + "loss": 1.0575, + "mean_token_accuracy": 0.6918837428092957, + "num_tokens": 294121146.0, + "step": 495 + }, + { + "epoch": 2.361904761904762, + "grad_norm": 0.6645520329475403, + "learning_rate": 1e-05, + "loss": 1.0768, + "mean_token_accuracy": 0.6873211860656738, + "num_tokens": 294726732.0, + "step": 496 + }, + { + "epoch": 2.3666666666666667, + "grad_norm": 0.6538220047950745, + "learning_rate": 1e-05, + "loss": 1.0682, + "mean_token_accuracy": 0.6892035007476807, + "num_tokens": 295306697.0, + "step": 497 + }, + { + "epoch": 2.3714285714285714, + "grad_norm": 0.7154629230499268, + "learning_rate": 1e-05, + "loss": 1.0893, + "mean_token_accuracy": 0.6833094358444214, + "num_tokens": 295894972.0, + "step": 498 + }, + { + "epoch": 2.376190476190476, + "grad_norm": 0.6492322087287903, + "learning_rate": 1e-05, + "loss": 1.0831, + "mean_token_accuracy": 0.6853781938552856, + "num_tokens": 296505345.0, + "step": 499 + }, + { + "epoch": 2.380952380952381, + "grad_norm": 0.7426714301109314, + "learning_rate": 1e-05, + "loss": 1.0664, + "mean_token_accuracy": 0.6893447637557983, + "num_tokens": 297100909.0, + "step": 500 + }, + { + "epoch": 2.3857142857142857, + "grad_norm": 0.6399804353713989, + "learning_rate": 1e-05, + "loss": 1.0743, + "mean_token_accuracy": 0.688417375087738, + "num_tokens": 297690267.0, + "step": 501 + }, + { + "epoch": 2.3904761904761904, + "grad_norm": 0.599839985370636, + "learning_rate": 1e-05, + "loss": 1.0646, + "mean_token_accuracy": 0.6897764801979065, + "num_tokens": 298283484.0, + "step": 502 + }, + { + "epoch": 2.395238095238095, + "grad_norm": 0.6296051740646362, + "learning_rate": 1e-05, + "loss": 1.0814, + "mean_token_accuracy": 0.685540497303009, + "num_tokens": 298880193.0, + "step": 503 + }, + { + "epoch": 2.4, + "grad_norm": 0.5922709107398987, + "learning_rate": 1e-05, + "loss": 1.058, + "mean_token_accuracy": 0.6912336349487305, + "num_tokens": 299479995.0, + "step": 504 + }, + { + "epoch": 2.4047619047619047, + "grad_norm": 0.608103334903717, + "learning_rate": 1e-05, + "loss": 1.0731, + "mean_token_accuracy": 0.6877481937408447, + "num_tokens": 300068384.0, + "step": 505 + }, + { + "epoch": 2.4095238095238094, + "grad_norm": 0.6003749966621399, + "learning_rate": 1e-05, + "loss": 1.083, + "mean_token_accuracy": 0.6847676038742065, + "num_tokens": 300687274.0, + "step": 506 + }, + { + "epoch": 2.414285714285714, + "grad_norm": 0.5747948884963989, + "learning_rate": 1e-05, + "loss": 1.075, + "mean_token_accuracy": 0.6867921352386475, + "num_tokens": 301288728.0, + "step": 507 + }, + { + "epoch": 2.419047619047619, + "grad_norm": 0.6287463307380676, + "learning_rate": 1e-05, + "loss": 1.0698, + "mean_token_accuracy": 0.6888238787651062, + "num_tokens": 301868926.0, + "step": 508 + }, + { + "epoch": 2.4238095238095236, + "grad_norm": 0.5455256104469299, + "learning_rate": 1e-05, + "loss": 1.0644, + "mean_token_accuracy": 0.690401017665863, + "num_tokens": 302467630.0, + "step": 509 + }, + { + "epoch": 2.4285714285714284, + "grad_norm": 0.6476891040802002, + "learning_rate": 1e-05, + "loss": 1.09, + "mean_token_accuracy": 0.6842606663703918, + "num_tokens": 303058965.0, + "step": 510 + }, + { + "epoch": 2.4333333333333336, + "grad_norm": 0.6696739792823792, + "learning_rate": 1e-05, + "loss": 1.091, + "mean_token_accuracy": 0.6829922795295715, + "num_tokens": 303639694.0, + "step": 511 + }, + { + "epoch": 2.4380952380952383, + "grad_norm": 0.5850697159767151, + "learning_rate": 1e-05, + "loss": 1.0651, + "mean_token_accuracy": 0.6903361082077026, + "num_tokens": 304234504.0, + "step": 512 + }, + { + "epoch": 2.442857142857143, + "grad_norm": 0.6123826503753662, + "learning_rate": 1e-05, + "loss": 1.0848, + "mean_token_accuracy": 0.6855412125587463, + "num_tokens": 304822484.0, + "step": 513 + }, + { + "epoch": 2.447619047619048, + "grad_norm": 0.6242313981056213, + "learning_rate": 1e-05, + "loss": 1.069, + "mean_token_accuracy": 0.6895902156829834, + "num_tokens": 305405226.0, + "step": 514 + }, + { + "epoch": 2.4523809523809526, + "grad_norm": 0.6153740286827087, + "learning_rate": 1e-05, + "loss": 1.0701, + "mean_token_accuracy": 0.6889458298683167, + "num_tokens": 306007153.0, + "step": 515 + }, + { + "epoch": 2.4571428571428573, + "grad_norm": 0.6674852967262268, + "learning_rate": 1e-05, + "loss": 1.0701, + "mean_token_accuracy": 0.6897221803665161, + "num_tokens": 306588836.0, + "step": 516 + }, + { + "epoch": 2.461904761904762, + "grad_norm": 0.6560084819793701, + "learning_rate": 1e-05, + "loss": 1.0804, + "mean_token_accuracy": 0.6861131191253662, + "num_tokens": 307200955.0, + "step": 517 + }, + { + "epoch": 2.466666666666667, + "grad_norm": 0.5911952257156372, + "learning_rate": 1e-05, + "loss": 1.0669, + "mean_token_accuracy": 0.6889474987983704, + "num_tokens": 307799028.0, + "step": 518 + }, + { + "epoch": 2.4714285714285715, + "grad_norm": 0.6963088512420654, + "learning_rate": 1e-05, + "loss": 1.0965, + "mean_token_accuracy": 0.6822454929351807, + "num_tokens": 308389748.0, + "step": 519 + }, + { + "epoch": 2.4761904761904763, + "grad_norm": 0.7166724801063538, + "learning_rate": 1e-05, + "loss": 1.0773, + "mean_token_accuracy": 0.6871429681777954, + "num_tokens": 308978715.0, + "step": 520 + }, + { + "epoch": 2.480952380952381, + "grad_norm": 0.598521888256073, + "learning_rate": 1e-05, + "loss": 1.0756, + "mean_token_accuracy": 0.6871167421340942, + "num_tokens": 309587298.0, + "step": 521 + }, + { + "epoch": 2.4857142857142858, + "grad_norm": 0.6383949518203735, + "learning_rate": 1e-05, + "loss": 1.0643, + "mean_token_accuracy": 0.6895929574966431, + "num_tokens": 310173585.0, + "step": 522 + }, + { + "epoch": 2.4904761904761905, + "grad_norm": 0.6667410731315613, + "learning_rate": 1e-05, + "loss": 1.0736, + "mean_token_accuracy": 0.6880219578742981, + "num_tokens": 310760313.0, + "step": 523 + }, + { + "epoch": 2.4952380952380953, + "grad_norm": 0.6218487620353699, + "learning_rate": 1e-05, + "loss": 1.0764, + "mean_token_accuracy": 0.6872262358665466, + "num_tokens": 311374002.0, + "step": 524 + }, + { + "epoch": 2.5, + "grad_norm": 0.6058824062347412, + "learning_rate": 1e-05, + "loss": 1.0701, + "mean_token_accuracy": 0.6883900165557861, + "num_tokens": 311952533.0, + "step": 525 + }, + { + "epoch": 2.5047619047619047, + "grad_norm": 0.6459484100341797, + "learning_rate": 1e-05, + "loss": 1.065, + "mean_token_accuracy": 0.6896857023239136, + "num_tokens": 312542383.0, + "step": 526 + }, + { + "epoch": 2.5095238095238095, + "grad_norm": 0.6192833781242371, + "learning_rate": 1e-05, + "loss": 1.0745, + "mean_token_accuracy": 0.686732828617096, + "num_tokens": 313136427.0, + "step": 527 + }, + { + "epoch": 2.5142857142857142, + "grad_norm": 0.602884829044342, + "learning_rate": 1e-05, + "loss": 1.0564, + "mean_token_accuracy": 0.6925665140151978, + "num_tokens": 313731115.0, + "step": 528 + }, + { + "epoch": 2.519047619047619, + "grad_norm": 0.5805109143257141, + "learning_rate": 1e-05, + "loss": 1.0644, + "mean_token_accuracy": 0.6895827651023865, + "num_tokens": 314316253.0, + "step": 529 + }, + { + "epoch": 2.5238095238095237, + "grad_norm": 0.6484024524688721, + "learning_rate": 1e-05, + "loss": 1.0634, + "mean_token_accuracy": 0.6902580857276917, + "num_tokens": 314906539.0, + "step": 530 + }, + { + "epoch": 2.5285714285714285, + "grad_norm": 0.6236498355865479, + "learning_rate": 1e-05, + "loss": 1.0611, + "mean_token_accuracy": 0.6907384991645813, + "num_tokens": 315491005.0, + "step": 531 + }, + { + "epoch": 2.533333333333333, + "grad_norm": 0.68634432554245, + "learning_rate": 1e-05, + "loss": 1.0759, + "mean_token_accuracy": 0.6861008405685425, + "num_tokens": 316086156.0, + "step": 532 + }, + { + "epoch": 2.538095238095238, + "grad_norm": 0.6483022570610046, + "learning_rate": 1e-05, + "loss": 1.0809, + "mean_token_accuracy": 0.6863186359405518, + "num_tokens": 316687284.0, + "step": 533 + }, + { + "epoch": 2.5428571428571427, + "grad_norm": 0.6313026547431946, + "learning_rate": 1e-05, + "loss": 1.065, + "mean_token_accuracy": 0.6903449296951294, + "num_tokens": 317280976.0, + "step": 534 + }, + { + "epoch": 2.5476190476190474, + "grad_norm": 0.7180777788162231, + "learning_rate": 1e-05, + "loss": 1.072, + "mean_token_accuracy": 0.6879873275756836, + "num_tokens": 317869704.0, + "step": 535 + }, + { + "epoch": 2.552380952380952, + "grad_norm": 0.6203593611717224, + "learning_rate": 1e-05, + "loss": 1.0754, + "mean_token_accuracy": 0.6873841285705566, + "num_tokens": 318453830.0, + "step": 536 + }, + { + "epoch": 2.557142857142857, + "grad_norm": 0.7294032573699951, + "learning_rate": 1e-05, + "loss": 1.0816, + "mean_token_accuracy": 0.6853822469711304, + "num_tokens": 319036628.0, + "step": 537 + }, + { + "epoch": 2.5619047619047617, + "grad_norm": 0.6315251588821411, + "learning_rate": 1e-05, + "loss": 1.0671, + "mean_token_accuracy": 0.6895589828491211, + "num_tokens": 319641680.0, + "step": 538 + }, + { + "epoch": 2.5666666666666664, + "grad_norm": 0.6481133699417114, + "learning_rate": 1e-05, + "loss": 1.0733, + "mean_token_accuracy": 0.6874011754989624, + "num_tokens": 320235018.0, + "step": 539 + }, + { + "epoch": 2.571428571428571, + "grad_norm": 0.6537102460861206, + "learning_rate": 1e-05, + "loss": 1.0782, + "mean_token_accuracy": 0.6865108609199524, + "num_tokens": 320839523.0, + "step": 540 + }, + { + "epoch": 2.576190476190476, + "grad_norm": 0.5990563631057739, + "learning_rate": 1e-05, + "loss": 1.0692, + "mean_token_accuracy": 0.6882610321044922, + "num_tokens": 321445469.0, + "step": 541 + }, + { + "epoch": 2.580952380952381, + "grad_norm": 0.7251924276351929, + "learning_rate": 1e-05, + "loss": 1.0769, + "mean_token_accuracy": 0.685580849647522, + "num_tokens": 322040382.0, + "step": 542 + }, + { + "epoch": 2.585714285714286, + "grad_norm": 0.5734168291091919, + "learning_rate": 1e-05, + "loss": 1.0731, + "mean_token_accuracy": 0.6879425048828125, + "num_tokens": 322631980.0, + "step": 543 + }, + { + "epoch": 2.5904761904761906, + "grad_norm": 0.6524589657783508, + "learning_rate": 1e-05, + "loss": 1.0715, + "mean_token_accuracy": 0.6874139308929443, + "num_tokens": 323217003.0, + "step": 544 + }, + { + "epoch": 2.5952380952380953, + "grad_norm": 0.6292608976364136, + "learning_rate": 1e-05, + "loss": 1.0751, + "mean_token_accuracy": 0.6870990991592407, + "num_tokens": 323797882.0, + "step": 545 + }, + { + "epoch": 2.6, + "grad_norm": 0.631439208984375, + "learning_rate": 1e-05, + "loss": 1.0783, + "mean_token_accuracy": 0.685767412185669, + "num_tokens": 324381508.0, + "step": 546 + }, + { + "epoch": 2.604761904761905, + "grad_norm": 0.621782124042511, + "learning_rate": 1e-05, + "loss": 1.0954, + "mean_token_accuracy": 0.6821581125259399, + "num_tokens": 324979976.0, + "step": 547 + }, + { + "epoch": 2.6095238095238096, + "grad_norm": 0.6306419372558594, + "learning_rate": 1e-05, + "loss": 1.0677, + "mean_token_accuracy": 0.6885519623756409, + "num_tokens": 325579147.0, + "step": 548 + }, + { + "epoch": 2.6142857142857143, + "grad_norm": 0.5700802206993103, + "learning_rate": 1e-05, + "loss": 1.0701, + "mean_token_accuracy": 0.6891588568687439, + "num_tokens": 326189724.0, + "step": 549 + }, + { + "epoch": 2.619047619047619, + "grad_norm": 0.5674880146980286, + "learning_rate": 1e-05, + "loss": 1.0723, + "mean_token_accuracy": 0.6874587535858154, + "num_tokens": 326781040.0, + "step": 550 + }, + { + "epoch": 2.623809523809524, + "grad_norm": 0.6210941076278687, + "learning_rate": 1e-05, + "loss": 1.066, + "mean_token_accuracy": 0.6903613805770874, + "num_tokens": 327384993.0, + "step": 551 + }, + { + "epoch": 2.6285714285714286, + "grad_norm": 0.5762701630592346, + "learning_rate": 1e-05, + "loss": 1.0541, + "mean_token_accuracy": 0.6926007866859436, + "num_tokens": 327967527.0, + "step": 552 + }, + { + "epoch": 2.6333333333333333, + "grad_norm": 0.5869442224502563, + "learning_rate": 1e-05, + "loss": 1.0602, + "mean_token_accuracy": 0.6907045841217041, + "num_tokens": 328556111.0, + "step": 553 + }, + { + "epoch": 2.638095238095238, + "grad_norm": 0.6561670303344727, + "learning_rate": 1e-05, + "loss": 1.067, + "mean_token_accuracy": 0.6888686418533325, + "num_tokens": 329156419.0, + "step": 554 + }, + { + "epoch": 2.642857142857143, + "grad_norm": 0.5729210376739502, + "learning_rate": 1e-05, + "loss": 1.0908, + "mean_token_accuracy": 0.6830568313598633, + "num_tokens": 329765795.0, + "step": 555 + }, + { + "epoch": 2.6476190476190475, + "grad_norm": 0.5583658218383789, + "learning_rate": 1e-05, + "loss": 1.0715, + "mean_token_accuracy": 0.6889873743057251, + "num_tokens": 330366805.0, + "step": 556 + }, + { + "epoch": 2.6523809523809523, + "grad_norm": 0.6156875491142273, + "learning_rate": 1e-05, + "loss": 1.0597, + "mean_token_accuracy": 0.6899721622467041, + "num_tokens": 330960683.0, + "step": 557 + }, + { + "epoch": 2.657142857142857, + "grad_norm": 0.5830056667327881, + "learning_rate": 1e-05, + "loss": 1.0766, + "mean_token_accuracy": 0.6871880292892456, + "num_tokens": 331566617.0, + "step": 558 + }, + { + "epoch": 2.6619047619047618, + "grad_norm": 0.6878387928009033, + "learning_rate": 1e-05, + "loss": 1.0606, + "mean_token_accuracy": 0.6908230781555176, + "num_tokens": 332146780.0, + "step": 559 + }, + { + "epoch": 2.6666666666666665, + "grad_norm": 0.6010000705718994, + "learning_rate": 1e-05, + "loss": 1.073, + "mean_token_accuracy": 0.6884846091270447, + "num_tokens": 332744565.0, + "step": 560 + }, + { + "epoch": 2.6714285714285713, + "grad_norm": 0.6257455348968506, + "learning_rate": 1e-05, + "loss": 1.0718, + "mean_token_accuracy": 0.6879858374595642, + "num_tokens": 333327246.0, + "step": 561 + }, + { + "epoch": 2.6761904761904765, + "grad_norm": 0.6111727356910706, + "learning_rate": 1e-05, + "loss": 1.0604, + "mean_token_accuracy": 0.6906289458274841, + "num_tokens": 333924279.0, + "step": 562 + }, + { + "epoch": 2.680952380952381, + "grad_norm": 0.6363468170166016, + "learning_rate": 1e-05, + "loss": 1.0628, + "mean_token_accuracy": 0.6897515058517456, + "num_tokens": 334526451.0, + "step": 563 + }, + { + "epoch": 2.685714285714286, + "grad_norm": 0.6247795820236206, + "learning_rate": 1e-05, + "loss": 1.0726, + "mean_token_accuracy": 0.6881762742996216, + "num_tokens": 335121296.0, + "step": 564 + }, + { + "epoch": 2.6904761904761907, + "grad_norm": 0.7256935238838196, + "learning_rate": 1e-05, + "loss": 1.069, + "mean_token_accuracy": 0.6878950595855713, + "num_tokens": 335705229.0, + "step": 565 + }, + { + "epoch": 2.6952380952380954, + "grad_norm": 0.6218934655189514, + "learning_rate": 1e-05, + "loss": 1.0745, + "mean_token_accuracy": 0.687312126159668, + "num_tokens": 336296515.0, + "step": 566 + }, + { + "epoch": 2.7, + "grad_norm": 0.64492267370224, + "learning_rate": 1e-05, + "loss": 1.0772, + "mean_token_accuracy": 0.6871779561042786, + "num_tokens": 336898581.0, + "step": 567 + }, + { + "epoch": 2.704761904761905, + "grad_norm": 0.6439410448074341, + "learning_rate": 1e-05, + "loss": 1.0849, + "mean_token_accuracy": 0.685498833656311, + "num_tokens": 337492720.0, + "step": 568 + }, + { + "epoch": 2.7095238095238097, + "grad_norm": 0.5982577204704285, + "learning_rate": 1e-05, + "loss": 1.0685, + "mean_token_accuracy": 0.6890565752983093, + "num_tokens": 338088080.0, + "step": 569 + }, + { + "epoch": 2.7142857142857144, + "grad_norm": 0.6382868885993958, + "learning_rate": 1e-05, + "loss": 1.0678, + "mean_token_accuracy": 0.6893150806427002, + "num_tokens": 338682863.0, + "step": 570 + }, + { + "epoch": 2.719047619047619, + "grad_norm": 0.5995696187019348, + "learning_rate": 1e-05, + "loss": 1.0737, + "mean_token_accuracy": 0.6885708570480347, + "num_tokens": 339274595.0, + "step": 571 + }, + { + "epoch": 2.723809523809524, + "grad_norm": 0.6478890180587769, + "learning_rate": 1e-05, + "loss": 1.0736, + "mean_token_accuracy": 0.687543511390686, + "num_tokens": 339857633.0, + "step": 572 + }, + { + "epoch": 2.7285714285714286, + "grad_norm": 0.6489014625549316, + "learning_rate": 1e-05, + "loss": 1.0564, + "mean_token_accuracy": 0.6918776035308838, + "num_tokens": 340451043.0, + "step": 573 + }, + { + "epoch": 2.7333333333333334, + "grad_norm": 0.6406450271606445, + "learning_rate": 1e-05, + "loss": 1.0801, + "mean_token_accuracy": 0.6862790584564209, + "num_tokens": 341042238.0, + "step": 574 + }, + { + "epoch": 2.738095238095238, + "grad_norm": 0.6261545419692993, + "learning_rate": 1e-05, + "loss": 1.0766, + "mean_token_accuracy": 0.6869131922721863, + "num_tokens": 341644006.0, + "step": 575 + }, + { + "epoch": 2.742857142857143, + "grad_norm": 0.5907791256904602, + "learning_rate": 1e-05, + "loss": 1.0666, + "mean_token_accuracy": 0.6884465217590332, + "num_tokens": 342238235.0, + "step": 576 + }, + { + "epoch": 2.7476190476190476, + "grad_norm": 0.638664186000824, + "learning_rate": 1e-05, + "loss": 1.0609, + "mean_token_accuracy": 0.6908861994743347, + "num_tokens": 342831260.0, + "step": 577 + }, + { + "epoch": 2.7523809523809524, + "grad_norm": 0.6344829797744751, + "learning_rate": 1e-05, + "loss": 1.0762, + "mean_token_accuracy": 0.687269389629364, + "num_tokens": 343427629.0, + "step": 578 + }, + { + "epoch": 2.757142857142857, + "grad_norm": 0.6150461435317993, + "learning_rate": 1e-05, + "loss": 1.0761, + "mean_token_accuracy": 0.6873693466186523, + "num_tokens": 344021401.0, + "step": 579 + }, + { + "epoch": 2.761904761904762, + "grad_norm": 0.6308332681655884, + "learning_rate": 1e-05, + "loss": 1.0618, + "mean_token_accuracy": 0.6911748647689819, + "num_tokens": 344610108.0, + "step": 580 + }, + { + "epoch": 2.7666666666666666, + "grad_norm": 0.55866539478302, + "learning_rate": 1e-05, + "loss": 1.0725, + "mean_token_accuracy": 0.6869944334030151, + "num_tokens": 345217241.0, + "step": 581 + }, + { + "epoch": 2.7714285714285714, + "grad_norm": 0.638909637928009, + "learning_rate": 1e-05, + "loss": 1.0658, + "mean_token_accuracy": 0.6890157461166382, + "num_tokens": 345804258.0, + "step": 582 + }, + { + "epoch": 2.776190476190476, + "grad_norm": 0.5688804984092712, + "learning_rate": 1e-05, + "loss": 1.0689, + "mean_token_accuracy": 0.6887319087982178, + "num_tokens": 346399481.0, + "step": 583 + }, + { + "epoch": 2.780952380952381, + "grad_norm": 0.6002762317657471, + "learning_rate": 1e-05, + "loss": 1.0563, + "mean_token_accuracy": 0.6915134191513062, + "num_tokens": 346997922.0, + "step": 584 + }, + { + "epoch": 2.7857142857142856, + "grad_norm": 0.6163663864135742, + "learning_rate": 1e-05, + "loss": 1.07, + "mean_token_accuracy": 0.6884576082229614, + "num_tokens": 347597863.0, + "step": 585 + }, + { + "epoch": 2.7904761904761903, + "grad_norm": 0.580531656742096, + "learning_rate": 1e-05, + "loss": 1.0638, + "mean_token_accuracy": 0.6888343095779419, + "num_tokens": 348201046.0, + "step": 586 + }, + { + "epoch": 2.795238095238095, + "grad_norm": 0.5918668508529663, + "learning_rate": 1e-05, + "loss": 1.0584, + "mean_token_accuracy": 0.6905962228775024, + "num_tokens": 348787326.0, + "step": 587 + }, + { + "epoch": 2.8, + "grad_norm": 0.6383691430091858, + "learning_rate": 1e-05, + "loss": 1.0689, + "mean_token_accuracy": 0.6883484125137329, + "num_tokens": 349380645.0, + "step": 588 + }, + { + "epoch": 2.8047619047619046, + "grad_norm": 0.6115639805793762, + "learning_rate": 1e-05, + "loss": 1.0654, + "mean_token_accuracy": 0.6897737979888916, + "num_tokens": 349983092.0, + "step": 589 + }, + { + "epoch": 2.8095238095238093, + "grad_norm": 0.6397126317024231, + "learning_rate": 1e-05, + "loss": 1.0617, + "mean_token_accuracy": 0.6903370022773743, + "num_tokens": 350576836.0, + "step": 590 + }, + { + "epoch": 2.814285714285714, + "grad_norm": 0.6862447261810303, + "learning_rate": 1e-05, + "loss": 1.0624, + "mean_token_accuracy": 0.691243588924408, + "num_tokens": 351172725.0, + "step": 591 + }, + { + "epoch": 2.819047619047619, + "grad_norm": 0.6518527269363403, + "learning_rate": 1e-05, + "loss": 1.0568, + "mean_token_accuracy": 0.6918639540672302, + "num_tokens": 351777142.0, + "step": 592 + }, + { + "epoch": 2.8238095238095235, + "grad_norm": 0.7507683634757996, + "learning_rate": 1e-05, + "loss": 1.0576, + "mean_token_accuracy": 0.6908581852912903, + "num_tokens": 352361834.0, + "step": 593 + }, + { + "epoch": 2.8285714285714287, + "grad_norm": 0.6769391298294067, + "learning_rate": 1e-05, + "loss": 1.0658, + "mean_token_accuracy": 0.6891970038414001, + "num_tokens": 352964892.0, + "step": 594 + }, + { + "epoch": 2.8333333333333335, + "grad_norm": 0.7207344770431519, + "learning_rate": 1e-05, + "loss": 1.0655, + "mean_token_accuracy": 0.688566267490387, + "num_tokens": 353563696.0, + "step": 595 + }, + { + "epoch": 2.8380952380952382, + "grad_norm": 0.6687008142471313, + "learning_rate": 1e-05, + "loss": 1.0663, + "mean_token_accuracy": 0.6889446377754211, + "num_tokens": 354162323.0, + "step": 596 + }, + { + "epoch": 2.842857142857143, + "grad_norm": 0.6510334610939026, + "learning_rate": 1e-05, + "loss": 1.0973, + "mean_token_accuracy": 0.6817850470542908, + "num_tokens": 354763224.0, + "step": 597 + }, + { + "epoch": 2.8476190476190477, + "grad_norm": 0.6164536476135254, + "learning_rate": 1e-05, + "loss": 1.0599, + "mean_token_accuracy": 0.6904336214065552, + "num_tokens": 355360011.0, + "step": 598 + }, + { + "epoch": 2.8523809523809525, + "grad_norm": 0.6652323603630066, + "learning_rate": 1e-05, + "loss": 1.0664, + "mean_token_accuracy": 0.6892472505569458, + "num_tokens": 355948770.0, + "step": 599 + }, + { + "epoch": 2.857142857142857, + "grad_norm": 0.6170997619628906, + "learning_rate": 1e-05, + "loss": 1.0749, + "mean_token_accuracy": 0.686385989189148, + "num_tokens": 356555915.0, + "step": 600 + }, + { + "epoch": 2.861904761904762, + "grad_norm": 0.5823125839233398, + "learning_rate": 1e-05, + "loss": 1.0762, + "mean_token_accuracy": 0.6866952180862427, + "num_tokens": 357168089.0, + "step": 601 + }, + { + "epoch": 2.8666666666666667, + "grad_norm": 0.6084815859794617, + "learning_rate": 1e-05, + "loss": 1.0589, + "mean_token_accuracy": 0.6905912160873413, + "num_tokens": 357762209.0, + "step": 602 + }, + { + "epoch": 2.8714285714285714, + "grad_norm": 0.5347459316253662, + "learning_rate": 1e-05, + "loss": 1.065, + "mean_token_accuracy": 0.6898081302642822, + "num_tokens": 358365632.0, + "step": 603 + }, + { + "epoch": 2.876190476190476, + "grad_norm": 0.6211216449737549, + "learning_rate": 1e-05, + "loss": 1.0795, + "mean_token_accuracy": 0.6860474944114685, + "num_tokens": 358969038.0, + "step": 604 + }, + { + "epoch": 2.880952380952381, + "grad_norm": 0.6298102736473083, + "learning_rate": 1e-05, + "loss": 1.0616, + "mean_token_accuracy": 0.691013514995575, + "num_tokens": 359560638.0, + "step": 605 + }, + { + "epoch": 2.8857142857142857, + "grad_norm": 0.6150857210159302, + "learning_rate": 1e-05, + "loss": 1.0726, + "mean_token_accuracy": 0.6874991655349731, + "num_tokens": 360159047.0, + "step": 606 + }, + { + "epoch": 2.8904761904761904, + "grad_norm": 0.6256808638572693, + "learning_rate": 1e-05, + "loss": 1.0696, + "mean_token_accuracy": 0.6895867586135864, + "num_tokens": 360752550.0, + "step": 607 + }, + { + "epoch": 2.895238095238095, + "grad_norm": 0.6338992714881897, + "learning_rate": 1e-05, + "loss": 1.0706, + "mean_token_accuracy": 0.6878842115402222, + "num_tokens": 361348966.0, + "step": 608 + }, + { + "epoch": 2.9, + "grad_norm": 0.6074673533439636, + "learning_rate": 1e-05, + "loss": 1.0625, + "mean_token_accuracy": 0.690711498260498, + "num_tokens": 361933541.0, + "step": 609 + }, + { + "epoch": 2.9047619047619047, + "grad_norm": 0.6169112324714661, + "learning_rate": 1e-05, + "loss": 1.0679, + "mean_token_accuracy": 0.6893640756607056, + "num_tokens": 362522689.0, + "step": 610 + }, + { + "epoch": 2.9095238095238094, + "grad_norm": 0.6712765097618103, + "learning_rate": 1e-05, + "loss": 1.0481, + "mean_token_accuracy": 0.6935627460479736, + "num_tokens": 363107779.0, + "step": 611 + }, + { + "epoch": 2.914285714285714, + "grad_norm": 0.6030009388923645, + "learning_rate": 1e-05, + "loss": 1.0624, + "mean_token_accuracy": 0.6899582147598267, + "num_tokens": 363690809.0, + "step": 612 + }, + { + "epoch": 2.919047619047619, + "grad_norm": 0.6335533261299133, + "learning_rate": 1e-05, + "loss": 1.0632, + "mean_token_accuracy": 0.6892010569572449, + "num_tokens": 364279923.0, + "step": 613 + }, + { + "epoch": 2.923809523809524, + "grad_norm": 0.6299601793289185, + "learning_rate": 1e-05, + "loss": 1.0536, + "mean_token_accuracy": 0.6920279264450073, + "num_tokens": 364846929.0, + "step": 614 + }, + { + "epoch": 2.928571428571429, + "grad_norm": 0.6494601964950562, + "learning_rate": 1e-05, + "loss": 1.0797, + "mean_token_accuracy": 0.6871404051780701, + "num_tokens": 365427755.0, + "step": 615 + }, + { + "epoch": 2.9333333333333336, + "grad_norm": 0.6412233710289001, + "learning_rate": 1e-05, + "loss": 1.0618, + "mean_token_accuracy": 0.6902071833610535, + "num_tokens": 366022879.0, + "step": 616 + }, + { + "epoch": 2.9380952380952383, + "grad_norm": 0.5901429653167725, + "learning_rate": 1e-05, + "loss": 1.0639, + "mean_token_accuracy": 0.6891224384307861, + "num_tokens": 366608198.0, + "step": 617 + }, + { + "epoch": 2.942857142857143, + "grad_norm": 0.6606128811836243, + "learning_rate": 1e-05, + "loss": 1.0687, + "mean_token_accuracy": 0.6881773471832275, + "num_tokens": 367187170.0, + "step": 618 + }, + { + "epoch": 2.947619047619048, + "grad_norm": 0.6021740436553955, + "learning_rate": 1e-05, + "loss": 1.062, + "mean_token_accuracy": 0.6895371675491333, + "num_tokens": 367778542.0, + "step": 619 + }, + { + "epoch": 2.9523809523809526, + "grad_norm": 0.6304929852485657, + "learning_rate": 1e-05, + "loss": 1.0685, + "mean_token_accuracy": 0.6876203417778015, + "num_tokens": 368374361.0, + "step": 620 + }, + { + "epoch": 2.9571428571428573, + "grad_norm": 0.6775472164154053, + "learning_rate": 1e-05, + "loss": 1.0693, + "mean_token_accuracy": 0.688637375831604, + "num_tokens": 368975961.0, + "step": 621 + }, + { + "epoch": 2.961904761904762, + "grad_norm": 0.6188324689865112, + "learning_rate": 1e-05, + "loss": 1.0446, + "mean_token_accuracy": 0.69502854347229, + "num_tokens": 369565801.0, + "step": 622 + }, + { + "epoch": 2.966666666666667, + "grad_norm": 0.7237592339515686, + "learning_rate": 1e-05, + "loss": 1.069, + "mean_token_accuracy": 0.6882259845733643, + "num_tokens": 370147963.0, + "step": 623 + }, + { + "epoch": 2.9714285714285715, + "grad_norm": 0.5706875920295715, + "learning_rate": 1e-05, + "loss": 1.0661, + "mean_token_accuracy": 0.688866376876831, + "num_tokens": 370730337.0, + "step": 624 + }, + { + "epoch": 2.9761904761904763, + "grad_norm": 0.6157565712928772, + "learning_rate": 1e-05, + "loss": 1.0595, + "mean_token_accuracy": 0.6903921365737915, + "num_tokens": 371313464.0, + "step": 625 + }, + { + "epoch": 2.980952380952381, + "grad_norm": 0.5899333953857422, + "learning_rate": 1e-05, + "loss": 1.0634, + "mean_token_accuracy": 0.690090537071228, + "num_tokens": 371903211.0, + "step": 626 + }, + { + "epoch": 2.9857142857142858, + "grad_norm": 0.6269708275794983, + "learning_rate": 1e-05, + "loss": 1.0536, + "mean_token_accuracy": 0.6934218406677246, + "num_tokens": 372496314.0, + "step": 627 + }, + { + "epoch": 2.9904761904761905, + "grad_norm": 0.6969268321990967, + "learning_rate": 1e-05, + "loss": 1.0676, + "mean_token_accuracy": 0.688661515712738, + "num_tokens": 373096610.0, + "step": 628 + }, + { + "epoch": 2.9952380952380953, + "grad_norm": 0.5695185661315918, + "learning_rate": 1e-05, + "loss": 1.0693, + "mean_token_accuracy": 0.6875466108322144, + "num_tokens": 373694165.0, + "step": 629 + }, + { + "epoch": 3.0, + "grad_norm": 0.6636136174201965, + "learning_rate": 1e-05, + "loss": 1.0583, + "mean_token_accuracy": 0.6922066807746887, + "num_tokens": 374283247.0, + "step": 630 + }, + { + "epoch": 3.0, + "step": 630, + "total_flos": 2.1853937174671524e+18, + "train_loss": 1.1615400253780304, + "train_runtime": 1840.1375, + "train_samples_per_second": 175.286, + "train_steps_per_second": 0.342 + } + ], + "logging_steps": 1, + "max_steps": 630, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 315, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 2.1853937174671524e+18, + "train_batch_size": 128, + "trial_name": null, + "trial_params": null +} diff --git a/training_args.bin b/training_args.bin new file mode 100644 index 0000000..fdbc740 --- /dev/null +++ b/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9056356460304202fe61664f9ab85520196516e5f05380540878306b4071d062 +size 13329